From 06eaf7232e9a920468c0f8d74dcf2fe8b555501c Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 13 Apr 2024 14:24:36 +0200 Subject: Adding upstream version 1:10.11.6. Signed-off-by: Daniel Baumann --- storage/innobase/.clang-format-old | 11 + storage/innobase/CMakeLists.txt | 511 + storage/innobase/COPYING.Google | 30 + storage/innobase/COPYING.Percona | 30 + storage/innobase/btr/btr0btr.cc | 5433 +++++ storage/innobase/btr/btr0bulk.cc | 1233 ++ storage/innobase/btr/btr0cur.cc | 7017 ++++++ storage/innobase/btr/btr0defragment.cc | 820 + storage/innobase/btr/btr0pcur.cc | 667 + storage/innobase/btr/btr0sea.cc | 2328 ++ storage/innobase/buf/buf0block_hint.cc | 59 + storage/innobase/buf/buf0buddy.cc | 769 + storage/innobase/buf/buf0buf.cc | 4180 ++++ storage/innobase/buf/buf0checksum.cc | 98 + storage/innobase/buf/buf0dblwr.cc | 779 + storage/innobase/buf/buf0dump.cc | 765 + storage/innobase/buf/buf0flu.cc | 2765 +++ storage/innobase/buf/buf0lru.cc | 1452 ++ storage/innobase/buf/buf0rea.cc | 710 + storage/innobase/data/data0data.cc | 820 + storage/innobase/data/data0type.cc | 212 + storage/innobase/dict/dict0boot.cc | 440 + storage/innobase/dict/dict0crea.cc | 1906 ++ storage/innobase/dict/dict0defrag_bg.cc | 434 + storage/innobase/dict/dict0dict.cc | 4859 +++++ storage/innobase/dict/dict0load.cc | 3213 +++ storage/innobase/dict/dict0mem.cc | 1379 ++ storage/innobase/dict/dict0stats.cc | 4724 +++++ storage/innobase/dict/dict0stats_bg.cc | 424 + storage/innobase/dict/drop.cc | 297 + storage/innobase/eval/eval0eval.cc | 643 + storage/innobase/eval/eval0proc.cc | 286 + storage/innobase/fil/fil0crypt.cc | 2425 +++ storage/innobase/fil/fil0fil.cc | 3282 +++ storage/innobase/fil/fil0pagecompress.cc | 584 + storage/innobase/fsp/fsp0file.cc | 936 + storage/innobase/fsp/fsp0fsp.cc | 3070 +++ storage/innobase/fsp/fsp0space.cc | 224 + storage/innobase/fsp/fsp0sysspace.cc | 1019 + storage/innobase/fts/Makefile.query | 18 + storage/innobase/fts/fts0ast.cc | 816 + storage/innobase/fts/fts0blex.cc | 2177 ++ storage/innobase/fts/fts0blex.l | 74 + storage/innobase/fts/fts0config.cc | 428 + storage/innobase/fts/fts0fts.cc | 6182 ++++++ storage/innobase/fts/fts0opt.cc | 3054 +++ storage/innobase/fts/fts0pars.cc | 2007 ++ storage/innobase/fts/fts0pars.y | 293 + storage/innobase/fts/fts0plugin.cc | 283 + storage/innobase/fts/fts0que.cc | 4612 ++++ storage/innobase/fts/fts0sql.cc | 208 + storage/innobase/fts/fts0tlex.cc | 2169 ++ storage/innobase/fts/fts0tlex.l | 69 + storage/innobase/fts/make_parser.sh | 49 + storage/innobase/fut/fut0lst.cc | 416 + storage/innobase/gis/gis0geo.cc | 650 + storage/innobase/gis/gis0rtree.cc | 1934 ++ storage/innobase/gis/gis0sea.cc | 2403 +++ storage/innobase/ha/ha0storage.cc | 178 + storage/innobase/handler/ha_innodb.cc | 21217 +++++++++++++++++++ storage/innobase/handler/ha_innodb.h | 937 + storage/innobase/handler/handler0alter.cc | 11843 +++++++++++ storage/innobase/handler/i_s.cc | 6506 ++++++ storage/innobase/handler/i_s.h | 91 + storage/innobase/ibuf/ibuf0ibuf.cc | 4617 ++++ storage/innobase/include/btr0btr.h | 543 + storage/innobase/include/btr0btr.inl | 111 + storage/innobase/include/btr0bulk.h | 371 + storage/innobase/include/btr0cur.h | 855 + storage/innobase/include/btr0cur.inl | 170 + storage/innobase/include/btr0defragment.h | 65 + storage/innobase/include/btr0pcur.h | 459 + storage/innobase/include/btr0pcur.inl | 372 + storage/innobase/include/btr0sea.h | 403 + storage/innobase/include/btr0sea.inl | 117 + storage/innobase/include/btr0types.h | 154 + storage/innobase/include/buf0block_hint.h | 76 + storage/innobase/include/buf0buddy.h | 91 + storage/innobase/include/buf0buf.h | 2190 ++ storage/innobase/include/buf0buf.inl | 132 + storage/innobase/include/buf0checksum.h | 57 + storage/innobase/include/buf0dblwr.h | 164 + storage/innobase/include/buf0dump.h | 44 + storage/innobase/include/buf0flu.h | 125 + storage/innobase/include/buf0lru.h | 193 + storage/innobase/include/buf0rea.h | 120 + storage/innobase/include/buf0types.h | 235 + storage/innobase/include/data0data.h | 704 + storage/innobase/include/data0data.inl | 633 + storage/innobase/include/data0type.h | 591 + storage/innobase/include/data0type.inl | 487 + storage/innobase/include/data0types.h | 36 + storage/innobase/include/db0err.h | 170 + storage/innobase/include/dict0boot.h | 297 + storage/innobase/include/dict0crea.h | 277 + storage/innobase/include/dict0crea.inl | 136 + storage/innobase/include/dict0defrag_bg.h | 101 + storage/innobase/include/dict0dict.h | 1744 ++ storage/innobase/include/dict0dict.inl | 1217 ++ storage/innobase/include/dict0load.h | 220 + storage/innobase/include/dict0mem.h | 2649 +++ storage/innobase/include/dict0mem.inl | 68 + storage/innobase/include/dict0pagecompress.h | 61 + storage/innobase/include/dict0pagecompress.inl | 81 + storage/innobase/include/dict0stats.h | 238 + storage/innobase/include/dict0stats.inl | 219 + storage/innobase/include/dict0stats_bg.h | 59 + storage/innobase/include/dict0types.h | 176 + storage/innobase/include/dyn0buf.h | 442 + storage/innobase/include/dyn0types.h | 39 + storage/innobase/include/eval0eval.h | 109 + storage/innobase/include/eval0eval.inl | 254 + storage/innobase/include/eval0proc.h | 94 + storage/innobase/include/eval0proc.inl | 88 + storage/innobase/include/fil0crypt.h | 396 + storage/innobase/include/fil0crypt.inl | 81 + storage/innobase/include/fil0fil.h | 1823 ++ storage/innobase/include/fil0pagecompress.h | 57 + storage/innobase/include/fsp0file.h | 509 + storage/innobase/include/fsp0fsp.h | 762 + storage/innobase/include/fsp0space.h | 209 + storage/innobase/include/fsp0sysspace.h | 278 + storage/innobase/include/fsp0types.h | 404 + storage/innobase/include/fts0ast.h | 340 + storage/innobase/include/fts0blex.h | 702 + storage/innobase/include/fts0fts.h | 947 + storage/innobase/include/fts0opt.h | 39 + storage/innobase/include/fts0pars.h | 72 + storage/innobase/include/fts0plugin.h | 50 + storage/innobase/include/fts0priv.h | 485 + storage/innobase/include/fts0priv.inl | 121 + storage/innobase/include/fts0tlex.h | 702 + storage/innobase/include/fts0tokenize.h | 189 + storage/innobase/include/fts0types.h | 354 + storage/innobase/include/fts0types.inl | 231 + storage/innobase/include/fts0vlc.h | 124 + storage/innobase/include/fut0lst.h | 156 + storage/innobase/include/gis0geo.h | 122 + storage/innobase/include/gis0rtree.h | 513 + storage/innobase/include/gis0rtree.inl | 245 + storage/innobase/include/gis0type.h | 146 + storage/innobase/include/ha0ha.h | 60 + storage/innobase/include/ha0ha.inl | 154 + storage/innobase/include/ha0storage.h | 137 + storage/innobase/include/ha0storage.inl | 142 + storage/innobase/include/ha_prototypes.h | 476 + storage/innobase/include/handler0alter.h | 108 + storage/innobase/include/hash0hash.h | 190 + storage/innobase/include/ibuf0ibuf.h | 436 + storage/innobase/include/ibuf0ibuf.inl | 282 + storage/innobase/include/lock0iter.h | 66 + storage/innobase/include/lock0lock.h | 1271 ++ storage/innobase/include/lock0lock.inl | 78 + storage/innobase/include/lock0prdt.h | 192 + storage/innobase/include/lock0priv.h | 582 + storage/innobase/include/lock0priv.inl | 255 + storage/innobase/include/lock0types.h | 251 + storage/innobase/include/log0crypt.h | 115 + storage/innobase/include/log0log.h | 529 + storage/innobase/include/log0recv.h | 491 + storage/innobase/include/log0types.h | 38 + storage/innobase/include/mach0data.h | 375 + storage/innobase/include/mach0data.inl | 837 + storage/innobase/include/mariadb_stats.h | 119 + storage/innobase/include/mem0mem.h | 345 + storage/innobase/include/mem0mem.inl | 468 + storage/innobase/include/mtr0log.h | 637 + storage/innobase/include/mtr0mtr.h | 780 + storage/innobase/include/mtr0types.h | 347 + storage/innobase/include/os0file.h | 1188 ++ storage/innobase/include/os0file.inl | 412 + storage/innobase/include/page0cur.h | 303 + storage/innobase/include/page0cur.inl | 203 + storage/innobase/include/page0page.h | 1101 + storage/innobase/include/page0page.inl | 550 + storage/innobase/include/page0types.h | 188 + storage/innobase/include/page0zip.h | 383 + storage/innobase/include/page0zip.inl | 317 + storage/innobase/include/pars0grm.h | 151 + storage/innobase/include/pars0opt.h | 68 + storage/innobase/include/pars0pars.h | 695 + storage/innobase/include/pars0sym.h | 243 + storage/innobase/include/pars0types.h | 50 + storage/innobase/include/que0que.h | 314 + storage/innobase/include/que0que.inl | 245 + storage/innobase/include/que0types.h | 97 + storage/innobase/include/read0types.h | 275 + storage/innobase/include/rem0cmp.h | 286 + storage/innobase/include/rem0rec.h | 1276 ++ storage/innobase/include/rem0rec.inl | 1134 + storage/innobase/include/rem0types.h | 78 + storage/innobase/include/row0ext.h | 101 + storage/innobase/include/row0ext.inl | 87 + storage/innobase/include/row0ftsort.h | 268 + storage/innobase/include/row0import.h | 67 + storage/innobase/include/row0ins.h | 224 + storage/innobase/include/row0log.h | 239 + storage/innobase/include/row0merge.h | 496 + storage/innobase/include/row0mysql.h | 841 + storage/innobase/include/row0purge.h | 149 + storage/innobase/include/row0quiesce.h | 67 + storage/innobase/include/row0row.h | 431 + storage/innobase/include/row0row.inl | 221 + storage/innobase/include/row0sel.h | 457 + storage/innobase/include/row0types.h | 54 + storage/innobase/include/row0uins.h | 50 + storage/innobase/include/row0umod.h | 46 + storage/innobase/include/row0undo.h | 114 + storage/innobase/include/row0upd.h | 559 + storage/innobase/include/row0upd.inl | 153 + storage/innobase/include/row0vers.h | 143 + storage/innobase/include/rw_lock.h | 138 + storage/innobase/include/small_vector.h | 100 + storage/innobase/include/srv0mon.h | 846 + storage/innobase/include/srv0mon.inl | 113 + storage/innobase/include/srv0srv.h | 715 + storage/innobase/include/srv0start.h | 124 + storage/innobase/include/srw_lock.h | 554 + storage/innobase/include/sux_lock.h | 472 + .../innobase/include/transactional_lock_guard.h | 174 + storage/innobase/include/trx0i_s.h | 277 + storage/innobase/include/trx0purge.h | 427 + storage/innobase/include/trx0rec.h | 299 + storage/innobase/include/trx0roll.h | 168 + storage/innobase/include/trx0rseg.h | 301 + storage/innobase/include/trx0sys.h | 1274 ++ storage/innobase/include/trx0trx.h | 1268 ++ storage/innobase/include/trx0trx.inl | 86 + storage/innobase/include/trx0types.h | 131 + storage/innobase/include/trx0undo.h | 514 + storage/innobase/include/trx0undo.inl | 129 + storage/innobase/include/trx0xa.h | 61 + storage/innobase/include/univ.i | 503 + storage/innobase/include/ut0byte.h | 107 + storage/innobase/include/ut0byte.inl | 90 + storage/innobase/include/ut0counter.h | 123 + storage/innobase/include/ut0dbg.h | 179 + storage/innobase/include/ut0list.h | 146 + storage/innobase/include/ut0list.inl | 80 + storage/innobase/include/ut0lst.h | 563 + storage/innobase/include/ut0mem.h | 76 + storage/innobase/include/ut0mem.inl | 246 + storage/innobase/include/ut0new.h | 1099 + storage/innobase/include/ut0pool.h | 365 + storage/innobase/include/ut0rbt.h | 254 + storage/innobase/include/ut0rnd.h | 128 + storage/innobase/include/ut0rnd.inl | 128 + storage/innobase/include/ut0sort.h | 104 + storage/innobase/include/ut0stage.h | 499 + storage/innobase/include/ut0ut.h | 444 + storage/innobase/include/ut0ut.inl | 143 + storage/innobase/include/ut0vec.h | 285 + storage/innobase/include/ut0vec.inl | 348 + storage/innobase/include/ut0wqueue.h | 86 + storage/innobase/lock/lock0iter.cc | 88 + storage/innobase/lock/lock0lock.cc | 6812 ++++++ storage/innobase/lock/lock0prdt.cc | 928 + storage/innobase/log/log0crypt.cc | 641 + storage/innobase/log/log0log.cc | 1358 ++ storage/innobase/log/log0recv.cc | 4870 +++++ storage/innobase/log/log0sync.cc | 404 + storage/innobase/log/log0sync.h | 99 + storage/innobase/mem/mem0mem.cc | 436 + storage/innobase/mtr/mtr0mtr.cc | 1667 ++ .../mysql-test/storage_engine/alter_tablespace.opt | 2 + .../storage_engine/autoinc_secondary.rdiff | 30 + .../mysql-test/storage_engine/cache_index.rdiff | 71 + .../storage_engine/checksum_table_live.rdiff | 13 + .../mysql-test/storage_engine/col_opt_not_null.opt | 1 + .../mysql-test/storage_engine/col_opt_null.opt | 1 + .../mysql-test/storage_engine/define_engine.inc | 45 + .../mysql-test/storage_engine/disabled.def | 9 + .../storage_engine/fulltext_search.rdiff | 49 + .../storage_engine/index_enable_disable.rdiff | 33 + .../storage_engine/index_type_hash.rdiff | 60 + .../mysql-test/storage_engine/insert_delayed.rdiff | 26 + .../storage_engine/lock_concurrent.rdiff | 25 + .../mysql-test/storage_engine/optimize_table.rdiff | 37 + .../storage_engine/parts/checksum_table.rdiff | 13 + .../storage_engine/parts/create_table.rdiff | 20 + .../mysql-test/storage_engine/parts/disabled.def | 1 + .../storage_engine/parts/optimize_table.rdiff | 58 + .../storage_engine/parts/repair_table.rdiff | 158 + .../mysql-test/storage_engine/parts/suite.opt | 2 + .../mysql-test/storage_engine/repair_table.rdiff | 139 + .../innobase/mysql-test/storage_engine/suite.opt | 1 + .../storage_engine/tbl_opt_index_dir.rdiff | 23 + .../storage_engine/tbl_opt_insert_method.rdiff | 11 + .../storage_engine/tbl_opt_row_format.rdiff | 44 + .../mysql-test/storage_engine/tbl_opt_union.rdiff | 16 + .../trx/cons_snapshot_serializable.rdiff | 18 + .../storage_engine/trx/level_read_committed.rdiff | 11 + .../trx/level_read_uncommitted.rdiff | 11 + .../mysql-test/storage_engine/trx/suite.opt | 3 + .../mysql-test/storage_engine/type_blob.opt | 1 + .../storage_engine/type_char_indexes.rdiff | 11 + .../storage_engine/type_float_indexes.rdiff | 11 + .../mysql-test/storage_engine/type_text.opt | 1 + storage/innobase/os/os0file.cc | 4270 ++++ storage/innobase/page/page0cur.cc | 3097 +++ storage/innobase/page/page0page.cc | 2523 +++ storage/innobase/page/page0zip.cc | 4666 ++++ storage/innobase/pars/lexyy.cc | 2841 +++ storage/innobase/pars/make_bison.sh | 32 + storage/innobase/pars/make_flex.sh | 50 + storage/innobase/pars/pars0grm.cc | 2504 +++ storage/innobase/pars/pars0grm.y | 609 + storage/innobase/pars/pars0lex.l | 614 + storage/innobase/pars/pars0opt.cc | 1263 ++ storage/innobase/pars/pars0pars.cc | 2381 +++ storage/innobase/pars/pars0sym.cc | 413 + storage/innobase/que/que0que.cc | 708 + storage/innobase/read/read0read.cc | 265 + storage/innobase/rem/rem0cmp.cc | 901 + storage/innobase/rem/rem0rec.cc | 2820 +++ storage/innobase/row/row0ext.cc | 132 + storage/innobase/row/row0ftsort.cc | 1791 ++ storage/innobase/row/row0import.cc | 4585 ++++ storage/innobase/row/row0ins.cc | 3843 ++++ storage/innobase/row/row0log.cc | 4134 ++++ storage/innobase/row/row0merge.cc | 5406 +++++ storage/innobase/row/row0mysql.cc | 2916 +++ storage/innobase/row/row0purge.cc | 1304 ++ storage/innobase/row/row0quiesce.cc | 715 + storage/innobase/row/row0row.cc | 1720 ++ storage/innobase/row/row0sel.cc | 6947 ++++++ storage/innobase/row/row0uins.cc | 652 + storage/innobase/row/row0umod.cc | 1288 ++ storage/innobase/row/row0undo.cc | 453 + storage/innobase/row/row0upd.cc | 3002 +++ storage/innobase/row/row0vers.cc | 1419 ++ storage/innobase/srv/srv0mon.cc | 1799 ++ storage/innobase/srv/srv0srv.cc | 1659 ++ storage/innobase/srv/srv0start.cc | 2101 ++ storage/innobase/sync/srw_lock.cc | 550 + storage/innobase/trx/trx0i_s.cc | 1471 ++ storage/innobase/trx/trx0purge.cc | 1480 ++ storage/innobase/trx/trx0rec.cc | 2448 +++ storage/innobase/trx/trx0roll.cc | 933 + storage/innobase/trx/trx0rseg.cc | 727 + storage/innobase/trx/trx0sys.cc | 370 + storage/innobase/trx/trx0trx.cc | 2292 ++ storage/innobase/trx/trx0undo.cc | 1478 ++ storage/innobase/unittest/CMakeLists.txt | 34 + storage/innobase/unittest/innodb_fts-t.cc | 52 + storage/innobase/unittest/innodb_sync-t.cc | 185 + storage/innobase/ut/ut0dbg.cc | 61 + storage/innobase/ut/ut0list.cc | 151 + storage/innobase/ut/ut0mem.cc | 55 + storage/innobase/ut/ut0new.cc | 112 + storage/innobase/ut/ut0rbt.cc | 1142 + storage/innobase/ut/ut0rnd.cc | 93 + storage/innobase/ut/ut0ut.cc | 599 + storage/innobase/ut/ut0vec.cc | 73 + storage/innobase/ut/ut0wqueue.cc | 118 + 355 files changed, 309116 insertions(+) create mode 100644 storage/innobase/.clang-format-old create mode 100644 storage/innobase/CMakeLists.txt create mode 100644 storage/innobase/COPYING.Google create mode 100644 storage/innobase/COPYING.Percona create mode 100644 storage/innobase/btr/btr0btr.cc create mode 100644 storage/innobase/btr/btr0bulk.cc create mode 100644 storage/innobase/btr/btr0cur.cc create mode 100644 storage/innobase/btr/btr0defragment.cc create mode 100644 storage/innobase/btr/btr0pcur.cc create mode 100644 storage/innobase/btr/btr0sea.cc create mode 100644 storage/innobase/buf/buf0block_hint.cc create mode 100644 storage/innobase/buf/buf0buddy.cc create mode 100644 storage/innobase/buf/buf0buf.cc create mode 100644 storage/innobase/buf/buf0checksum.cc create mode 100644 storage/innobase/buf/buf0dblwr.cc create mode 100644 storage/innobase/buf/buf0dump.cc create mode 100644 storage/innobase/buf/buf0flu.cc create mode 100644 storage/innobase/buf/buf0lru.cc create mode 100644 storage/innobase/buf/buf0rea.cc create mode 100644 storage/innobase/data/data0data.cc create mode 100644 storage/innobase/data/data0type.cc create mode 100644 storage/innobase/dict/dict0boot.cc create mode 100644 storage/innobase/dict/dict0crea.cc create mode 100644 storage/innobase/dict/dict0defrag_bg.cc create mode 100644 storage/innobase/dict/dict0dict.cc create mode 100644 storage/innobase/dict/dict0load.cc create mode 100644 storage/innobase/dict/dict0mem.cc create mode 100644 storage/innobase/dict/dict0stats.cc create mode 100644 storage/innobase/dict/dict0stats_bg.cc create mode 100644 storage/innobase/dict/drop.cc create mode 100644 storage/innobase/eval/eval0eval.cc create mode 100644 storage/innobase/eval/eval0proc.cc create mode 100644 storage/innobase/fil/fil0crypt.cc create mode 100644 storage/innobase/fil/fil0fil.cc create mode 100644 storage/innobase/fil/fil0pagecompress.cc create mode 100644 storage/innobase/fsp/fsp0file.cc create mode 100644 storage/innobase/fsp/fsp0fsp.cc create mode 100644 storage/innobase/fsp/fsp0space.cc create mode 100644 storage/innobase/fsp/fsp0sysspace.cc create mode 100644 storage/innobase/fts/Makefile.query create mode 100644 storage/innobase/fts/fts0ast.cc create mode 100644 storage/innobase/fts/fts0blex.cc create mode 100644 storage/innobase/fts/fts0blex.l create mode 100644 storage/innobase/fts/fts0config.cc create mode 100644 storage/innobase/fts/fts0fts.cc create mode 100644 storage/innobase/fts/fts0opt.cc create mode 100644 storage/innobase/fts/fts0pars.cc create mode 100644 storage/innobase/fts/fts0pars.y create mode 100644 storage/innobase/fts/fts0plugin.cc create mode 100644 storage/innobase/fts/fts0que.cc create mode 100644 storage/innobase/fts/fts0sql.cc create mode 100644 storage/innobase/fts/fts0tlex.cc create mode 100644 storage/innobase/fts/fts0tlex.l create mode 100755 storage/innobase/fts/make_parser.sh create mode 100644 storage/innobase/fut/fut0lst.cc create mode 100644 storage/innobase/gis/gis0geo.cc create mode 100644 storage/innobase/gis/gis0rtree.cc create mode 100644 storage/innobase/gis/gis0sea.cc create mode 100644 storage/innobase/ha/ha0storage.cc create mode 100644 storage/innobase/handler/ha_innodb.cc create mode 100644 storage/innobase/handler/ha_innodb.h create mode 100644 storage/innobase/handler/handler0alter.cc create mode 100644 storage/innobase/handler/i_s.cc create mode 100644 storage/innobase/handler/i_s.h create mode 100644 storage/innobase/ibuf/ibuf0ibuf.cc create mode 100644 storage/innobase/include/btr0btr.h create mode 100644 storage/innobase/include/btr0btr.inl create mode 100644 storage/innobase/include/btr0bulk.h create mode 100644 storage/innobase/include/btr0cur.h create mode 100644 storage/innobase/include/btr0cur.inl create mode 100644 storage/innobase/include/btr0defragment.h create mode 100644 storage/innobase/include/btr0pcur.h create mode 100644 storage/innobase/include/btr0pcur.inl create mode 100644 storage/innobase/include/btr0sea.h create mode 100644 storage/innobase/include/btr0sea.inl create mode 100644 storage/innobase/include/btr0types.h create mode 100644 storage/innobase/include/buf0block_hint.h create mode 100644 storage/innobase/include/buf0buddy.h create mode 100644 storage/innobase/include/buf0buf.h create mode 100644 storage/innobase/include/buf0buf.inl create mode 100644 storage/innobase/include/buf0checksum.h create mode 100644 storage/innobase/include/buf0dblwr.h create mode 100644 storage/innobase/include/buf0dump.h create mode 100644 storage/innobase/include/buf0flu.h create mode 100644 storage/innobase/include/buf0lru.h create mode 100644 storage/innobase/include/buf0rea.h create mode 100644 storage/innobase/include/buf0types.h create mode 100644 storage/innobase/include/data0data.h create mode 100644 storage/innobase/include/data0data.inl create mode 100644 storage/innobase/include/data0type.h create mode 100644 storage/innobase/include/data0type.inl create mode 100644 storage/innobase/include/data0types.h create mode 100644 storage/innobase/include/db0err.h create mode 100644 storage/innobase/include/dict0boot.h create mode 100644 storage/innobase/include/dict0crea.h create mode 100644 storage/innobase/include/dict0crea.inl create mode 100644 storage/innobase/include/dict0defrag_bg.h create mode 100644 storage/innobase/include/dict0dict.h create mode 100644 storage/innobase/include/dict0dict.inl create mode 100644 storage/innobase/include/dict0load.h create mode 100644 storage/innobase/include/dict0mem.h create mode 100644 storage/innobase/include/dict0mem.inl create mode 100644 storage/innobase/include/dict0pagecompress.h create mode 100644 storage/innobase/include/dict0pagecompress.inl create mode 100644 storage/innobase/include/dict0stats.h create mode 100644 storage/innobase/include/dict0stats.inl create mode 100644 storage/innobase/include/dict0stats_bg.h create mode 100644 storage/innobase/include/dict0types.h create mode 100644 storage/innobase/include/dyn0buf.h create mode 100644 storage/innobase/include/dyn0types.h create mode 100644 storage/innobase/include/eval0eval.h create mode 100644 storage/innobase/include/eval0eval.inl create mode 100644 storage/innobase/include/eval0proc.h create mode 100644 storage/innobase/include/eval0proc.inl create mode 100644 storage/innobase/include/fil0crypt.h create mode 100644 storage/innobase/include/fil0crypt.inl create mode 100644 storage/innobase/include/fil0fil.h create mode 100644 storage/innobase/include/fil0pagecompress.h create mode 100644 storage/innobase/include/fsp0file.h create mode 100644 storage/innobase/include/fsp0fsp.h create mode 100644 storage/innobase/include/fsp0space.h create mode 100644 storage/innobase/include/fsp0sysspace.h create mode 100644 storage/innobase/include/fsp0types.h create mode 100644 storage/innobase/include/fts0ast.h create mode 100644 storage/innobase/include/fts0blex.h create mode 100644 storage/innobase/include/fts0fts.h create mode 100644 storage/innobase/include/fts0opt.h create mode 100644 storage/innobase/include/fts0pars.h create mode 100644 storage/innobase/include/fts0plugin.h create mode 100644 storage/innobase/include/fts0priv.h create mode 100644 storage/innobase/include/fts0priv.inl create mode 100644 storage/innobase/include/fts0tlex.h create mode 100644 storage/innobase/include/fts0tokenize.h create mode 100644 storage/innobase/include/fts0types.h create mode 100644 storage/innobase/include/fts0types.inl create mode 100644 storage/innobase/include/fts0vlc.h create mode 100644 storage/innobase/include/fut0lst.h create mode 100644 storage/innobase/include/gis0geo.h create mode 100644 storage/innobase/include/gis0rtree.h create mode 100644 storage/innobase/include/gis0rtree.inl create mode 100644 storage/innobase/include/gis0type.h create mode 100644 storage/innobase/include/ha0ha.h create mode 100644 storage/innobase/include/ha0ha.inl create mode 100644 storage/innobase/include/ha0storage.h create mode 100644 storage/innobase/include/ha0storage.inl create mode 100644 storage/innobase/include/ha_prototypes.h create mode 100644 storage/innobase/include/handler0alter.h create mode 100644 storage/innobase/include/hash0hash.h create mode 100644 storage/innobase/include/ibuf0ibuf.h create mode 100644 storage/innobase/include/ibuf0ibuf.inl create mode 100644 storage/innobase/include/lock0iter.h create mode 100644 storage/innobase/include/lock0lock.h create mode 100644 storage/innobase/include/lock0lock.inl create mode 100644 storage/innobase/include/lock0prdt.h create mode 100644 storage/innobase/include/lock0priv.h create mode 100644 storage/innobase/include/lock0priv.inl create mode 100644 storage/innobase/include/lock0types.h create mode 100644 storage/innobase/include/log0crypt.h create mode 100644 storage/innobase/include/log0log.h create mode 100644 storage/innobase/include/log0recv.h create mode 100644 storage/innobase/include/log0types.h create mode 100644 storage/innobase/include/mach0data.h create mode 100644 storage/innobase/include/mach0data.inl create mode 100644 storage/innobase/include/mariadb_stats.h create mode 100644 storage/innobase/include/mem0mem.h create mode 100644 storage/innobase/include/mem0mem.inl create mode 100644 storage/innobase/include/mtr0log.h create mode 100644 storage/innobase/include/mtr0mtr.h create mode 100644 storage/innobase/include/mtr0types.h create mode 100644 storage/innobase/include/os0file.h create mode 100644 storage/innobase/include/os0file.inl create mode 100644 storage/innobase/include/page0cur.h create mode 100644 storage/innobase/include/page0cur.inl create mode 100644 storage/innobase/include/page0page.h create mode 100644 storage/innobase/include/page0page.inl create mode 100644 storage/innobase/include/page0types.h create mode 100644 storage/innobase/include/page0zip.h create mode 100644 storage/innobase/include/page0zip.inl create mode 100644 storage/innobase/include/pars0grm.h create mode 100644 storage/innobase/include/pars0opt.h create mode 100644 storage/innobase/include/pars0pars.h create mode 100644 storage/innobase/include/pars0sym.h create mode 100644 storage/innobase/include/pars0types.h create mode 100644 storage/innobase/include/que0que.h create mode 100644 storage/innobase/include/que0que.inl create mode 100644 storage/innobase/include/que0types.h create mode 100644 storage/innobase/include/read0types.h create mode 100644 storage/innobase/include/rem0cmp.h create mode 100644 storage/innobase/include/rem0rec.h create mode 100644 storage/innobase/include/rem0rec.inl create mode 100644 storage/innobase/include/rem0types.h create mode 100644 storage/innobase/include/row0ext.h create mode 100644 storage/innobase/include/row0ext.inl create mode 100644 storage/innobase/include/row0ftsort.h create mode 100644 storage/innobase/include/row0import.h create mode 100644 storage/innobase/include/row0ins.h create mode 100644 storage/innobase/include/row0log.h create mode 100644 storage/innobase/include/row0merge.h create mode 100644 storage/innobase/include/row0mysql.h create mode 100644 storage/innobase/include/row0purge.h create mode 100644 storage/innobase/include/row0quiesce.h create mode 100644 storage/innobase/include/row0row.h create mode 100644 storage/innobase/include/row0row.inl create mode 100644 storage/innobase/include/row0sel.h create mode 100644 storage/innobase/include/row0types.h create mode 100644 storage/innobase/include/row0uins.h create mode 100644 storage/innobase/include/row0umod.h create mode 100644 storage/innobase/include/row0undo.h create mode 100644 storage/innobase/include/row0upd.h create mode 100644 storage/innobase/include/row0upd.inl create mode 100644 storage/innobase/include/row0vers.h create mode 100644 storage/innobase/include/rw_lock.h create mode 100644 storage/innobase/include/small_vector.h create mode 100644 storage/innobase/include/srv0mon.h create mode 100644 storage/innobase/include/srv0mon.inl create mode 100644 storage/innobase/include/srv0srv.h create mode 100644 storage/innobase/include/srv0start.h create mode 100644 storage/innobase/include/srw_lock.h create mode 100644 storage/innobase/include/sux_lock.h create mode 100644 storage/innobase/include/transactional_lock_guard.h create mode 100644 storage/innobase/include/trx0i_s.h create mode 100644 storage/innobase/include/trx0purge.h create mode 100644 storage/innobase/include/trx0rec.h create mode 100644 storage/innobase/include/trx0roll.h create mode 100644 storage/innobase/include/trx0rseg.h create mode 100644 storage/innobase/include/trx0sys.h create mode 100644 storage/innobase/include/trx0trx.h create mode 100644 storage/innobase/include/trx0trx.inl create mode 100644 storage/innobase/include/trx0types.h create mode 100644 storage/innobase/include/trx0undo.h create mode 100644 storage/innobase/include/trx0undo.inl create mode 100644 storage/innobase/include/trx0xa.h create mode 100644 storage/innobase/include/univ.i create mode 100644 storage/innobase/include/ut0byte.h create mode 100644 storage/innobase/include/ut0byte.inl create mode 100644 storage/innobase/include/ut0counter.h create mode 100644 storage/innobase/include/ut0dbg.h create mode 100644 storage/innobase/include/ut0list.h create mode 100644 storage/innobase/include/ut0list.inl create mode 100644 storage/innobase/include/ut0lst.h create mode 100644 storage/innobase/include/ut0mem.h create mode 100644 storage/innobase/include/ut0mem.inl create mode 100644 storage/innobase/include/ut0new.h create mode 100644 storage/innobase/include/ut0pool.h create mode 100644 storage/innobase/include/ut0rbt.h create mode 100644 storage/innobase/include/ut0rnd.h create mode 100644 storage/innobase/include/ut0rnd.inl create mode 100644 storage/innobase/include/ut0sort.h create mode 100644 storage/innobase/include/ut0stage.h create mode 100644 storage/innobase/include/ut0ut.h create mode 100644 storage/innobase/include/ut0ut.inl create mode 100644 storage/innobase/include/ut0vec.h create mode 100644 storage/innobase/include/ut0vec.inl create mode 100644 storage/innobase/include/ut0wqueue.h create mode 100644 storage/innobase/lock/lock0iter.cc create mode 100644 storage/innobase/lock/lock0lock.cc create mode 100644 storage/innobase/lock/lock0prdt.cc create mode 100644 storage/innobase/log/log0crypt.cc create mode 100644 storage/innobase/log/log0log.cc create mode 100644 storage/innobase/log/log0recv.cc create mode 100644 storage/innobase/log/log0sync.cc create mode 100644 storage/innobase/log/log0sync.h create mode 100644 storage/innobase/mem/mem0mem.cc create mode 100644 storage/innobase/mtr/mtr0mtr.cc create mode 100644 storage/innobase/mysql-test/storage_engine/alter_tablespace.opt create mode 100644 storage/innobase/mysql-test/storage_engine/autoinc_secondary.rdiff create mode 100644 storage/innobase/mysql-test/storage_engine/cache_index.rdiff create mode 100644 storage/innobase/mysql-test/storage_engine/checksum_table_live.rdiff create mode 100644 storage/innobase/mysql-test/storage_engine/col_opt_not_null.opt create mode 100644 storage/innobase/mysql-test/storage_engine/col_opt_null.opt create mode 100644 storage/innobase/mysql-test/storage_engine/define_engine.inc create mode 100644 storage/innobase/mysql-test/storage_engine/disabled.def create mode 100644 storage/innobase/mysql-test/storage_engine/fulltext_search.rdiff create mode 100644 storage/innobase/mysql-test/storage_engine/index_enable_disable.rdiff create mode 100644 storage/innobase/mysql-test/storage_engine/index_type_hash.rdiff create mode 100644 storage/innobase/mysql-test/storage_engine/insert_delayed.rdiff create mode 100644 storage/innobase/mysql-test/storage_engine/lock_concurrent.rdiff create mode 100644 storage/innobase/mysql-test/storage_engine/optimize_table.rdiff create mode 100644 storage/innobase/mysql-test/storage_engine/parts/checksum_table.rdiff create mode 100644 storage/innobase/mysql-test/storage_engine/parts/create_table.rdiff create mode 100644 storage/innobase/mysql-test/storage_engine/parts/disabled.def create mode 100644 storage/innobase/mysql-test/storage_engine/parts/optimize_table.rdiff create mode 100644 storage/innobase/mysql-test/storage_engine/parts/repair_table.rdiff create mode 100644 storage/innobase/mysql-test/storage_engine/parts/suite.opt create mode 100644 storage/innobase/mysql-test/storage_engine/repair_table.rdiff create mode 100644 storage/innobase/mysql-test/storage_engine/suite.opt create mode 100644 storage/innobase/mysql-test/storage_engine/tbl_opt_index_dir.rdiff create mode 100644 storage/innobase/mysql-test/storage_engine/tbl_opt_insert_method.rdiff create mode 100644 storage/innobase/mysql-test/storage_engine/tbl_opt_row_format.rdiff create mode 100644 storage/innobase/mysql-test/storage_engine/tbl_opt_union.rdiff create mode 100644 storage/innobase/mysql-test/storage_engine/trx/cons_snapshot_serializable.rdiff create mode 100644 storage/innobase/mysql-test/storage_engine/trx/level_read_committed.rdiff create mode 100644 storage/innobase/mysql-test/storage_engine/trx/level_read_uncommitted.rdiff create mode 100644 storage/innobase/mysql-test/storage_engine/trx/suite.opt create mode 100644 storage/innobase/mysql-test/storage_engine/type_blob.opt create mode 100644 storage/innobase/mysql-test/storage_engine/type_char_indexes.rdiff create mode 100644 storage/innobase/mysql-test/storage_engine/type_float_indexes.rdiff create mode 100644 storage/innobase/mysql-test/storage_engine/type_text.opt create mode 100644 storage/innobase/os/os0file.cc create mode 100644 storage/innobase/page/page0cur.cc create mode 100644 storage/innobase/page/page0page.cc create mode 100644 storage/innobase/page/page0zip.cc create mode 100644 storage/innobase/pars/lexyy.cc create mode 100755 storage/innobase/pars/make_bison.sh create mode 100755 storage/innobase/pars/make_flex.sh create mode 100644 storage/innobase/pars/pars0grm.cc create mode 100644 storage/innobase/pars/pars0grm.y create mode 100644 storage/innobase/pars/pars0lex.l create mode 100644 storage/innobase/pars/pars0opt.cc create mode 100644 storage/innobase/pars/pars0pars.cc create mode 100644 storage/innobase/pars/pars0sym.cc create mode 100644 storage/innobase/que/que0que.cc create mode 100644 storage/innobase/read/read0read.cc create mode 100644 storage/innobase/rem/rem0cmp.cc create mode 100644 storage/innobase/rem/rem0rec.cc create mode 100644 storage/innobase/row/row0ext.cc create mode 100644 storage/innobase/row/row0ftsort.cc create mode 100644 storage/innobase/row/row0import.cc create mode 100644 storage/innobase/row/row0ins.cc create mode 100644 storage/innobase/row/row0log.cc create mode 100644 storage/innobase/row/row0merge.cc create mode 100644 storage/innobase/row/row0mysql.cc create mode 100644 storage/innobase/row/row0purge.cc create mode 100644 storage/innobase/row/row0quiesce.cc create mode 100644 storage/innobase/row/row0row.cc create mode 100644 storage/innobase/row/row0sel.cc create mode 100644 storage/innobase/row/row0uins.cc create mode 100644 storage/innobase/row/row0umod.cc create mode 100644 storage/innobase/row/row0undo.cc create mode 100644 storage/innobase/row/row0upd.cc create mode 100644 storage/innobase/row/row0vers.cc create mode 100644 storage/innobase/srv/srv0mon.cc create mode 100644 storage/innobase/srv/srv0srv.cc create mode 100644 storage/innobase/srv/srv0start.cc create mode 100644 storage/innobase/sync/srw_lock.cc create mode 100644 storage/innobase/trx/trx0i_s.cc create mode 100644 storage/innobase/trx/trx0purge.cc create mode 100644 storage/innobase/trx/trx0rec.cc create mode 100644 storage/innobase/trx/trx0roll.cc create mode 100644 storage/innobase/trx/trx0rseg.cc create mode 100644 storage/innobase/trx/trx0sys.cc create mode 100644 storage/innobase/trx/trx0trx.cc create mode 100644 storage/innobase/trx/trx0undo.cc create mode 100644 storage/innobase/unittest/CMakeLists.txt create mode 100644 storage/innobase/unittest/innodb_fts-t.cc create mode 100644 storage/innobase/unittest/innodb_sync-t.cc create mode 100644 storage/innobase/ut/ut0dbg.cc create mode 100644 storage/innobase/ut/ut0list.cc create mode 100644 storage/innobase/ut/ut0mem.cc create mode 100644 storage/innobase/ut/ut0new.cc create mode 100644 storage/innobase/ut/ut0rbt.cc create mode 100644 storage/innobase/ut/ut0rnd.cc create mode 100644 storage/innobase/ut/ut0ut.cc create mode 100644 storage/innobase/ut/ut0vec.cc create mode 100644 storage/innobase/ut/ut0wqueue.cc (limited to 'storage/innobase') diff --git a/storage/innobase/.clang-format-old b/storage/innobase/.clang-format-old new file mode 100644 index 00000000..54f7b47b --- /dev/null +++ b/storage/innobase/.clang-format-old @@ -0,0 +1,11 @@ +UseTab: Always +TabWidth: 8 +IndentWidth: 8 +ContinuationIndentWidth: 8 +BreakBeforeBinaryOperators: All +PointerAlignment: Left +BreakBeforeBraces: Custom +ColumnLimit: 79 +BraceWrapping: + AfterFunction: true +AccessModifierOffset: -8 diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt new file mode 100644 index 00000000..32c0a437 --- /dev/null +++ b/storage/innobase/CMakeLists.txt @@ -0,0 +1,511 @@ + +# Copyright (c) 2006, 2017, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2014, 2022, MariaDB Corporation. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA + +# This is the CMakeLists for InnoDB + +INCLUDE(CheckFunctionExists) +INCLUDE(CheckCSourceCompiles) +INCLUDE(CheckCSourceRuns) +INCLUDE(numa) +INCLUDE(TestBigEndian) + +MYSQL_CHECK_NUMA() + +INCLUDE(${MYSQL_CMAKE_SCRIPT_DIR}/compile_flags.cmake) + +IF(CMAKE_CROSSCOMPILING) + # Use CHECK_C_SOURCE_COMPILES instead of CHECK_C_SOURCE_RUNS when + # cross-compiling. Not as precise, but usually good enough. + # This only make sense for atomic tests in this file, this trick doesn't + # work in a general case. + MACRO(CHECK_C_SOURCE SOURCE VAR) + CHECK_C_SOURCE_COMPILES("${SOURCE}" "${VAR}") + ENDMACRO() +ELSE() + MACRO(CHECK_C_SOURCE SOURCE VAR) + CHECK_C_SOURCE_RUNS("${SOURCE}" "${VAR}") + ENDMACRO() +ENDIF() + +# OS tests +IF(UNIX) + IF(CMAKE_SYSTEM_NAME STREQUAL "Linux") + ADD_DEFINITIONS("-D_GNU_SOURCE=1") + IF(HAVE_LIBNUMA) + LINK_LIBRARIES(numa) + ENDIF() + ENDIF() +ENDIF() + +# Enable InnoDB's UNIV_DEBUG in debug builds +SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DUNIV_DEBUG") + +OPTION(WITH_INNODB_AHI "Include innodb_adaptive_hash_index" ON) +OPTION(WITH_INNODB_ROOT_GUESS "Cache index root block descriptors" ON) +IF(WITH_INNODB_AHI) + ADD_DEFINITIONS(-DBTR_CUR_HASH_ADAPT -DBTR_CUR_ADAPT) + IF(NOT WITH_INNODB_ROOT_GUESS) + MESSAGE(WARNING "WITH_INNODB_AHI implies WITH_INNODB_ROOT_GUESS") + SET(WITH_INNODB_ROOT_GUESS ON) + ENDIF() +ELSEIF(WITH_INNODB_ROOT_GUESS) + ADD_DEFINITIONS(-DBTR_CUR_ADAPT) +ENDIF() +ADD_FEATURE_INFO(INNODB_AHI WITH_INNODB_AHI "InnoDB Adaptive Hash Index") +ADD_FEATURE_INFO(INNODB_ROOT_GUESS WITH_INNODB_ROOT_GUESS + "Cache index root block descriptors in InnoDB") + +OPTION(WITH_INNODB_EXTRA_DEBUG "Enable extra InnoDB debug checks" OFF) +IF(WITH_INNODB_EXTRA_DEBUG) + ADD_DEFINITIONS(-DUNIV_ZIP_DEBUG) +ENDIF() +ADD_FEATURE_INFO(INNODB_EXTRA_DEBUG WITH_INNODB_EXTRA_DEBUG "Extra InnoDB debug checks") + +IF(HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE) + ADD_DEFINITIONS(-DHAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE=1) +ENDIF() + +IF (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR + CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wconversion -Wno-sign-conversion") + SET_SOURCE_FILES_PROPERTIES(fts/fts0pars.cc + PROPERTIES COMPILE_FLAGS -Wno-conversion) +ENDIF() + +IF(NOT MSVC) + # Work around MDEV-18417, MDEV-18656, MDEV-18417 + IF(WITH_ASAN AND CMAKE_COMPILER_IS_GNUCC AND + CMAKE_C_COMPILER_VERSION VERSION_LESS "6.0.0") + SET_SOURCE_FILES_PROPERTIES(trx/trx0rec.cc PROPERTIES COMPILE_FLAGS -O1) + ENDIF() +ENDIF(NOT MSVC) + +CHECK_FUNCTION_EXISTS(vasprintf HAVE_VASPRINTF) + +# Include directories under innobase +INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/storage/innobase/include + ${CMAKE_SOURCE_DIR}/storage/innobase/handler) + +# Sun Studio bug with -xO2 +IF(CMAKE_CXX_COMPILER_ID MATCHES "SunPro" + AND CMAKE_CXX_FLAGS_RELEASE MATCHES "O2" + AND NOT CMAKE_BUILD_TYPE STREQUAL "Debug") + # Sun Studio 12 crashes with -xO2 flag, but not with higher optimization + # -xO3 + SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_SOURCE_DIR}/rem/rem0rec.cc + PROPERTIES COMPILE_FLAGS -xO3) +ENDIF() + + +IF(MSVC) + # Avoid "unreferenced label" warning in generated file + GET_FILENAME_COMPONENT(_SRC_DIR ${CMAKE_CURRENT_LIST_FILE} PATH) + SET_SOURCE_FILES_PROPERTIES(${_SRC_DIR}/pars/pars0grm.c + PROPERTIES COMPILE_FLAGS "/wd4102") + SET_SOURCE_FILES_PROPERTIES(${_SRC_DIR}/pars/lexyy.c + PROPERTIES COMPILE_FLAGS "/wd4003") +ENDIF() + +# Include directories under innobase +INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/storage/innobase/include + ${CMAKE_SOURCE_DIR}/storage/innobase/handler + ${CMAKE_SOURCE_DIR}/libbinlogevents/include) +INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/tpool) + +SET(INNOBASE_SOURCES + btr/btr0btr.cc + btr/btr0bulk.cc + btr/btr0cur.cc + btr/btr0pcur.cc + btr/btr0sea.cc + btr/btr0defragment.cc + buf/buf0block_hint.cc + buf/buf0buddy.cc + buf/buf0buf.cc + buf/buf0dblwr.cc + buf/buf0checksum.cc + buf/buf0dump.cc + buf/buf0flu.cc + buf/buf0lru.cc + buf/buf0rea.cc + data/data0data.cc + data/data0type.cc + dict/dict0boot.cc + dict/dict0crea.cc + dict/dict0dict.cc + dict/dict0load.cc + dict/dict0mem.cc + dict/dict0stats.cc + dict/dict0stats_bg.cc + dict/dict0defrag_bg.cc + dict/drop.cc + eval/eval0eval.cc + eval/eval0proc.cc + fil/fil0fil.cc + fil/fil0pagecompress.cc + fil/fil0crypt.cc + fsp/fsp0fsp.cc + fsp/fsp0file.cc + fsp/fsp0space.cc + fsp/fsp0sysspace.cc + fut/fut0lst.cc + ha/ha0storage.cc + fts/fts0fts.cc + fts/fts0ast.cc + fts/fts0blex.cc + fts/fts0config.cc + fts/fts0opt.cc + fts/fts0pars.cc + fts/fts0que.cc + fts/fts0sql.cc + fts/fts0tlex.cc + gis/gis0geo.cc + gis/gis0rtree.cc + gis/gis0sea.cc + fts/fts0plugin.cc + handler/ha_innodb.cc + handler/handler0alter.cc + handler/i_s.cc + ibuf/ibuf0ibuf.cc + include/btr0btr.h + include/btr0btr.inl + include/btr0bulk.h + include/btr0cur.h + include/btr0cur.inl + include/btr0defragment.h + include/btr0pcur.h + include/btr0pcur.inl + include/btr0sea.h + include/btr0sea.inl + include/btr0types.h + include/buf0buddy.h + include/buf0buf.h + include/buf0buf.inl + include/buf0checksum.h + include/buf0dblwr.h + include/buf0dump.h + include/buf0flu.h + include/buf0lru.h + include/buf0rea.h + include/buf0types.h + include/data0data.h + include/data0data.inl + include/data0type.h + include/data0type.inl + include/data0types.h + include/db0err.h + include/dict0boot.h + include/dict0crea.h + include/dict0crea.inl + include/dict0defrag_bg.h + include/dict0dict.h + include/dict0dict.inl + include/dict0load.h + include/dict0mem.h + include/dict0mem.inl + include/dict0pagecompress.h + include/dict0pagecompress.inl + include/dict0stats.h + include/dict0stats.inl + include/dict0stats_bg.h + include/dict0types.h + include/dyn0buf.h + include/dyn0types.h + include/eval0eval.h + include/eval0eval.inl + include/eval0proc.h + include/eval0proc.inl + include/fil0crypt.h + include/fil0crypt.inl + include/fil0fil.h + include/fil0pagecompress.h + include/fsp0file.h + include/fsp0fsp.h + include/fsp0space.h + include/fsp0sysspace.h + include/fsp0types.h + include/fts0ast.h + include/fts0blex.h + include/fts0fts.h + include/fts0opt.h + include/fts0pars.h + include/fts0plugin.h + include/fts0priv.h + include/fts0priv.inl + include/fts0tlex.h + include/fts0tokenize.h + include/fts0types.h + include/fts0types.inl + include/fts0vlc.h + include/fut0lst.h + include/gis0geo.h + include/gis0rtree.h + include/gis0rtree.inl + include/gis0type.h + include/ha_prototypes.h + include/ha0ha.h + include/ha0ha.inl + include/ha0storage.h + include/ha0storage.inl + include/handler0alter.h + include/hash0hash.h + include/ibuf0ibuf.h + include/ibuf0ibuf.inl + include/lock0iter.h + include/lock0lock.h + include/lock0lock.inl + include/lock0prdt.h + include/lock0priv.h + include/lock0priv.inl + include/lock0types.h + include/log0crypt.h + include/log0log.h + include/log0recv.h + include/log0types.h + include/mach0data.h + include/mach0data.inl + include/mem0mem.h + include/mem0mem.inl + include/mtr0log.h + include/mtr0mtr.h + include/mtr0types.h + include/os0file.h + include/os0file.inl + include/page0cur.h + include/page0cur.inl + include/page0page.h + include/page0page.inl + include/page0types.h + include/page0zip.h + include/page0zip.inl + include/pars0grm.h + include/pars0opt.h + include/pars0pars.h + include/pars0sym.h + include/pars0types.h + include/que0que.h + include/que0que.inl + include/que0types.h + include/read0types.h + include/rem0cmp.h + include/rem0rec.h + include/rem0rec.inl + include/rem0types.h + include/row0ext.h + include/row0ext.inl + include/row0ftsort.h + include/row0import.h + include/row0ins.h + include/row0log.h + include/row0merge.h + include/row0mysql.h + include/row0purge.h + include/row0quiesce.h + include/row0row.h + include/row0row.inl + include/row0sel.h + include/row0types.h + include/row0uins.h + include/row0umod.h + include/row0undo.h + include/row0upd.h + include/row0upd.inl + include/row0vers.h + include/rw_lock.h + include/small_vector.h + include/srv0mon.h + include/srv0mon.inl + include/srv0srv.h + include/srv0start.h + include/srw_lock.h + include/sux_lock.h + include/transactional_lock_guard.h + include/trx0i_s.h + include/trx0purge.h + include/trx0rec.h + include/trx0roll.h + include/trx0rseg.h + include/trx0sys.h + include/trx0trx.h + include/trx0trx.inl + include/trx0types.h + include/trx0undo.h + include/trx0undo.inl + include/trx0xa.h + include/univ.i + include/ut0byte.h + include/ut0byte.inl + include/ut0counter.h + include/ut0dbg.h + include/ut0list.h + include/ut0list.inl + include/ut0lst.h + include/ut0mem.h + include/ut0mem.inl + include/ut0new.h + include/ut0pool.h + include/ut0rbt.h + include/ut0rnd.h + include/ut0rnd.inl + include/ut0sort.h + include/ut0stage.h + include/ut0ut.h + include/ut0ut.inl + include/ut0vec.h + include/ut0vec.inl + include/ut0wqueue.h + lock/lock0iter.cc + lock/lock0prdt.cc + lock/lock0lock.cc + log/log0log.cc + log/log0recv.cc + log/log0crypt.cc + log/log0sync.cc + mem/mem0mem.cc + mtr/mtr0mtr.cc + os/os0file.cc + page/page0cur.cc + page/page0page.cc + page/page0zip.cc + pars/lexyy.cc + pars/pars0grm.cc + pars/pars0opt.cc + pars/pars0pars.cc + pars/pars0sym.cc + que/que0que.cc + read/read0read.cc + rem/rem0cmp.cc + rem/rem0rec.cc + row/row0ext.cc + row/row0ftsort.cc + row/row0import.cc + row/row0ins.cc + row/row0merge.cc + row/row0mysql.cc + row/row0log.cc + row/row0purge.cc + row/row0row.cc + row/row0sel.cc + row/row0uins.cc + row/row0umod.cc + row/row0undo.cc + row/row0upd.cc + row/row0quiesce.cc + row/row0vers.cc + srv/srv0mon.cc + srv/srv0srv.cc + srv/srv0start.cc + sync/srw_lock.cc + trx/trx0i_s.cc + trx/trx0purge.cc + trx/trx0rec.cc + trx/trx0roll.cc + trx/trx0rseg.cc + trx/trx0sys.cc + trx/trx0trx.cc + trx/trx0undo.cc + ut/ut0dbg.cc + ut/ut0list.cc + ut/ut0mem.cc + ut/ut0new.cc + ut/ut0rbt.cc + ut/ut0rnd.cc + ut/ut0ut.cc + ut/ut0vec.cc + ut/ut0wqueue.cc) + +OPTION(WITH_PMEM "Support redo log in persistent memory" OFF) +FIND_PACKAGE(PMEM) +IF(PMEM_FOUND) + INCLUDE_DIRECTORIES(${PMEM_INCLUDES}) + ADD_COMPILE_FLAGS(log/log0log.cc log/log0recv.cc + buf/buf0flu.cc mtr/mtr0mtr.cc trx/trx0trx.cc srv/srv0start.cc + COMPILE_FLAGS "-DHAVE_PMEM") + SET(PMEM_LIBRARY ${PMEM_LIBRARIES}) +ELSE() + IF(WITH_PMEM) + MESSAGE(FATAL_ERROR "WITH_PMEM=ON cannot be satisfied") + ENDIF() +ENDIF() + +MYSQL_ADD_PLUGIN(innobase ${INNOBASE_SOURCES} STORAGE_ENGINE + MODULE_OUTPUT_NAME ha_innodb + DEFAULT RECOMPILE_FOR_EMBEDDED + LINK_LIBRARIES + ${ZLIB_LIBRARY} + ${PMEM_LIBRARY} + ${NUMA_LIBRARY} + ${LIBSYSTEMD} + ${LINKER_SCRIPT}) + +IF(NOT TARGET innobase) + RETURN() +ENDIF() + +ADD_DEFINITIONS(${SSL_DEFINES} ${TPOOL_DEFINES}) + +# A GCC bug causes crash when compiling these files on ARM64 with -O1+ +# Compile them with -O0 as a workaround. +IF(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" + AND CMAKE_C_COMPILER_VERSION VERSION_LESS "5.2.0") + ADD_COMPILE_FLAGS( + btr/btr0btr.cc + btr/btr0cur.cc + buf/buf0buf.cc + fts/fts0fts.cc + gis/gis0sea.cc + handler/handler0alter.cc + mtr/mtr0mtr.cc + row/row0merge.cc + row/row0mysql.cc + srv/srv0srv.cc + COMPILE_FLAGS "-O0" + ) +ENDIF() + +# Older gcc version insist on -mhtm flag for including the +# htmxlintrin.h header. This is also true for new gcc versions +# like 11.2.0 in Debian Sid +# s390x because of the way it defines the high level intrinsics +# as not-inline in the header file can only be included by one +# source file that has -mhtm enabled. +IF(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64|powerpc64|s390x" + OR CMAKE_SYSTEM_NAME MATCHES "AIX") + ADD_COMPILE_FLAGS( + sync/srw_lock.cc + COMPILE_FLAGS "-mhtm" + ) +ENDIF() +IF(MSVC) + IF(CMAKE_SIZEOF_VOID_P EQUAL 8) + ADD_COMPILE_FLAGS( + pars/lexyy.cc + COMPILE_FLAGS "/wd4267") + ENDIF() + # silence "switch statement contains 'default' but no 'case' label + # on generated file. + TARGET_COMPILE_OPTIONS(innobase PRIVATE "/wd4065") +ENDIF() + +IF(NOT (PLUGIN_INNOBASE STREQUAL DYNAMIC)) + TARGET_LINK_LIBRARIES(innobase tpool mysys) + ADD_SUBDIRECTORY(${CMAKE_SOURCE_DIR}/extra/mariabackup ${CMAKE_BINARY_DIR}/extra/mariabackup) +ENDIF() + +IF(WITH_UNIT_TESTS) + ADD_SUBDIRECTORY(unittest) +ENDIF() diff --git a/storage/innobase/COPYING.Google b/storage/innobase/COPYING.Google new file mode 100644 index 00000000..5ade2b0e --- /dev/null +++ b/storage/innobase/COPYING.Google @@ -0,0 +1,30 @@ +Portions of this software contain modifications contributed by Google, Inc. +These contributions are used with the following license: + +Copyright (c) 2008, Google Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials + provided with the distribution. + * Neither the name of the Google Inc. nor the names of its + contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/storage/innobase/COPYING.Percona b/storage/innobase/COPYING.Percona new file mode 100644 index 00000000..8c786811 --- /dev/null +++ b/storage/innobase/COPYING.Percona @@ -0,0 +1,30 @@ +Portions of this software contain modifications contributed by Percona, Inc. +These contributions are used with the following license: + +Copyright (c) 2008, 2009, Percona Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials + provided with the distribution. + * Neither the name of the Percona Inc. nor the names of its + contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc new file mode 100644 index 00000000..08be1991 --- /dev/null +++ b/storage/innobase/btr/btr0btr.cc @@ -0,0 +1,5433 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2014, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file btr/btr0btr.cc +The B-tree + +Created 6/2/1994 Heikki Tuuri +*******************************************************/ + +#include "btr0btr.h" + +#include "page0page.h" +#include "page0zip.h" +#include "gis0rtree.h" + +#include "btr0cur.h" +#include "btr0sea.h" +#include "btr0pcur.h" +#include "btr0defragment.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "ibuf0ibuf.h" +#include "trx0trx.h" +#include "srv0mon.h" +#include "gis0geo.h" +#include "dict0boot.h" +#include "row0sel.h" /* row_search_max_autoinc() */ +#include "log.h" + +/**************************************************************//** +Checks if the page in the cursor can be merged with given page. +If necessary, re-organize the merge_page. +@return true if possible to merge. */ +static +bool +btr_can_merge_with_page( +/*====================*/ + btr_cur_t* cursor, /*!< in: cursor on the page to merge */ + uint32_t page_no, /*!< in: a sibling page */ + buf_block_t** merge_block, /*!< out: the merge block */ + mtr_t* mtr); /*!< in: mini-transaction */ + +/* +Latching strategy of the InnoDB B-tree +-------------------------------------- + +Node pointer page latches acquisition is protected by index->lock latch. + +Before MariaDB 10.2.2, all node pointer pages were protected by index->lock +either in S (shared) or X (exclusive) mode and block->lock was not acquired on +node pointer pages. + +After MariaDB 10.2.2, block->lock S-latch or X-latch is used to protect +node pointer pages and obtaiment of node pointer page latches is protected by +index->lock. + +(0) Definition: B-tree level. + +(0.1) The leaf pages of the B-tree are at level 0. + +(0.2) The parent of a page at level L has level L+1. (The level of the +root page is equal to the tree height.) + +(0.3) The B-tree lock (index->lock) is the parent of the root page and +has a level = tree height + 1. + +Index->lock has 3 possible locking modes: + +(1) S-latch: + +(1.1) All latches for pages must be obtained in descending order of tree level. + +(1.2) Before obtaining the first node pointer page latch at a given B-tree +level, parent latch must be held (at level +1 ). + +(1.3) If a node pointer page is already latched at the same level +we can only obtain latch to its right sibling page latch at the same level. + +(1.4) Release of the node pointer page latches must be done in +child-to-parent order. (Prevents deadlocks when obtained index->lock +in SX mode). + +(1.4.1) Level L node pointer page latch can be released only when +no latches at children level i.e. level < L are hold. + +(1.4.2) All latches from node pointer pages must be released so +that no latches are obtained between. + +(1.5) [implied by (1.1), (1.2)] Root page latch must be first node pointer +latch obtained. + +(2) SX-latch: + +In this case rules (1.2) and (1.3) from S-latch case are relaxed and +merged into (2.2) and rule (1.4) is removed. Thus, latch acquisition +can be skipped at some tree levels and latches can be obtained in +a less restricted order. + +(2.1) [identical to (1.1)]: All latches for pages must be obtained in descending +order of tree level. + +(2.2) When a node pointer latch at level L is obtained, +the left sibling page latch in the same level or some ancestor +page latch (at level > L) must be hold. + +(2.3) [implied by (2.1), (2.2)] The first node pointer page latch obtained can +be any node pointer page. + +(3) X-latch: + +Node pointer latches can be obtained in any order. + +NOTE: New rules after MariaDB 10.2.2 does not affect the latching rules of leaf pages: + +index->lock S-latch is needed in read for the node pointer traversal. When the leaf +level is reached, index-lock can be released (and with the MariaDB 10.2.2 changes, all +node pointer latches). Left to right index travelsal in leaf page level can be safely done +by obtaining right sibling leaf page latch and then releasing the old page latch. + +Single leaf page modifications (BTR_MODIFY_LEAF) are protected by index->lock +S-latch. + +B-tree operations involving page splits or merges (BTR_MODIFY_TREE) and page +allocations are protected by index->lock X-latch. + +Node pointers +------------- +Leaf pages of a B-tree contain the index records stored in the +tree. On levels n > 0 we store 'node pointers' to pages on level +n - 1. For each page there is exactly one node pointer stored: +thus the our tree is an ordinary B-tree, not a B-link tree. + +A node pointer contains a prefix P of an index record. The prefix +is long enough so that it determines an index record uniquely. +The file page number of the child page is added as the last +field. To the child page we can store node pointers or index records +which are >= P in the alphabetical order, but < P1 if there is +a next node pointer on the level, and P1 is its prefix. + +If a node pointer with a prefix P points to a non-leaf child, +then the leftmost record in the child must have the same +prefix P. If it points to a leaf node, the child is not required +to contain any record with a prefix equal to P. The leaf case +is decided this way to allow arbitrary deletions in a leaf node +without touching upper levels of the tree. + +We have predefined a special minimum record which we +define as the smallest record in any alphabetical order. +A minimum record is denoted by setting a bit in the record +header. A minimum record acts as the prefix of a node pointer +which points to a leftmost node on any level of the tree. + +File page allocation +-------------------- +In the root node of a B-tree there are two file segment headers. +The leaf pages of a tree are allocated from one file segment, to +make them consecutive on disk if possible. From the other file segment +we allocate pages for the non-leaf levels of the tree. +*/ + +/** Check a file segment header within a B-tree root page. +@param offset file segment header offset +@param block B-tree root page +@param space tablespace +@return whether the segment header is valid */ +static bool btr_root_fseg_validate(ulint offset, + const buf_block_t &block, + const fil_space_t &space) +{ + ut_ad(block.page.id().space() == space.id); + const uint16_t hdr= mach_read_from_2(offset + FSEG_HDR_OFFSET + + block.page.frame); + if (FIL_PAGE_DATA <= hdr && hdr <= srv_page_size - FIL_PAGE_DATA_END && + mach_read_from_4(block.page.frame + offset + FSEG_HDR_SPACE) == space.id) + return true; + sql_print_error("InnoDB: Index root page " UINT32PF " in %s is corrupted " + "at " ULINTPF, + block.page.id().page_no(), + UT_LIST_GET_FIRST(space.chain)->name); + return false; +} + +/** Report a decryption failure. */ +ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index) +{ + ib_push_warning(static_cast(nullptr), DB_DECRYPTION_FAILED, + "Table %s is encrypted but encryption service or" + " used key_id is not available. " + " Can't continue reading table.", + index.table->name.m_name); + index.table->file_unreadable= true; +} + +/** Get an index page and declare its latching order level. +@param[in] index index tree +@param[in] page page number +@param[in] mode latch mode +@param[in] merge whether change buffer merge should be attempted +@param[in,out] mtr mini-transaction +@param[out] err error code +@return block */ +buf_block_t *btr_block_get(const dict_index_t &index, + uint32_t page, rw_lock_type_t mode, bool merge, + mtr_t *mtr, dberr_t *err) +{ + ut_ad(mode != RW_NO_LATCH); + dberr_t local_err; + if (!err) + err= &local_err; + buf_block_t *block= + buf_page_get_gen(page_id_t{index.table->space->id, page}, + index.table->space->zip_size(), mode, nullptr, BUF_GET, + mtr, err, merge && !index.is_clust()); + ut_ad(!block == (*err != DB_SUCCESS)); + + if (UNIV_LIKELY(block != nullptr)) + { + if (!!page_is_comp(block->page.frame) != index.table->not_redundant() || + btr_page_get_index_id(block->page.frame) != index.id || + !fil_page_index_page_check(block->page.frame) || + index.is_spatial() != + (fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE)) + { + *err= DB_PAGE_CORRUPTED; + block= nullptr; + } + } + else if (*err == DB_DECRYPTION_FAILED) + btr_decryption_failed(index); + + return block; +} + +/**************************************************************//** +Gets the root node of a tree and x- or s-latches it. +@return root page, x- or s-latched */ +buf_block_t* +btr_root_block_get( +/*===============*/ + dict_index_t* index, /*!< in: index tree */ + rw_lock_type_t mode, /*!< in: either RW_S_LATCH + or RW_X_LATCH */ + mtr_t* mtr, /*!< in: mtr */ + dberr_t* err) /*!< out: error code */ +{ + if (!index->table || !index->table->space) + { + *err= DB_TABLESPACE_NOT_FOUND; + return nullptr; + } + + buf_block_t *block; +#ifndef BTR_CUR_ADAPT + static constexpr buf_block_t *guess= nullptr; +#else + buf_block_t *&guess= btr_search_get_info(index)->root_guess; + guess= +#endif + block= + buf_page_get_gen(page_id_t{index->table->space->id, index->page}, + index->table->space->zip_size(), mode, guess, BUF_GET, + mtr, err, false); + ut_ad(!block == (*err != DB_SUCCESS)); + + if (UNIV_LIKELY(block != nullptr)) + { + if (UNIV_UNLIKELY(mode == RW_NO_LATCH)); + else if (!!page_is_comp(block->page.frame) != + index->table->not_redundant() || + btr_page_get_index_id(block->page.frame) != index->id || + !fil_page_index_page_check(block->page.frame) || + index->is_spatial() != + (fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE)) + { + *err= DB_PAGE_CORRUPTED; + block= nullptr; + } + else if (index->is_ibuf()); + else if (!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF, + *block, *index->table->space) || + !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP, + *block, *index->table->space)) + { + *err= DB_CORRUPTION; + block= nullptr; + } + } + else if (*err == DB_DECRYPTION_FAILED) + btr_decryption_failed(*index); + + return block; +} + +/**************************************************************//** +Gets the root node of a tree and sx-latches it for segment access. +@return root page, sx-latched */ +static +page_t* +btr_root_get( +/*=========*/ + dict_index_t* index, /*!< in: index tree */ + mtr_t* mtr, /*!< in: mtr */ + dberr_t* err) /*!< out: error code */ +{ + /* Intended to be used for accessing file segment lists. + Concurrent read of other data is allowed. */ + if (buf_block_t *root= btr_root_block_get(index, RW_SX_LATCH, mtr, err)) + return root->page.frame; + return nullptr; +} + +/**************************************************************//** +Checks a file segment header within a B-tree root page and updates +the segment header space id. +@return TRUE if valid */ +static +bool +btr_root_fseg_adjust_on_import( +/*===========================*/ + fseg_header_t* seg_header, /*!< in/out: segment header */ + page_zip_des_t* page_zip, /*!< in/out: compressed page, + or NULL */ + ulint space) /*!< in: tablespace identifier */ +{ + ulint offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET); + + if (offset < FIL_PAGE_DATA + || offset > srv_page_size - FIL_PAGE_DATA_END) { + return false; + } + + seg_header += FSEG_HDR_SPACE; + + mach_write_to_4(seg_header, space); + if (UNIV_LIKELY_NULL(page_zip)) { + memcpy(page_zip->data + page_offset(seg_header), seg_header, + 4); + } + + return true; +} + +/**************************************************************//** +Checks and adjusts the root node of a tree during IMPORT TABLESPACE. +@return error code, or DB_SUCCESS */ +dberr_t +btr_root_adjust_on_import( +/*======================*/ + const dict_index_t* index) /*!< in: index tree */ +{ + dberr_t err; + mtr_t mtr; + page_t* page; + page_zip_des_t* page_zip; + dict_table_t* table = index->table; + + DBUG_EXECUTE_IF("ib_import_trigger_corruption_3", + return(DB_CORRUPTION);); + + mtr_start(&mtr); + + mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); + + buf_block_t* block = buf_page_get_gen( + page_id_t(table->space->id, index->page), + table->space->zip_size(), RW_X_LATCH, NULL, BUF_GET, + &mtr, &err); + if (!block) { + ut_ad(err != DB_SUCCESS); + goto func_exit; + } + + page = buf_block_get_frame(block); + page_zip = buf_block_get_page_zip(block); + + if (!fil_page_index_page_check(page) || page_has_siblings(page)) { + err = DB_CORRUPTION; + + } else if (dict_index_is_clust(index)) { + bool page_is_compact_format; + + page_is_compact_format = page_is_comp(page) > 0; + + /* Check if the page format and table format agree. */ + if (page_is_compact_format != dict_table_is_comp(table)) { + err = DB_CORRUPTION; + } else { + /* Check that the table flags and the tablespace + flags match. */ + uint32_t tf = dict_tf_to_fsp_flags(table->flags); + uint32_t sf = table->space->flags; + sf &= ~FSP_FLAGS_MEM_MASK; + tf &= ~FSP_FLAGS_MEM_MASK; + if (fil_space_t::is_flags_equal(tf, sf) + || fil_space_t::is_flags_equal(sf, tf)) { + mysql_mutex_lock(&fil_system.mutex); + table->space->flags = (table->space->flags + & ~FSP_FLAGS_MEM_MASK) + | (tf & FSP_FLAGS_MEM_MASK); + mysql_mutex_unlock(&fil_system.mutex); + err = DB_SUCCESS; + } else { + err = DB_CORRUPTION; + } + } + } else { + err = DB_SUCCESS; + } + + /* Check and adjust the file segment headers, if all OK so far. */ + if (err == DB_SUCCESS + && (!btr_root_fseg_adjust_on_import( + FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + + page, page_zip, table->space_id) + || !btr_root_fseg_adjust_on_import( + FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + page, page_zip, table->space_id))) { + + err = DB_CORRUPTION; + } + +func_exit: + mtr_commit(&mtr); + + return(err); +} + +/**************************************************************//** +Creates a new index page (not the root, and also not +used in page reorganization). @see btr_page_empty(). */ +void +btr_page_create( +/*============*/ + buf_block_t* block, /*!< in/out: page to be created */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: the B-tree level of the page */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + byte *index_id= my_assume_aligned<2>(PAGE_HEADER + PAGE_INDEX_ID + + block->page.frame); + + if (UNIV_LIKELY_NULL(page_zip)) + { + mach_write_to_8(index_id, index->id); + page_create_zip(block, index, level, 0, mtr); + } + else + { + page_create(block, mtr, dict_table_is_comp(index->table)); + if (index->is_spatial()) + { + static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) == + FIL_PAGE_RTREE, "compatibility"); + mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->page.frame, + byte(FIL_PAGE_RTREE)); + if (mach_read_from_8(block->page.frame + FIL_RTREE_SPLIT_SEQ_NUM)) + mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM, 8, 0); + } + /* Set the level of the new index page */ + mtr->write<2,mtr_t::MAYBE_NOP>(*block, + my_assume_aligned<2>(PAGE_HEADER + + PAGE_LEVEL + + block->page.frame), + level); + mtr->write<8,mtr_t::MAYBE_NOP>(*block, index_id, index->id); + } +} + +buf_block_t * +mtr_t::get_already_latched(const page_id_t id, mtr_memo_type_t type) const +{ + ut_ad(is_active()); + ut_ad(type == MTR_MEMO_PAGE_X_FIX || type == MTR_MEMO_PAGE_SX_FIX || + type == MTR_MEMO_PAGE_S_FIX); + for (ulint i= 0; i < m_memo.size(); i++) + { + const mtr_memo_slot_t &slot= m_memo[i]; + const auto slot_type= mtr_memo_type_t(slot.type & ~MTR_MEMO_MODIFY); + if (slot_type == MTR_MEMO_PAGE_X_FIX || slot_type == type) + { + buf_block_t *block= static_cast(slot.object); + if (block->page.id() == id) + return block; + } + } + return nullptr; +} + +/** Fetch an index root page that was already latched in the +mini-transaction. */ +static buf_block_t *btr_get_latched_root(const dict_index_t &index, mtr_t *mtr) +{ + return mtr->get_already_latched(page_id_t{index.table->space_id, index.page}, + MTR_MEMO_PAGE_SX_FIX); +} + +/** Fetch an index page that should have been already latched in the +mini-transaction. */ +static buf_block_t * +btr_block_reget(mtr_t *mtr, const dict_index_t &index, + const page_id_t id, dberr_t *err) +{ + if (buf_block_t *block= mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX)) + { + *err= DB_SUCCESS; + return block; + } + + ut_ad(mtr->memo_contains_flagged(&index.lock, MTR_MEMO_X_LOCK)); + return btr_block_get(index, id.page_no(), RW_X_LATCH, true, mtr, err); +} + +/**************************************************************//** +Allocates a new file page to be used in an ibuf tree. Takes the page from +the free list of the tree, which must contain pages! +@return new allocated block, x-latched */ +static +buf_block_t* +btr_page_alloc_for_ibuf( +/*====================*/ + dict_index_t* index, /*!< in: index tree */ + mtr_t* mtr, /*!< in: mtr */ + dberr_t* err) /*!< out: error code */ +{ + buf_block_t *root= btr_get_latched_root(*index, mtr); + if (UNIV_UNLIKELY(!root)) + return root; + buf_block_t *new_block= + buf_page_get_gen(page_id_t(IBUF_SPACE_ID, + mach_read_from_4(PAGE_HEADER + + PAGE_BTR_IBUF_FREE_LIST + + FLST_FIRST + FIL_ADDR_PAGE + + root->page.frame)), + 0, RW_X_LATCH, nullptr, BUF_GET, mtr, err); + if (new_block) + *err= flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, new_block, + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr); + ut_d(if (*err == DB_SUCCESS) + flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr)); + return new_block; +} + +/**************************************************************//** +Allocates a new file page to be used in an index tree. NOTE: we assume +that the caller has made the reservation for free extents! +@retval NULL if no page could be allocated */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +buf_block_t* +btr_page_alloc_low( +/*===============*/ + dict_index_t* index, /*!< in: index */ + uint32_t hint_page_no, /*!< in: hint of a good page */ + byte file_direction, /*!< in: direction where a possible + page split is made */ + ulint level, /*!< in: level where the page is placed + in the tree */ + mtr_t* mtr, /*!< in/out: mini-transaction + for the allocation */ + mtr_t* init_mtr, /*!< in/out: mtr or another + mini-transaction in which the + page should be initialized. */ + dberr_t* err) /*!< out: error code */ +{ + const auto savepoint= mtr->get_savepoint(); + buf_block_t *root= btr_root_block_get(index, RW_NO_LATCH, mtr, err); + if (UNIV_UNLIKELY(!root)) + return root; + + const bool have_latch= mtr->have_u_or_x_latch(*root); +#ifdef BTR_CUR_HASH_ADAPT + ut_ad(!have_latch || !root->index || !root->index->freed()); +#endif + mtr->rollback_to_savepoint(savepoint); + + if (!have_latch && + UNIV_UNLIKELY(!(root= btr_root_block_get(index, RW_SX_LATCH, mtr, err)))) + return root; + + fseg_header_t *seg_header= root->page.frame + + (level ? PAGE_HEADER + PAGE_BTR_SEG_TOP : PAGE_HEADER + PAGE_BTR_SEG_LEAF); + return fseg_alloc_free_page_general(seg_header, hint_page_no, file_direction, + true, mtr, init_mtr, err); +} + +/**************************************************************//** +Allocates a new file page to be used in an index tree. NOTE: we assume +that the caller has made the reservation for free extents! +@retval NULL if no page could be allocated */ +buf_block_t* +btr_page_alloc( +/*===========*/ + dict_index_t* index, /*!< in: index */ + uint32_t hint_page_no, /*!< in: hint of a good page */ + byte file_direction, /*!< in: direction where a possible + page split is made */ + ulint level, /*!< in: level where the page is placed + in the tree */ + mtr_t* mtr, /*!< in/out: mini-transaction + for the allocation */ + mtr_t* init_mtr, /*!< in/out: mini-transaction + for x-latching and initializing + the page */ + dberr_t* err) /*!< out: error code */ +{ + ut_ad(level < BTR_MAX_NODE_LEVEL); + return index->is_ibuf() + ? btr_page_alloc_for_ibuf(index, mtr, err) + : btr_page_alloc_low(index, hint_page_no, file_direction, level, + mtr, init_mtr, err); +} + +/**************************************************************//** +Frees a page used in an ibuf tree. Puts the page to the free list of the +ibuf tree. */ +static +dberr_t +btr_page_free_for_ibuf( +/*===================*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: block to be freed, x-latched */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + buf_block_t *root= btr_get_latched_root(*index, mtr); + dberr_t err= + flst_add_first(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr); + ut_d(if (err == DB_SUCCESS) + flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr)); + return err; +} + +/** Free an index page. +@param[in,out] index index tree +@param[in,out] block block to be freed +@param[in,out] mtr mini-transaction +@param[in] blob whether this is freeing a BLOB page +@param[in] latched whether index->table->space->x_lock() was called +@return error code */ +dberr_t btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr, + bool blob, bool space_latched) +{ + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); +#if defined BTR_CUR_HASH_ADAPT && defined UNIV_DEBUG + if (btr_search_check_marked_free_index(block)) + { + ut_ad(!blob); + ut_ad(page_is_leaf(block->page.frame)); + } +#endif + const uint32_t page{block->page.id().page_no()}; + ut_ad(index->table->space_id == block->page.id().space()); + /* The root page is freed by btr_free_root(). */ + ut_ad(page != index->page); + ut_ad(mtr->is_named_space(index->table->space)); + + /* The page gets invalid for optimistic searches: increment the frame + modify clock */ + buf_block_modify_clock_inc(block); + + /* TODO: Discard any operations for block from mtr->m_log. + The page will be freed, so previous changes to it by this + mini-transaction should not matter. */ + + if (index->is_ibuf()) + return btr_page_free_for_ibuf(index, block, mtr); + + fil_space_t *space= index->table->space; + dberr_t err; + + const auto savepoint= mtr->get_savepoint(); + if (buf_block_t *root= btr_root_block_get(index, RW_NO_LATCH, mtr, &err)) + { + const bool have_latch= mtr->have_u_or_x_latch(*root); +#ifdef BTR_CUR_HASH_ADAPT + ut_ad(!have_latch || !root->index || !root->index->freed()); +#endif + mtr->rollback_to_savepoint(savepoint); + if (have_latch || + (root= btr_root_block_get(index, RW_SX_LATCH, mtr, &err))) + err= fseg_free_page(&root->page.frame[blob || + page_is_leaf(block->page.frame) + ? PAGE_HEADER + PAGE_BTR_SEG_LEAF + : PAGE_HEADER + PAGE_BTR_SEG_TOP], + space, page, mtr, space_latched); + } + if (err == DB_SUCCESS) + buf_page_free(space, page, mtr); + + /* The page was marked free in the allocation bitmap, but it + should remain exclusively latched until mtr_t::commit() or until it + is explicitly freed from the mini-transaction. */ + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + return err; +} + +/** Set the child page number in a node pointer record. +@param[in,out] block non-leaf index page +@param[in,out] rec node pointer record in the page +@param[in] offsets rec_get_offsets(rec) +@param[in] page_no child page number +@param[in,out] mtr mini-transaction +Sets the child node file address in a node pointer. */ +inline void btr_node_ptr_set_child_page_no(buf_block_t *block, + rec_t *rec, const rec_offs *offsets, + ulint page_no, mtr_t *mtr) +{ + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(!page_rec_is_leaf(rec)); + ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec)); + + const ulint offs= rec_offs_data_size(offsets); + ut_ad(rec_offs_nth_size(offsets, rec_offs_n_fields(offsets) - 1) == + REC_NODE_PTR_SIZE); + + if (UNIV_LIKELY_NULL(block->page.zip.data)) + page_zip_write_node_ptr(block, rec, offs, page_no, mtr); + else + mtr->write<4>(*block, rec + offs - REC_NODE_PTR_SIZE, page_no); +} + +MY_ATTRIBUTE((nonnull(1,2,3,4),warn_unused_result)) +/************************************************************//** +Returns the child page of a node pointer and sx-latches it. +@return child page, sx-latched */ +static +buf_block_t* +btr_node_ptr_get_child( +/*===================*/ + const rec_t* node_ptr,/*!< in: node pointer */ + dict_index_t* index, /*!< in: index */ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + mtr_t* mtr, /*!< in: mtr */ + dberr_t* err = nullptr) /*!< out: error code */ +{ + ut_ad(rec_offs_validate(node_ptr, index, offsets)); + ut_ad(index->table->space_id + == page_get_space_id(page_align(node_ptr))); + + return btr_block_get( + *index, btr_node_ptr_get_child_page_no(node_ptr, offsets), + RW_SX_LATCH, btr_page_get_level(page_align(node_ptr)) == 1, + mtr, err); +} + +MY_ATTRIBUTE((nonnull(2,3,4), warn_unused_result)) +/************************************************************//** +Returns the upper level node pointer to a page. It is assumed that mtr holds +an sx-latch on the tree. +@return rec_get_offsets() of the node pointer record */ +static +rec_offs* +btr_page_get_father_node_ptr_for_validate( + rec_offs* offsets,/*!< in: work area for the return value */ + mem_heap_t* heap, /*!< in: memory heap to use */ + btr_cur_t* cursor, /*!< in: cursor pointing to user record, + out: cursor on node pointer record, + its page x-latched */ + mtr_t* mtr) /*!< in: mtr */ +{ + const uint32_t page_no = btr_cur_get_block(cursor)->page.id().page_no(); + dict_index_t* index = btr_cur_get_index(cursor); + ut_ad(!dict_index_is_spatial(index)); + ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK)); + ut_ad(dict_index_get_page(index) != page_no); + + const auto level = btr_page_get_level(btr_cur_get_page(cursor)); + + const rec_t* user_rec = btr_cur_get_rec(cursor); + ut_a(page_rec_is_user_rec(user_rec)); + + if (btr_cur_search_to_nth_level(level + 1, + dict_index_build_node_ptr(index, + user_rec, 0, + heap, level), + RW_S_LATCH, + cursor, mtr) != DB_SUCCESS) { + return nullptr; + } + + const rec_t* node_ptr = btr_cur_get_rec(cursor); + + offsets = rec_get_offsets(node_ptr, index, offsets, 0, + ULINT_UNDEFINED, &heap); + + if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) { + offsets = nullptr; + } + + return(offsets); +} + +MY_ATTRIBUTE((nonnull(2,3,4), warn_unused_result)) +/** Return the node pointer to a page. +@param offsets work area for the return value +@param heap memory heap +@param cursor in: child page; out: node pointer to it +@param mtr mini-transaction +@return rec_get_offsets() of the node pointer record +@retval nullptr if the parent page had not been latched in mtr */ +static rec_offs *btr_page_get_parent(rec_offs *offsets, mem_heap_t *heap, + btr_cur_t *cursor, mtr_t *mtr) +{ + const uint32_t page_no= cursor->block()->page.id().page_no(); + const dict_index_t *index= cursor->index(); + ut_ad(!index->is_spatial()); + ut_ad(index->page != page_no); + + uint32_t p= index->page; + auto level= btr_page_get_level(cursor->block()->page.frame); + const dtuple_t *tuple= + dict_index_build_node_ptr(index, btr_cur_get_rec(cursor), 0, heap, level); + level++; + + ulint i; + for (i= 0; i < mtr->get_savepoint(); i++) + if (buf_block_t *block= mtr->block_at_savepoint(i)) + if (block->page.id().page_no() == p) + { + ut_ad(block->page.lock.have_u_or_x() || + (!block->page.lock.have_s() && index->lock.have_x())); + ulint up_match= 0, low_match= 0; + cursor->page_cur.block= block; + if (page_cur_search_with_match(tuple, PAGE_CUR_LE, &up_match, + &low_match, &cursor->page_cur, + nullptr)) + return nullptr; + offsets= rec_get_offsets(cursor->page_cur.rec, index, offsets, 0, + ULINT_UNDEFINED, &heap); + p= btr_node_ptr_get_child_page_no(cursor->page_cur.rec, offsets); + if (p != page_no) + { + if (btr_page_get_level(block->page.frame) == level) + return nullptr; + i= 0; // MDEV-29835 FIXME: require all pages to be latched in order! + continue; + } + ut_ad(block->page.lock.have_u_or_x()); + if (block->page.lock.have_u_not_x()) + { + /* btr_cur_t::search_leaf(BTR_MODIFY_TREE) only U-latches the + root page initially. */ + ut_ad(block->page.id().page_no() == index->page); + block->page.lock.u_x_upgrade(); + mtr->page_lock_upgrade(*block); + } + return offsets; + } + + return nullptr; +} + +/************************************************************//** +Returns the upper level node pointer to a page. It is assumed that mtr holds +an x-latch on the tree. +@return rec_get_offsets() of the node pointer record */ +static +rec_offs* +btr_page_get_father_block( +/*======================*/ + rec_offs* offsets,/*!< in: work area for the return value */ + mem_heap_t* heap, /*!< in: memory heap to use */ + mtr_t* mtr, /*!< in: mtr */ + btr_cur_t* cursor) /*!< out: cursor on node pointer record, + its page x-latched */ +{ + rec_t *rec= + page_rec_get_next(page_get_infimum_rec(cursor->block()->page.frame)); + if (UNIV_UNLIKELY(!rec)) + return nullptr; + cursor->page_cur.rec= rec; + return btr_page_get_parent(offsets, heap, cursor, mtr); +} + +/** Seek to the parent page of a B-tree page. +@param[in,out] mtr mini-transaction +@param[in,out] cursor cursor pointing to the x-latched parent page +@return whether the cursor was successfully positioned */ +bool btr_page_get_father(mtr_t* mtr, btr_cur_t* cursor) +{ + rec_t *rec= + page_rec_get_next(page_get_infimum_rec(cursor->block()->page.frame)); + if (UNIV_UNLIKELY(!rec)) + return false; + cursor->page_cur.rec= rec; + mem_heap_t *heap= mem_heap_create(100); + const bool got= btr_page_get_parent(nullptr, heap, cursor, mtr); + mem_heap_free(heap); + return got; +} + +#ifdef UNIV_DEBUG +/** PAGE_INDEX_ID value for freed index B-trees */ +constexpr index_id_t BTR_FREED_INDEX_ID = 0; +#endif + +/** Free a B-tree root page. btr_free_but_not_root() must already +have been called. +@param block index root page +@param space tablespace +@param mtr mini-transaction */ +static void btr_free_root(buf_block_t *block, const fil_space_t &space, + mtr_t *mtr) +{ + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX | + MTR_MEMO_PAGE_SX_FIX)); + ut_ad(mtr->is_named_space(&space)); + + btr_search_drop_page_hash_index(block, false); + + if (btr_root_fseg_validate(PAGE_HEADER + PAGE_BTR_SEG_TOP, *block, space)) + { + /* Free the entire segment in small steps. */ + ut_d(mtr->freeing_tree()); + while (!fseg_free_step(PAGE_HEADER + PAGE_BTR_SEG_TOP + + block->page.frame, mtr)); + } +} + +MY_ATTRIBUTE((warn_unused_result)) +/** Prepare to free a B-tree. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] index_id PAGE_INDEX_ID contents +@param[in,out] mtr mini-transaction +@return root block, to invoke btr_free_but_not_root() and btr_free_root() +@retval NULL if the page is no longer a matching B-tree page */ +static +buf_block_t *btr_free_root_check(const page_id_t page_id, ulint zip_size, + index_id_t index_id, mtr_t *mtr) +{ + ut_ad(page_id.space() != SRV_TMP_SPACE_ID); + ut_ad(index_id != BTR_FREED_INDEX_ID); + + buf_block_t *block= buf_page_get_gen(page_id, zip_size, RW_X_LATCH, + nullptr, BUF_GET_POSSIBLY_FREED, mtr); + + if (!block); + else if (fil_page_index_page_check(block->page.frame) && + index_id == btr_page_get_index_id(block->page.frame)) + /* This should be a root page. It should not be possible to + reassign the same index_id for some other index in the + tablespace. */ + ut_ad(!page_has_siblings(block->page.frame)); + else + block= nullptr; + + return block; +} + +/** Initialize the root page of the b-tree +@param[in,out] block root block +@param[in] index_id index id +@param[in] index index of root page +@param[in,out] mtr mini-transaction */ +static void btr_root_page_init(buf_block_t *block, index_id_t index_id, + dict_index_t *index, mtr_t *mtr) +{ + constexpr uint16_t field= PAGE_HEADER + PAGE_INDEX_ID; + byte *page_index_id= my_assume_aligned<2>(field + block->page.frame); + + /* Create a new index page on the allocated segment page */ + if (UNIV_LIKELY_NULL(block->page.zip.data)) + { + mach_write_to_8(page_index_id, index_id); + ut_ad(!page_has_siblings(block->page.zip.data)); + page_create_zip(block, index, 0, 0, mtr); + } + else + { + page_create(block, mtr, index && index->table->not_redundant()); + if (index && index->is_spatial()) + { + static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) == + FIL_PAGE_RTREE, "compatibility"); + mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->page.frame, + byte(FIL_PAGE_RTREE)); + if (mach_read_from_8(block->page.frame + FIL_RTREE_SPLIT_SEQ_NUM)) + mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM, 8, 0); + } + /* Set the level of the new index page */ + mtr->write<2,mtr_t::MAYBE_NOP>( + *block, PAGE_HEADER + PAGE_LEVEL + block->page.frame, 0U); + mtr->write<8,mtr_t::MAYBE_NOP>(*block, page_index_id, index_id); + } +} + +/** Create the root node for a new index tree. +@param[in] type type of the index +@param[in] index_id index id +@param[in,out] space tablespace where created +@param[in] index index, or NULL to create a system table +@param[in,out] mtr mini-transaction +@param[out] err error code +@return page number of the created root +@retval FIL_NULL if did not succeed */ +uint32_t +btr_create( + ulint type, + fil_space_t* space, + index_id_t index_id, + dict_index_t* index, + mtr_t* mtr, + dberr_t* err) +{ + buf_block_t* block; + + ut_ad(mtr->is_named_space(space)); + ut_ad(index_id != BTR_FREED_INDEX_ID); + ut_ad(index || space == fil_system.sys_space); + + /* Create the two new segments (one, in the case of an ibuf tree) for + the index tree; the segment headers are put on the allocated root page + (for an ibuf tree, not in the root, but on a separate ibuf header + page) */ + + if (UNIV_UNLIKELY(type & DICT_IBUF)) { + /* Allocate first the ibuf header page */ + buf_block_t* ibuf_hdr_block = fseg_create( + space, IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr, err); + + if (ibuf_hdr_block == NULL) { + return(FIL_NULL); + } + + ut_ad(ibuf_hdr_block->page.id().page_no() + == IBUF_HEADER_PAGE_NO); + /* Allocate then the next page to the segment: it will be the + tree root page */ + + block = fseg_alloc_free_page_general( + buf_block_get_frame(ibuf_hdr_block) + + IBUF_HEADER + IBUF_TREE_SEG_HEADER, + IBUF_TREE_ROOT_PAGE_NO, + FSP_UP, false, mtr, mtr, err); + + if (block == NULL) { + return(FIL_NULL); + } + + ut_ad(block->page.id() == page_id_t(0,IBUF_TREE_ROOT_PAGE_NO)); + + flst_init(block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr); + } else { + block = fseg_create(space, PAGE_HEADER + PAGE_BTR_SEG_TOP, + mtr, err); + + if (block == NULL) { + return(FIL_NULL); + } + + if (!fseg_create(space, PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr, + err, false, block)) { + /* Not enough space for new segment, free root + segment before return. */ + btr_free_root(block, *space, mtr); + return(FIL_NULL); + } + } + + ut_ad(!page_has_siblings(block->page.frame)); + + btr_root_page_init(block, index_id, index, mtr); + + /* We reset the free bits for the page in a separate + mini-transaction to allow creation of several trees in the + same mtr, otherwise the latch on a bitmap page would prevent + it because of the latching order. + + Note: Insert Buffering is disabled for temporary tables given that + most temporary tables are smaller in size and short-lived. */ + if (!(type & DICT_CLUSTERED) + && (!index || !index->table->is_temporary())) { + ibuf_reset_free_bits(block); + } + + /* In the following assertion we test that two records of maximum + allowed size fit on the root page: this fact is needed to ensure + correctness of split algorithms */ + + ut_ad(page_get_max_insert_size(block->page.frame, 2) + > 2 * BTR_PAGE_MAX_REC_SIZE); + + return(block->page.id().page_no()); +} + +/** Free a B-tree except the root page. The root page MUST be freed after +this by calling btr_free_root. +@param[in,out] block root page +@param[in] log_mode mtr logging mode */ +static +void +btr_free_but_not_root( + buf_block_t* block, + mtr_log_t log_mode +#ifdef BTR_CUR_HASH_ADAPT + ,bool ahi=false +#endif + ) +{ + mtr_t mtr; + + ut_ad(fil_page_index_page_check(block->page.frame)); + ut_ad(!page_has_siblings(block->page.frame)); +leaf_loop: + mtr_start(&mtr); + ut_d(mtr.freeing_tree()); + mtr_set_log_mode(&mtr, log_mode); + fil_space_t *space = mtr.set_named_space_id(block->page.id().space()); + + if (!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF, + *block, *space) + || !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP, + *block, *space)) { + mtr_commit(&mtr); + return; + } + + /* NOTE: page hash indexes are dropped when a page is freed inside + fsp0fsp. */ + + bool finished = fseg_free_step(PAGE_HEADER + PAGE_BTR_SEG_LEAF + + block->page.frame, &mtr +#ifdef BTR_CUR_HASH_ADAPT + , ahi +#endif /* BTR_CUR_HASH_ADAPT */ + ); + mtr_commit(&mtr); + + if (!finished) { + + goto leaf_loop; + } +top_loop: + mtr_start(&mtr); + mtr_set_log_mode(&mtr, log_mode); + space = mtr.set_named_space_id(block->page.id().space()); + + finished = !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP, + *block, *space) + || fseg_free_step_not_header(PAGE_HEADER + PAGE_BTR_SEG_TOP + + block->page.frame, &mtr +#ifdef BTR_CUR_HASH_ADAPT + ,ahi +#endif /* BTR_CUR_HASH_ADAPT */ + ); + mtr_commit(&mtr); + + if (!finished) { + goto top_loop; + } +} + +/** Clear the index tree and reinitialize the root page, in the +rollback of TRX_UNDO_EMPTY. The BTR_SEG_LEAF is freed and reinitialized. +@param thr query thread +@return error code */ +TRANSACTIONAL_TARGET +dberr_t dict_index_t::clear(que_thr_t *thr) +{ + mtr_t mtr; + mtr.start(); + if (table->is_temporary()) + mtr.set_log_mode(MTR_LOG_NO_REDO); + else + set_modified(mtr); + mtr_sx_lock_index(this, &mtr); + + dberr_t err; + if (buf_block_t *root_block= + buf_page_get_gen(page_id_t(table->space->id, page), + table->space->zip_size(), + RW_X_LATCH, nullptr, BUF_GET, &mtr, &err)) + { + btr_free_but_not_root(root_block, mtr.get_log_mode() +#ifdef BTR_CUR_HASH_ADAPT + ,n_ahi_pages() != 0 +#endif + ); + +#ifdef BTR_CUR_HASH_ADAPT + if (root_block->index) + btr_search_drop_page_hash_index(root_block, false); + ut_ad(n_ahi_pages() == 0); +#endif + mtr.memset(root_block, PAGE_HEADER + PAGE_BTR_SEG_LEAF, + FSEG_HEADER_SIZE, 0); + if (fseg_create(table->space, PAGE_HEADER + PAGE_BTR_SEG_LEAF, &mtr, + &err, false, root_block)) + btr_root_page_init(root_block, id, this, &mtr); + } + + mtr.commit(); + return err; +} + +/** Free a persistent index tree if it exists. +@param[in,out] space tablespce +@param[in] page root page number +@param[in] index_id PAGE_INDEX_ID contents +@param[in,out] mtr mini-transaction */ +void btr_free_if_exists(fil_space_t *space, uint32_t page, + index_id_t index_id, mtr_t *mtr) +{ + if (buf_block_t *root= btr_free_root_check(page_id_t(space->id, page), + space->zip_size(), + index_id, mtr)) + { + btr_free_but_not_root(root, mtr->get_log_mode()); + mtr->set_named_space(space); + btr_free_root(root, *space, mtr); + } +} + +/** Drop a temporary table +@param table temporary table */ +void btr_drop_temporary_table(const dict_table_t &table) +{ + ut_ad(table.is_temporary()); + ut_ad(table.space == fil_system.temp_space); + mtr_t mtr; + mtr.start(); + for (const dict_index_t *index= table.indexes.start; index; + index= dict_table_get_next_index(index)) + { + if (buf_block_t *block= buf_page_get_low({SRV_TMP_SPACE_ID, index->page}, 0, + RW_X_LATCH, nullptr, BUF_GET, &mtr, + nullptr, false)) + { + btr_free_but_not_root(block, MTR_LOG_NO_REDO); + mtr.set_log_mode(MTR_LOG_NO_REDO); + btr_free_root(block, *fil_system.temp_space, &mtr); + mtr.commit(); + mtr.start(); + } + } + mtr.commit(); +} + +/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC. +@param[in,out] index clustered index +@return the last used AUTO_INCREMENT value +@retval 0 on error or if no AUTO_INCREMENT value was used yet */ +ib_uint64_t +btr_read_autoinc(dict_index_t* index) +{ + ut_ad(index->is_primary()); + ut_ad(index->table->persistent_autoinc); + ut_ad(!index->table->is_temporary()); + mtr_t mtr; + mtr.start(); + ib_uint64_t autoinc; + if (buf_block_t* block = buf_page_get( + page_id_t(index->table->space_id, index->page), + index->table->space->zip_size(), + RW_S_LATCH, &mtr)) { + autoinc = page_get_autoinc(block->page.frame); + } else { + autoinc = 0; + } + mtr.commit(); + return autoinc; +} + +/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC, +or fall back to MAX(auto_increment_column). +@param[in] table table containing an AUTO_INCREMENT column +@param[in] col_no index of the AUTO_INCREMENT column +@return the AUTO_INCREMENT value +@retval 0 on error or if no AUTO_INCREMENT value was used yet */ +ib_uint64_t +btr_read_autoinc_with_fallback(const dict_table_t* table, unsigned col_no) +{ + ut_ad(table->persistent_autoinc); + ut_ad(!table->is_temporary()); + + dict_index_t* index = dict_table_get_first_index(table); + + if (index == NULL) { + return 0; + } + + mtr_t mtr; + mtr.start(); + buf_block_t* block = buf_page_get( + page_id_t(index->table->space_id, index->page), + index->table->space->zip_size(), + RW_S_LATCH, &mtr); + + ib_uint64_t autoinc = block + ? page_get_autoinc(block->page.frame) : 0; + const bool retry = block && autoinc == 0 + && !page_is_empty(block->page.frame); + mtr.commit(); + + if (retry) { + /* This should be an old data file where + PAGE_ROOT_AUTO_INC was initialized to 0. + Fall back to reading MAX(autoinc_col). + There should be an index on it. */ + const dict_col_t* autoinc_col + = dict_table_get_nth_col(table, col_no); + while (index && index->fields[0].col != autoinc_col) { + index = dict_table_get_next_index(index); + } + + if (index) { + autoinc = row_search_max_autoinc(index); + } + } + + return autoinc; +} + +/** Write the next available AUTO_INCREMENT value to PAGE_ROOT_AUTO_INC. +@param[in,out] index clustered index +@param[in] autoinc the AUTO_INCREMENT value +@param[in] reset whether to reset the AUTO_INCREMENT + to a possibly smaller value than currently + exists in the page */ +void +btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset) +{ + ut_ad(index->is_primary()); + ut_ad(index->table->persistent_autoinc); + ut_ad(!index->table->is_temporary()); + + mtr_t mtr; + mtr.start(); + fil_space_t *space= index->table->space; + if (buf_block_t *root= buf_page_get(page_id_t(space->id, index->page), + space->zip_size(), RW_SX_LATCH, &mtr)) + { + mtr.set_named_space(space); + page_set_autoinc(root, autoinc, &mtr, reset); + } + + mtr.commit(); +} + +/** Reorganize an index page. +@param cursor index page cursor +@param mtr mini-transaction */ +static dberr_t btr_page_reorganize_low(page_cur_t *cursor, mtr_t *mtr) +{ + buf_block_t *const block= cursor->block; + + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(!is_buf_block_get_page_zip(block)); + ut_ad(fil_page_index_page_check(block->page.frame)); + ut_ad(cursor->index->is_dummy || + block->page.id().space() == cursor->index->table->space->id); + ut_ad(cursor->index->is_dummy || + block->page.id().page_no() != cursor->index->page || + !page_has_siblings(block->page.frame)); + + /* Save the cursor position. */ + const ulint pos= page_rec_get_n_recs_before(cursor->rec); + + if (UNIV_UNLIKELY(pos == ULINT_UNDEFINED)) + return DB_CORRUPTION; + + btr_search_drop_page_hash_index(block, false); + + buf_block_t *old= buf_block_alloc(); + /* Copy the old page to temporary space */ + memcpy_aligned(old->page.frame, block->page.frame, + srv_page_size); + + const mtr_log_t log_mode= mtr->set_log_mode(MTR_LOG_NO_REDO); + + page_create(block, mtr, cursor->index->table->not_redundant()); + if (cursor->index->is_spatial()) + block->page.frame[FIL_PAGE_TYPE + 1]= byte(FIL_PAGE_RTREE); + + static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) == + FIL_PAGE_RTREE, "compatibility"); + + /* Copy the records from the temporary space to the recreated page; + do not copy the lock bits yet */ + + dberr_t err= + page_copy_rec_list_end_no_locks(block, old, + page_get_infimum_rec(old->page.frame), + cursor->index, mtr); + mtr->set_log_mode(log_mode); + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) + return err; + + /* Copy the PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC. */ + ut_ad(!page_get_max_trx_id(block->page.frame)); + memcpy_aligned<8>(PAGE_MAX_TRX_ID + PAGE_HEADER + block->page.frame, + PAGE_MAX_TRX_ID + PAGE_HEADER + old->page.frame, 8); +#ifdef UNIV_DEBUG + if (page_get_max_trx_id(block->page.frame)) + /* PAGE_MAX_TRX_ID must be zero on non-leaf pages other than + clustered index root pages. */ + ut_ad(dict_index_is_sec_or_ibuf(cursor->index) + ? page_is_leaf(block->page.frame) + : block->page.id().page_no() == cursor->index->page); + else + /* PAGE_MAX_TRX_ID is unused in clustered index pages (other than + the root where it is repurposed as PAGE_ROOT_AUTO_INC), non-leaf + pages, and in temporary tables. It was always zero-initialized in + page_create(). PAGE_MAX_TRX_ID must be nonzero on + dict_index_is_sec_or_ibuf() leaf pages. */ + ut_ad(cursor->index->table->is_temporary() || + !page_is_leaf(block->page.frame) || + !dict_index_is_sec_or_ibuf(cursor->index)); +#endif + + const uint16_t data_size1= page_get_data_size(old->page.frame); + const uint16_t data_size2= page_get_data_size(block->page.frame); + const ulint max1= + page_get_max_insert_size_after_reorganize(old->page.frame, 1); + const ulint max2= + page_get_max_insert_size_after_reorganize(block->page.frame, 1); + + if (UNIV_UNLIKELY(data_size1 != data_size2 || max1 != max2)) + { + sql_print_error("InnoDB: Page old data size %u new data size %u" + ", page old max ins size %zu new max ins size %zu", + data_size1, data_size2, max1, max2); + return DB_CORRUPTION; + } + + /* Restore the cursor position. */ + if (!pos) + ut_ad(cursor->rec == page_get_infimum_rec(block->page.frame)); + else if (!(cursor->rec= page_rec_get_nth(block->page.frame, pos))) + return DB_CORRUPTION; + + if (block->page.id().page_no() != cursor->index->page || + fil_page_get_type(old->page.frame) != FIL_PAGE_TYPE_INSTANT) + ut_ad(!memcmp(old->page.frame, block->page.frame, PAGE_HEADER)); + else if (!cursor->index->is_instant()) + { + ut_ad(!memcmp(old->page.frame, block->page.frame, FIL_PAGE_TYPE)); + ut_ad(!memcmp(old->page.frame + FIL_PAGE_TYPE + 2, + block->page.frame + FIL_PAGE_TYPE + 2, + PAGE_HEADER - FIL_PAGE_TYPE - 2)); + mtr->write<2,mtr_t::FORCED>(*block, FIL_PAGE_TYPE + block->page.frame, + FIL_PAGE_INDEX); + } + else + { + /* Preserve the PAGE_INSTANT information. */ + memcpy_aligned<2>(FIL_PAGE_TYPE + block->page.frame, + FIL_PAGE_TYPE + old->page.frame, 2); + memcpy_aligned<2>(PAGE_HEADER + PAGE_INSTANT + block->page.frame, + PAGE_HEADER + PAGE_INSTANT + old->page.frame, 2); + if (!cursor->index->table->instant); + else if (page_is_comp(block->page.frame)) + { + memcpy(PAGE_NEW_INFIMUM + block->page.frame, + PAGE_NEW_INFIMUM + old->page.frame, 8); + memcpy(PAGE_NEW_SUPREMUM + block->page.frame, + PAGE_NEW_SUPREMUM + old->page.frame, 8); + } + else + { + memcpy(PAGE_OLD_INFIMUM + block->page.frame, + PAGE_OLD_INFIMUM + old->page.frame, 8); + memcpy(PAGE_OLD_SUPREMUM + block->page.frame, + PAGE_OLD_SUPREMUM + old->page.frame, 8); + } + + ut_ad(!memcmp(old->page.frame, block->page.frame, PAGE_HEADER)); + } + + ut_ad(!memcmp(old->page.frame + PAGE_MAX_TRX_ID + PAGE_HEADER, + block->page.frame + PAGE_MAX_TRX_ID + PAGE_HEADER, + PAGE_DATA - (PAGE_MAX_TRX_ID + PAGE_HEADER))); + + if (!cursor->index->has_locking()); + else if (cursor->index->page == FIL_NULL) + ut_ad(cursor->index->is_dummy); + else + lock_move_reorganize_page(block, old); + + /* Write log for the changes, if needed. */ + if (log_mode == MTR_LOG_ALL) + { + /* Check and log the changes in the page header. */ + ulint a, e; + for (a= PAGE_HEADER, e= PAGE_MAX_TRX_ID + PAGE_HEADER; a < e; a++) + { + if (old->page.frame[a] == block->page.frame[a]) + continue; + while (--e, old->page.frame[e] == block->page.frame[e]); + e++; + ut_ad(a < e); + /* Write log for the changed page header fields. */ + mtr->memcpy(*block, a, e - a); + break; + } + + const uint16_t top= page_header_get_offs(block->page.frame, PAGE_HEAP_TOP); + + if (page_is_comp(block->page.frame)) + { + /* info_bits=0, n_owned=1, heap_no=0, status */ + ut_ad(!memcmp(PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES + + block->page.frame, + PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES + + old->page.frame, 3)); + /* If the 'next' pointer of the infimum record has changed, log it. */ + a= PAGE_NEW_INFIMUM - 2; + e= a + 2; + if (block->page.frame[a] == old->page.frame[a]) + a++; + if (--e, block->page.frame[e] != old->page.frame[e]) + e++; + if (ulint len= e - a) + mtr->memcpy(*block, a, len); + /* The infimum record itself must not change. */ + ut_ad(!memcmp(PAGE_NEW_INFIMUM + block->page.frame, + PAGE_NEW_INFIMUM + old->page.frame, 8)); + /* Log any change of the n_owned of the supremum record. */ + a= PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES; + if (block->page.frame[a] != old->page.frame[a]) + mtr->memcpy(*block, a, 1); + /* The rest of the supremum record must not change. */ + ut_ad(!memcmp(&block->page.frame[a + 1], &old->page.frame[a + 1], + PAGE_NEW_SUPREMUM_END - PAGE_NEW_SUPREMUM + + REC_N_NEW_EXTRA_BYTES - 1)); + + /* Log the differences in the payload. */ + for (a= PAGE_NEW_SUPREMUM_END, e= top; a < e; a++) + { + if (old->page.frame[a] == block->page.frame[a]) + continue; + while (--e, old->page.frame[e] == block->page.frame[e]); + e++; + ut_ad(a < e); + /* TODO: write MEMMOVE records to minimize this further! */ + mtr->memcpy(*block, a, e - a); + break; + } + } + else + { + /* info_bits=0, n_owned=1, heap_no=0, number of fields, 1-byte format */ + ut_ad(!memcmp(PAGE_OLD_INFIMUM - REC_N_OLD_EXTRA_BYTES + + block->page.frame, + PAGE_OLD_INFIMUM - REC_N_OLD_EXTRA_BYTES + + old->page.frame, 4)); + /* If the 'next' pointer of the infimum record has changed, log it. */ + a= PAGE_OLD_INFIMUM - 2; + e= a + 2; + if (block->page.frame[a] == old->page.frame[a]) + a++; + if (--e, block->page.frame[e] != old->page.frame[e]) + e++; + if (ulint len= e - a) + mtr->memcpy(*block, a, len); + /* The infimum record itself must not change. */ + ut_ad(!memcmp(PAGE_OLD_INFIMUM + block->page.frame, + PAGE_OLD_INFIMUM + old->page.frame, 8)); + /* Log any change of the n_owned of the supremum record. */ + a= PAGE_OLD_SUPREMUM - REC_N_OLD_EXTRA_BYTES; + if (block->page.frame[a] != old->page.frame[a]) + mtr->memcpy(*block, a, 1); + ut_ad(!memcmp(&block->page.frame[a + 1], &old->page.frame[a + 1], + PAGE_OLD_SUPREMUM_END - PAGE_OLD_SUPREMUM + + REC_N_OLD_EXTRA_BYTES - 1)); + + /* Log the differences in the payload. */ + for (a= PAGE_OLD_SUPREMUM_END, e= top; a < e; a++) + { + if (old->page.frame[a] == block->page.frame[a]) + continue; + while (--e, old->page.frame[e] == block->page.frame[e]); + e++; + ut_ad(a < e); + /* TODO: write MEMMOVE records to minimize this further! */ + mtr->memcpy(*block, a, e - a); + break; + } + } + + e= srv_page_size - PAGE_DIR; + a= e - PAGE_DIR_SLOT_SIZE * page_dir_get_n_slots(block->page.frame); + + /* Zero out the payload area. */ + mtr->memset(*block, top, a - top, 0); + + /* Log changes to the page directory. */ + for (; a < e; a++) + { + if (old->page.frame[a] == block->page.frame[a]) + continue; + while (--e, old->page.frame[e] == block->page.frame[e]); + e++; + ut_ad(a < e); + /* Write log for the changed page directory slots. */ + mtr->memcpy(*block, a, e - a); + break; + } + } + + buf_block_free(old); + + MONITOR_INC(MONITOR_INDEX_REORG_ATTEMPTS); + MONITOR_INC(MONITOR_INDEX_REORG_SUCCESSFUL); + return DB_SUCCESS; +} + +/*************************************************************//** +Reorganizes an index page. + +IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. This has to +be done either within the same mini-transaction, or by invoking +ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages, +IBUF_BITMAP_FREE is unaffected by reorganization. + +@return error code +@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */ +dberr_t +btr_page_reorganize_block( + ulint z_level,/*!< in: compression level to be used + if dealing with compressed page */ + buf_block_t* block, /*!< in/out: B-tree page */ + dict_index_t* index, /*!< in: the index tree of the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + if (buf_block_get_page_zip(block)) + return page_zip_reorganize(block, index, z_level, mtr, true); + page_cur_t cur; + page_cur_set_before_first(block, &cur); + cur.index= index; + return btr_page_reorganize_low(&cur, mtr); +} + +/*************************************************************//** +Reorganizes an index page. + +IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. This has to +be done either within the same mini-transaction, or by invoking +ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages, +IBUF_BITMAP_FREE is unaffected by reorganization. + +@param cursor page cursor +@param mtr mini-transaction +@return error code +@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */ +dberr_t btr_page_reorganize(page_cur_t *cursor, mtr_t *mtr) +{ + if (!buf_block_get_page_zip(cursor->block)) + return btr_page_reorganize_low(cursor, mtr); + + ulint pos= page_rec_get_n_recs_before(cursor->rec); + if (UNIV_UNLIKELY(pos == ULINT_UNDEFINED)) + return DB_CORRUPTION; + + dberr_t err= page_zip_reorganize(cursor->block, cursor->index, + page_zip_level, mtr, true); + if (err == DB_FAIL); + else if (!pos) + ut_ad(cursor->rec == page_get_infimum_rec(cursor->block->page.frame)); + else if (!(cursor->rec= page_rec_get_nth(cursor->block->page.frame, pos))) + err= DB_CORRUPTION; + + return err; +} + +/** Empty an index page (possibly the root page). @see btr_page_create(). +@param[in,out] block page to be emptied +@param[in,out] page_zip compressed page frame, or NULL +@param[in] index index of the page +@param[in] level B-tree level of the page (0=leaf) +@param[in,out] mtr mini-transaction */ +void +btr_page_empty( + buf_block_t* block, + page_zip_des_t* page_zip, + dict_index_t* index, + ulint level, + mtr_t* mtr) +{ + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(page_zip == buf_block_get_page_zip(block)); + ut_ad(!index->is_dummy); + ut_ad(index->table->space->id == block->page.id().space()); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip + || page_zip_validate(page_zip, block->page.frame, index)); +#endif /* UNIV_ZIP_DEBUG */ + + btr_search_drop_page_hash_index(block, false); + + /* Recreate the page: note that global data on page (possible + segment headers, next page-field, etc.) is preserved intact */ + + /* Preserve PAGE_ROOT_AUTO_INC when creating a clustered index + root page. */ + const ib_uint64_t autoinc + = dict_index_is_clust(index) + && index->page == block->page.id().page_no() + ? page_get_autoinc(block->page.frame) + : 0; + + if (page_zip) { + page_create_zip(block, index, level, autoinc, mtr); + } else { + page_create(block, mtr, index->table->not_redundant()); + if (index->is_spatial()) { + static_assert(((FIL_PAGE_INDEX & 0xff00) + | byte(FIL_PAGE_RTREE)) + == FIL_PAGE_RTREE, "compatibility"); + mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + + block->page.frame, + byte(FIL_PAGE_RTREE)); + if (mach_read_from_8(block->page.frame + + FIL_RTREE_SPLIT_SEQ_NUM)) { + mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM, + 8, 0); + } + } + mtr->write<2,mtr_t::MAYBE_NOP>(*block, PAGE_HEADER + PAGE_LEVEL + + block->page.frame, level); + if (autoinc) { + mtr->write<8>(*block, PAGE_HEADER + PAGE_MAX_TRX_ID + + block->page.frame, autoinc); + } + } +} + +/** Write instant ALTER TABLE metadata to a root page. +@param[in,out] root clustered index root page +@param[in] index clustered index with instant ALTER TABLE +@param[in,out] mtr mini-transaction */ +void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr) +{ + ut_ad(index.n_core_fields > 0); + ut_ad(index.n_core_fields < REC_MAX_N_FIELDS); + ut_ad(index.is_instant()); + ut_ad(fil_page_get_type(root->page.frame) == FIL_PAGE_TYPE_INSTANT + || fil_page_get_type(root->page.frame) == FIL_PAGE_INDEX); + ut_ad(!page_has_siblings(root->page.frame)); + ut_ad(root->page.id().page_no() == index.page); + + rec_t* infimum = page_get_infimum_rec(root->page.frame); + rec_t* supremum = page_get_supremum_rec(root->page.frame); + byte* page_type = root->page.frame + FIL_PAGE_TYPE; + uint16_t i = page_header_get_field(root->page.frame, PAGE_INSTANT); + + switch (mach_read_from_2(page_type)) { + case FIL_PAGE_TYPE_INSTANT: + ut_ad(page_get_instant(root->page.frame) + == index.n_core_fields); + if (memcmp(infimum, "infimum", 8) + || memcmp(supremum, "supremum", 8)) { + ut_ad(index.table->instant); + ut_ad(!memcmp(infimum, field_ref_zero, 8)); + ut_ad(!memcmp(supremum, field_ref_zero, 7)); + /* The n_core_null_bytes only matters for + ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC tables. */ + ut_ad(supremum[7] == index.n_core_null_bytes + || !index.table->not_redundant()); + return; + } + break; + default: + ut_ad("wrong page type" == 0); + /* fall through */ + case FIL_PAGE_INDEX: + ut_ad(!page_is_comp(root->page.frame) + || !page_get_instant(root->page.frame)); + ut_ad(!memcmp(infimum, "infimum", 8)); + ut_ad(!memcmp(supremum, "supremum", 8)); + mtr->write<2>(*root, page_type, FIL_PAGE_TYPE_INSTANT); + ut_ad(i <= PAGE_NO_DIRECTION); + i |= static_cast(index.n_core_fields << 3); + mtr->write<2>(*root, PAGE_HEADER + PAGE_INSTANT + + root->page.frame, i); + break; + } + + if (index.table->instant) { + mtr->memset(root, infimum - root->page.frame, 8, 0); + mtr->memset(root, supremum - root->page.frame, 7, 0); + mtr->write<1,mtr_t::MAYBE_NOP>(*root, &supremum[7], + index.n_core_null_bytes); + } +} + +/** Reset the table to the canonical format on ROLLBACK of instant ALTER TABLE. +@param[in] index clustered index with instant ALTER TABLE +@param[in] all whether to reset FIL_PAGE_TYPE as well +@param[in,out] mtr mini-transaction */ +ATTRIBUTE_COLD +void btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr) +{ + ut_ad(!index.table->is_temporary()); + ut_ad(index.is_primary()); + buf_block_t *root= btr_get_latched_root(index, mtr); + byte *page_type= root->page.frame + FIL_PAGE_TYPE; + if (all) + { + ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT || + mach_read_from_2(page_type) == FIL_PAGE_INDEX); + mtr->write<2,mtr_t::MAYBE_NOP>(*root, page_type, FIL_PAGE_INDEX); + byte *instant= PAGE_INSTANT + PAGE_HEADER + root->page.frame; + mtr->write<2,mtr_t::MAYBE_NOP>(*root, instant, + page_ptr_get_direction(instant + 1)); + } + else + ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT); + static const byte supremuminfimum[8 + 8] = "supremuminfimum"; + uint16_t infimum, supremum; + if (page_is_comp(root->page.frame)) + { + infimum= PAGE_NEW_INFIMUM; + supremum= PAGE_NEW_SUPREMUM; + } + else + { + infimum= PAGE_OLD_INFIMUM; + supremum= PAGE_OLD_SUPREMUM; + } + ut_ad(!memcmp(&root->page.frame[infimum], supremuminfimum + 8, 8) == + !memcmp(&root->page.frame[supremum], supremuminfimum, 8)); + mtr->memcpy(*root, &root->page.frame[infimum], + supremuminfimum + 8, 8); + mtr->memcpy(*root, &root->page.frame[supremum], + supremuminfimum, 8); +} + +/*************************************************************//** +Makes tree one level higher by splitting the root, and inserts +the tuple. It is assumed that mtr contains an x-latch on the tree. +NOTE that the operation of this function must always succeed, +we cannot reverse it: therefore enough free disk space must be +guaranteed to be available before this function is called. +@return inserted record */ +rec_t* +btr_root_raise_and_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor at which to insert: must be + on the root page; when the function returns, + the cursor is positioned on the predecessor + of the inserted record */ + rec_offs** offsets,/*!< out: offsets on inserted record */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr, /*!< in: mtr */ + dberr_t* err) /*!< out: error code */ +{ + dict_index_t* index; + rec_t* rec; + dtuple_t* node_ptr; + ulint level; + rec_t* node_ptr_rec; + page_cur_t* page_cursor; + page_zip_des_t* root_page_zip; + page_zip_des_t* new_page_zip; + buf_block_t* root; + buf_block_t* new_block; + + root = btr_cur_get_block(cursor); + root_page_zip = buf_block_get_page_zip(root); + ut_ad(!page_is_empty(root->page.frame)); + index = btr_cur_get_index(cursor); + ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable)); +#ifdef UNIV_ZIP_DEBUG + ut_a(!root_page_zip + || page_zip_validate(root_page_zip, root->page.frame, index)); +#endif /* UNIV_ZIP_DEBUG */ + const page_id_t root_id{root->page.id()}; + + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains_flagged(root, MTR_MEMO_PAGE_X_FIX)); + + if (index->page != root_id.page_no()) { + ut_ad("corrupted root page number" == 0); + return nullptr; + } + + if (index->is_ibuf()) { + } else if (!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF, + *root, *index->table->space) + || !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP, + *root, *index->table->space)) { + return nullptr; + } + + /* Allocate a new page to the tree. Root splitting is done by first + moving the root records to the new page, emptying the root, putting + a node pointer to the new page, and then splitting the new page. */ + + level = btr_page_get_level(root->page.frame); + + new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr, mtr, err); + + if (!new_block) { + return nullptr; + } + + new_page_zip = buf_block_get_page_zip(new_block); + ut_a(!new_page_zip == !root_page_zip); + ut_a(!new_page_zip + || page_zip_get_size(new_page_zip) + == page_zip_get_size(root_page_zip)); + + btr_page_create(new_block, new_page_zip, index, level, mtr); + if (page_has_siblings(new_block->page.frame)) { + compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4); + compile_time_assert(FIL_NULL == 0xffffffff); + static_assert(FIL_PAGE_PREV % 8 == 0, "alignment"); + memset_aligned<8>(new_block->page.frame + FIL_PAGE_PREV, + 0xff, 8); + mtr->memset(new_block, FIL_PAGE_PREV, 8, 0xff); + if (UNIV_LIKELY_NULL(new_page_zip)) { + memset_aligned<8>(new_page_zip->data + FIL_PAGE_PREV, + 0xff, 8); + } + } + + /* Copy the records from root to the new page one by one. */ + if (0 +#ifdef UNIV_ZIP_COPY + || new_page_zip +#endif /* UNIV_ZIP_COPY */ + || !page_copy_rec_list_end(new_block, root, + page_get_infimum_rec(root->page.frame), + index, mtr, err)) { + switch (*err) { + case DB_SUCCESS: + break; + case DB_FAIL: + *err = DB_SUCCESS; + break; + default: + return nullptr; + } + + ut_a(new_page_zip); + + /* Copy the page byte for byte. */ + page_zip_copy_recs(new_block, root_page_zip, + root->page.frame, index, mtr); + + /* Update the lock table and possible hash index. */ + if (index->has_locking()) { + lock_move_rec_list_end( + new_block, root, + page_get_infimum_rec(root->page.frame)); + } + + /* Move any existing predicate locks */ + if (dict_index_is_spatial(index)) { + lock_prdt_rec_move(new_block, root_id); + } else { + btr_search_move_or_delete_hash_entries( + new_block, root); + } + } + + constexpr uint16_t max_trx_id = PAGE_HEADER + PAGE_MAX_TRX_ID; + if (dict_index_is_sec_or_ibuf(index)) { + /* In secondary indexes and the change buffer, + PAGE_MAX_TRX_ID can be reset on the root page, because + the field only matters on leaf pages, and the root no + longer is a leaf page. (Older versions of InnoDB did + set PAGE_MAX_TRX_ID on all secondary index pages.) */ + byte* p = my_assume_aligned<8>( + PAGE_HEADER + PAGE_MAX_TRX_ID + root->page.frame); + if (mach_read_from_8(p)) { + mtr->memset(root, max_trx_id, 8, 0); + if (UNIV_LIKELY_NULL(root->page.zip.data)) { + memset_aligned<8>(max_trx_id + + root->page.zip.data, 0, 8); + } + } + } else { + /* PAGE_ROOT_AUTO_INC is only present in the clustered index + root page; on other clustered index pages, we want to reserve + the field PAGE_MAX_TRX_ID for future use. */ + byte* p = my_assume_aligned<8>( + PAGE_HEADER + PAGE_MAX_TRX_ID + new_block->page.frame); + if (mach_read_from_8(p)) { + mtr->memset(new_block, max_trx_id, 8, 0); + if (UNIV_LIKELY_NULL(new_block->page.zip.data)) { + memset_aligned<8>(max_trx_id + + new_block->page.zip.data, + 0, 8); + } + } + } + + /* If this is a pessimistic insert which is actually done to + perform a pessimistic update then we have stored the lock + information of the record to be inserted on the infimum of the + root page: we cannot discard the lock structs on the root page */ + + if (index->has_locking()) { + lock_update_root_raise(*new_block, root_id); + } + + /* Create a memory heap where the node pointer is stored */ + if (!*heap) { + *heap = mem_heap_create(1000); + } + + const uint32_t new_page_no = new_block->page.id().page_no(); + rec = page_rec_get_next(page_get_infimum_rec(new_block->page.frame)); + ut_ad(rec); /* We just created the page. */ + + /* Build the node pointer (= node key and page address) for the + child */ + if (dict_index_is_spatial(index)) { + rtr_mbr_t new_mbr; + + rtr_page_cal_mbr(index, new_block, &new_mbr, *heap); + node_ptr = rtr_index_build_node_ptr( + index, &new_mbr, rec, new_page_no, *heap); + } else { + node_ptr = dict_index_build_node_ptr( + index, rec, new_page_no, *heap, level); + } + /* The node pointer must be marked as the predefined minimum record, + as there is no lower alphabetical limit to records in the leftmost + node of a level: */ + dtuple_set_info_bits(node_ptr, + dtuple_get_info_bits(node_ptr) + | REC_INFO_MIN_REC_FLAG); + + /* Rebuild the root page to get free space */ + btr_page_empty(root, root_page_zip, index, level + 1, mtr); + /* btr_page_empty() is supposed to zero-initialize the field. */ + ut_ad(!page_get_instant(root->page.frame)); + + if (index->is_instant()) { + ut_ad(!root_page_zip); + btr_set_instant(root, *index, mtr); + } + + ut_ad(!page_has_siblings(root->page.frame)); + + page_cursor = btr_cur_get_page_cur(cursor); + + /* Insert node pointer to the root */ + + page_cur_set_before_first(root, page_cursor); + + node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr, + offsets, heap, 0, mtr); + + /* The root page should only contain the node pointer + to new_block at this point. Thus, the data should fit. */ + ut_a(node_ptr_rec); + + /* We play safe and reset the free bits for the new page */ + + if (!dict_index_is_clust(index) + && !index->table->is_temporary()) { + ibuf_reset_free_bits(new_block); + } + + page_cursor->block = new_block; + page_cursor->index = index; + + ut_ad(dtuple_check_typed(tuple)); + /* Reposition the cursor to the child node */ + ulint low_match = 0, up_match = 0; + + if (page_cur_search_with_match(tuple, PAGE_CUR_LE, + &up_match, &low_match, + page_cursor, nullptr)) { + *err = DB_CORRUPTION; + return nullptr; + } + + /* Split the child and insert tuple */ + return btr_page_split_and_insert(flags, cursor, offsets, heap, + tuple, n_ext, mtr, err); +} + +/** Decide if the page should be split at the convergence point of inserts +converging to the left. +@param[in] cursor insert position +@return the first record to be moved to the right half page +@retval NULL if no split is recommended */ +rec_t* btr_page_get_split_rec_to_left(const btr_cur_t* cursor) +{ + rec_t* split_rec = btr_cur_get_rec(cursor); + const page_t* page = page_align(split_rec); + + if (page_header_get_ptr(page, PAGE_LAST_INSERT) + != page_rec_get_next(split_rec)) { + return NULL; + } + + /* The metadata record must be present in the leftmost leaf page + of the clustered index, if and only if index->is_instant(). + However, during innobase_instant_try(), index->is_instant() + would already hold when row_ins_clust_index_entry_low() + is being invoked to insert the the metadata record. + So, we can only assert that when the metadata record exists, + index->is_instant() must hold. */ + ut_ad(!page_is_leaf(page) || page_has_prev(page) + || cursor->index()->is_instant() + || !(rec_get_info_bits(page_rec_get_next_const( + page_get_infimum_rec(page)), + cursor->index()->table->not_redundant()) + & REC_INFO_MIN_REC_FLAG)); + + const rec_t* infimum = page_get_infimum_rec(page); + + /* If the convergence is in the middle of a page, include also + the record immediately before the new insert to the upper + page. Otherwise, we could repeatedly move from page to page + lots of records smaller than the convergence point. */ + + if (split_rec == infimum + || split_rec == page_rec_get_next_const(infimum)) { + split_rec = page_rec_get_next(split_rec); + } + + return split_rec; +} + +/** Decide if the page should be split at the convergence point of inserts +converging to the right. +@param[in] cursor insert position +@param[out] split_rec if split recommended, the first record + on the right half page, or + NULL if the to-be-inserted record + should be first +@return whether split is recommended */ +bool +btr_page_get_split_rec_to_right(const btr_cur_t* cursor, rec_t** split_rec) +{ + rec_t* insert_point = btr_cur_get_rec(cursor); + const page_t* page = page_align(insert_point); + + /* We use eager heuristics: if the new insert would be right after + the previous insert on the same page, we assume that there is a + pattern of sequential inserts here. */ + + if (page_header_get_ptr(page, PAGE_LAST_INSERT) != insert_point) { + return false; + } + + insert_point = page_rec_get_next(insert_point); + + if (!insert_point || page_rec_is_supremum(insert_point)) { + insert_point = NULL; + } else { + insert_point = page_rec_get_next(insert_point); + if (page_rec_is_supremum(insert_point)) { + insert_point = NULL; + } + + /* If there are >= 2 user records up from the insert + point, split all but 1 off. We want to keep one because + then sequential inserts can use the adaptive hash + index, as they can do the necessary checks of the right + search position just by looking at the records on this + page. */ + } + + *split_rec = insert_point; + return true; +} + +/*************************************************************//** +Calculates a split record such that the tuple will certainly fit on +its half-page when the split is performed. We assume in this function +only that the cursor page has at least one user record. +@return split record, or NULL if tuple will be the first record on +the lower or upper half-page (determined by btr_page_tuple_smaller()) */ +static +rec_t* +btr_page_get_split_rec( +/*===================*/ + btr_cur_t* cursor, /*!< in: cursor at which insert should be made */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext) /*!< in: number of externally stored columns */ +{ + page_t* page; + page_zip_des_t* page_zip; + ulint insert_size; + ulint free_space; + ulint total_data; + ulint total_n_recs; + ulint total_space; + ulint incl_data; + rec_t* ins_rec; + rec_t* rec; + rec_t* next_rec; + ulint n; + mem_heap_t* heap; + rec_offs* offsets; + + page = btr_cur_get_page(cursor); + + insert_size = rec_get_converted_size(cursor->index(), tuple, n_ext); + free_space = page_get_free_space_of_empty(page_is_comp(page)); + + page_zip = btr_cur_get_page_zip(cursor); + if (page_zip) { + /* Estimate the free space of an empty compressed page. */ + ulint free_space_zip = page_zip_empty_size( + cursor->index()->n_fields, + page_zip_get_size(page_zip)); + + if (free_space > (ulint) free_space_zip) { + free_space = (ulint) free_space_zip; + } + } + + /* free_space is now the free space of a created new page */ + + total_data = page_get_data_size(page) + insert_size; + total_n_recs = ulint(page_get_n_recs(page)) + 1; + ut_ad(total_n_recs >= 2); + total_space = total_data + page_dir_calc_reserved_space(total_n_recs); + + n = 0; + incl_data = 0; + ins_rec = btr_cur_get_rec(cursor); + rec = page_get_infimum_rec(page); + + heap = NULL; + offsets = NULL; + + /* We start to include records to the left half, and when the + space reserved by them exceeds half of total_space, then if + the included records fit on the left page, they will be put there + if something was left over also for the right page, + otherwise the last included record will be the first on the right + half page */ + + do { + /* Decide the next record to include */ + if (rec == ins_rec) { + rec = NULL; /* NULL denotes that tuple is + now included */ + } else if (rec == NULL) { + rec = page_rec_get_next(ins_rec); + } else { + rec = page_rec_get_next(rec); + } + + if (rec == NULL) { + /* Include tuple */ + incl_data += insert_size; + } else { + offsets = rec_get_offsets(rec, cursor->index(), + offsets, page_is_leaf(page) + ? cursor->index() + ->n_core_fields + : 0, + ULINT_UNDEFINED, &heap); + incl_data += rec_offs_size(offsets); + } + + n++; + } while (incl_data + page_dir_calc_reserved_space(n) + < total_space / 2); + + if (incl_data + page_dir_calc_reserved_space(n) <= free_space) { + /* The next record will be the first on + the right half page if it is not the + supremum record of page */ + + if (rec == ins_rec) { + rec = NULL; + + goto func_exit; + } else if (rec == NULL) { + next_rec = page_rec_get_next(ins_rec); + } else { + next_rec = page_rec_get_next(rec); + } + ut_ad(next_rec); + if (!page_rec_is_supremum(next_rec)) { + rec = next_rec; + } + } + +func_exit: + if (heap) { + mem_heap_free(heap); + } + return(rec); +} + +#ifdef UNIV_DEBUG +/*************************************************************//** +Returns TRUE if the insert fits on the appropriate half-page with the +chosen split_rec. +@return true if fits */ +static MY_ATTRIBUTE((nonnull(1,3,4,6), warn_unused_result)) +bool +btr_page_insert_fits( +/*=================*/ + btr_cur_t* cursor, /*!< in: cursor at which insert + should be made */ + const rec_t* split_rec,/*!< in: suggestion for first record + on upper half-page, or NULL if + tuple to be inserted should be first */ + rec_offs** offsets,/*!< in: rec_get_offsets( + split_rec, cursor->index()); out: garbage */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mem_heap_t** heap) /*!< in: temporary memory heap */ +{ + page_t* page; + ulint insert_size; + ulint free_space; + ulint total_data; + ulint total_n_recs; + const rec_t* rec; + const rec_t* end_rec; + + page = btr_cur_get_page(cursor); + + ut_ad(!split_rec + || !page_is_comp(page) == !rec_offs_comp(*offsets)); + ut_ad(!split_rec + || rec_offs_validate(split_rec, cursor->index(), *offsets)); + + insert_size = rec_get_converted_size(cursor->index(), tuple, n_ext); + free_space = page_get_free_space_of_empty(page_is_comp(page)); + + /* free_space is now the free space of a created new page */ + + total_data = page_get_data_size(page) + insert_size; + total_n_recs = ulint(page_get_n_recs(page)) + 1; + + /* We determine which records (from rec to end_rec, not including + end_rec) will end up on the other half page from tuple when it is + inserted. */ + + if (!(end_rec = split_rec)) { + end_rec = page_rec_get_next(btr_cur_get_rec(cursor)); + } else if (cmp_dtuple_rec(tuple, split_rec, cursor->index(), + *offsets) < 0) { + rec = split_rec; + end_rec = page_get_supremum_rec(page); + goto got_rec; + } + + if (!(rec = page_rec_get_next(page_get_infimum_rec(page)))) { + return false; + } + +got_rec: + if (total_data + page_dir_calc_reserved_space(total_n_recs) + <= free_space) { + + /* Ok, there will be enough available space on the + half page where the tuple is inserted */ + + return(true); + } + + while (rec != end_rec) { + /* In this loop we calculate the amount of reserved + space after rec is removed from page. */ + + *offsets = rec_get_offsets(rec, cursor->index(), *offsets, + page_is_leaf(page) + ? cursor->index()->n_core_fields + : 0, + ULINT_UNDEFINED, heap); + + total_data -= rec_offs_size(*offsets); + total_n_recs--; + + if (total_data + page_dir_calc_reserved_space(total_n_recs) + <= free_space) { + + /* Ok, there will be enough available space on the + half page where the tuple is inserted */ + + return(true); + } + + if (!(rec = page_rec_get_next_const(rec))) { + break; + } + } + + return(false); +} +#endif + +/*******************************************************//** +Inserts a data tuple to a tree on a non-leaf level. It is assumed +that mtr holds an x-latch on the tree. */ +dberr_t +btr_insert_on_non_leaf_level( + ulint flags, /*!< in: undo logging and locking flags */ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: level, must be > 0 */ + dtuple_t* tuple, /*!< in: the record to be inserted */ + mtr_t* mtr) /*!< in: mtr */ +{ + big_rec_t* dummy_big_rec; + btr_cur_t cursor; + rec_t* rec; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + rtr_info_t rtr_info; + + ut_ad(level > 0); + + flags |= BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG + | BTR_NO_UNDO_LOG_FLAG; + cursor.page_cur.index = index; + + dberr_t err; + + if (index->is_spatial()) { + /* For spatial index, initialize structures to track + its parents etc. */ + rtr_init_rtr_info(&rtr_info, false, &cursor, index, false); + + rtr_info_update_btr(&cursor, &rtr_info); + err = rtr_search_to_nth_level(level, tuple, + PAGE_CUR_RTREE_INSERT, + BTR_CONT_MODIFY_TREE, + &cursor, mtr); + } else { + err = btr_cur_search_to_nth_level(level, tuple, RW_X_LATCH, + &cursor, mtr); + } + + ut_ad(cursor.flag == BTR_CUR_BINARY); + ut_ad(btr_cur_get_block(&cursor) + != mtr->at_savepoint(mtr->get_savepoint() - 1) + || index->is_spatial() + || mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK)); + + if (UNIV_LIKELY(err == DB_SUCCESS)) { + err = btr_cur_optimistic_insert(flags, + &cursor, &offsets, &heap, + tuple, &rec, + &dummy_big_rec, 0, NULL, mtr); + } + + if (err == DB_FAIL) { + err = btr_cur_pessimistic_insert(flags, + &cursor, &offsets, &heap, + tuple, &rec, + &dummy_big_rec, 0, NULL, mtr); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + if (index->is_spatial()) { + ut_ad(cursor.rtr_info); + + rtr_clean_rtr_info(&rtr_info, true); + } + + return err; +} + +static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment"); +static_assert(FIL_PAGE_PREV % 4 == 0, "alignment"); +static_assert(FIL_PAGE_NEXT % 4 == 0, "alignment"); + +MY_ATTRIBUTE((nonnull,warn_unused_result)) +/**************************************************************//** +Attaches the halves of an index page on the appropriate level in an +index tree. */ +static +dberr_t +btr_attach_half_pages( +/*==================*/ + ulint flags, /*!< in: undo logging and + locking flags */ + dict_index_t* index, /*!< in: the index tree */ + buf_block_t* block, /*!< in/out: page to be split */ + const rec_t* split_rec, /*!< in: first record on upper + half page */ + buf_block_t* new_block, /*!< in/out: the new half page */ + ulint direction, /*!< in: FSP_UP or FSP_DOWN */ + mtr_t* mtr) /*!< in: mtr */ +{ + dtuple_t* node_ptr_upper; + mem_heap_t* heap; + buf_block_t* prev_block = nullptr; + buf_block_t* next_block = nullptr; + buf_block_t* lower_block; + buf_block_t* upper_block; + + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr->memo_contains_flagged(new_block, MTR_MEMO_PAGE_X_FIX)); + + /* Create a memory heap where the data tuple is stored */ + heap = mem_heap_create(1024); + + /* Based on split direction, decide upper and lower pages */ + if (direction == FSP_DOWN) { + + btr_cur_t cursor; + rec_offs* offsets; + + lower_block = new_block; + upper_block = block; + + cursor.page_cur.block = block; + cursor.page_cur.index = index; + + /* Look up the index for the node pointer to page */ + offsets = btr_page_get_father_block(nullptr, heap, mtr, + &cursor); + + /* Replace the address of the old child node (= page) with the + address of the new lower half */ + + btr_node_ptr_set_child_page_no( + btr_cur_get_block(&cursor), + btr_cur_get_rec(&cursor), + offsets, lower_block->page.id().page_no(), mtr); + mem_heap_empty(heap); + } else { + lower_block = block; + upper_block = new_block; + } + + /* Get the level of the split pages */ + const ulint level = btr_page_get_level(block->page.frame); + ut_ad(level == btr_page_get_level(new_block->page.frame)); + page_id_t id{block->page.id()}; + + /* Get the previous and next pages of page */ + const uint32_t prev_page_no = btr_page_get_prev(block->page.frame); + const uint32_t next_page_no = btr_page_get_next(block->page.frame); + + /* for consistency, both blocks should be locked, before change */ + if (prev_page_no != FIL_NULL && direction == FSP_DOWN) { + id.set_page_no(prev_page_no); + prev_block = mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX); +#if 1 /* MDEV-29835 FIXME: acquire page latches upfront */ + if (!prev_block) { + ut_ad(mtr->memo_contains(index->lock, + MTR_MEMO_X_LOCK)); + prev_block = btr_block_get(*index, prev_page_no, + RW_X_LATCH, !level, mtr); + } +#endif + } + if (next_page_no != FIL_NULL && direction != FSP_DOWN) { + id.set_page_no(next_page_no); + next_block = mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX); +#if 1 /* MDEV-29835 FIXME: acquire page latches upfront */ + if (!next_block) { + ut_ad(mtr->memo_contains(index->lock, + MTR_MEMO_X_LOCK)); + next_block = btr_block_get(*index, next_page_no, + RW_X_LATCH, !level, mtr); + } +#endif + } + + /* Build the node pointer (= node key and page address) for the upper + half */ + + node_ptr_upper = dict_index_build_node_ptr( + index, split_rec, upper_block->page.id().page_no(), + heap, level); + + /* Insert it next to the pointer to the lower half. Note that this + may generate recursion leading to a split on the higher level. */ + + dberr_t err = btr_insert_on_non_leaf_level( + flags, index, level + 1, node_ptr_upper, mtr); + + /* Free the memory heap */ + mem_heap_free(heap); + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + return err; + } + + /* Update page links of the level */ + + if (prev_block) { + if (UNIV_UNLIKELY(memcmp_aligned<4>(prev_block->page.frame + + FIL_PAGE_NEXT, + block->page.frame + + FIL_PAGE_OFFSET, + 4))) { + return DB_CORRUPTION; + } + btr_page_set_next(prev_block, lower_block->page.id().page_no(), + mtr); + } + + if (next_block) { + if (UNIV_UNLIKELY(memcmp_aligned<4>(next_block->page.frame + + FIL_PAGE_PREV, + block->page.frame + + FIL_PAGE_OFFSET, + 4))) { + return DB_CORRUPTION; + } + btr_page_set_prev(next_block, upper_block->page.id().page_no(), + mtr); + } + + if (direction == FSP_DOWN) { + ut_ad(lower_block == new_block); + ut_ad(btr_page_get_next(upper_block->page.frame) + == next_page_no); + btr_page_set_prev(lower_block, prev_page_no, mtr); + } else { + ut_ad(upper_block == new_block); + ut_ad(btr_page_get_prev(lower_block->page.frame) + == prev_page_no); + btr_page_set_next(upper_block, next_page_no, mtr); + } + + btr_page_set_prev(upper_block, lower_block->page.id().page_no(), mtr); + btr_page_set_next(lower_block, upper_block->page.id().page_no(), mtr); + + return DB_SUCCESS; +} + +/*************************************************************//** +Determine if a tuple is smaller than any record on the page. +@return TRUE if smaller */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +bool +btr_page_tuple_smaller( +/*===================*/ + btr_cur_t* cursor, /*!< in: b-tree cursor */ + const dtuple_t* tuple, /*!< in: tuple to consider */ + rec_offs** offsets,/*!< in/out: temporary storage */ + ulint n_uniq, /*!< in: number of unique fields + in the index page records */ + mem_heap_t** heap) /*!< in/out: heap for offsets */ +{ + buf_block_t* block; + const rec_t* first_rec; + page_cur_t pcur; + + /* Read the first user record in the page. */ + block = btr_cur_get_block(cursor); + page_cur_set_before_first(block, &pcur); + if (UNIV_UNLIKELY(!(first_rec = page_cur_move_to_next(&pcur)))) { + ut_ad("corrupted page" == 0); + return false; + } + + *offsets = rec_get_offsets(first_rec, cursor->index(), *offsets, + page_is_leaf(block->page.frame) + ? cursor->index()->n_core_fields : 0, + n_uniq, heap); + + return cmp_dtuple_rec(tuple, first_rec, cursor->index(), *offsets) < 0; +} + +/** Insert the tuple into the right sibling page, if the cursor is at the end +of a page. +@param[in] flags undo logging and locking flags +@param[in,out] cursor cursor at which to insert; when the function succeeds, + the cursor is positioned before the insert point. +@param[out] offsets offsets on inserted record +@param[in,out] heap memory heap for allocating offsets +@param[in] tuple tuple to insert +@param[in] n_ext number of externally stored columns +@param[in,out] mtr mini-transaction +@return inserted record (first record on the right sibling page); + the cursor will be positioned on the page infimum +@retval NULL if the operation was not performed */ +static +rec_t* +btr_insert_into_right_sibling( + ulint flags, + btr_cur_t* cursor, + rec_offs** offsets, + mem_heap_t* heap, + const dtuple_t* tuple, + ulint n_ext, + mtr_t* mtr) +{ + buf_block_t* block = btr_cur_get_block(cursor); + page_t* page = buf_block_get_frame(block); + const uint32_t next_page_no = btr_page_get_next(page); + + ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock, + MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(heap); + ut_ad(dtuple_check_typed(tuple)); + + if (next_page_no == FIL_NULL || !page_rec_is_supremum( + page_rec_get_next(btr_cur_get_rec(cursor)))) { + + return nullptr; + } + + page_cur_t next_page_cursor; + buf_block_t* next_block; + page_t* next_page; + btr_cur_t next_father_cursor; + rec_t* rec = nullptr; + ulint max_size; + + next_block = btr_block_get(*cursor->index(), next_page_no, RW_X_LATCH, + page_is_leaf(page), mtr); + if (UNIV_UNLIKELY(!next_block)) { + return nullptr; + } + next_page = buf_block_get_frame(next_block); + const bool is_leaf = page_is_leaf(next_page); + + next_page_cursor.index = cursor->index(); + next_page_cursor.block = next_block; + next_father_cursor.page_cur = next_page_cursor; + + if (!btr_page_get_father(mtr, &next_father_cursor)) { + return nullptr; + } + + ulint up_match = 0, low_match = 0; + + if (page_cur_search_with_match(tuple, + PAGE_CUR_LE, &up_match, &low_match, + &next_page_cursor, nullptr)) { + return nullptr; + } + + max_size = page_get_max_insert_size_after_reorganize(next_page, 1); + + /* Extends gap lock for the next page */ + if (is_leaf && cursor->index()->has_locking()) { + lock_update_node_pointer(block, next_block); + } + + rec = page_cur_tuple_insert(&next_page_cursor, tuple, offsets, &heap, + n_ext, mtr); + + if (!rec) { + if (is_leaf + && next_block->page.zip.ssize + && !dict_index_is_clust(cursor->index()) + && !cursor->index()->table->is_temporary()) { + /* Reset the IBUF_BITMAP_FREE bits, because + page_cur_tuple_insert() will have attempted page + reorganize before failing. */ + ibuf_reset_free_bits(next_block); + } + return nullptr; + } + + ibool compressed; + dberr_t err; + ulint level = btr_page_get_level(next_page); + + /* adjust cursor position */ + *btr_cur_get_page_cur(cursor) = next_page_cursor; + + ut_ad(btr_cur_get_rec(cursor) == page_get_infimum_rec(next_page)); + ut_ad(page_rec_get_next(page_get_infimum_rec(next_page)) == rec); + + /* We have to change the parent node pointer */ + + compressed = btr_cur_pessimistic_delete( + &err, TRUE, &next_father_cursor, + BTR_CREATE_FLAG, false, mtr); + + if (err != DB_SUCCESS) { + return nullptr; + } + + if (!compressed) { + btr_cur_compress_if_useful(&next_father_cursor, false, mtr); + } + + dtuple_t* node_ptr = dict_index_build_node_ptr( + cursor->index(), rec, next_block->page.id().page_no(), + heap, level); + + if (btr_insert_on_non_leaf_level(flags, cursor->index(), level + 1, + node_ptr, mtr) != DB_SUCCESS) { + return nullptr; + } + + ut_ad(rec_offs_validate(rec, cursor->index(), *offsets)); + + if (is_leaf + && !dict_index_is_clust(cursor->index()) + && !cursor->index()->table->is_temporary()) { + /* Update the free bits of the B-tree page in the + insert buffer bitmap. */ + + if (next_block->page.zip.ssize) { + ibuf_update_free_bits_zip(next_block, mtr); + } else { + ibuf_update_free_bits_if_full( + next_block, max_size, + rec_offs_size(*offsets) + PAGE_DIR_SLOT_SIZE); + } + } + + return(rec); +} + +/*************************************************************//** +Moves record list end to another page. Moved records include +split_rec. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return error code */ +static +dberr_t +page_move_rec_list_end( +/*===================*/ + buf_block_t* new_block, /*!< in/out: index page where to move */ + buf_block_t* block, /*!< in: index page from where to move */ + rec_t* split_rec, /*!< in: first record to move */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* new_page = buf_block_get_frame(new_block); + ulint old_data_size; + ulint new_data_size; + ulint old_n_recs; + ulint new_n_recs; + + ut_ad(!dict_index_is_spatial(index)); + + old_data_size = page_get_data_size(new_page); + old_n_recs = page_get_n_recs(new_page); +#ifdef UNIV_ZIP_DEBUG + { + page_zip_des_t* new_page_zip + = buf_block_get_page_zip(new_block); + page_zip_des_t* page_zip + = buf_block_get_page_zip(block); + ut_a(!new_page_zip == !page_zip); + ut_a(!new_page_zip + || page_zip_validate(new_page_zip, new_page, index)); + ut_a(!page_zip + || page_zip_validate(page_zip, page_align(split_rec), + index)); + } +#endif /* UNIV_ZIP_DEBUG */ + + dberr_t err; + if (!page_copy_rec_list_end(new_block, block, + split_rec, index, mtr, &err)) { + return err; + } + + new_data_size = page_get_data_size(new_page); + new_n_recs = page_get_n_recs(new_page); + + ut_ad(new_data_size >= old_data_size); + + return page_delete_rec_list_end(split_rec, block, index, + new_n_recs - old_n_recs, + new_data_size - old_data_size, mtr); +} + +/*************************************************************//** +Moves record list start to another page. Moved records do not include +split_rec. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return error code */ +static +dberr_t +page_move_rec_list_start( +/*=====================*/ + buf_block_t* new_block, /*!< in/out: index page where to move */ + buf_block_t* block, /*!< in/out: page containing split_rec */ + rec_t* split_rec, /*!< in: first record not to move */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + dberr_t err; + if (page_copy_rec_list_start(new_block, block, split_rec, index, mtr, &err)) + page_delete_rec_list_start(split_rec, block, index, mtr); + return err; +} + +/*************************************************************//** +Splits an index page to halves and inserts the tuple. It is assumed +that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is +released within this function! NOTE that the operation of this +function must always succeed, we cannot reverse it: therefore enough +free disk space (2 pages) must be guaranteed to be available before +this function is called. +@return inserted record or NULL if run out of space */ +rec_t* +btr_page_split_and_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor at which to insert; when the + function returns, the cursor is positioned + on the predecessor of the inserted record */ + rec_offs** offsets,/*!< out: offsets on inserted record */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr, /*!< in: mtr */ + dberr_t* err) /*!< out: error code */ +{ + buf_block_t* block; + page_t* page; + page_zip_des_t* page_zip; + buf_block_t* new_block; + page_t* new_page; + page_zip_des_t* new_page_zip; + rec_t* split_rec; + buf_block_t* left_block; + buf_block_t* right_block; + page_cur_t* page_cursor; + rec_t* first_rec; + byte* buf = 0; /* remove warning */ + rec_t* move_limit; + ulint n_iterations = 0; + ulint n_uniq; + + ut_ad(*err == DB_SUCCESS); + ut_ad(dtuple_check_typed(tuple)); + + buf_pool.pages_split++; + + if (cursor->index()->is_spatial()) { + /* Split rtree page and update parent */ + return rtr_page_split_and_insert(flags, cursor, offsets, heap, + tuple, n_ext, mtr, err); + } + + if (!*heap) { + *heap = mem_heap_create(1024); + } + n_uniq = dict_index_get_n_unique_in_tree(cursor->index()); +func_start: + mem_heap_empty(*heap); + *offsets = NULL; + + ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock, + MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(!dict_index_is_online_ddl(cursor->index()) + || (flags & BTR_CREATE_FLAG) + || dict_index_is_clust(cursor->index())); + ut_ad(cursor->index()->lock.have_u_or_x()); + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + page_zip = buf_block_get_page_zip(block); + + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(!page_is_empty(page)); + + /* try to insert to the next page if possible before split */ + if (rec_t* rec = btr_insert_into_right_sibling( + flags, cursor, offsets, *heap, tuple, n_ext, mtr)) { + return(rec); + } + + /* 1. Decide the split record; split_rec == NULL means that the + tuple to be inserted should be the first record on the upper + half-page */ + bool insert_left = false; + uint32_t hint_page_no = block->page.id().page_no() + 1; + byte direction = FSP_UP; + + if (n_iterations > 0) { + split_rec = btr_page_get_split_rec(cursor, tuple, n_ext); + + if (split_rec == NULL) { + insert_left = btr_page_tuple_smaller( + cursor, tuple, offsets, n_uniq, heap); + } + } else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) { + } else if ((split_rec = btr_page_get_split_rec_to_left(cursor))) { + direction = FSP_DOWN; + hint_page_no -= 2; + } else { + /* If there is only one record in the index page, we + can't split the node in the middle by default. We need + to determine whether the new record will be inserted + to the left or right. */ + + if (page_get_n_recs(page) > 1) { + split_rec = page_get_middle_rec(page); + } else if (btr_page_tuple_smaller(cursor, tuple, + offsets, n_uniq, heap)) { + split_rec = page_rec_get_next( + page_get_infimum_rec(page)); + } else { + split_rec = NULL; + goto got_split_rec; + } + + if (UNIV_UNLIKELY(!split_rec)) { + *err = DB_CORRUPTION; + return nullptr; + } + } + +got_split_rec: + /* 2. Allocate a new page to the index */ + const uint16_t page_level = btr_page_get_level(page); + new_block = btr_page_alloc(cursor->index(), hint_page_no, direction, + page_level, mtr, mtr, err); + + if (!new_block) { + return nullptr; + } + + new_page = buf_block_get_frame(new_block); + new_page_zip = buf_block_get_page_zip(new_block); + + if (page_level && UNIV_LIKELY_NULL(new_page_zip)) { + /* ROW_FORMAT=COMPRESSED non-leaf pages are not expected + to contain FIL_NULL in FIL_PAGE_PREV at this stage. */ + memset_aligned<4>(new_page + FIL_PAGE_PREV, 0, 4); + } + btr_page_create(new_block, new_page_zip, cursor->index(), + page_level, mtr); + /* Only record the leaf level page splits. */ + if (!page_level) { + cursor->index()->stat_defrag_n_page_split ++; + cursor->index()->stat_defrag_modified_counter ++; + btr_defragment_save_defrag_stats_if_needed(cursor->index()); + } + + /* 3. Calculate the first record on the upper half-page, and the + first record (move_limit) on original page which ends up on the + upper half */ + + if (split_rec) { + first_rec = move_limit = split_rec; + + *offsets = rec_get_offsets(split_rec, cursor->index(), + *offsets, page_is_leaf(page) + ? cursor->index()->n_core_fields + : 0, + n_uniq, heap); + + insert_left = cmp_dtuple_rec(tuple, split_rec, cursor->index(), + *offsets) < 0; + + if (!insert_left && new_page_zip && n_iterations > 0) { + /* If a compressed page has already been split, + avoid further splits by inserting the record + to an empty page. */ + split_rec = NULL; + goto insert_empty; + } + } else if (insert_left) { + if (UNIV_UNLIKELY(!n_iterations)) { +corrupted: + *err = DB_CORRUPTION; + return nullptr; + } + first_rec = page_rec_get_next(page_get_infimum_rec(page)); +insert_move_limit: + move_limit = page_rec_get_next(btr_cur_get_rec(cursor)); + if (UNIV_UNLIKELY(!first_rec || !move_limit)) { + goto corrupted; + } + } else { +insert_empty: + ut_ad(!split_rec); + ut_ad(!insert_left); + buf = UT_NEW_ARRAY_NOKEY( + byte, + rec_get_converted_size(cursor->index(), tuple, n_ext)); + + first_rec = rec_convert_dtuple_to_rec(buf, cursor->index(), + tuple, n_ext); + goto insert_move_limit; + } + + /* 4. Do first the modifications in the tree structure */ + + /* FIXME: write FIL_PAGE_PREV,FIL_PAGE_NEXT in new_block earlier! */ + *err = btr_attach_half_pages(flags, cursor->index(), block, + first_rec, new_block, direction, mtr); + + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { + return nullptr; + } + +#ifdef UNIV_DEBUG + /* If the split is made on the leaf level and the insert will fit + on the appropriate half-page, we may release the tree x-latch. + We can then move the records after releasing the tree latch, + thus reducing the tree latch contention. */ + const bool insert_will_fit = !new_page_zip + && btr_page_insert_fits(cursor, split_rec, offsets, tuple, + n_ext, heap); +#endif + if (!split_rec && !insert_left) { + UT_DELETE_ARRAY(buf); + buf = NULL; + } + +#if 0 // FIXME: this used to be a no-op, and may cause trouble if enabled + if (insert_will_fit + && page_is_leaf(page) + && !dict_index_is_online_ddl(cursor->index())) { + mtr->release(cursor->index()->lock); + /* NOTE: We cannot release root block latch here, because it + has segment header and already modified in most of cases.*/ + } +#endif + + /* 5. Move then the records to the new page */ + if (direction == FSP_DOWN) { + /* fputs("Split left\n", stderr); */ + + if (0 +#ifdef UNIV_ZIP_COPY + || page_zip +#endif /* UNIV_ZIP_COPY */ + || (*err = page_move_rec_list_start(new_block, block, + move_limit, + cursor->index(), + mtr))) { + if (*err != DB_FAIL) { + return nullptr; + } + + /* For some reason, compressing new_block failed, + even though it should contain fewer records than + the original page. Copy the page byte for byte + and then delete the records from both pages + as appropriate. Deleting will always succeed. */ + ut_a(new_page_zip); + + page_zip_copy_recs(new_block, page_zip, page, + cursor->index(), mtr); + *err = page_delete_rec_list_end(move_limit + - page + new_page, + new_block, + cursor->index(), + ULINT_UNDEFINED, + ULINT_UNDEFINED, mtr); + if (*err != DB_SUCCESS) { + return nullptr; + } + + /* Update the lock table and possible hash index. */ + if (cursor->index()->has_locking()) { + lock_move_rec_list_start( + new_block, block, move_limit, + new_page + PAGE_NEW_INFIMUM); + } + + btr_search_move_or_delete_hash_entries( + new_block, block); + + /* Delete the records from the source page. */ + + page_delete_rec_list_start(move_limit, block, + cursor->index(), mtr); + } + + left_block = new_block; + right_block = block; + + if (cursor->index()->has_locking()) { + lock_update_split_left(right_block, left_block); + } + } else { + /* fputs("Split right\n", stderr); */ + + if (0 +#ifdef UNIV_ZIP_COPY + || page_zip +#endif /* UNIV_ZIP_COPY */ + || (*err = page_move_rec_list_end(new_block, block, + move_limit, + cursor->index(), mtr))) { + if (*err != DB_FAIL) { + return nullptr; + } + + /* For some reason, compressing new_page failed, + even though it should contain fewer records than + the original page. Copy the page byte for byte + and then delete the records from both pages + as appropriate. Deleting will always succeed. */ + ut_a(new_page_zip); + + page_zip_copy_recs(new_block, page_zip, page, + cursor->index(), mtr); + page_delete_rec_list_start(move_limit - page + + new_page, new_block, + cursor->index(), mtr); + + /* Update the lock table and possible hash index. */ + if (cursor->index()->has_locking()) { + lock_move_rec_list_end(new_block, block, + move_limit); + } + + btr_search_move_or_delete_hash_entries( + new_block, block); + + /* Delete the records from the source page. */ + + *err = page_delete_rec_list_end(move_limit, block, + cursor->index(), + ULINT_UNDEFINED, + ULINT_UNDEFINED, mtr); + if (*err != DB_SUCCESS) { + return nullptr; + } + } + + left_block = block; + right_block = new_block; + + if (cursor->index()->has_locking()) { + lock_update_split_right(right_block, left_block); + } + } + +#ifdef UNIV_ZIP_DEBUG + if (page_zip) { + ut_a(page_zip_validate(page_zip, page, cursor->index())); + ut_a(page_zip_validate(new_page_zip, new_page, + cursor->index())); + } +#endif /* UNIV_ZIP_DEBUG */ + + /* At this point, split_rec, move_limit and first_rec may point + to garbage on the old page. */ + + /* 6. The split and the tree modification is now completed. Decide the + page where the tuple should be inserted */ + rec_t* rec; + buf_block_t* const insert_block = insert_left + ? left_block : right_block; + + /* 7. Reposition the cursor for insert and try insertion */ + page_cursor = btr_cur_get_page_cur(cursor); + page_cursor->block = insert_block; + + ulint up_match = 0, low_match = 0; + + if (page_cur_search_with_match(tuple, + PAGE_CUR_LE, &up_match, &low_match, + page_cursor, nullptr)) { + *err = DB_CORRUPTION; + return nullptr; + } + + rec = page_cur_tuple_insert(page_cursor, tuple, + offsets, heap, n_ext, mtr); + +#ifdef UNIV_ZIP_DEBUG + { + page_t* insert_page + = buf_block_get_frame(insert_block); + + page_zip_des_t* insert_page_zip + = buf_block_get_page_zip(insert_block); + + ut_a(!insert_page_zip + || page_zip_validate(insert_page_zip, insert_page, + cursor->index())); + } +#endif /* UNIV_ZIP_DEBUG */ + + if (rec != NULL) { + + goto func_exit; + } + + /* 8. If insert did not fit, try page reorganization. + For compressed pages, page_cur_tuple_insert() will have + attempted this already. */ + + if (page_cur_get_page_zip(page_cursor)) { + goto insert_failed; + } + + *err = btr_page_reorganize(page_cursor, mtr); + + if (*err != DB_SUCCESS) { + return nullptr; + } + + rec = page_cur_tuple_insert(page_cursor, tuple, + offsets, heap, n_ext, mtr); + + if (rec == NULL) { + /* The insert did not fit on the page: loop back to the + start of the function for a new split */ +insert_failed: + /* We play safe and reset the free bits for new_page */ + if (!dict_index_is_clust(page_cursor->index) + && !page_cursor->index->table->is_temporary()) { + ibuf_reset_free_bits(new_block); + ibuf_reset_free_bits(block); + } + + n_iterations++; + ut_ad(n_iterations < 2 + || buf_block_get_page_zip(insert_block)); + ut_ad(!insert_will_fit); + + goto func_start; + } + +func_exit: + /* Insert fit on the page: update the free bits for the + left and right pages in the same mtr */ + + if (!dict_index_is_clust(page_cursor->index) + && !page_cursor->index->table->is_temporary() + && page_is_leaf(page)) { + + ibuf_update_free_bits_for_two_pages_low( + left_block, right_block, mtr); + } + + ut_ad(page_validate(buf_block_get_frame(left_block), + page_cursor->index)); + ut_ad(page_validate(buf_block_get_frame(right_block), + page_cursor->index)); + + ut_ad(!rec || rec_offs_validate(rec, page_cursor->index, *offsets)); + return(rec); +} + +/** Remove a page from the level list of pages. +@param[in] block page to remove +@param[in] index index tree +@param[in,out] mtr mini-transaction */ +dberr_t btr_level_list_remove(const buf_block_t& block, + const dict_index_t& index, mtr_t* mtr) +{ + ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(block.zip_size() == index.table->space->zip_size()); + ut_ad(index.table->space->id == block.page.id().space()); + /* Get the previous and next page numbers of page */ + const uint32_t prev_page_no= btr_page_get_prev(block.page.frame); + const uint32_t next_page_no= btr_page_get_next(block.page.frame); + page_id_t id{block.page.id()}; + buf_block_t *prev= nullptr, *next; + dberr_t err; + + /* Update page links of the level */ + if (prev_page_no != FIL_NULL) + { + id.set_page_no(prev_page_no); + prev= mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX); +#if 1 /* MDEV-29835 FIXME: acquire page latches upfront */ + if (!prev) + { + ut_ad(mtr->memo_contains(index.lock, MTR_MEMO_X_LOCK)); + prev= btr_block_get(index, id.page_no(), RW_X_LATCH, + page_is_leaf(block.page.frame), mtr, &err); + if (UNIV_UNLIKELY(!prev)) + return err; + } +#endif + } + + if (next_page_no != FIL_NULL) + { + id.set_page_no(next_page_no); + next= mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX); +#if 1 /* MDEV-29835 FIXME: acquire page latches upfront */ + if (!next) + { + ut_ad(mtr->memo_contains(index.lock, MTR_MEMO_X_LOCK)); + next= btr_block_get(index, id.page_no(), RW_X_LATCH, + page_is_leaf(block.page.frame), mtr, &err); + if (UNIV_UNLIKELY(!next)) + return err; + } +#endif + btr_page_set_prev(next, prev_page_no, mtr); + } + + if (prev) + btr_page_set_next(prev, next_page_no, mtr); + + return DB_SUCCESS; +} + +/*************************************************************//** +If page is the only on its level, this function moves its records to the +father page, thus reducing the tree height. +@return father block */ +buf_block_t* +btr_lift_page_up( + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: page which is the only on its level; + must not be empty: use + btr_discard_only_page_on_level if the last + record from the page should be removed */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + dberr_t* err) /*!< out: error code */ +{ + buf_block_t* father_block; + ulint page_level; + page_zip_des_t* father_page_zip; + page_t* page = buf_block_get_frame(block); + ulint root_page_no; + buf_block_t* blocks[BTR_MAX_LEVELS]; + ulint n_blocks; /*!< last used index in blocks[] */ + ulint i; + bool lift_father_up; + buf_block_t* block_orig = block; + + ut_ad(!page_has_siblings(page)); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(!page_is_empty(page)); + + page_level = btr_page_get_level(page); + root_page_no = dict_index_get_page(index); + + { + btr_cur_t cursor; + rec_offs* offsets = NULL; + mem_heap_t* heap = mem_heap_create( + sizeof(*offsets) + * (REC_OFFS_HEADER_SIZE + 1 + 1 + + unsigned(index->n_fields))); + buf_block_t* b; + cursor.page_cur.index = index; + cursor.page_cur.block = block; + + if (index->is_spatial()) { + offsets = rtr_page_get_father_block( + nullptr, heap, mtr, nullptr, &cursor); + } else { + offsets = btr_page_get_father_block(offsets, heap, + mtr, &cursor); + } + father_block = btr_cur_get_block(&cursor); + father_page_zip = buf_block_get_page_zip(father_block); + + n_blocks = 0; + + /* Store all ancestor pages so we can reset their + levels later on. We have to do all the searches on + the tree now because later on, after we've replaced + the first level, the tree is in an inconsistent state + and can not be searched. */ + for (b = father_block; + b->page.id().page_no() != root_page_no; ) { + ut_a(n_blocks < BTR_MAX_LEVELS); + + if (index->is_spatial()) { + offsets = rtr_page_get_father_block( + nullptr, heap, mtr, nullptr, &cursor); + } else { + offsets = btr_page_get_father_block(offsets, + heap, + mtr, + &cursor); + } + + blocks[n_blocks++] = b = btr_cur_get_block(&cursor); + } + + lift_father_up = (n_blocks && page_level == 0); + if (lift_father_up) { + /* The father page also should be the only on its level (not + root). We should lift up the father page at first. + Because the leaf page should be lifted up only for root page. + The freeing page is based on page_level (==0 or !=0) + to choose segment. If the page_level is changed ==0 from !=0, + later freeing of the page doesn't find the page allocation + to be freed.*/ + + block = father_block; + page = buf_block_get_frame(block); + page_level = btr_page_get_level(page); + + ut_ad(!page_has_siblings(page)); + ut_ad(mtr->memo_contains_flagged(block, + MTR_MEMO_PAGE_X_FIX)); + + father_block = blocks[0]; + father_page_zip = buf_block_get_page_zip(father_block); + } + + mem_heap_free(heap); + } + + btr_search_drop_page_hash_index(block, false); + + /* Make the father empty */ + btr_page_empty(father_block, father_page_zip, index, page_level, mtr); + /* btr_page_empty() is supposed to zero-initialize the field. */ + ut_ad(!page_get_instant(father_block->page.frame)); + + if (index->is_instant() + && father_block->page.id().page_no() == root_page_no) { + ut_ad(!father_page_zip); + + if (page_is_leaf(page)) { + const rec_t* rec = page_rec_get_next( + page_get_infimum_rec(page)); + ut_ad(rec_is_metadata(rec, *index)); + if (rec_is_add_metadata(rec, *index) + && page_get_n_recs(page) == 1) { + index->clear_instant_add(); + goto copied; + } + } + + btr_set_instant(father_block, *index, mtr); + } + + /* Copy the records to the father page one by one. */ + if (0 +#ifdef UNIV_ZIP_COPY + || father_page_zip +#endif /* UNIV_ZIP_COPY */ + || !page_copy_rec_list_end(father_block, block, + page_get_infimum_rec(page), + index, mtr, err)) { + switch (*err) { + case DB_SUCCESS: + break; + case DB_FAIL: + *err = DB_SUCCESS; + break; + default: + return nullptr; + } + + const page_zip_des_t* page_zip + = buf_block_get_page_zip(block); + ut_a(father_page_zip); + ut_a(page_zip); + + /* Copy the page byte for byte. */ + page_zip_copy_recs(father_block, + page_zip, page, index, mtr); + + /* Update the lock table and possible hash index. */ + + if (index->has_locking()) { + lock_move_rec_list_end(father_block, block, + page_get_infimum_rec(page)); + } + + /* Also update the predicate locks */ + if (dict_index_is_spatial(index)) { + lock_prdt_rec_move(father_block, block->page.id()); + } else { + btr_search_move_or_delete_hash_entries( + father_block, block); + } + } + +copied: + if (index->has_locking()) { + const page_id_t id{block->page.id()}; + /* Free predicate page locks on the block */ + if (index->is_spatial()) { + lock_sys.prdt_page_free_from_discard(id); + } else { + lock_update_copy_and_discard(*father_block, id); + } + } + + page_level++; + + /* Go upward to root page, decrementing levels by one. */ + for (i = lift_father_up ? 1 : 0; i < n_blocks; i++, page_level++) { + ut_ad(btr_page_get_level(blocks[i]->page.frame) + == page_level + 1); + btr_page_set_level(blocks[i], page_level, mtr); + } + + if (dict_index_is_spatial(index)) { + rtr_check_discard_page(index, NULL, block); + } + + /* Free the file page */ + btr_page_free(index, block, mtr); + + /* We play it safe and reset the free bits for the father */ + if (!dict_index_is_clust(index) + && !index->table->is_temporary()) { + ibuf_reset_free_bits(father_block); + } + ut_ad(page_validate(father_block->page.frame, index)); + ut_ad(btr_check_node_ptr(index, father_block, mtr)); + + return(lift_father_up ? block_orig : father_block); +} + +/*************************************************************//** +Tries to merge the page first to the left immediate brother if such a +brother exists, and the node pointers to the current page and to the brother +reside on the same page. If the left brother does not satisfy these +conditions, looks at the right brother. If the page is the only one on that +level lifts the records of the page to the father page, thus reducing the +tree height. It is assumed that mtr holds an x-latch on the tree and on the +page. If cursor is on the leaf level, mtr must also hold x-latches to the +brothers, if they exist. +@return error code */ +dberr_t +btr_compress( +/*=========*/ + btr_cur_t* cursor, /*!< in/out: cursor on the page to merge + or lift; the page must not be empty: + when deleting records, use btr_discard_page() + if the page would become empty */ + bool adjust, /*!< in: whether the cursor position should be + adjusted even when compression occurs */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + dict_index_t* index; + buf_block_t* merge_block = nullptr; + page_t* merge_page = nullptr; + page_zip_des_t* merge_page_zip; + ibool is_left; + buf_block_t* block; + page_t* page; + btr_cur_t father_cursor; + mem_heap_t* heap; + rec_offs* offsets; + ulint nth_rec = 0; /* remove bogus warning */ + bool mbr_changed = false; +#ifdef UNIV_DEBUG + bool leftmost_child; +#endif + DBUG_ENTER("btr_compress"); + + block = btr_cur_get_block(cursor); + page = btr_cur_get_page(cursor); + index = btr_cur_get_index(cursor); + + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + + MONITOR_INC(MONITOR_INDEX_MERGE_ATTEMPTS); + + const uint32_t left_page_no = btr_page_get_prev(page); + const uint32_t right_page_no = btr_page_get_next(page); + dberr_t err = DB_SUCCESS; + + ut_ad(page_is_leaf(page) || left_page_no != FIL_NULL + || (REC_INFO_MIN_REC_FLAG & rec_get_info_bits( + page_rec_get_next(page_get_infimum_rec(page)), + page_is_comp(page)))); + + heap = mem_heap_create(100); + father_cursor.page_cur.index = index; + father_cursor.page_cur.block = block; + + if (index->is_spatial()) { + offsets = rtr_page_get_father_block( + NULL, heap, mtr, cursor, &father_cursor); + ut_ad(cursor->page_cur.block->page.id() == block->page.id()); + rec_t* my_rec = father_cursor.page_cur.rec; + + ulint page_no = btr_node_ptr_get_child_page_no(my_rec, offsets); + + if (page_no != block->page.id().page_no()) { + ib::info() << "father positioned on page " + << page_no << "instead of " + << block->page.id().page_no(); + offsets = btr_page_get_father_block( + NULL, heap, mtr, &father_cursor); + } + } else { + offsets = btr_page_get_father_block( + NULL, heap, mtr, &father_cursor); + } + + if (adjust) { + nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor)); + if (UNIV_UNLIKELY(!nth_rec || nth_rec == ULINT_UNDEFINED)) { + corrupted: + err = DB_CORRUPTION; + err_exit: + /* We play it safe and reset the free bits. */ + if (merge_block && merge_block->zip_size() + && page_is_leaf(merge_block->page.frame) + && !index->is_clust()) { + ibuf_reset_free_bits(merge_block); + } + goto func_exit; + } + } + + if (left_page_no == FIL_NULL && right_page_no == FIL_NULL) { + /* The page is the only one on the level, lift the records + to the father */ + + merge_block = btr_lift_page_up(index, block, mtr, &err); +success: + if (adjust) { + ut_ad(nth_rec > 0); + if (rec_t* nth + = page_rec_get_nth(merge_block->page.frame, + nth_rec)) { + btr_cur_position(index, nth, + merge_block, cursor); + } else { + goto corrupted; + } + } + + MONITOR_INC(MONITOR_INDEX_MERGE_SUCCESSFUL); +func_exit: + mem_heap_free(heap); + DBUG_RETURN(err); + } + + ut_d(leftmost_child = + left_page_no != FIL_NULL + && (page_rec_get_next( + page_get_infimum_rec( + btr_cur_get_page(&father_cursor))) + == btr_cur_get_rec(&father_cursor))); + + /* Decide the page to which we try to merge and which will inherit + the locks */ + + is_left = btr_can_merge_with_page(cursor, left_page_no, + &merge_block, mtr); + + DBUG_EXECUTE_IF("ib_always_merge_right", is_left = FALSE;); +retry: + if (!is_left + && !btr_can_merge_with_page(cursor, right_page_no, &merge_block, + mtr)) { + if (!merge_block) { + merge_page = NULL; + } +cannot_merge: + err = DB_FAIL; + goto err_exit; + } + + merge_page = buf_block_get_frame(merge_block); + + if (UNIV_UNLIKELY(memcmp_aligned<4>(merge_page + (is_left + ? FIL_PAGE_NEXT + : FIL_PAGE_PREV), + block->page.frame + + FIL_PAGE_OFFSET, 4))) { + goto corrupted; + } + + ut_ad(page_validate(merge_page, index)); + + merge_page_zip = buf_block_get_page_zip(merge_block); +#ifdef UNIV_ZIP_DEBUG + if (merge_page_zip) { + const page_zip_des_t* page_zip + = buf_block_get_page_zip(block); + ut_a(page_zip); + ut_a(page_zip_validate(merge_page_zip, merge_page, index)); + ut_a(page_zip_validate(page_zip, page, index)); + } +#endif /* UNIV_ZIP_DEBUG */ + + btr_cur_t cursor2; + cursor2.page_cur.index = index; + cursor2.page_cur.block = merge_block; + + /* Move records to the merge page */ + if (is_left) { + rtr_mbr_t new_mbr; + rec_offs* offsets2 = NULL; + + /* For rtree, we need to update father's mbr. */ + if (index->is_spatial()) { + /* We only support merge pages with the same parent + page */ + if (!rtr_check_same_block( + index, &cursor2, + btr_cur_get_block(&father_cursor), heap)) { + is_left = false; + goto retry; + } + + /* Set rtr_info for cursor2, since it is + necessary in recursive page merge. */ + cursor2.rtr_info = cursor->rtr_info; + cursor2.tree_height = cursor->tree_height; + + offsets2 = rec_get_offsets( + btr_cur_get_rec(&cursor2), index, NULL, + page_is_leaf(btr_cur_get_page(&cursor2)) + ? index->n_fields : 0, + ULINT_UNDEFINED, &heap); + + /* Check if parent entry needs to be updated */ + mbr_changed = rtr_merge_mbr_changed( + &cursor2, &father_cursor, + offsets2, offsets, &new_mbr); + } + + rec_t* orig_pred = page_copy_rec_list_start( + merge_block, block, page_get_supremum_rec(page), + index, mtr, &err); + + if (!orig_pred) { + goto err_exit; + } + + btr_search_drop_page_hash_index(block, false); + + /* Remove the page from the level list */ + err = btr_level_list_remove(*block, *index, mtr); + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + goto err_exit; + } + + const page_id_t id{block->page.id()}; + + if (index->is_spatial()) { + rec_t* my_rec = father_cursor.page_cur.rec; + + ulint page_no = btr_node_ptr_get_child_page_no( + my_rec, offsets); + + if (page_no != block->page.id().page_no()) { + ib::fatal() << "father positioned on " + << page_no << " instead of " + << block->page.id().page_no(); + } + + if (mbr_changed) { + rtr_update_mbr_field( + &cursor2, offsets2, &father_cursor, + merge_page, &new_mbr, NULL, mtr); + } else { + rtr_node_ptr_delete(&father_cursor, mtr); + } + + /* No GAP lock needs to be worrying about */ + lock_sys.prdt_page_free_from_discard(id); + } else { + err = btr_cur_node_ptr_delete(&father_cursor, mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + goto err_exit; + } + if (index->has_locking()) { + lock_update_merge_left( + *merge_block, orig_pred, id); + } + } + + if (adjust) { + ulint n = page_rec_get_n_recs_before(orig_pred); + if (UNIV_UNLIKELY(!n || n == ULINT_UNDEFINED)) { + goto corrupted; + } + nth_rec += n; + } + } else { + rec_t* orig_succ; + ibool compressed; + dberr_t err; + byte fil_page_prev[4]; + + if (index->is_spatial()) { + /* For spatial index, we disallow merge of blocks + with different parents, since the merge would need + to update entry (for MBR and Primary key) in the + parent of block being merged */ + if (!rtr_check_same_block( + index, &cursor2, + btr_cur_get_block(&father_cursor), heap)) { + goto cannot_merge; + } + + /* Set rtr_info for cursor2, since it is + necessary in recursive page merge. */ + cursor2.rtr_info = cursor->rtr_info; + cursor2.tree_height = cursor->tree_height; + } else if (!btr_page_get_father(mtr, &cursor2)) { + goto cannot_merge; + } + + if (merge_page_zip && left_page_no == FIL_NULL) { + + /* The function page_zip_compress(), which will be + invoked by page_copy_rec_list_end() below, + requires that FIL_PAGE_PREV be FIL_NULL. + Clear the field, but prepare to restore it. */ + static_assert(FIL_PAGE_PREV % 8 == 0, "alignment"); + memcpy(fil_page_prev, merge_page + FIL_PAGE_PREV, 4); + compile_time_assert(FIL_NULL == 0xffffffffU); + memset_aligned<4>(merge_page + FIL_PAGE_PREV, 0xff, 4); + } + + orig_succ = page_copy_rec_list_end(merge_block, block, + page_get_infimum_rec(page), + cursor->index(), mtr, &err); + + if (!orig_succ) { + ut_a(merge_page_zip); + if (left_page_no == FIL_NULL) { + /* FIL_PAGE_PREV was restored from + merge_page_zip. */ + ut_ad(!memcmp(fil_page_prev, + merge_page + FIL_PAGE_PREV, 4)); + } + goto err_exit; + } + + btr_search_drop_page_hash_index(block, false); + + if (merge_page_zip && left_page_no == FIL_NULL) { + + /* Restore FIL_PAGE_PREV in order to avoid an assertion + failure in btr_level_list_remove(), which will set + the field again to FIL_NULL. Even though this makes + merge_page and merge_page_zip inconsistent for a + split second, it is harmless, because the pages + are X-latched. */ + memcpy(merge_page + FIL_PAGE_PREV, fil_page_prev, 4); + } + + /* Remove the page from the level list */ + err = btr_level_list_remove(*block, *index, mtr); + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + goto err_exit; + } + + ut_ad(btr_node_ptr_get_child_page_no( + btr_cur_get_rec(&father_cursor), offsets) + == block->page.id().page_no()); + + /* Replace the address of the old child node (= page) with the + address of the merge page to the right */ + btr_node_ptr_set_child_page_no( + btr_cur_get_block(&father_cursor), + btr_cur_get_rec(&father_cursor), + offsets, right_page_no, mtr); + +#ifdef UNIV_DEBUG + if (!page_is_leaf(page) && left_page_no == FIL_NULL) { + ut_ad(REC_INFO_MIN_REC_FLAG & rec_get_info_bits( + page_rec_get_next(page_get_infimum_rec( + buf_block_get_frame(merge_block))), + page_is_comp(page))); + } +#endif /* UNIV_DEBUG */ + + /* For rtree, we need to update father's mbr. */ + if (index->is_spatial()) { + rec_offs* offsets2; + ulint rec_info; + + offsets2 = rec_get_offsets( + btr_cur_get_rec(&cursor2), index, NULL, + page_is_leaf(btr_cur_get_page(&cursor2)) + ? index->n_fields : 0, + ULINT_UNDEFINED, &heap); + + ut_ad(btr_node_ptr_get_child_page_no( + btr_cur_get_rec(&cursor2), offsets2) + == right_page_no); + + rec_info = rec_get_info_bits( + btr_cur_get_rec(&father_cursor), + rec_offs_comp(offsets)); + if (rec_info & REC_INFO_MIN_REC_FLAG) { + /* When the father node ptr is minimal rec, + we will keep it and delete the node ptr of + merge page. */ + rtr_merge_and_update_mbr(&father_cursor, + &cursor2, + offsets, offsets2, + merge_page, mtr); + } else { + /* Otherwise, we will keep the node ptr of + merge page and delete the father node ptr. + This is for keeping the rec order in upper + level. */ + rtr_merge_and_update_mbr(&cursor2, + &father_cursor, + offsets2, offsets, + merge_page, mtr); + } + const page_id_t id{block->page.id()}; + lock_sys.prdt_page_free_from_discard(id); + } else { + + compressed = btr_cur_pessimistic_delete(&err, TRUE, + &cursor2, + BTR_CREATE_FLAG, + false, mtr); + ut_a(err == DB_SUCCESS); + + if (!compressed) { + btr_cur_compress_if_useful(&cursor2, false, + mtr); + } + + if (index->has_locking()) { + lock_update_merge_right( + merge_block, orig_succ, block); + } + } + } + + if (!dict_index_is_clust(index) + && !index->table->is_temporary() + && page_is_leaf(merge_page)) { + /* Update the free bits of the B-tree page in the + insert buffer bitmap. This has to be done in a + separate mini-transaction that is committed before the + main mini-transaction. We cannot update the insert + buffer bitmap in this mini-transaction, because + btr_compress() can be invoked recursively without + committing the mini-transaction in between. Since + insert buffer bitmap pages have a lower rank than + B-tree pages, we must not access other pages in the + same mini-transaction after accessing an insert buffer + bitmap page. */ + + /* The free bits in the insert buffer bitmap must + never exceed the free space on a page. It is safe to + decrement or reset the bits in the bitmap in a + mini-transaction that is committed before the + mini-transaction that affects the free space. */ + + /* It is unsafe to increment the bits in a separately + committed mini-transaction, because in crash recovery, + the free bits could momentarily be set too high. */ + + if (merge_block->zip_size()) { + /* Because the free bits may be incremented + and we cannot update the insert buffer bitmap + in the same mini-transaction, the only safe + thing we can do here is the pessimistic + approach: reset the free bits. */ + ibuf_reset_free_bits(merge_block); + } else { + /* On uncompressed pages, the free bits will + never increase here. Thus, it is safe to + write the bits accurately in a separate + mini-transaction. */ + ibuf_update_free_bits_if_full(merge_block, + srv_page_size, + ULINT_UNDEFINED); + } + } + + ut_ad(page_validate(merge_page, index)); +#ifdef UNIV_ZIP_DEBUG + ut_a(!merge_page_zip || page_zip_validate(merge_page_zip, merge_page, + index)); +#endif /* UNIV_ZIP_DEBUG */ + + if (dict_index_is_spatial(index)) { + rtr_check_discard_page(index, NULL, block); + } + + /* Free the file page */ + err = btr_page_free(index, block, mtr); + if (err == DB_SUCCESS) { + ut_ad(leftmost_child + || btr_check_node_ptr(index, merge_block, mtr)); + goto success; + } else { + goto err_exit; + } +} + +/*************************************************************//** +Discards a page that is the only page on its level. This will empty +the whole B-tree, leaving just an empty root page. This function +should almost never be reached, because btr_compress(), which is invoked in +delete operations, calls btr_lift_page_up() to flatten the B-tree. */ +ATTRIBUTE_COLD +static +void +btr_discard_only_page_on_level( +/*===========================*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: page which is the only on its level */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint page_level = 0; + + ut_ad(!index->is_dummy); + + /* Save the PAGE_MAX_TRX_ID from the leaf page. */ + const trx_id_t max_trx_id = page_get_max_trx_id(block->page.frame); + const rec_t* r = page_rec_get_next( + page_get_infimum_rec(block->page.frame)); + /* In the caller we checked that a valid key exists in the page, + because we were able to look up a parent page. */ + ut_ad(r); + ut_ad(rec_is_metadata(r, *index) == index->is_instant()); + + while (block->page.id().page_no() != dict_index_get_page(index)) { + btr_cur_t cursor; + buf_block_t* father; + const page_t* page = buf_block_get_frame(block); + + ut_a(page_get_n_recs(page) == 1); + ut_a(page_level == btr_page_get_level(page)); + ut_a(!page_has_siblings(page)); + ut_ad(fil_page_index_page_check(page)); + ut_ad(block->page.id().space() == index->table->space->id); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + btr_search_drop_page_hash_index(block, false); + cursor.page_cur.index = index; + cursor.page_cur.block = block; + + if (index->is_spatial()) { + /* Check any concurrent search having this page */ + rtr_check_discard_page(index, NULL, block); + if (!rtr_page_get_father(mtr, nullptr, &cursor)) { + return; + } + } else { + if (!btr_page_get_father(mtr, &cursor)) { + return; + } + } + father = btr_cur_get_block(&cursor); + + if (index->has_locking()) { + lock_update_discard( + father, PAGE_HEAP_NO_SUPREMUM, block); + } + + /* Free the file page */ + if (btr_page_free(index, block, mtr) != DB_SUCCESS) { + return; + } + + block = father; + page_level++; + } + + /* block is the root page, which must be empty, except + for the node pointer to the (now discarded) block(s). */ + ut_ad(!page_has_siblings(block->page.frame)); + + mem_heap_t* heap = nullptr; + const rec_t* rec = nullptr; + rec_offs* offsets = nullptr; + if (index->table->instant || index->must_avoid_clear_instant_add()) { + if (!rec_is_metadata(r, *index)) { + } else if (!index->table->instant + || rec_is_alter_metadata(r, *index)) { + heap = mem_heap_create(srv_page_size); + offsets = rec_get_offsets(r, index, nullptr, + index->n_core_fields, + ULINT_UNDEFINED, &heap); + rec = rec_copy(mem_heap_alloc(heap, + rec_offs_size(offsets)), + r, offsets); + rec_offs_make_valid(rec, index, true, offsets); + } + } + + btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr); + ut_ad(page_is_leaf(buf_block_get_frame(block))); + /* btr_page_empty() is supposed to zero-initialize the field. */ + ut_ad(!page_get_instant(block->page.frame)); + + if (index->is_primary()) { + if (rec) { + page_cur_t cur; + page_cur_set_before_first(block, &cur); + cur.index = index; + DBUG_ASSERT(index->table->instant); + DBUG_ASSERT(rec_is_alter_metadata(rec, *index)); + btr_set_instant(block, *index, mtr); + rec = page_cur_insert_rec_low(&cur, rec, offsets, mtr); + ut_ad(rec); + mem_heap_free(heap); + } else if (index->is_instant()) { + index->clear_instant_add(); + } + } else if (!index->table->is_temporary()) { + /* We play it safe and reset the free bits for the root */ + ibuf_reset_free_bits(block); + + ut_a(max_trx_id); + page_set_max_trx_id(block, + buf_block_get_page_zip(block), + max_trx_id, mtr); + } +} + +/*************************************************************//** +Discards a page from a B-tree. This is used to remove the last record from +a B-tree page: the whole page must be removed at the same time. This cannot +be used for the root page, which is allowed to be empty. */ +dberr_t +btr_discard_page( +/*=============*/ + btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on + the root page */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_index_t* index; + buf_block_t* merge_block; + buf_block_t* block; + btr_cur_t parent_cursor; + + block = btr_cur_get_block(cursor); + index = btr_cur_get_index(cursor); + parent_cursor.page_cur = cursor->page_cur; + + ut_ad(dict_index_get_page(index) != block->page.id().page_no()); + + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + + MONITOR_INC(MONITOR_INDEX_DISCARD); + + if (index->is_spatial() + ? !rtr_page_get_father(mtr, cursor, &parent_cursor) + : !btr_page_get_father(mtr, &parent_cursor)) { + return DB_CORRUPTION; + } + + /* Decide the page which will inherit the locks */ + + const uint32_t left_page_no = btr_page_get_prev(block->page.frame); + const uint32_t right_page_no = btr_page_get_next(block->page.frame); + page_id_t merge_page_id{block->page.id()}; + + ut_d(bool parent_is_different = false); + dberr_t err; + if (left_page_no != FIL_NULL) { + merge_page_id.set_page_no(left_page_no); + merge_block = btr_block_reget(mtr, *index, merge_page_id, + &err); + if (UNIV_UNLIKELY(!merge_block)) { + return err; + } +#if 1 /* MDEV-29835 FIXME: Acquire the page latch upfront. */ + ut_ad(!memcmp_aligned<4>(merge_block->page.frame + + FIL_PAGE_NEXT, + block->page.frame + FIL_PAGE_OFFSET, + 4)); +#else + if (UNIV_UNLIKELY(memcmp_aligned<4>(merge_block->page.frame + + FIL_PAGE_NEXT, + block->page.frame + + FIL_PAGE_OFFSET, 4))) { + return DB_CORRUPTION; + } +#endif + ut_d(parent_is_different = + (page_rec_get_next( + page_get_infimum_rec( + btr_cur_get_page( + &parent_cursor))) + == btr_cur_get_rec(&parent_cursor))); + } else if (right_page_no != FIL_NULL) { + merge_page_id.set_page_no(right_page_no); + merge_block = btr_block_reget(mtr, *index, merge_page_id, + &err); + if (UNIV_UNLIKELY(!merge_block)) { + return err; + } +#if 1 /* MDEV-29835 FIXME: Acquire the page latch upfront. */ + ut_ad(!memcmp_aligned<4>(merge_block->page.frame + + FIL_PAGE_PREV, + block->page.frame + FIL_PAGE_OFFSET, + 4)); +#else + if (UNIV_UNLIKELY(memcmp_aligned<4>(merge_block->page.frame + + FIL_PAGE_PREV, + block->page.frame + + FIL_PAGE_OFFSET, 4))) { + return DB_CORRUPTION; + } +#endif + ut_d(parent_is_different = page_rec_is_supremum( + page_rec_get_next(btr_cur_get_rec(&parent_cursor)))); + if (page_is_leaf(merge_block->page.frame)) { + } else if (rec_t* node_ptr = + page_rec_get_next(page_get_infimum_rec( + merge_block->page.frame))) { + ut_ad(page_rec_is_user_rec(node_ptr)); + /* We have to mark the leftmost node pointer as the + predefined minimum record. */ + btr_set_min_rec_mark(node_ptr, *merge_block, + mtr); + } else { + return DB_CORRUPTION; + } + } else { + btr_discard_only_page_on_level(index, block, mtr); + return DB_SUCCESS; + } + + if (UNIV_UNLIKELY(memcmp_aligned<2>(&merge_block->page.frame + [PAGE_HEADER + PAGE_LEVEL], + &block->page.frame + [PAGE_HEADER + PAGE_LEVEL], 2))) { + return DB_CORRUPTION; + } + + btr_search_drop_page_hash_index(block, false); + + if (dict_index_is_spatial(index)) { + rtr_node_ptr_delete(&parent_cursor, mtr); + } else if (dberr_t err = + btr_cur_node_ptr_delete(&parent_cursor, mtr)) { + return err; + } + + /* Remove the page from the level list */ + if (dberr_t err = btr_level_list_remove(*block, *index, mtr)) { + return err; + } + +#ifdef UNIV_ZIP_DEBUG + if (page_zip_des_t* merge_page_zip + = buf_block_get_page_zip(merge_block)) + ut_a(page_zip_validate(merge_page_zip, + merge_block->page.frame, index)); +#endif /* UNIV_ZIP_DEBUG */ + + if (index->has_locking()) { + if (left_page_no != FIL_NULL) { + lock_update_discard(merge_block, PAGE_HEAP_NO_SUPREMUM, + block); + } else { + lock_update_discard(merge_block, + lock_get_min_heap_no(merge_block), + block); + } + + if (index->is_spatial()) { + rtr_check_discard_page(index, cursor, block); + } + } + + /* Free the file page */ + err = btr_page_free(index, block, mtr); + + if (err == DB_SUCCESS) { + /* btr_check_node_ptr() needs parent block latched. + If the merge_block's parent block is not same, + we cannot use btr_check_node_ptr() */ + ut_ad(parent_is_different + || btr_check_node_ptr(index, merge_block, mtr)); + + if (btr_cur_get_block(&parent_cursor)->page.id().page_no() + == index->page + && !page_has_siblings(btr_cur_get_page(&parent_cursor)) + && page_get_n_recs(btr_cur_get_page(&parent_cursor)) + == 1) { + btr_lift_page_up(index, merge_block, mtr, &err); + } + } + + return err; +} + +#ifdef UNIV_BTR_PRINT +/*************************************************************//** +Prints size info of a B-tree. */ +void +btr_print_size( +/*===========*/ + dict_index_t* index) /*!< in: index tree */ +{ + page_t* root; + fseg_header_t* seg; + mtr_t mtr; + + if (dict_index_is_ibuf(index)) { + fputs("Sorry, cannot print info of an ibuf tree:" + " use ibuf functions\n", stderr); + + return; + } + + mtr_start(&mtr); + + root = btr_root_get(index, &mtr); + + seg = root + PAGE_HEADER + PAGE_BTR_SEG_TOP; + + fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr); + fseg_print(seg, &mtr); + + if (!dict_index_is_ibuf(index)) { + + seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; + + fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr); + fseg_print(seg, &mtr); + } + + mtr_commit(&mtr); +} + +/************************************************************//** +Prints recursively index tree pages. */ +static +void +btr_print_recursive( +/*================*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: index page */ + ulint width, /*!< in: print this many entries from start + and end */ + mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */ + rec_offs** offsets,/*!< in/out: buffer for rec_get_offsets() */ + mtr_t* mtr) /*!< in: mtr */ +{ + const page_t* page = buf_block_get_frame(block); + page_cur_t cursor; + ulint n_recs; + ulint i = 0; + mtr_t mtr2; + + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_SX_FIX)); + + ib::info() << "NODE ON LEVEL " << btr_page_get_level(page) + << " page " << block->page.id; + + page_print(block, index, width, width); + + n_recs = page_get_n_recs(page); + + page_cur_set_before_first(block, &cursor); + page_cur_move_to_next(&cursor); + + while (!page_cur_is_after_last(&cursor)) { + + if (page_is_leaf(page)) { + + /* If this is the leaf level, do nothing */ + + } else if ((i <= width) || (i >= n_recs - width)) { + + const rec_t* node_ptr; + + mtr_start(&mtr2); + + node_ptr = page_cur_get_rec(&cursor); + + *offsets = rec_get_offsets( + node_ptr, index, *offsets, 0, + ULINT_UNDEFINED, heap); + if (buf_block_t *child = + btr_node_ptr_get_child(node_ptr, index, *offsets, + &mtr2)) { + btr_print_recursive(index, child, width, heap, + offsets, &mtr2); + } + mtr_commit(&mtr2); + } + + page_cur_move_to_next(&cursor); + i++; + } +} + +/**************************************************************//** +Prints directories and other info of all nodes in the tree. */ +void +btr_print_index( +/*============*/ + dict_index_t* index, /*!< in: index */ + ulint width) /*!< in: print this many entries from start + and end */ +{ + mtr_t mtr; + buf_block_t* root; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + fputs("--------------------------\n" + "INDEX TREE PRINT\n", stderr); + + mtr_start(&mtr); + + root = btr_root_block_get(index, RW_SX_LATCH, &mtr); + + btr_print_recursive(index, root, width, &heap, &offsets, &mtr); + if (heap) { + mem_heap_free(heap); + } + + mtr_commit(&mtr); + + ut_ad(btr_validate_index(index, 0)); +} +#endif /* UNIV_BTR_PRINT */ + +#ifdef UNIV_DEBUG +/************************************************************//** +Checks that the node pointer to a page is appropriate. +@return TRUE */ +ibool +btr_check_node_ptr( +/*===============*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: index page */ + mtr_t* mtr) /*!< in: mtr */ +{ + mem_heap_t* heap; + dtuple_t* tuple; + rec_offs* offsets; + btr_cur_t cursor; + page_t* page = buf_block_get_frame(block); + + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + + if (dict_index_get_page(index) == block->page.id().page_no()) { + + return(TRUE); + } + + cursor.page_cur.index = index; + cursor.page_cur.block = block; + + heap = mem_heap_create(256); + + if (dict_index_is_spatial(index)) { + offsets = rtr_page_get_father_block(NULL, heap, mtr, + NULL, &cursor); + } else { + offsets = btr_page_get_father_block(NULL, heap, mtr, &cursor); + } + + ut_ad(offsets); + + if (page_is_leaf(page)) { + + goto func_exit; + } + + tuple = dict_index_build_node_ptr( + index, page_rec_get_next(page_get_infimum_rec(page)), 0, heap, + btr_page_get_level(page)); + + /* For spatial index, the MBR in the parent rec could be different + with that of first rec of child, their relationship should be + "WITHIN" relationship */ + if (dict_index_is_spatial(index)) { + ut_a(!cmp_dtuple_rec_with_gis( + tuple, btr_cur_get_rec(&cursor), + PAGE_CUR_WITHIN)); + } else { + ut_a(!cmp_dtuple_rec(tuple, btr_cur_get_rec(&cursor), index, + offsets)); + } +func_exit: + mem_heap_free(heap); + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/************************************************************//** +Display identification information for a record. */ +static +void +btr_index_rec_validate_report( +/*==========================*/ + const page_t* page, /*!< in: index page */ + const rec_t* rec, /*!< in: index record */ + const dict_index_t* index) /*!< in: index */ +{ + ib::info() << "Record in index " << index->name + << " of table " << index->table->name + << ", page " << page_id_t(page_get_space_id(page), + page_get_page_no(page)) + << ", at offset " << page_offset(rec); +} + +/************************************************************//** +Checks the size and number of fields in a record based on the definition of +the index. +@return TRUE if ok */ +ibool +btr_index_rec_validate( +/*===================*/ + const rec_t* rec, /*!< in: index record */ + const dict_index_t* index, /*!< in: index */ + ibool dump_on_error) /*!< in: TRUE if the function + should print hex dump of record + and page on error */ +{ + ulint len; + const page_t* page; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + page = page_align(rec); + + ut_ad(index->n_core_fields); + + if (index->is_ibuf()) { + /* The insert buffer index tree can contain records from any + other index: we cannot check the number of fields or + their length */ + + return(TRUE); + } + +#ifdef VIRTUAL_INDEX_DEBUG + if (dict_index_has_virtual(index)) { + fprintf(stderr, "index name is %s\n", index->name()); + } +#endif + if ((ibool)!!page_is_comp(page) != dict_table_is_comp(index->table)) { + btr_index_rec_validate_report(page, rec, index); + + ib::error() << "Compact flag=" << !!page_is_comp(page) + << ", should be " << dict_table_is_comp(index->table); + + return(FALSE); + } + + const bool is_alter_metadata = page_is_leaf(page) + && !page_has_prev(page) + && index->is_primary() && index->table->instant + && rec == page_rec_get_next_const(page_get_infimum_rec(page)); + + if (is_alter_metadata + && !rec_is_alter_metadata(rec, page_is_comp(page))) { + btr_index_rec_validate_report(page, rec, index); + + ib::error() << "First record is not ALTER TABLE metadata"; + return FALSE; + } + + if (!page_is_comp(page)) { + const ulint n_rec_fields = rec_get_n_fields_old(rec); + if (n_rec_fields == DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD + && index->id == DICT_INDEXES_ID) { + /* A record for older SYS_INDEXES table + (missing merge_threshold column) is acceptable. */ + } else if (is_alter_metadata) { + if (n_rec_fields != ulint(index->n_fields) + 1) { + goto n_field_mismatch; + } + } else if (n_rec_fields < index->n_core_fields + || n_rec_fields > index->n_fields) { +n_field_mismatch: + btr_index_rec_validate_report(page, rec, index); + + ib::error() << "Has " << rec_get_n_fields_old(rec) + << " fields, should have " + << index->n_core_fields << ".." + << index->n_fields; + + if (dump_on_error) { + fputs("InnoDB: corrupt record ", stderr); + rec_print_old(stderr, rec); + putc('\n', stderr); + } + return(FALSE); + } + } + + offsets = rec_get_offsets(rec, index, offsets, page_is_leaf(page) + ? index->n_core_fields : 0, + ULINT_UNDEFINED, &heap); + const dict_field_t* field = index->fields; + ut_ad(rec_offs_n_fields(offsets) + == ulint(index->n_fields) + is_alter_metadata); + + for (unsigned i = 0; i < rec_offs_n_fields(offsets); i++) { + rec_get_nth_field_offs(offsets, i, &len); + + ulint fixed_size; + + if (is_alter_metadata && i == index->first_user_field()) { + fixed_size = FIELD_REF_SIZE; + if (len != FIELD_REF_SIZE + || !rec_offs_nth_extern(offsets, i)) { + goto len_mismatch; + } + + continue; + } else { + fixed_size = dict_col_get_fixed_size( + field->col, page_is_comp(page)); + if (rec_offs_nth_extern(offsets, i)) { + const byte* data = rec_get_nth_field( + rec, offsets, i, &len); + len -= BTR_EXTERN_FIELD_REF_SIZE; + ulint extern_len = mach_read_from_4( + data + len + BTR_EXTERN_LEN + 4); + if (fixed_size == extern_len + len) { + goto next_field; + } + } + } + + /* Note that if fixed_size != 0, it equals the + length of a fixed-size column in the clustered index. + We should adjust it here. + A prefix index of the column is of fixed, but different + length. When fixed_size == 0, prefix_len is the maximum + length of the prefix index column. */ + + if (len_is_stored(len) + && (field->prefix_len + ? len > field->prefix_len + : (fixed_size && len != fixed_size))) { +len_mismatch: + btr_index_rec_validate_report(page, rec, index); + ib::error error; + + error << "Field " << i << " len is " << len + << ", should be " << fixed_size; + + if (dump_on_error) { + error << "; "; + rec_print(error.m_oss, rec, + rec_get_info_bits( + rec, rec_offs_comp(offsets)), + offsets); + } + if (heap) { + mem_heap_free(heap); + } + return(FALSE); + } +next_field: + field++; + } + +#ifdef VIRTUAL_INDEX_DEBUG + if (dict_index_has_virtual(index)) { + rec_print_new(stderr, rec, offsets); + } +#endif + + if (heap) { + mem_heap_free(heap); + } + return(TRUE); +} + +/************************************************************//** +Checks the size and number of fields in records based on the definition of +the index. +@return true if ok */ +static +bool +btr_index_page_validate( +/*====================*/ + buf_block_t* block, /*!< in: index page */ + dict_index_t* index) /*!< in: index */ +{ + page_cur_t cur; +#ifndef DBUG_OFF + ulint nth = 1; +#endif /* !DBUG_OFF */ + + page_cur_set_before_first(block, &cur); + + /* Directory slot 0 should only contain the infimum record. */ + DBUG_EXECUTE_IF("check_table_rec_next", + ut_a(page_rec_get_nth_const( + page_cur_get_page(&cur), 0) + == cur.rec); + ut_a(page_dir_slot_get_n_owned( + page_dir_get_nth_slot( + page_cur_get_page(&cur), 0)) + == 1);); + + while (page_cur_move_to_next(&cur)) { + if (page_cur_is_after_last(&cur)) { + return true; + } + + if (!btr_index_rec_validate(cur.rec, index, TRUE)) { + break; + } + + /* Verify that page_rec_get_nth_const() is correctly + retrieving each record. */ + DBUG_EXECUTE_IF("check_table_rec_next", + ut_a(cur.rec == page_rec_get_nth_const( + page_cur_get_page(&cur), + page_rec_get_n_recs_before( + cur.rec))); + ut_a(nth++ == page_rec_get_n_recs_before( + cur.rec));); + } + + return false; +} + +/************************************************************//** +Report an error on one page of an index tree. */ +static +void +btr_validate_report1( +/*=================*/ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: B-tree level */ + const buf_block_t* block) /*!< in: index page */ +{ + ib::error error; + error << "In page " << block->page.id().page_no() + << " of index " << index->name + << " of table " << index->table->name; + + if (level > 0) { + error << ", index tree level " << level; + } +} + +/************************************************************//** +Report an error on two pages of an index tree. */ +static +void +btr_validate_report2( +/*=================*/ + const dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: B-tree level */ + const buf_block_t* block1, /*!< in: first index page */ + const buf_block_t* block2) /*!< in: second index page */ +{ + ib::error error; + error << "In pages " << block1->page.id() + << " and " << block2->page.id() << " of index " << index->name + << " of table " << index->table->name; + + if (level) + error << ", index tree level " << level; +} + +/** Validate an index tree level. */ +static +dberr_t +btr_validate_level( +/*===============*/ + dict_index_t* index, /*!< in: index tree */ + const trx_t* trx, /*!< in: transaction or NULL */ + ulint level) /*!< in: level number */ +{ + buf_block_t* block; + page_t* page; + buf_block_t* right_block = 0; /* remove warning */ + page_t* right_page = 0; /* remove warning */ + page_t* father_page; + btr_cur_t node_cur; + btr_cur_t right_node_cur; + rec_t* rec; + page_cur_t cursor; + dtuple_t* node_ptr_tuple; + mtr_t mtr; + mem_heap_t* heap = mem_heap_create(256); + rec_offs* offsets = NULL; + rec_offs* offsets2= NULL; +#ifdef UNIV_ZIP_DEBUG + page_zip_des_t* page_zip; +#endif /* UNIV_ZIP_DEBUG */ + + mtr.start(); + + mtr_x_lock_index(index, &mtr); + + dberr_t err; + block = btr_root_block_get(index, RW_SX_LATCH, &mtr, &err); + if (!block) { + mtr.commit(); + return err; + } + page = buf_block_get_frame(block); + + fil_space_t* space = index->table->space; + + while (level != btr_page_get_level(page)) { + const rec_t* node_ptr; + switch (dberr_t e = + fseg_page_is_allocated(space, + block->page.id().page_no())) { + case DB_SUCCESS_LOCKED_REC: + break; + case DB_SUCCESS: + btr_validate_report1(index, level, block); + ib::warn() << "Page is free"; + e = DB_CORRUPTION; + /* fall through */ + default: + err = e; + } + ut_ad(index->table->space_id == block->page.id().space()); + ut_ad(block->page.id().space() == page_get_space_id(page)); +#ifdef UNIV_ZIP_DEBUG + page_zip = buf_block_get_page_zip(block); + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + if (page_is_leaf(page)) { +corrupted: + err = DB_CORRUPTION; + goto invalid_page; + } + + page_cur_set_before_first(block, &cursor); + if (!(node_ptr = page_cur_move_to_next(&cursor))) { + goto corrupted; + } + + offsets = rec_get_offsets(node_ptr, index, offsets, 0, + ULINT_UNDEFINED, &heap); + + block = btr_node_ptr_get_child(node_ptr, index, offsets, &mtr, + &err); + if (!block) { + break; + } + page = buf_block_get_frame(block); + + /* For R-Tree, since record order might not be the same as + linked index page in the lower level, we need to travers + backwards to get the first page rec in this level. + This is only used for index validation. Spatial index + does not use such scan for any of its DML or query + operations */ + if (dict_index_is_spatial(index)) { + uint32_t left_page_no = btr_page_get_prev(page); + + while (left_page_no != FIL_NULL) { + /* To obey latch order of tree blocks, + we should release the right_block once to + obtain lock of the uncle block. */ + mtr.release_last_page(); + + block = btr_block_get(*index, left_page_no, + RW_SX_LATCH, false, + &mtr, &err); + if (!block) { + goto invalid_page; + } + page = buf_block_get_frame(block); + left_page_no = btr_page_get_prev(page); + } + } + } + + /* Now we are on the desired level. Loop through the pages on that + level. */ + +loop: + if (!block) { +invalid_page: + mtr.commit(); +func_exit: + mem_heap_free(heap); + return err; + } + + mem_heap_empty(heap); + offsets = offsets2 = NULL; + + mtr_x_lock_index(index, &mtr); + + page = block->page.frame; + +#ifdef UNIV_ZIP_DEBUG + page_zip = buf_block_get_page_zip(block); + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + if (DB_SUCCESS_LOCKED_REC + != fseg_page_is_allocated(space, block->page.id().page_no())) { + btr_validate_report1(index, level, block); + + ib::warn() << "Page is marked as free"; + err = DB_CORRUPTION; + } else if (btr_page_get_index_id(page) != index->id) { + ib::error() << "Page index id " << btr_page_get_index_id(page) + << " != data dictionary index id " << index->id; + err = DB_CORRUPTION; + } else if (!page_validate(page, index)) { + btr_validate_report1(index, level, block); + err = DB_CORRUPTION; + } else if (btr_page_get_level(page) != level) { + btr_validate_report1(index, level, block); + ib::error() << "Page level is not " << level; + err = DB_CORRUPTION; + } else if (level == 0 && !btr_index_page_validate(block, index)) { + /* We are on level 0. Check that the records have the right + number of fields, and field lengths are right. */ + err = DB_CORRUPTION; + } else if (!page_is_empty(page)) { + } else if (level) { + btr_validate_report1(index, level, block); + ib::error() << "Non-leaf page is empty"; + } else if (block->page.id().page_no() != index->page) { + btr_validate_report1(index, level, block); + ib::error() << "Empty leaf page is not index root"; + } + + uint32_t right_page_no = btr_page_get_next(page); + uint32_t left_page_no = btr_page_get_prev(page); + + if (right_page_no != FIL_NULL) { + const rec_t* right_rec; + + right_block = btr_block_get(*index, right_page_no, RW_SX_LATCH, + !level, &mtr, &err); + if (!right_block) { + btr_validate_report1(index, level, block); + fputs("InnoDB: broken FIL_PAGE_NEXT link\n", stderr); + goto invalid_page; + } + right_page = buf_block_get_frame(right_block); + + if (btr_page_get_prev(right_page) != page_get_page_no(page)) { + btr_validate_report2(index, level, block, right_block); + fputs("InnoDB: broken FIL_PAGE_NEXT" + " or FIL_PAGE_PREV links\n", stderr); + err = DB_CORRUPTION; + } + + if (!(rec = page_rec_get_prev(page_get_supremum_rec(page)))) { +broken_links: + btr_validate_report1(index, level, block); + fputs("InnoDB: broken record links\n", stderr); + goto invalid_page; + } + if (!(right_rec = + page_rec_get_next(page_get_infimum_rec(right_page)))) { + goto broken_links; + } + + offsets = rec_get_offsets(rec, index, offsets, + page_is_leaf(page) + ? index->n_core_fields : 0, + ULINT_UNDEFINED, &heap); + offsets2 = rec_get_offsets(right_rec, index, offsets2, + page_is_leaf(right_page) + ? index->n_core_fields : 0, + ULINT_UNDEFINED, &heap); + + /* For spatial index, we cannot guarantee the key ordering + across pages, so skip the record compare verification for + now. Will enhanced in special R-Tree index validation scheme */ + if (index->is_btree() + && cmp_rec_rec(rec, right_rec, + offsets, offsets2, index) >= 0) { + + btr_validate_report2(index, level, block, right_block); + + fputs("InnoDB: records in wrong order" + " on adjacent pages\n", stderr); + + rec = page_rec_get_prev(page_get_supremum_rec(page)); + if (rec) { + fputs("InnoDB: record ", stderr); + rec_print(stderr, rec, index); + putc('\n', stderr); + } + fputs("InnoDB: record ", stderr); + rec = page_rec_get_next( + page_get_infimum_rec(right_page)); + if (rec) { + rec_print(stderr, rec, index); + } + putc('\n', stderr); + err = DB_CORRUPTION; + } + } + + if (!level || left_page_no != FIL_NULL) { + } else if (const rec_t* first = + page_rec_get_next_const(page_get_infimum_rec(page))) { + if (!(REC_INFO_MIN_REC_FLAG + & rec_get_info_bits(first, page_is_comp(page)))) { + btr_validate_report1(index, level, block); + ib::error() << "Missing REC_INFO_MIN_REC_FLAG"; + err = DB_CORRUPTION; + } + } else { + err = DB_CORRUPTION; + goto node_ptr_fails; + } + + /* Similarly skip the father node check for spatial index for now, + for a couple of reasons: + 1) As mentioned, there is no ordering relationship between records + in parent level and linked pages in the child level. + 2) Search parent from root is very costly for R-tree. + We will add special validation mechanism for R-tree later (WL #7520) */ + if (index->is_btree() && block->page.id().page_no() != index->page) { + /* Check father node pointers */ + rec_t* node_ptr + = page_rec_get_next(page_get_infimum_rec(page)); + if (!node_ptr) { + err = DB_CORRUPTION; + goto node_ptr_fails; + } + + btr_cur_position(index, node_ptr, block, &node_cur); + offsets = btr_page_get_father_node_ptr_for_validate( + offsets, heap, &node_cur, &mtr); + + father_page = btr_cur_get_page(&node_cur); + node_ptr = btr_cur_get_rec(&node_cur); + + rec = page_rec_get_prev(page_get_supremum_rec(page)); + if (rec) { + btr_cur_position(index, rec, block, &node_cur); + + offsets = btr_page_get_father_node_ptr_for_validate( + offsets, heap, &node_cur, &mtr); + } else { + offsets = nullptr; + } + + if (!offsets || node_ptr != btr_cur_get_rec(&node_cur) + || btr_node_ptr_get_child_page_no(node_ptr, offsets) + != block->page.id().page_no()) { + + btr_validate_report1(index, level, block); + + fputs("InnoDB: node pointer to the page is wrong\n", + stderr); + + fputs("InnoDB: node ptr ", stderr); + rec_print(stderr, node_ptr, index); + + if (offsets) { + rec = btr_cur_get_rec(&node_cur); + fprintf(stderr, "\n" + "InnoDB: node ptr child page n:o %u\n", + btr_node_ptr_get_child_page_no( + rec, offsets)); + fputs("InnoDB: record on page ", stderr); + rec_print_new(stderr, rec, offsets); + putc('\n', stderr); + } + + err = DB_CORRUPTION; + goto node_ptr_fails; + } + + if (page_is_leaf(page)) { + } else if (const rec_t* first_rec = + page_rec_get_next(page_get_infimum_rec(page))) { + node_ptr_tuple = dict_index_build_node_ptr( + index, first_rec, + 0, heap, btr_page_get_level(page)); + + if (cmp_dtuple_rec(node_ptr_tuple, node_ptr, index, + offsets)) { + btr_validate_report1(index, level, block); + + ib::error() << "Node ptrs differ on levels > 0"; + + fputs("InnoDB: node ptr ",stderr); + rec_print_new(stderr, node_ptr, offsets); + fputs("InnoDB: first rec ", stderr); + rec_print(stderr, first_rec, index); + putc('\n', stderr); + err = DB_CORRUPTION; + goto node_ptr_fails; + } + } else { + err = DB_CORRUPTION; + goto node_ptr_fails; + } + + if (left_page_no == FIL_NULL) { + if (page_has_prev(father_page) + || node_ptr != page_rec_get_next( + page_get_infimum_rec(father_page))) { + err = DB_CORRUPTION; + goto node_ptr_fails; + } + } + + if (right_page_no == FIL_NULL) { + if (page_has_next(father_page) + || node_ptr != page_rec_get_prev( + page_get_supremum_rec(father_page))) { + err = DB_CORRUPTION; + goto node_ptr_fails; + } + } else if (const rec_t* right_node_ptr + = page_rec_get_next(node_ptr)) { + btr_cur_position( + index, + page_get_infimum_rec(right_block->page.frame), + right_block, &right_node_cur); + if (!page_cur_move_to_next(&right_node_cur.page_cur)) { + goto node_pointer_corrupted; + } + + offsets = btr_page_get_father_node_ptr_for_validate( + offsets, heap, &right_node_cur, &mtr); + + if (right_node_ptr + != page_get_supremum_rec(father_page)) { + + if (btr_cur_get_rec(&right_node_cur) + != right_node_ptr) { +node_pointer_corrupted: + err = DB_CORRUPTION; + fputs("InnoDB: node pointer to" + " the right page is wrong\n", + stderr); + + btr_validate_report1(index, level, + block); + } + } else { + page_t* right_father_page + = btr_cur_get_page(&right_node_cur); + + if (btr_cur_get_rec(&right_node_cur) + != page_rec_get_next( + page_get_infimum_rec( + right_father_page))) { + err = DB_CORRUPTION; + fputs("InnoDB: node pointer 2 to" + " the right page is wrong\n", + stderr); + + btr_validate_report1(index, level, + block); + } + + if (page_get_page_no(right_father_page) + != btr_page_get_next(father_page)) { + + err = DB_CORRUPTION; + fputs("InnoDB: node pointer 3 to" + " the right page is wrong\n", + stderr); + + btr_validate_report1(index, level, + block); + } + } + } else { + err = DB_CORRUPTION; + } + } + +node_ptr_fails: + /* Commit the mini-transaction to release the latch on 'page'. + Re-acquire the latch on right_page, which will become 'page' + on the next loop. The page has already been checked. */ + mtr.commit(); + + if (trx_is_interrupted(trx)) { + /* On interrupt, return the current status. */ + } else if (right_page_no != FIL_NULL) { + + mtr.start(); + + block = btr_block_get(*index, right_page_no, RW_SX_LATCH, + !level, &mtr, &err); + goto loop; + } + + goto func_exit; +} + +/**************************************************************//** +Checks the consistency of an index tree. +@return DB_SUCCESS if ok, error code if not */ +dberr_t +btr_validate_index( +/*===============*/ + dict_index_t* index, /*!< in: index */ + const trx_t* trx) /*!< in: transaction or NULL */ +{ + mtr_t mtr; + mtr.start(); + + mtr_x_lock_index(index, &mtr); + + dberr_t err; + if (page_t *root= btr_root_get(index, &mtr, &err)) + for (auto level= btr_page_get_level(root);; level--) + { + if (dberr_t err_level= btr_validate_level(index, trx, level)) + err= err_level; + if (!level) + break; + } + + mtr.commit(); + return err; +} + +/**************************************************************//** +Checks if the page in the cursor can be merged with given page. +If necessary, re-organize the merge_page. +@return true if possible to merge. */ +static +bool +btr_can_merge_with_page( +/*====================*/ + btr_cur_t* cursor, /*!< in: cursor on the page to merge */ + uint32_t page_no, /*!< in: a sibling page */ + buf_block_t** merge_block, /*!< out: the merge block */ + mtr_t* mtr) /*!< in: mini-transaction */ +{ + dict_index_t* index; + page_t* page; + ulint n_recs; + ulint data_size; + ulint max_ins_size_reorg; + ulint max_ins_size; + buf_block_t* mblock; + page_t* mpage; + DBUG_ENTER("btr_can_merge_with_page"); + + if (page_no == FIL_NULL) { +error: + *merge_block = NULL; + DBUG_RETURN(false); + } + + index = btr_cur_get_index(cursor); + page = btr_cur_get_page(cursor); + + mblock = btr_block_get(*index, page_no, RW_X_LATCH, page_is_leaf(page), + mtr); + if (!mblock) { + goto error; + } + mpage = buf_block_get_frame(mblock); + + n_recs = page_get_n_recs(page); + data_size = page_get_data_size(page); + + max_ins_size_reorg = page_get_max_insert_size_after_reorganize( + mpage, n_recs); + + if (data_size > max_ins_size_reorg) { + goto error; + } + + /* If compression padding tells us that merging will result in + too packed up page i.e.: which is likely to cause compression + failure then don't merge the pages. */ + if (mblock->page.zip.data && page_is_leaf(mpage) + && (page_get_data_size(mpage) + data_size + >= dict_index_zip_pad_optimal_page_size(index))) { + + goto error; + } + + max_ins_size = page_get_max_insert_size(mpage, n_recs); + + if (data_size > max_ins_size) { + /* We have to reorganize mpage */ + if (btr_page_reorganize_block(page_zip_level, mblock, index, + mtr) != DB_SUCCESS) { + goto error; + } + + max_ins_size = page_get_max_insert_size(mpage, n_recs); + + ut_ad(page_validate(mpage, index)); + ut_ad(max_ins_size == max_ins_size_reorg); + + if (data_size > max_ins_size) { + + /* Add fault tolerance, though this should + never happen */ + + goto error; + } + } + + *merge_block = mblock; + DBUG_RETURN(true); +} diff --git a/storage/innobase/btr/btr0bulk.cc b/storage/innobase/btr/btr0bulk.cc new file mode 100644 index 00000000..013cd131 --- /dev/null +++ b/storage/innobase/btr/btr0bulk.cc @@ -0,0 +1,1233 @@ +/***************************************************************************** + +Copyright (c) 2014, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file btr/btr0bulk.cc +The B-tree bulk load + +Created 03/11/2014 Shaohua Wang +*******************************************************/ + +#include "btr0bulk.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "btr0pcur.h" +#include "ibuf0ibuf.h" +#include "page0page.h" +#include "trx0trx.h" + +/** Innodb B-tree index fill factor for bulk load. */ +uint innobase_fill_factor; + +/** Initialize members, allocate page if needed and start mtr. +Note: we commit all mtrs on failure. +@return error code. */ +dberr_t +PageBulk::init() +{ + buf_block_t* new_block; + page_t* new_page; + + ut_ad(m_heap == NULL); + m_heap = mem_heap_create(1000); + + m_mtr.start(); + m_index->set_modified(m_mtr); + + if (m_page_no == FIL_NULL) { + mtr_t alloc_mtr; + + /* We commit redo log for allocation by a separate mtr, + because we don't guarantee pages are committed following + the allocation order, and we will always generate redo log + for page allocation, even when creating a new tablespace. */ + alloc_mtr.start(); + m_index->set_modified(alloc_mtr); + + uint32_t n_reserved; + dberr_t err = fsp_reserve_free_extents( + &n_reserved, m_index->table->space, 1, FSP_NORMAL, + &alloc_mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { +oom: + alloc_mtr.commit(); + m_mtr.commit(); + return err; + } + + /* Allocate a new page. */ + new_block = btr_page_alloc(m_index, 0, FSP_UP, m_level, + &alloc_mtr, &m_mtr, &err); + if (!new_block) { + goto oom; + } + + m_index->table->space->release_free_extents(n_reserved); + + alloc_mtr.commit(); + + new_page = buf_block_get_frame(new_block); + m_page_no = new_block->page.id().page_no(); + + byte* index_id = my_assume_aligned<2> + (PAGE_HEADER + PAGE_INDEX_ID + new_page); + compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4); + compile_time_assert(FIL_NULL == 0xffffffff); + memset_aligned<8>(new_page + FIL_PAGE_PREV, 0xff, 8); + + if (UNIV_LIKELY_NULL(new_block->page.zip.data)) { + mach_write_to_8(index_id, m_index->id); + page_create_zip(new_block, m_index, m_level, 0, + &m_mtr); + } else { + ut_ad(!m_index->is_spatial()); + page_create(new_block, &m_mtr, + m_index->table->not_redundant()); + m_mtr.memset(*new_block, FIL_PAGE_PREV, 8, 0xff); + m_mtr.write<2,mtr_t::MAYBE_NOP>(*new_block, PAGE_HEADER + + PAGE_LEVEL + + new_page, m_level); + m_mtr.write<8>(*new_block, index_id, m_index->id); + } + } else { + new_block = btr_block_get(*m_index, m_page_no, RW_X_LATCH, + false, &m_mtr); + if (!new_block) { + m_mtr.commit(); + return(DB_CORRUPTION); + } + + new_page = buf_block_get_frame(new_block); + + ut_ad(page_dir_get_n_heap(new_page) == PAGE_HEAP_NO_USER_LOW); + + btr_page_set_level(new_block, m_level, &m_mtr); + } + + m_page_zip = buf_block_get_page_zip(new_block); + + if (!m_level && dict_index_is_sec_or_ibuf(m_index)) { + page_update_max_trx_id(new_block, m_page_zip, m_trx_id, + &m_mtr); + } + + m_block = new_block; + m_page = new_page; + m_cur_rec = page_get_infimum_rec(new_page); + ut_ad(m_is_comp == !!page_is_comp(new_page)); + m_free_space = page_get_free_space_of_empty(m_is_comp); + + if (innobase_fill_factor == 100 && dict_index_is_clust(m_index)) { + /* Keep default behavior compatible with 5.6 */ + m_reserved_space = dict_index_get_space_reserve(); + } else { + m_reserved_space = + srv_page_size * (100 - innobase_fill_factor) / 100; + } + + m_padding_space = + srv_page_size - dict_index_zip_pad_optimal_page_size(m_index); + m_heap_top = page_header_get_ptr(new_page, PAGE_HEAP_TOP); + m_rec_no = page_header_get_field(new_page, PAGE_N_RECS); + /* Temporarily reset PAGE_DIRECTION_B from PAGE_NO_DIRECTION to 0, + without writing redo log, to ensure that needs_finish() will hold + on an empty page. */ + ut_ad(m_page[PAGE_HEADER + PAGE_DIRECTION_B] == PAGE_NO_DIRECTION); + m_page[PAGE_HEADER + PAGE_DIRECTION_B] = 0; + ut_d(m_total_data = 0); + + return(DB_SUCCESS); +} + +/** Insert a record in the page. +@tparam fmt the page format +@param[in,out] rec record +@param[in] offsets record offsets */ +template +inline void PageBulk::insertPage(rec_t *rec, rec_offs *offsets) +{ + ut_ad((m_page_zip != nullptr) == (fmt == COMPRESSED)); + ut_ad((fmt != REDUNDANT) == m_is_comp); + ut_ad(page_align(m_heap_top) == m_page); + ut_ad(m_heap); + + const ulint rec_size= rec_offs_size(offsets); + const ulint extra_size= rec_offs_extra_size(offsets); + ut_ad(page_align(m_heap_top + rec_size) == m_page); + ut_d(const bool is_leaf= page_rec_is_leaf(m_cur_rec)); + +#ifdef UNIV_DEBUG + /* Check whether records are in order. */ + if (page_offset(m_cur_rec) != + (fmt == REDUNDANT ? PAGE_OLD_INFIMUM : PAGE_NEW_INFIMUM)) + { + const rec_t *old_rec = m_cur_rec; + rec_offs *old_offsets= rec_get_offsets(old_rec, m_index, nullptr, is_leaf + ? m_index->n_core_fields : 0, + ULINT_UNDEFINED, &m_heap); + ut_ad(cmp_rec_rec(rec, old_rec, offsets, old_offsets, m_index) > 0); + } + + m_total_data+= rec_size; +#endif /* UNIV_DEBUG */ + + rec_t* const insert_rec= m_heap_top + extra_size; + + /* Insert the record in the linked list. */ + if (fmt != REDUNDANT) + { + const rec_t *next_rec= m_page + + page_offset(m_cur_rec + mach_read_from_2(m_cur_rec - REC_NEXT)); + if (fmt != COMPRESSED) + m_mtr.write<2>(*m_block, m_cur_rec - REC_NEXT, + static_cast(insert_rec - m_cur_rec)); + else + { + mach_write_to_2(m_cur_rec - REC_NEXT, + static_cast(insert_rec - m_cur_rec)); + memcpy(m_heap_top, rec - extra_size, rec_size); + } + + rec_t * const this_rec= fmt != COMPRESSED + ? const_cast(rec) : insert_rec; + rec_set_bit_field_1(this_rec, 0, REC_NEW_N_OWNED, REC_N_OWNED_MASK, + REC_N_OWNED_SHIFT); + rec_set_bit_field_2(this_rec, PAGE_HEAP_NO_USER_LOW + m_rec_no, + REC_NEW_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); + mach_write_to_2(this_rec - REC_NEXT, + static_cast(next_rec - insert_rec)); + } + else + { + memcpy(const_cast(rec) - REC_NEXT, m_cur_rec - REC_NEXT, 2); + m_mtr.write<2>(*m_block, m_cur_rec - REC_NEXT, page_offset(insert_rec)); + rec_set_bit_field_1(const_cast(rec), 0, + REC_OLD_N_OWNED, REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); + rec_set_bit_field_2(const_cast(rec), + PAGE_HEAP_NO_USER_LOW + m_rec_no, + REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT); + } + + if (fmt == COMPRESSED) + /* We already wrote the record. Log is written in PageBulk::compress(). */; + else if (page_offset(m_cur_rec) == + (fmt == REDUNDANT ? PAGE_OLD_INFIMUM : PAGE_NEW_INFIMUM)) + m_mtr.memcpy(*m_block, m_heap_top, rec - extra_size, rec_size); + else + { + /* Try to copy common prefix from the preceding record. */ + const byte *r= rec - extra_size; + const byte * const insert_rec_end= m_heap_top + rec_size; + byte *b= m_heap_top; + + /* Skip any unchanged prefix of the record. */ + for (; * b == *r; b++, r++); + + ut_ad(b < insert_rec_end); + + const byte *c= m_cur_rec - (rec - r); + const byte * const c_end= std::min(m_cur_rec + rec_offs_data_size(offsets), + m_heap_top); + + /* Try to copy any bytes of the preceding record. */ + if (UNIV_LIKELY(c >= m_page && c < c_end)) + { + const byte *cm= c; + byte *bm= b; + const byte *rm= r; + for (; cm < c_end && *rm == *cm; cm++, bm++, rm++); + ut_ad(bm <= insert_rec_end); + size_t len= static_cast(rm - r); + ut_ad(!memcmp(r, c, len)); + if (len > 2) + { + memcpy(b, c, len); + m_mtr.memmove(*m_block, page_offset(b), page_offset(c), len); + c= cm; + b= bm; + r= rm; + } + } + + if (c < m_cur_rec) + { + if (!rec_offs_data_size(offsets)) + { +no_data: + m_mtr.memcpy(*m_block, b, r, m_cur_rec - c); + goto rec_done; + } + /* Some header bytes differ. Compare the data separately. */ + const byte *cd= m_cur_rec; + byte *bd= insert_rec; + const byte *rd= rec; + /* Skip any unchanged prefix of the record. */ + for (;; cd++, bd++, rd++) + if (bd == insert_rec_end) + goto no_data; + else if (*bd != *rd) + break; + + /* Try to copy any data bytes of the preceding record. */ + if (c_end - cd > 2) + { + const byte *cdm= cd; + const byte *rdm= rd; + for (; cdm < c_end && *rdm == *cdm; cdm++, rdm++) + ut_ad(rdm - rd + bd <= insert_rec_end); + size_t len= static_cast(rdm - rd); + ut_ad(!memcmp(rd, cd, len)); + if (len > 2) + { + m_mtr.memcpy(*m_block, b, r, m_cur_rec - c); + memcpy(bd, cd, len); + m_mtr.memmove(*m_block, page_offset(bd), page_offset(cd), len); + c= cdm; + b= rdm - rd + bd; + r= rdm; + } + } + } + + if (size_t len= static_cast(insert_rec_end - b)) + m_mtr.memcpy(*m_block, b, r, len); + } + +rec_done: + ut_ad(fmt == COMPRESSED || !memcmp(m_heap_top, rec - extra_size, rec_size)); + rec_offs_make_valid(insert_rec, m_index, is_leaf, offsets); + + /* Update the member variables. */ + ulint slot_size= page_dir_calc_reserved_space(m_rec_no + 1) - + page_dir_calc_reserved_space(m_rec_no); + + ut_ad(m_free_space >= rec_size + slot_size); + ut_ad(m_heap_top + rec_size < m_page + srv_page_size); + + m_free_space-= rec_size + slot_size; + m_heap_top+= rec_size; + m_rec_no++; + m_cur_rec= insert_rec; +} + +/** Insert a record in the page. +@param[in] rec record +@param[in] offsets record offsets */ +inline void PageBulk::insert(const rec_t *rec, rec_offs *offsets) +{ + byte rec_hdr[REC_N_OLD_EXTRA_BYTES]; + static_assert(REC_N_OLD_EXTRA_BYTES > REC_N_NEW_EXTRA_BYTES, "file format"); + + if (UNIV_LIKELY_NULL(m_page_zip)) + insertPage(const_cast(rec), offsets); + else if (m_is_comp) + { + memcpy(rec_hdr, rec - REC_N_NEW_EXTRA_BYTES, REC_N_NEW_EXTRA_BYTES); + insertPage(const_cast(rec), offsets); + memcpy(const_cast(rec) - REC_N_NEW_EXTRA_BYTES, rec_hdr, + REC_N_NEW_EXTRA_BYTES); + } + else + { + memcpy(rec_hdr, rec - REC_N_OLD_EXTRA_BYTES, REC_N_OLD_EXTRA_BYTES); + insertPage(const_cast(rec), offsets); + memcpy(const_cast(rec) - REC_N_OLD_EXTRA_BYTES, rec_hdr, + REC_N_OLD_EXTRA_BYTES); + } +} + +/** Set the number of owned records in the uncompressed page of +a ROW_FORMAT=COMPRESSED record without redo-logging. */ +static void rec_set_n_owned_zip(rec_t *rec, ulint n_owned) +{ + rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); +} + +/** Mark end of insertion to the page. Scan all records to set page dirs, +and set page header members. +@tparam fmt page format */ +template +inline void PageBulk::finishPage() +{ + ut_ad((m_page_zip != nullptr) == (fmt == COMPRESSED)); + ut_ad((fmt != REDUNDANT) == m_is_comp); + + ulint count= 0; + byte *slot= my_assume_aligned<2>(m_page + srv_page_size - + (PAGE_DIR + PAGE_DIR_SLOT_SIZE)); + const page_dir_slot_t *const slot0 = slot; + compile_time_assert(PAGE_DIR_SLOT_SIZE == 2); + if (fmt != REDUNDANT) + { + uint16_t offset= mach_read_from_2(PAGE_NEW_INFIMUM - REC_NEXT + m_page); + ut_ad(offset >= PAGE_NEW_SUPREMUM - PAGE_NEW_INFIMUM); + offset= static_cast(offset + PAGE_NEW_INFIMUM); + /* Set owner & dir. */ + while (offset != PAGE_NEW_SUPREMUM) + { + ut_ad(offset >= PAGE_NEW_SUPREMUM); + ut_ad(offset < page_offset(slot)); + count++; + + if (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2) + { + slot-= PAGE_DIR_SLOT_SIZE; + mach_write_to_2(slot, offset); + + if (fmt != COMPRESSED) + page_rec_set_n_owned(m_block, m_page + offset, count, true, + &m_mtr); + else + rec_set_n_owned_zip(m_page + offset, count); + + count= 0; + } + + uint16_t next= static_cast + ((mach_read_from_2(m_page + offset - REC_NEXT) + offset) & + (srv_page_size - 1)); + ut_ad(next); + offset= next; + } + + if (slot0 != slot && (count + 1 + (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 <= + PAGE_DIR_SLOT_MAX_N_OWNED)) + { + /* Merge the last two slots, like page_cur_insert_rec_low() does. */ + count+= (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2; + + rec_t *rec= const_cast(page_dir_slot_get_rec(slot)); + if (fmt != COMPRESSED) + page_rec_set_n_owned(m_block, rec, 0, true, &m_mtr); + else + rec_set_n_owned_zip(rec, 0); + } + else + slot-= PAGE_DIR_SLOT_SIZE; + + mach_write_to_2(slot, PAGE_NEW_SUPREMUM); + if (fmt != COMPRESSED) + page_rec_set_n_owned(m_block, m_page + PAGE_NEW_SUPREMUM, + count + 1, true, &m_mtr); + else + rec_set_n_owned_zip(m_page + PAGE_NEW_SUPREMUM, count + 1); + } + else + { + rec_t *insert_rec= m_page + + mach_read_from_2(PAGE_OLD_INFIMUM - REC_NEXT + m_page); + + /* Set owner & dir. */ + while (insert_rec != m_page + PAGE_OLD_SUPREMUM) + { + count++; + + if (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2) + { + slot-= PAGE_DIR_SLOT_SIZE; + mach_write_to_2(slot, page_offset(insert_rec)); + page_rec_set_n_owned(m_block, insert_rec, count, false, &m_mtr); + count= 0; + } + + insert_rec= m_page + mach_read_from_2(insert_rec - REC_NEXT); + } + + if (slot0 != slot && (count + 1 + (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 <= + PAGE_DIR_SLOT_MAX_N_OWNED)) + { + /* Merge the last two slots, like page_cur_insert_rec_low() does. */ + count+= (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2; + + rec_t *rec= const_cast(page_dir_slot_get_rec(slot)); + page_rec_set_n_owned(m_block, rec, 0, false, &m_mtr); + } + else + slot-= PAGE_DIR_SLOT_SIZE; + + mach_write_to_2(slot, PAGE_OLD_SUPREMUM); + page_rec_set_n_owned(m_block, m_page + PAGE_OLD_SUPREMUM, count + 1, + false, &m_mtr); + } + + if (!m_rec_no); + else if (fmt != COMPRESSED) + { + static_assert(PAGE_N_DIR_SLOTS == 0, "compatibility"); + alignas(8) byte page_header[PAGE_N_HEAP + 2]; + mach_write_to_2(page_header + PAGE_N_DIR_SLOTS, + 1 + (slot0 - slot) / PAGE_DIR_SLOT_SIZE); + mach_write_to_2(page_header + PAGE_HEAP_TOP, m_heap_top - m_page); + mach_write_to_2(page_header + PAGE_N_HEAP, + (PAGE_HEAP_NO_USER_LOW + m_rec_no) | + uint16_t{fmt != REDUNDANT} << 15); + m_mtr.memcpy(*m_block, PAGE_HEADER + m_page, page_header, + sizeof page_header); + m_mtr.write<2>(*m_block, PAGE_HEADER + PAGE_N_RECS + m_page, m_rec_no); + m_mtr.memcpy(*m_block, page_offset(slot), slot0 - slot); + } + else + { + /* For ROW_FORMAT=COMPRESSED, redo log may be written in + PageBulk::compress(). */ + mach_write_to_2(PAGE_HEADER + PAGE_N_DIR_SLOTS + m_page, + 1 + (slot0 - slot) / PAGE_DIR_SLOT_SIZE); + mach_write_to_2(PAGE_HEADER + PAGE_HEAP_TOP + m_page, + static_cast(m_heap_top - m_page)); + mach_write_to_2(PAGE_HEADER + PAGE_N_HEAP + m_page, + (PAGE_HEAP_NO_USER_LOW + m_rec_no) | 1U << 15); + mach_write_to_2(PAGE_HEADER + PAGE_N_RECS + m_page, m_rec_no); + } +} + +inline bool PageBulk::needs_finish() const +{ + ut_ad(page_align(m_cur_rec) == m_block->page.frame); + ut_ad(m_page == m_block->page.frame); + if (!m_page[PAGE_HEADER + PAGE_DIRECTION_B]) + return true; + ulint heap_no, n_heap= page_header_get_field(m_page, PAGE_N_HEAP); + ut_ad((n_heap & 0x7fff) >= PAGE_HEAP_NO_USER_LOW); + if (n_heap & 0x8000) + { + n_heap&= 0x7fff; + heap_no= rec_get_heap_no_new(m_cur_rec); + if (heap_no == PAGE_HEAP_NO_INFIMUM && + page_header_get_field(m_page, PAGE_HEAP_TOP) == PAGE_NEW_SUPREMUM_END) + return false; + } + else + { + heap_no= rec_get_heap_no_old(m_cur_rec); + if (heap_no == PAGE_HEAP_NO_INFIMUM && + page_header_get_field(m_page, PAGE_HEAP_TOP) == PAGE_OLD_SUPREMUM_END) + return false; + } + return heap_no != n_heap - 1; +} + +/** Mark end of insertion to the page. Scan all records to set page dirs, +and set page header members. +@tparam compressed whether the page is in ROW_FORMAT=COMPRESSED */ +inline void PageBulk::finish() +{ + ut_ad(!m_index->is_spatial()); + + if (!needs_finish()); + else if (UNIV_LIKELY_NULL(m_page_zip)) + finishPage(); + else if (m_is_comp) + finishPage(); + else + finishPage(); + + /* In MariaDB 10.2, 10.3, 10.4, we would initialize + PAGE_DIRECTION_B, PAGE_N_DIRECTION, PAGE_LAST_INSERT + in the same way as we would during normal INSERT operations. + Starting with MariaDB Server 10.5, bulk insert will not + touch those fields. */ + ut_ad(!m_page[PAGE_HEADER + PAGE_INSTANT]); + /* Restore the temporary change of PageBulk::init() that was necessary to + ensure that PageBulk::needs_finish() holds on an empty page. */ + m_page[PAGE_HEADER + PAGE_DIRECTION_B]= PAGE_NO_DIRECTION; + + ut_ad(!page_header_get_field(m_page, PAGE_FREE)); + ut_ad(!page_header_get_field(m_page, PAGE_GARBAGE)); + ut_ad(!page_header_get_field(m_page, PAGE_LAST_INSERT)); + ut_ad(!page_header_get_field(m_page, PAGE_N_DIRECTION)); + ut_ad(m_total_data + page_dir_calc_reserved_space(m_rec_no) <= + page_get_free_space_of_empty(m_is_comp)); + ut_ad(!needs_finish()); + ut_ad(page_validate(m_page, m_index)); +} + +/** Commit inserts done to the page +@param[in] success Flag whether all inserts succeed. */ +void PageBulk::commit(bool success) +{ + finish(); + if (success && !m_index->is_clust() && page_is_leaf(m_page)) + ibuf_set_bitmap_for_bulk_load(m_block, &m_mtr, + innobase_fill_factor == 100); + m_mtr.commit(); +} + +/** Compress a page of compressed table +@return true compress successfully or no need to compress +@return false compress failed. */ +bool +PageBulk::compress() +{ + ut_ad(m_page_zip != NULL); + + return page_zip_compress(m_block, m_index, page_zip_level, &m_mtr); +} + +/** Get node pointer +@return node pointer */ +dtuple_t* +PageBulk::getNodePtr() +{ + rec_t* first_rec; + dtuple_t* node_ptr; + + /* Create node pointer */ + first_rec = page_rec_get_next(page_get_infimum_rec(m_page)); + ut_a(page_rec_is_user_rec(first_rec)); + node_ptr = dict_index_build_node_ptr(m_index, first_rec, m_page_no, + m_heap, m_level); + + return(node_ptr); +} + +/** Get split rec in left page.We split a page in half when compresssion fails, +and the split rec will be copied to right page. +@return split rec */ +rec_t* +PageBulk::getSplitRec() +{ + rec_t* rec; + rec_offs* offsets; + ulint total_used_size; + ulint total_recs_size; + ulint n_recs; + + ut_ad(m_page_zip != NULL); + ut_ad(m_rec_no >= 2); + ut_ad(!m_index->is_instant()); + + ut_ad(page_get_free_space_of_empty(m_is_comp) > m_free_space); + total_used_size = page_get_free_space_of_empty(m_is_comp) + - m_free_space; + + total_recs_size = 0; + n_recs = 0; + offsets = NULL; + rec = page_get_infimum_rec(m_page); + const ulint n_core = page_is_leaf(m_page) ? m_index->n_core_fields : 0; + + do { + rec = page_rec_get_next(rec); + ut_ad(page_rec_is_user_rec(rec)); + + offsets = rec_get_offsets(rec, m_index, offsets, n_core, + ULINT_UNDEFINED, &m_heap); + total_recs_size += rec_offs_size(offsets); + n_recs++; + } while (total_recs_size + page_dir_calc_reserved_space(n_recs) + < total_used_size / 2); + + /* Keep at least one record on left page */ + if (page_rec_is_first(rec, m_page)) { + rec = page_rec_get_next(rec); + ut_ad(page_rec_is_user_rec(rec)); + } + + return(rec); +} + +/** Copy all records after split rec including itself. +@param[in] rec split rec */ +void +PageBulk::copyIn( + rec_t* split_rec) +{ + + rec_t* rec = split_rec; + rec_offs* offsets = NULL; + + ut_ad(m_rec_no == 0); + ut_ad(page_rec_is_user_rec(rec)); + + const ulint n_core = page_rec_is_leaf(rec) + ? m_index->n_core_fields : 0; + + do { + offsets = rec_get_offsets(rec, m_index, offsets, n_core, + ULINT_UNDEFINED, &m_heap); + + insert(rec, offsets); + + rec = page_rec_get_next(rec); + } while (!page_rec_is_supremum(rec)); + + ut_ad(m_rec_no > 0); +} + +/** Remove all records after split rec including itself. +@param[in] rec split rec */ +void +PageBulk::copyOut( + rec_t* split_rec) +{ + /* Suppose before copyOut, we have 5 records on the page: + infimum->r1->r2->r3->r4->r5->supremum, and r3 is the split rec. + + after copyOut, we have 2 records on the page: + infimum->r1->r2->supremum. slot ajustment is not done. */ + + rec_t *rec = page_get_infimum_rec(m_page); + ulint n; + + for (n = 0;; n++) { + rec_t *next = page_rec_get_next(rec); + if (next == split_rec) { + break; + } + rec = next; + } + + ut_ad(n > 0); + + const rec_t *last_rec = split_rec; + for (;;) { + const rec_t *next = page_rec_get_next_const(last_rec); + if (page_rec_is_supremum(next)) { + break; + } + last_rec = next; + } + + /* Set last record's next in page */ + const ulint n_core = page_rec_is_leaf(split_rec) + ? m_index->n_core_fields : 0; + + rec_offs* offsets = rec_get_offsets(rec, m_index, nullptr, n_core, + ULINT_UNDEFINED, &m_heap); + mach_write_to_2(rec - REC_NEXT, m_is_comp + ? static_cast + (PAGE_NEW_SUPREMUM - page_offset(rec)) + : PAGE_OLD_SUPREMUM); + + /* Set related members */ + m_cur_rec = rec; + m_heap_top = rec_get_end(rec, offsets); + + offsets = rec_get_offsets(last_rec, m_index, offsets, n_core, + ULINT_UNDEFINED, &m_heap); + + m_free_space += ulint(rec_get_end(last_rec, offsets) - m_heap_top) + + page_dir_calc_reserved_space(m_rec_no) + - page_dir_calc_reserved_space(n); + ut_ad(lint(m_free_space) > 0); + m_rec_no = n; + +#ifdef UNIV_DEBUG + m_total_data -= ulint(rec_get_end(last_rec, offsets) - m_heap_top); +#endif /* UNIV_DEBUG */ +} + +/** Set next page +@param[in] next_page_no next page no */ +inline void PageBulk::setNext(ulint next_page_no) +{ + if (UNIV_LIKELY_NULL(m_page_zip)) + /* For ROW_FORMAT=COMPRESSED, redo log may be written + in PageBulk::compress(). */ + mach_write_to_4(m_page + FIL_PAGE_NEXT, next_page_no); + else + m_mtr.write<4>(*m_block, m_page + FIL_PAGE_NEXT, next_page_no); +} + +/** Set previous page +@param[in] prev_page_no previous page no */ +inline void PageBulk::setPrev(ulint prev_page_no) +{ + if (UNIV_LIKELY_NULL(m_page_zip)) + /* For ROW_FORMAT=COMPRESSED, redo log may be written + in PageBulk::compress(). */ + mach_write_to_4(m_page + FIL_PAGE_PREV, prev_page_no); + else + m_mtr.write<4>(*m_block, m_page + FIL_PAGE_PREV, prev_page_no); +} + +/** Check if required space is available in the page for the rec to be inserted. +We check fill factor & padding here. +@param[in] length required length +@return true if space is available */ +bool +PageBulk::isSpaceAvailable( + ulint rec_size) +{ + if (m_rec_no >= 8190) { + ut_ad(srv_page_size == 65536); + return false; + } + + ulint slot_size; + ulint required_space; + + slot_size = page_dir_calc_reserved_space(m_rec_no + 1) + - page_dir_calc_reserved_space(m_rec_no); + + required_space = rec_size + slot_size; + + if (required_space > m_free_space) { + ut_ad(m_rec_no > 0); + return false; + } + + /* Fillfactor & Padding apply to both leaf and non-leaf pages. + Note: we keep at least 2 records in a page to avoid B-tree level + growing too high. */ + if (m_rec_no >= 2 + && ((m_page_zip == NULL && m_free_space - required_space + < m_reserved_space) + || (m_page_zip != NULL && m_free_space - required_space + < m_padding_space))) { + return(false); + } + + return(true); +} + +/** Check whether the record needs to be stored externally. +@return false if the entire record can be stored locally on the page */ +bool +PageBulk::needExt( + const dtuple_t* tuple, + ulint rec_size) +{ + return page_zip_rec_needs_ext(rec_size, m_is_comp, + dtuple_get_n_fields(tuple), + m_block->zip_size()); +} + +/** Store external record +Since the record is not logged yet, so we don't log update to the record. +the blob data is logged first, then the record is logged in bulk mode. +@param[in] big_rec external recrod +@param[in] offsets record offsets +@return error code */ +dberr_t +PageBulk::storeExt( + const big_rec_t* big_rec, + rec_offs* offsets) +{ + finish(); + + /* Note: not all fields are initialized in btr_pcur. */ + btr_pcur_t btr_pcur; + btr_pcur.pos_state = BTR_PCUR_IS_POSITIONED; + btr_pcur.latch_mode = BTR_MODIFY_LEAF; + btr_pcur.btr_cur.page_cur.index = m_index; + btr_pcur.btr_cur.page_cur.rec = m_cur_rec; + btr_pcur.btr_cur.page_cur.offsets = offsets; + btr_pcur.btr_cur.page_cur.block = m_block; + + dberr_t err = btr_store_big_rec_extern_fields( + &btr_pcur, offsets, big_rec, &m_mtr, BTR_STORE_INSERT_BULK); + + return(err); +} + +/** Release block by commiting mtr +Note: log_free_check requires holding no lock/latch in current thread. */ +void +PageBulk::release() +{ + finish(); + + /* We fix the block because we will re-pin it soon. */ + m_block->page.fix(); + + /* No other threads can modify this block. */ + m_modify_clock = buf_block_get_modify_clock(m_block); + + m_mtr.commit(); +} + +/** Start mtr and latch the block */ +void PageBulk::latch() +{ + m_mtr.start(); + m_index->set_modified(m_mtr); +#ifdef BTR_CUR_HASH_ADAPT + ut_ad(!m_block->index); +#endif + m_block->page.lock.x_lock(); + ut_ad(m_block->page.buf_fix_count()); + m_mtr.memo_push(m_block, MTR_MEMO_PAGE_X_FIX); + + ut_ad(m_cur_rec > m_page); + ut_ad(m_cur_rec < m_heap_top); +} + +/** Split a page +@param[in] page_bulk page to split +@param[in] next_page_bulk next page +@return error code */ +dberr_t +BtrBulk::pageSplit( + PageBulk* page_bulk, + PageBulk* next_page_bulk) +{ + ut_ad(page_bulk->getPageZip() != NULL); + + if (page_bulk->getRecNo() <= 1) { + return(DB_TOO_BIG_RECORD); + } + + /* Initialize a new page */ + PageBulk new_page_bulk(m_index, m_trx->id, FIL_NULL, + page_bulk->getLevel()); + dberr_t err = new_page_bulk.init(); + if (err != DB_SUCCESS) { + return(err); + } + + /* Copy the upper half to the new page. */ + rec_t* split_rec = page_bulk->getSplitRec(); + new_page_bulk.copyIn(split_rec); + page_bulk->copyOut(split_rec); + + /* Commit the pages after split. */ + err = pageCommit(page_bulk, &new_page_bulk, true); + if (err != DB_SUCCESS) { + pageAbort(&new_page_bulk); + return(err); + } + + err = pageCommit(&new_page_bulk, next_page_bulk, true); + if (err != DB_SUCCESS) { + pageAbort(&new_page_bulk); + return(err); + } + + return(err); +} + +/** Commit(finish) a page. We set next/prev page no, compress a page of +compressed table and split the page if compression fails, insert a node +pointer to father page if needed, and commit mini-transaction. +@param[in] page_bulk page to commit +@param[in] next_page_bulk next page +@param[in] insert_father false when page_bulk is a root page and + true when it's a non-root page +@return error code */ +dberr_t +BtrBulk::pageCommit( + PageBulk* page_bulk, + PageBulk* next_page_bulk, + bool insert_father) +{ + page_bulk->finish(); + + /* Set page links */ + if (next_page_bulk != NULL) { + ut_ad(page_bulk->getLevel() == next_page_bulk->getLevel()); + + page_bulk->setNext(next_page_bulk->getPageNo()); + next_page_bulk->setPrev(page_bulk->getPageNo()); + } else { + ut_ad(!page_has_next(page_bulk->getPage())); + /* If a page is released and latched again, we need to + mark it modified in mini-transaction. */ + page_bulk->set_modified(); + } + + ut_ad(!m_index->lock.have_any()); + + /* Compress page if it's a compressed table. */ + if (page_bulk->getPageZip() != NULL && !page_bulk->compress()) { + return(pageSplit(page_bulk, next_page_bulk)); + } + + /* Insert node pointer to father page. */ + if (insert_father) { + dtuple_t* node_ptr = page_bulk->getNodePtr(); + dberr_t err = insert(node_ptr, page_bulk->getLevel()+1); + + if (err != DB_SUCCESS) { + return(err); + } + } + + /* Commit mtr. */ + page_bulk->commit(true); + + return(DB_SUCCESS); +} + +/** Log free check */ +inline void BtrBulk::logFreeCheck() +{ + if (log_sys.check_flush_or_checkpoint()) { + release(); + + log_check_margins(); + + latch(); + } +} + +/** Release all latches */ +void +BtrBulk::release() +{ + ut_ad(m_root_level + 1 == m_page_bulks.size()); + + for (ulint level = 0; level <= m_root_level; level++) { + PageBulk* page_bulk = m_page_bulks.at(level); + + page_bulk->release(); + } +} + +/** Re-latch all latches */ +void +BtrBulk::latch() +{ + ut_ad(m_root_level + 1 == m_page_bulks.size()); + + for (ulint level = 0; level <= m_root_level; level++) { + PageBulk* page_bulk = m_page_bulks.at(level); + page_bulk->latch(); + } +} + +/** Insert a tuple to page in a level +@param[in] tuple tuple to insert +@param[in] level B-tree level +@return error code */ +dberr_t +BtrBulk::insert( + dtuple_t* tuple, + ulint level) +{ + bool is_left_most = false; + dberr_t err = DB_SUCCESS; + + /* Check if we need to create a PageBulk for the level. */ + if (level + 1 > m_page_bulks.size()) { + PageBulk* new_page_bulk + = UT_NEW_NOKEY(PageBulk(m_index, m_trx->id, FIL_NULL, + level)); + err = new_page_bulk->init(); + if (err != DB_SUCCESS) { + UT_DELETE(new_page_bulk); + return(err); + } + + m_page_bulks.push_back(new_page_bulk); + ut_ad(level + 1 == m_page_bulks.size()); + m_root_level = level; + + is_left_most = true; + } + + ut_ad(m_page_bulks.size() > level); + + PageBulk* page_bulk = m_page_bulks.at(level); + + if (is_left_most && level > 0 && page_bulk->getRecNo() == 0) { + /* The node pointer must be marked as the predefined minimum + record, as there is no lower alphabetical limit to records in + the leftmost node of a level: */ + dtuple_set_info_bits(tuple, dtuple_get_info_bits(tuple) + | REC_INFO_MIN_REC_FLAG); + } + + ulint n_ext = 0; + ulint rec_size = rec_get_converted_size(m_index, tuple, n_ext); + big_rec_t* big_rec = NULL; + rec_t* rec = NULL; + rec_offs* offsets = NULL; + + if (page_bulk->needExt(tuple, rec_size)) { + /* The record is so big that we have to store some fields + externally on separate database pages */ + big_rec = dtuple_convert_big_rec(m_index, 0, tuple, &n_ext); + + if (big_rec == NULL) { + return(DB_TOO_BIG_RECORD); + } + + rec_size = rec_get_converted_size(m_index, tuple, n_ext); + } + + if (page_bulk->getPageZip() != NULL + && page_zip_is_too_big(m_index, tuple)) { + err = DB_TOO_BIG_RECORD; + goto func_exit; + } + + if (!page_bulk->isSpaceAvailable(rec_size)) { + /* Create a sibling page_bulk. */ + PageBulk* sibling_page_bulk; + sibling_page_bulk = UT_NEW_NOKEY(PageBulk(m_index, m_trx->id, + FIL_NULL, level)); + err = sibling_page_bulk->init(); + if (err != DB_SUCCESS) { + UT_DELETE(sibling_page_bulk); + goto func_exit; + } + + /* Commit page bulk. */ + err = pageCommit(page_bulk, sibling_page_bulk, true); + if (err != DB_SUCCESS) { + pageAbort(sibling_page_bulk); + UT_DELETE(sibling_page_bulk); + goto func_exit; + } + + /* Set new page bulk to page_bulks. */ + ut_ad(sibling_page_bulk->getLevel() <= m_root_level); + m_page_bulks.at(level) = sibling_page_bulk; + + UT_DELETE(page_bulk); + page_bulk = sibling_page_bulk; + + /* Important: log_free_check whether we need a checkpoint. */ + if (page_is_leaf(sibling_page_bulk->getPage())) { + if (trx_is_interrupted(m_trx)) { + err = DB_INTERRUPTED; + goto func_exit; + } + + srv_inc_activity_count(); + logFreeCheck(); + } + } + + /* Convert tuple to rec. */ + rec = rec_convert_dtuple_to_rec(static_cast(mem_heap_alloc( + page_bulk->m_heap, rec_size)), m_index, tuple, n_ext); + offsets = rec_get_offsets(rec, m_index, offsets, level + ? 0 : m_index->n_core_fields, + ULINT_UNDEFINED, &page_bulk->m_heap); + + page_bulk->insert(rec, offsets); + + if (big_rec != NULL) { + ut_ad(dict_index_is_clust(m_index)); + ut_ad(page_bulk->getLevel() == 0); + ut_ad(page_bulk == m_page_bulks.at(0)); + + /* Release all pages above the leaf level */ + for (ulint level = 1; level <= m_root_level; level++) { + m_page_bulks.at(level)->release(); + } + + err = page_bulk->storeExt(big_rec, offsets); + + /* Latch */ + for (ulint level = 1; level <= m_root_level; level++) { + PageBulk* page_bulk = m_page_bulks.at(level); + page_bulk->latch(); + } + } + +func_exit: + if (big_rec != NULL) { + dtuple_convert_back_big_rec(m_index, tuple, big_rec); + } + + return(err); +} + +/** Btree bulk load finish. We commit the last page in each level +and copy the last page in top level to the root page of the index +if no error occurs. +@param[in] err whether bulk load was successful until now +@return error code */ +dberr_t +BtrBulk::finish(dberr_t err) +{ + uint32_t last_page_no = FIL_NULL; + + ut_ad(!m_index->table->is_temporary()); + + if (m_page_bulks.size() == 0) { + /* The table is empty. The root page of the index tree + is already in a consistent state. No need to flush. */ + return(err); + } + + ut_ad(m_root_level + 1 == m_page_bulks.size()); + + /* Finish all page bulks */ + for (ulint level = 0; level <= m_root_level; level++) { + PageBulk* page_bulk = m_page_bulks.at(level); + + last_page_no = page_bulk->getPageNo(); + + if (err == DB_SUCCESS) { + err = pageCommit(page_bulk, NULL, + level != m_root_level); + } + + if (err != DB_SUCCESS) { + pageAbort(page_bulk); + } + + UT_DELETE(page_bulk); + } + + if (err == DB_SUCCESS) { + rec_t* first_rec; + mtr_t mtr; + buf_block_t* last_block; + PageBulk root_page_bulk(m_index, m_trx->id, + m_index->page, m_root_level); + + mtr.start(); + m_index->set_modified(mtr); + mtr_x_lock_index(m_index, &mtr); + + ut_ad(last_page_no != FIL_NULL); + last_block = btr_block_get(*m_index, last_page_no, RW_X_LATCH, + false, &mtr); + if (!last_block) { + err = DB_CORRUPTION; +err_exit: + mtr.commit(); + return err; + } + + first_rec = page_rec_get_next( + page_get_infimum_rec(last_block->page.frame)); + /* Because this index tree is being created by this thread, + we assume that it cannot be corrupted. */ + ut_ad(first_rec); + ut_ad(page_rec_is_user_rec(first_rec)); + + /* Copy last page to root page. */ + err = root_page_bulk.init(); + if (err != DB_SUCCESS) { + goto err_exit; + } + root_page_bulk.copyIn(first_rec); + root_page_bulk.finish(); + + /* Remove last page. */ + err = btr_page_free(m_index, last_block, &mtr); + mtr.commit(); + + if (dberr_t e = pageCommit(&root_page_bulk, NULL, false)) { + err = e; + } + ut_ad(err == DB_SUCCESS); + } + + ut_ad(err != DB_SUCCESS + || btr_validate_index(m_index, NULL) == DB_SUCCESS); + return(err); +} diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc new file mode 100644 index 00000000..e736f338 --- /dev/null +++ b/storage/innobase/btr/btr0cur.cc @@ -0,0 +1,7017 @@ +/***************************************************************************** + +Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2015, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file btr/btr0cur.cc +The index tree cursor + +All changes that row operations make to a B-tree or the records +there must go through this module! Undo log records are written here +of every modify or insert of a clustered index record. + + NOTE!!! +To make sure we do not run out of disk space during a pessimistic +insert or update, we have to reserve 2 x the height of the index tree +many pages in the tablespace before we start the operation, because +if leaf splitting has been started, it is difficult to undo, except +by crashing the database and doing a roll-forward. + +Created 10/16/1994 Heikki Tuuri +*******************************************************/ + +#include "btr0cur.h" +#include "row0upd.h" +#include "mtr0log.h" +#include "page0page.h" +#include "page0zip.h" +#include "rem0rec.h" +#include "rem0cmp.h" +#include "buf0lru.h" +#include "buf0rea.h" +#include "btr0btr.h" +#include "btr0sea.h" +#include "row0log.h" +#include "row0purge.h" +#include "row0upd.h" +#include "trx0rec.h" +#include "trx0roll.h" +#include "que0que.h" +#include "row0row.h" +#include "srv0srv.h" +#include "ibuf0ibuf.h" +#include "lock0lock.h" +#include "zlib.h" +#include "srv0start.h" +#include "mysql_com.h" +#include "dict0stats.h" +#include "row0ins.h" +#ifdef WITH_WSREP +#include "mysql/service_wsrep.h" +#endif /* WITH_WSREP */ +#include "log.h" + +/** Buffered B-tree operation types, introduced as part of delete buffering. */ +enum btr_op_t { + BTR_NO_OP = 0, /*!< Not buffered */ + BTR_INSERT_OP, /*!< Insert, do not ignore UNIQUE */ + BTR_INSERT_IGNORE_UNIQUE_OP, /*!< Insert, ignoring UNIQUE */ + BTR_DELETE_OP, /*!< Purge a delete-marked record */ + BTR_DELMARK_OP /*!< Mark a record for deletion */ +}; + +/** Modification types for the B-tree operation. + Note that the order must be DELETE, BOTH, INSERT !! + */ +enum btr_intention_t { + BTR_INTENTION_DELETE, + BTR_INTENTION_BOTH, + BTR_INTENTION_INSERT +}; + +/** For the index->lock scalability improvement, only possibility of clear +performance regression observed was caused by grown huge history list length. +That is because the exclusive use of index->lock also worked as reserving +free blocks and read IO bandwidth with priority. To avoid huge glowing history +list as same level with previous implementation, prioritizes pessimistic tree +operations by purge as the previous, when it seems to be growing huge. + + Experimentally, the history list length starts to affect to performance +throughput clearly from about 100000. */ +#define BTR_CUR_FINE_HISTORY_LENGTH 100000 + +#ifdef BTR_CUR_HASH_ADAPT +/** Number of searches down the B-tree in btr_cur_t::search_leaf(). */ +ib_counter_t btr_cur_n_non_sea; +/** Old value of btr_cur_n_non_sea. Copied by +srv_refresh_innodb_monitor_stats(). Referenced by +srv_printf_innodb_monitor(). */ +ulint btr_cur_n_non_sea_old; +/** Number of successful adaptive hash index lookups in +btr_cur_t::search_leaf(). */ +ib_counter_t btr_cur_n_sea; +/** Old value of btr_cur_n_sea. Copied by +srv_refresh_innodb_monitor_stats(). Referenced by +srv_printf_innodb_monitor(). */ +ulint btr_cur_n_sea_old; +#endif /* BTR_CUR_HASH_ADAPT */ + +#ifdef UNIV_DEBUG +/* Flag to limit optimistic insert records */ +uint btr_cur_limit_optimistic_insert_debug; +#endif /* UNIV_DEBUG */ + +/** In the optimistic insert, if the insert does not fit, but this much space +can be released by page reorganize, then it is reorganized */ +#define BTR_CUR_PAGE_REORGANIZE_LIMIT (srv_page_size / 32) + +/** The structure of a BLOB part header */ +/* @{ */ +/*--------------------------------------*/ +#define BTR_BLOB_HDR_PART_LEN 0 /*!< BLOB part len on this + page */ +#define BTR_BLOB_HDR_NEXT_PAGE_NO 4 /*!< next BLOB part page no, + FIL_NULL if none */ +/*--------------------------------------*/ +#define BTR_BLOB_HDR_SIZE 8 /*!< Size of a BLOB + part header, in bytes */ + +/* @} */ + +/*******************************************************************//** +Marks all extern fields in a record as owned by the record. This function +should be called if the delete mark of a record is removed: a not delete +marked record always owns all its extern fields. */ +static +void +btr_cur_unmark_extern_fields( +/*=========================*/ + buf_block_t* block, /*!< in/out: index page */ + rec_t* rec, /*!< in/out: record in a clustered index */ + dict_index_t* index, /*!< in: index of the page */ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + mtr_t* mtr); /*!< in: mtr, or NULL if not logged */ +/***********************************************************//** +Frees the externally stored fields for a record, if the field is mentioned +in the update vector. */ +static +void +btr_rec_free_updated_extern_fields( +/*===============================*/ + dict_index_t* index, /*!< in: index of rec; the index tree MUST be + X-latched */ + rec_t* rec, /*!< in: record */ + buf_block_t* block, /*!< in: index page of rec */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + const upd_t* update, /*!< in: update vector */ + bool rollback,/*!< in: performing rollback? */ + mtr_t* mtr); /*!< in: mini-transaction handle which contains + an X-latch to record page and to the tree */ +/***********************************************************//** +Frees the externally stored fields for a record. */ +static +void +btr_rec_free_externally_stored_fields( +/*==================================*/ + dict_index_t* index, /*!< in: index of the data, the index + tree MUST be X-latched */ + rec_t* rec, /*!< in: record */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + buf_block_t* block, /*!< in: index page of rec */ + bool rollback,/*!< in: performing rollback? */ + mtr_t* mtr); /*!< in: mini-transaction handle which contains + an X-latch to record page and to the index + tree */ + +/*==================== B-TREE SEARCH =========================*/ + +/** Load the instant ALTER TABLE metadata from the clustered index +when loading a table definition. +@param[in,out] index clustered index definition +@param[in,out] mtr mini-transaction +@return error code +@retval DB_SUCCESS if no error occurred +@retval DB_CORRUPTION if any corruption was noticed */ +static dberr_t btr_cur_instant_init_low(dict_index_t* index, mtr_t* mtr) +{ + ut_ad(index->is_primary()); + ut_ad(index->n_core_null_bytes == dict_index_t::NO_CORE_NULL_BYTES); + ut_ad(index->table->supports_instant()); + ut_ad(index->table->is_readable()); + + dberr_t err; + const fil_space_t* space = index->table->space; + if (!space) { +corrupted: + err = DB_CORRUPTION; +unreadable: + ib::error() << "Table " << index->table->name + << " has an unreadable root page"; + index->table->corrupted = true; + index->table->file_unreadable = true; + return err; + } + + buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr, &err); + if (!root) { + goto unreadable; + } + + if (btr_cur_instant_root_init(index, root->page.frame)) { + goto corrupted; + } + + ut_ad(index->n_core_null_bytes != dict_index_t::NO_CORE_NULL_BYTES); + + if (fil_page_get_type(root->page.frame) == FIL_PAGE_INDEX) { + ut_ad(!index->is_instant()); + return DB_SUCCESS; + } + + btr_cur_t cur; + /* Relax the assertion in rec_init_offsets(). */ + ut_ad(!index->in_instant_init); + ut_d(index->in_instant_init = true); + err = cur.open_leaf(true, index, BTR_SEARCH_LEAF, mtr); + ut_d(index->in_instant_init = false); + if (err != DB_SUCCESS) { + index->table->file_unreadable = true; + index->table->corrupted = true; + return err; + } + + ut_ad(page_cur_is_before_first(&cur.page_cur)); + ut_ad(page_is_leaf(cur.page_cur.block->page.frame)); + + const rec_t* rec = page_cur_move_to_next(&cur.page_cur); + const ulint comp = dict_table_is_comp(index->table); + const ulint info_bits = rec ? rec_get_info_bits(rec, comp) : 0; + + if (page_rec_is_supremum(rec) + || !(info_bits & REC_INFO_MIN_REC_FLAG)) { + if (rec && !index->is_instant()) { + /* The FIL_PAGE_TYPE_INSTANT and PAGE_INSTANT may be + assigned even if instant ADD COLUMN was not + committed. Changes to these page header fields are not + undo-logged, but changes to the hidden metadata record + are. If the server is killed and restarted, the page + header fields could remain set even though no metadata + record is present. */ + return DB_SUCCESS; + } + + ib::error() << "Table " << index->table->name + << " is missing instant ALTER metadata"; + index->table->corrupted = true; + return DB_CORRUPTION; + } + + if ((info_bits & ~REC_INFO_DELETED_FLAG) != REC_INFO_MIN_REC_FLAG + || (comp && rec_get_status(rec) != REC_STATUS_INSTANT)) { +incompatible: + ib::error() << "Table " << index->table->name + << " contains unrecognizable instant ALTER metadata"; + index->table->corrupted = true; + return DB_CORRUPTION; + } + + /* Read the metadata. We can get here on server restart + or when the table was evicted from the data dictionary cache + and is now being accessed again. + + Here, READ COMMITTED and REPEATABLE READ should be equivalent. + Committing the ADD COLUMN operation would acquire + MDL_EXCLUSIVE and LOCK_X|LOCK_TABLE, which would prevent any + concurrent operations on the table, including table eviction + from the cache. */ + + if (info_bits & REC_INFO_DELETED_FLAG) { + /* This metadata record includes a BLOB that identifies + any dropped or reordered columns. */ + ulint trx_id_offset = index->trx_id_offset; + /* If !index->trx_id_offset, the PRIMARY KEY contains + variable-length columns. For the metadata record, + variable-length columns should be written with zero + length. However, before MDEV-21088 was fixed, for + variable-length encoded PRIMARY KEY column of type + CHAR, we wrote more than zero bytes. That is why we + must determine the actual length of each PRIMARY KEY + column. The DB_TRX_ID will start right after any + PRIMARY KEY columns. */ + ut_ad(index->n_uniq); + + /* We cannot invoke rec_get_offsets() before + index->table->deserialise_columns(). Therefore, + we must duplicate some logic here. */ + if (trx_id_offset) { + } else if (index->table->not_redundant()) { + /* The PRIMARY KEY contains variable-length columns. + For the metadata record, variable-length columns are + always written with zero length. The DB_TRX_ID will + start right after any fixed-length columns. */ + + /* OK, before MDEV-21088 was fixed, for + variable-length encoded PRIMARY KEY column of + type CHAR, we wrote more than zero bytes. In + order to allow affected tables to be accessed, + it would be nice to determine the actual + length of each PRIMARY KEY column. However, to + be able to do that, we should determine the + size of the null-bit bitmap in the metadata + record. And we cannot know that before reading + the metadata BLOB, whose starting point we are + trying to find here. (Although the PRIMARY KEY + columns cannot be NULL, we would have to know + where the lengths of variable-length PRIMARY KEY + columns start.) + + So, unfortunately we cannot help users who + were affected by MDEV-21088 on a ROW_FORMAT=COMPACT + or ROW_FORMAT=DYNAMIC table. */ + + for (uint i = index->n_uniq; i--; ) { + trx_id_offset += index->fields[i].fixed_len; + } + } else if (rec_get_1byte_offs_flag(rec)) { + trx_id_offset = rec_1_get_field_end_info( + rec, index->n_uniq - 1); + ut_ad(!(trx_id_offset & REC_1BYTE_SQL_NULL_MASK)); + trx_id_offset &= ~REC_1BYTE_SQL_NULL_MASK; + } else { + trx_id_offset = rec_2_get_field_end_info( + rec, index->n_uniq - 1); + ut_ad(!(trx_id_offset & REC_2BYTE_SQL_NULL_MASK)); + trx_id_offset &= ~REC_2BYTE_SQL_NULL_MASK; + } + + const byte* ptr = rec + trx_id_offset + + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + if (mach_read_from_4(ptr + BTR_EXTERN_LEN)) { + goto incompatible; + } + + uint len = mach_read_from_4(ptr + BTR_EXTERN_LEN + 4); + if (!len + || mach_read_from_4(ptr + BTR_EXTERN_OFFSET) + != FIL_PAGE_DATA + || mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID) + != space->id) { + goto incompatible; + } + + buf_block_t* block = buf_page_get( + page_id_t(space->id, + mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)), + 0, RW_S_LATCH, mtr); + if (!block) { + goto incompatible; + } + + if (fil_page_get_type(block->page.frame) != FIL_PAGE_TYPE_BLOB + || mach_read_from_4(&block->page.frame + [FIL_PAGE_DATA + + BTR_BLOB_HDR_NEXT_PAGE_NO]) + != FIL_NULL + || mach_read_from_4(&block->page.frame + [FIL_PAGE_DATA + + BTR_BLOB_HDR_PART_LEN]) + != len) { + goto incompatible; + } + + /* The unused part of the BLOB page should be zero-filled. */ + for (const byte* b = block->page.frame + + (FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE) + len, + * const end = block->page.frame + srv_page_size + - BTR_EXTERN_LEN; + b < end; ) { + if (*b++) { + goto incompatible; + } + } + + if (index->table->deserialise_columns( + &block->page.frame + [FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE], len)) { + goto incompatible; + } + + /* Proceed to initialize the default values of + any instantly added columns. */ + } + + mem_heap_t* heap = NULL; + rec_offs* offsets = rec_get_offsets(rec, index, NULL, + index->n_core_fields, + ULINT_UNDEFINED, &heap); + if (rec_offs_any_default(offsets)) { +inconsistent: + mem_heap_free(heap); + goto incompatible; + } + + /* In fact, because we only ever append fields to the metadata + record, it is also OK to perform READ UNCOMMITTED and + then ignore any extra fields, provided that + trx_sys.is_registered(DB_TRX_ID). */ + if (rec_offs_n_fields(offsets) + > ulint(index->n_fields) + !!index->table->instant + && !trx_sys.is_registered(current_trx(), + row_get_rec_trx_id(rec, index, + offsets))) { + goto inconsistent; + } + + for (unsigned i = index->n_core_fields; i < index->n_fields; i++) { + dict_col_t* col = index->fields[i].col; + const unsigned o = i + !!index->table->instant; + ulint len; + const byte* data = rec_get_nth_field(rec, offsets, o, &len); + ut_ad(!col->is_added()); + ut_ad(!col->def_val.data); + col->def_val.len = len; + switch (len) { + case UNIV_SQL_NULL: + continue; + case 0: + col->def_val.data = field_ref_zero; + continue; + } + ut_ad(len != UNIV_SQL_DEFAULT); + if (!rec_offs_nth_extern(offsets, o)) { + col->def_val.data = mem_heap_dup( + index->table->heap, data, len); + } else if (len < BTR_EXTERN_FIELD_REF_SIZE + || !memcmp(data + len - BTR_EXTERN_FIELD_REF_SIZE, + field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)) { + col->def_val.len = UNIV_SQL_DEFAULT; + goto inconsistent; + } else { + col->def_val.data = btr_copy_externally_stored_field( + &col->def_val.len, data, + cur.page_cur.block->zip_size(), + len, index->table->heap); + } + } + + mem_heap_free(heap); + return DB_SUCCESS; +} + +/** Load the instant ALTER TABLE metadata from the clustered index +when loading a table definition. +@param[in,out] table table definition from the data dictionary +@return error code +@retval DB_SUCCESS if no error occurred */ +dberr_t +btr_cur_instant_init(dict_table_t* table) +{ + mtr_t mtr; + dict_index_t* index = dict_table_get_first_index(table); + mtr.start(); + dberr_t err = index + ? btr_cur_instant_init_low(index, &mtr) + : DB_CORRUPTION; + mtr.commit(); + return(err); +} + +/** Initialize the n_core_null_bytes on first access to a clustered +index root page. +@param[in] index clustered index that is on its first access +@param[in] page clustered index root page +@return whether the page is corrupted */ +bool btr_cur_instant_root_init(dict_index_t* index, const page_t* page) +{ + ut_ad(!index->is_dummy); + ut_ad(index->is_primary()); + ut_ad(!index->is_instant()); + ut_ad(index->table->supports_instant()); + + if (page_has_siblings(page)) { + return true; + } + + /* This is normally executed as part of btr_cur_instant_init() + when dict_load_table_one() is loading a table definition. + Other threads should not access or modify the n_core_null_bytes, + n_core_fields before dict_load_table_one() returns. + + This can also be executed during IMPORT TABLESPACE, where the + table definition is exclusively locked. */ + + switch (fil_page_get_type(page)) { + default: + return true; + case FIL_PAGE_INDEX: + /* The field PAGE_INSTANT is guaranteed 0 on clustered + index root pages of ROW_FORMAT=COMPACT or + ROW_FORMAT=DYNAMIC when instant ADD COLUMN is not used. */ + if (page_is_comp(page) && page_get_instant(page)) { + return true; + } + index->n_core_null_bytes = static_cast( + UT_BITS_IN_BYTES(unsigned(index->n_nullable))); + return false; + case FIL_PAGE_TYPE_INSTANT: + break; + } + + const uint16_t n = page_get_instant(page); + + if (n < index->n_uniq + DATA_ROLL_PTR) { + /* The PRIMARY KEY (or hidden DB_ROW_ID) and + DB_TRX_ID,DB_ROLL_PTR columns must always be present + as 'core' fields. */ + return true; + } + + if (n > REC_MAX_N_FIELDS) { + return true; + } + + index->n_core_fields = n & dict_index_t::MAX_N_FIELDS; + + const rec_t* infimum = page_get_infimum_rec(page); + const rec_t* supremum = page_get_supremum_rec(page); + + if (!memcmp(infimum, "infimum", 8) + && !memcmp(supremum, "supremum", 8)) { + if (n > index->n_fields) { + /* All fields, including those for instantly + added columns, must be present in the + data dictionary. */ + return true; + } + + ut_ad(!index->is_dummy); + ut_d(index->is_dummy = true); + index->n_core_null_bytes = static_cast( + UT_BITS_IN_BYTES(index->get_n_nullable(n))); + ut_d(index->is_dummy = false); + return false; + } + + if (memcmp(infimum, field_ref_zero, 8) + || memcmp(supremum, field_ref_zero, 7)) { + /* The infimum and supremum records must either contain + the original strings, or they must be filled with zero + bytes, except for the bytes that we have repurposed. */ + return true; + } + + index->n_core_null_bytes = supremum[7]; + return index->n_core_null_bytes > 128; +} + +/** +Gets intention in btr_intention_t from latch_mode, and cleares the intention +at the latch_mode. +@param latch_mode in/out: pointer to latch_mode +@return intention for latching tree */ +static +btr_intention_t btr_cur_get_and_clear_intention(btr_latch_mode *latch_mode) +{ + btr_intention_t intention; + + switch (*latch_mode & (BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE)) { + case BTR_LATCH_FOR_INSERT: + intention = BTR_INTENTION_INSERT; + break; + case BTR_LATCH_FOR_DELETE: + intention = BTR_INTENTION_DELETE; + break; + default: + /* both or unknown */ + intention = BTR_INTENTION_BOTH; + } + *latch_mode = btr_latch_mode( + *latch_mode & ~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE)); + + return(intention); +} + +/** @return whether the distance between two records is at most the +specified value */ +static bool +page_rec_distance_is_at_most(const rec_t *left, const rec_t *right, ulint val) +{ + do + { + if (left == right) + return true; + left= page_rec_get_next_const(left); + } + while (left && val--); + return false; +} + +/** Detects whether the modifying record might need a modifying tree structure. +@param[in] index index +@param[in] page page +@param[in] lock_intention lock intention for the tree operation +@param[in] rec record (current node_ptr) +@param[in] rec_size size of the record or max size of node_ptr +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] mtr mtr +@return true if tree modification is needed */ +static +bool +btr_cur_will_modify_tree( + dict_index_t* index, + const page_t* page, + btr_intention_t lock_intention, + const rec_t* rec, + ulint rec_size, + ulint zip_size, + mtr_t* mtr) +{ + ut_ad(!page_is_leaf(page)); + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + + /* Pessimistic delete of the first record causes delete & insert + of node_ptr at upper level. And a subsequent page shrink is + possible. It causes delete of node_ptr at the upper level. + So we should pay attention also to 2nd record not only + first record and last record. Because if the "delete & insert" are + done for the different page, the 2nd record become + first record and following compress might delete the record and causes + the uppper level node_ptr modification. */ + + const ulint n_recs = page_get_n_recs(page); + + if (lock_intention <= BTR_INTENTION_BOTH) { + compile_time_assert(BTR_INTENTION_DELETE < BTR_INTENTION_BOTH); + compile_time_assert(BTR_INTENTION_BOTH < BTR_INTENTION_INSERT); + + if (!page_has_siblings(page)) { + return true; + } + + ulint margin = rec_size; + + if (lock_intention == BTR_INTENTION_BOTH) { + ulint level = btr_page_get_level(page); + + /* This value is the worst expectation for the node_ptr + records to be deleted from this page. It is used to + expect whether the cursor position can be the left_most + record in this page or not. */ + ulint max_nodes_deleted = 0; + + /* By modifying tree operations from the under of this + level, logically (2 ^ (level - 1)) opportunities to + deleting records in maximum even unreally rare case. */ + if (level > 7) { + /* TODO: adjust this practical limit. */ + max_nodes_deleted = 64; + } else if (level > 0) { + max_nodes_deleted = (ulint)1 << (level - 1); + } + /* check delete will cause. (BTR_INTENTION_BOTH + or BTR_INTENTION_DELETE) */ + if (n_recs <= max_nodes_deleted * 2 + || page_rec_is_first(rec, page)) { + /* The cursor record can be the left most record + in this page. */ + return true; + } + + if (page_has_prev(page) + && page_rec_distance_is_at_most( + page_get_infimum_rec(page), rec, + max_nodes_deleted)) { + return true; + } + + if (page_has_next(page) + && page_rec_distance_is_at_most( + rec, page_get_supremum_rec(page), + max_nodes_deleted)) { + return true; + } + + /* Delete at leftmost record in a page causes delete + & insert at its parent page. After that, the delete + might cause btr_compress() and delete record at its + parent page. Thus we should consider max deletes. */ + margin *= max_nodes_deleted; + } + + /* Safe because we already have SX latch of the index tree */ + if (page_get_data_size(page) + < margin + BTR_CUR_PAGE_COMPRESS_LIMIT(index)) { + return(true); + } + } + + if (lock_intention >= BTR_INTENTION_BOTH) { + /* check insert will cause. BTR_INTENTION_BOTH + or BTR_INTENTION_INSERT*/ + + /* Once we invoke the btr_cur_limit_optimistic_insert_debug, + we should check it here in advance, since the max allowable + records in a page is limited. */ + LIMIT_OPTIMISTIC_INSERT_DEBUG(n_recs, return true); + + /* needs 2 records' space for the case the single split and + insert cannot fit. + page_get_max_insert_size_after_reorganize() includes space + for page directory already */ + ulint max_size + = page_get_max_insert_size_after_reorganize(page, 2); + + if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + rec_size + || max_size < rec_size * 2) { + return(true); + } + + /* TODO: optimize this condition for ROW_FORMAT=COMPRESSED. + This is based on the worst case, and we could invoke + page_zip_available() on the block->page.zip. */ + /* needs 2 records' space also for worst compress rate. */ + if (zip_size + && page_zip_empty_size(index->n_fields, zip_size) + <= rec_size * 2 + page_get_data_size(page) + + page_dir_calc_reserved_space(n_recs + 2)) { + return(true); + } + } + + return(false); +} + +/** Detects whether the modifying record might need a opposite modification +to the intention. +@param bpage buffer pool page +@param is_clust whether this is a clustered index +@param lock_intention lock intention for the tree operation +@param node_ptr_max_size the maximum size of a node pointer +@param compress_limit BTR_CUR_PAGE_COMPRESS_LIMIT(index) +@param rec record (current node_ptr) +@return true if tree modification is needed */ +static bool btr_cur_need_opposite_intention(const buf_page_t &bpage, + bool is_clust, + btr_intention_t lock_intention, + ulint node_ptr_max_size, + ulint compress_limit, + const rec_t *rec) +{ + if (UNIV_LIKELY_NULL(bpage.zip.data) && + !page_zip_available(&bpage.zip, is_clust, node_ptr_max_size, 1)) + return true; + const page_t *const page= bpage.frame; + if (lock_intention != BTR_INTENTION_INSERT) + { + /* We compensate also for btr_cur_compress_recommendation() */ + if (!page_has_siblings(page) || + page_rec_is_first(rec, page) || page_rec_is_last(rec, page) || + page_get_data_size(page) < node_ptr_max_size + compress_limit) + return true; + if (lock_intention == BTR_INTENTION_DELETE) + return false; + } + else if (page_has_next(page) && page_rec_is_last(rec, page)) + return true; + LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page), return true); + const ulint max_size= page_get_max_insert_size_after_reorganize(page, 2); + return max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + node_ptr_max_size || + max_size < node_ptr_max_size * 2; +} + +/** +@param[in] index b-tree +@return maximum size of a node pointer record in bytes */ +static ulint btr_node_ptr_max_size(const dict_index_t* index) +{ + if (dict_index_is_ibuf(index)) { + /* cannot estimate accurately */ + /* This is universal index for change buffer. + The max size of the entry is about max key length * 2. + (index key + primary key to be inserted to the index) + (The max key length is UNIV_PAGE_SIZE / 16 * 3 at + ha_innobase::max_supported_key_length(), + considering MAX_KEY_LENGTH = 3072 at MySQL imposes + the 3500 historical InnoDB value for 16K page size case.) + For the universal index, node_ptr contains most of the entry. + And 512 is enough to contain ibuf columns and meta-data */ + return srv_page_size / 8 * 3 + 512; + } + + /* Each record has page_no, length of page_no and header. */ + ulint comp = dict_table_is_comp(index->table); + ulint rec_max_size = comp + ? REC_NODE_PTR_SIZE + 1 + REC_N_NEW_EXTRA_BYTES + + UT_BITS_IN_BYTES(index->n_nullable) + : REC_NODE_PTR_SIZE + 2 + REC_N_OLD_EXTRA_BYTES + + 2 * index->n_fields; + + /* Compute the maximum possible record size. */ + for (ulint i = 0; i < dict_index_get_n_unique_in_tree(index); i++) { + const dict_field_t* field + = dict_index_get_nth_field(index, i); + const dict_col_t* col + = dict_field_get_col(field); + ulint field_max_size; + ulint field_ext_max_size; + + /* Determine the maximum length of the index field. */ + + field_max_size = dict_col_get_fixed_size(col, comp); + if (field_max_size) { + /* dict_index_add_col() should guarantee this */ + ut_ad(!field->prefix_len + || field->fixed_len == field->prefix_len); + /* Fixed lengths are not encoded + in ROW_FORMAT=COMPACT. */ + rec_max_size += field_max_size; + continue; + } + + field_max_size = dict_col_get_max_size(col); + if (UNIV_UNLIKELY(!field_max_size)) { + switch (col->mtype) { + case DATA_VARCHAR: + if (!comp + && (!strcmp(index->table->name.m_name, + "SYS_FOREIGN") + || !strcmp(index->table->name.m_name, + "SYS_FOREIGN_COLS"))) { + break; + } + /* fall through */ + case DATA_FIXBINARY: + case DATA_BINARY: + case DATA_VARMYSQL: + case DATA_CHAR: + case DATA_MYSQL: + /* BINARY(0), VARBINARY(0), + CHAR(0) and VARCHAR(0) are possible + data type definitions in MariaDB. + The InnoDB internal SQL parser maps + CHAR to DATA_VARCHAR, so DATA_CHAR (or + DATA_MYSQL) is only coming from the + MariaDB SQL layer. */ + if (comp) { + /* Add a length byte, because + fixed-length empty field are + encoded as variable-length. + For ROW_FORMAT=REDUNDANT, + these bytes were added to + rec_max_size before this loop. */ + rec_max_size++; + } + continue; + } + + /* SYS_FOREIGN.ID is defined as CHAR in the + InnoDB internal SQL parser, which translates + into the incorrect VARCHAR(0). InnoDB does + not enforce maximum lengths of columns, so + that is why any data can be inserted in the + first place. + + Likewise, SYS_FOREIGN.FOR_NAME, + SYS_FOREIGN.REF_NAME, SYS_FOREIGN_COLS.ID, are + defined as CHAR, and also they are part of a key. */ + + ut_ad(!strcmp(index->table->name.m_name, + "SYS_FOREIGN") + || !strcmp(index->table->name.m_name, + "SYS_FOREIGN_COLS")); + ut_ad(!comp); + ut_ad(col->mtype == DATA_VARCHAR); + + rec_max_size += (srv_page_size == UNIV_PAGE_SIZE_MAX) + ? REDUNDANT_REC_MAX_DATA_SIZE + : page_get_free_space_of_empty(FALSE) / 2; + } else if (field_max_size == NAME_LEN && i == 1 + && (!strcmp(index->table->name.m_name, + TABLE_STATS_NAME) + || !strcmp(index->table->name.m_name, + INDEX_STATS_NAME))) { + /* Interpret "table_name" as VARCHAR(199) even + if it was incorrectly defined as VARCHAR(64). + While the caller of ha_innobase enforces the + maximum length on any data written, the InnoDB + internal SQL parser will happily write as much + data as is provided. The purpose of this hack + is to avoid InnoDB hangs after persistent + statistics on partitioned tables are + deleted. */ + field_max_size = 199 * SYSTEM_CHARSET_MBMAXLEN; + } + field_ext_max_size = field_max_size < 256 ? 1 : 2; + + if (field->prefix_len + && field->prefix_len < field_max_size) { + field_max_size = field->prefix_len; + } + + if (comp) { + /* Add the extra size for ROW_FORMAT=COMPACT. + For ROW_FORMAT=REDUNDANT, these bytes were + added to rec_max_size before this loop. */ + rec_max_size += field_ext_max_size; + } + + rec_max_size += field_max_size; + } + + return rec_max_size; +} + +/** @return a B-tree search mode suitable for non-leaf pages +@param mode leaf page search mode */ +static inline page_cur_mode_t btr_cur_nonleaf_mode(page_cur_mode_t mode) +{ + if (mode > PAGE_CUR_GE) + { + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE); + return mode; + } + if (mode == PAGE_CUR_GE) + return PAGE_CUR_L; + ut_ad(mode == PAGE_CUR_G); + return PAGE_CUR_LE; +} + +static MY_ATTRIBUTE((nonnull)) +/** Acquire a latch on the previous page without violating the latching order. +@param block index page +@param page_id page identifier with valid space identifier +@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param rw_latch the latch on block (RW_S_LATCH or RW_X_LATCH) +@param mtr mini-transaction +@param err error code +@retval 0 if an error occurred +@retval 1 if the page could be latched in the wrong order +@retval -1 if the latch on block was temporarily released */ +int btr_latch_prev(buf_block_t *block, page_id_t page_id, ulint zip_size, + rw_lock_type_t rw_latch, mtr_t *mtr, dberr_t *err) +{ + ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH); + ut_ad(page_id.space() == block->page.id().space()); + + const auto prev_savepoint= mtr->get_savepoint(); + ut_ad(block == mtr->at_savepoint(prev_savepoint - 1)); + + page_id.set_page_no(btr_page_get_prev(block->page.frame)); + buf_block_t *prev= buf_page_get_gen(page_id, zip_size, RW_NO_LATCH, nullptr, + BUF_GET, mtr, err, false); + if (UNIV_UNLIKELY(!prev)) + return 0; + + int ret= 1; + if (UNIV_UNLIKELY(rw_latch == RW_S_LATCH)) + { + if (UNIV_LIKELY(prev->page.lock.s_lock_try())) + { + mtr->lock_register(prev_savepoint, MTR_MEMO_PAGE_S_FIX); + goto prev_latched; + } + block->page.lock.s_unlock(); + } + else + { + if (UNIV_LIKELY(prev->page.lock.x_lock_try())) + { + mtr->lock_register(prev_savepoint, MTR_MEMO_PAGE_X_FIX); + goto prev_latched; + } + block->page.lock.x_unlock(); + } + + ret= -1; + mtr->lock_register(prev_savepoint - 1, MTR_MEMO_BUF_FIX); + mtr->rollback_to_savepoint(prev_savepoint); + prev= buf_page_get_gen(page_id, zip_size, rw_latch, prev, + BUF_GET, mtr, err, false); + if (UNIV_UNLIKELY(!prev)) + return 0; + mtr->upgrade_buffer_fix(prev_savepoint - 1, rw_latch); + + prev_latched: + if (memcmp_aligned<2>(FIL_PAGE_TYPE + prev->page.frame, + FIL_PAGE_TYPE + block->page.frame, 2) || + memcmp_aligned<2>(PAGE_HEADER + PAGE_INDEX_ID + prev->page.frame, + PAGE_HEADER + PAGE_INDEX_ID + block->page.frame, 8) || + page_is_comp(prev->page.frame) != page_is_comp(block->page.frame)) + { + ut_ad("corrupted" == 0); // FIXME: remove this + *err= DB_CORRUPTION; + ret= 0; + } + + return ret; +} + +dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, + btr_latch_mode latch_mode, mtr_t *mtr) +{ + ut_ad(index()->is_btree() || index()->is_ibuf()); + ut_ad(!index()->is_ibuf() || ibuf_inside(mtr)); + + buf_block_t *guess; + btr_op_t btr_op; + btr_intention_t lock_intention; + bool detected_same_key_root= false; + + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs offsets2_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets2 = offsets2_; + rec_offs_init(offsets_); + rec_offs_init(offsets2_); + + ut_ad(dict_index_check_search_tuple(index(), tuple)); + ut_ad(dtuple_check_typed(tuple)); + ut_ad(index()->page != FIL_NULL); + + MEM_UNDEFINED(&up_match, sizeof up_match); + MEM_UNDEFINED(&up_bytes, sizeof up_bytes); + MEM_UNDEFINED(&low_match, sizeof low_match); + MEM_UNDEFINED(&low_bytes, sizeof low_bytes); + ut_d(up_match= ULINT_UNDEFINED); + ut_d(low_match= ULINT_UNDEFINED); + + ut_ad(!(latch_mode & BTR_ALREADY_S_LATCHED) || + mtr->memo_contains_flagged(&index()->lock, + MTR_MEMO_S_LOCK | MTR_MEMO_SX_LOCK | + MTR_MEMO_X_LOCK)); + + /* These flags are mutually exclusive, they are lumped together + with the latch mode for historical reasons. It's possible for + none of the flags to be set. */ + switch (UNIV_EXPECT(latch_mode & BTR_DELETE, 0)) { + default: + btr_op= BTR_NO_OP; + break; + case BTR_INSERT: + btr_op= (latch_mode & BTR_IGNORE_SEC_UNIQUE) + ? BTR_INSERT_IGNORE_UNIQUE_OP + : BTR_INSERT_OP; + break; + case BTR_DELETE: + btr_op= BTR_DELETE_OP; + ut_a(purge_node); + break; + case BTR_DELETE_MARK: + btr_op= BTR_DELMARK_OP; + break; + } + + /* Operations on the insert buffer tree cannot be buffered. */ + ut_ad(btr_op == BTR_NO_OP || !index()->is_ibuf()); + /* Operations on the clustered index cannot be buffered. */ + ut_ad(btr_op == BTR_NO_OP || !index()->is_clust()); + /* Operations on the temporary table(indexes) cannot be buffered. */ + ut_ad(btr_op == BTR_NO_OP || !index()->table->is_temporary()); + + const bool latch_by_caller= latch_mode & BTR_ALREADY_S_LATCHED; + lock_intention= btr_cur_get_and_clear_intention(&latch_mode); + latch_mode= BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode); + + ut_ad(!latch_by_caller + || latch_mode == BTR_SEARCH_LEAF + || latch_mode == BTR_MODIFY_LEAF + || latch_mode == BTR_MODIFY_TREE + || latch_mode == BTR_MODIFY_ROOT_AND_LEAF); + + flag= BTR_CUR_BINARY; +#ifndef BTR_CUR_ADAPT + guess= nullptr; +#else + btr_search_t *info= btr_search_get_info(index()); + guess= info->root_guess; + +# ifdef BTR_CUR_HASH_ADAPT +# ifdef UNIV_SEARCH_PERF_STAT + info->n_searches++; +# endif + bool ahi_enabled= btr_search_enabled && !index()->is_ibuf(); + /* We do a dirty read of btr_search_enabled below, + and btr_search_guess_on_hash() will have to check it again. */ + if (!ahi_enabled); + else if (btr_search_guess_on_hash(index(), info, tuple, mode, + latch_mode, this, mtr)) + { + /* Search using the hash index succeeded */ + ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE); + ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); + ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); + ++btr_cur_n_sea; + + return DB_SUCCESS; + } + else + ++btr_cur_n_non_sea; +# endif +#endif + + /* If the hash search did not succeed, do binary search down the + tree */ + + /* Store the position of the tree latch we push to mtr so that we + know how to release it when we have latched leaf node(s) */ + + const ulint savepoint= mtr->get_savepoint(); + + ulint node_ptr_max_size= 0, compress_limit= 0; + rw_lock_type_t rw_latch= RW_S_LATCH; + + switch (latch_mode) { + case BTR_MODIFY_TREE: + rw_latch= RW_X_LATCH; + node_ptr_max_size= btr_node_ptr_max_size(index()); + if (latch_by_caller) + { + ut_ad(mtr->memo_contains_flagged(&index()->lock, MTR_MEMO_X_LOCK)); + break; + } + if (lock_intention == BTR_INTENTION_DELETE) + { + compress_limit= BTR_CUR_PAGE_COMPRESS_LIMIT(index()); + if (os_aio_pending_reads_approx() && + trx_sys.history_size_approx() > BTR_CUR_FINE_HISTORY_LENGTH) + { + /* Most delete-intended operations are due to the purge of history. + Prioritize them when the history list is growing huge. */ + mtr_x_lock_index(index(), mtr); + break; + } + } + mtr_sx_lock_index(index(), mtr); + break; +#ifdef UNIV_DEBUG + case BTR_CONT_MODIFY_TREE: + ut_ad("invalid mode" == 0); + break; +#endif + case BTR_MODIFY_ROOT_AND_LEAF: + rw_latch= RW_SX_LATCH; + /* fall through */ + default: + if (!latch_by_caller) + mtr_s_lock_index(index(), mtr); + } + + const ulint zip_size= index()->table->space->zip_size(); + + /* Start with the root page. */ + page_id_t page_id(index()->table->space_id, index()->page); + + const page_cur_mode_t page_mode= btr_cur_nonleaf_mode(mode); + ulint height= ULINT_UNDEFINED; + up_match= 0; + up_bytes= 0; + low_match= 0; + low_bytes= 0; + ulint buf_mode= BUF_GET; + search_loop: + dberr_t err; + auto block_savepoint= mtr->get_savepoint(); + buf_block_t *block= + buf_page_get_gen(page_id, zip_size, rw_latch, guess, buf_mode, mtr, + &err, height == 0 && !index()->is_clust()); + if (!block) + { + switch (err) { + case DB_DECRYPTION_FAILED: + btr_decryption_failed(*index()); + /* fall through */ + default: + func_exit: + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + return err; + case DB_SUCCESS: + /* This must be a search to perform an insert, delete mark, or delete; + try using the change buffer */ + ut_ad(height == 0); + ut_ad(thr); + break; + } + + switch (btr_op) { + default: + MY_ASSERT_UNREACHABLE(); + break; + case BTR_INSERT_OP: + case BTR_INSERT_IGNORE_UNIQUE_OP: + ut_ad(buf_mode == BUF_GET_IF_IN_POOL); + + if (ibuf_insert(IBUF_OP_INSERT, tuple, index(), page_id, zip_size, thr)) + { + flag= BTR_CUR_INSERT_TO_IBUF; + goto func_exit; + } + break; + + case BTR_DELMARK_OP: + ut_ad(buf_mode == BUF_GET_IF_IN_POOL); + + if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple, + index(), page_id, zip_size, thr)) + { + flag = BTR_CUR_DEL_MARK_IBUF; + goto func_exit; + } + + break; + + case BTR_DELETE_OP: + ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH); + auto& chain = buf_pool.page_hash.cell_get(page_id.fold()); + + if (!row_purge_poss_sec(purge_node, index(), tuple)) + /* The record cannot be purged yet. */ + flag= BTR_CUR_DELETE_REF; + else if (ibuf_insert(IBUF_OP_DELETE, tuple, index(), + page_id, zip_size, thr)) + /* The purge was buffered. */ + flag= BTR_CUR_DELETE_IBUF; + else + { + /* The purge could not be buffered. */ + buf_pool.watch_unset(page_id, chain); + break; + } + + buf_pool.watch_unset(page_id, chain); + goto func_exit; + } + + /* Change buffering did not succeed, we must read the page. */ + buf_mode= BUF_GET; + goto search_loop; + } + + if (!!page_is_comp(block->page.frame) != index()->table->not_redundant() || + btr_page_get_index_id(block->page.frame) != index()->id || + fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE || + !fil_page_index_page_check(block->page.frame)) + { + corrupted: + ut_ad("corrupted" == 0); // FIXME: remove this + err= DB_CORRUPTION; + goto func_exit; + } + + page_cur.block= block; + ut_ad(block == mtr->at_savepoint(block_savepoint)); + ut_ad(rw_latch != RW_NO_LATCH); +#ifdef UNIV_ZIP_DEBUG + if (const page_zip_des_t *page_zip= buf_block_get_page_zip(block)) + ut_a(page_zip_validate(page_zip, block->page.frame, index())); +#endif /* UNIV_ZIP_DEBUG */ + + const uint32_t page_level= btr_page_get_level(block->page.frame); + + if (height == ULINT_UNDEFINED) + { + /* We are in the B-tree index root page. */ +#ifdef BTR_CUR_ADAPT + info->root_guess= block; +#endif + height= page_level; + tree_height= height + 1; + + if (!height) + { + /* The root page is also a leaf page. + We may have to reacquire the page latch in a different mode. */ + switch (rw_latch) { + case RW_S_LATCH: + if ((latch_mode & ~12) != RW_S_LATCH) + { + ut_ad(rw_lock_type_t(latch_mode & ~12) == RW_X_LATCH); + goto relatch_x; + } + if (latch_mode != BTR_MODIFY_PREV) + { + if (!latch_by_caller) + /* Release the tree s-latch */ + mtr->rollback_to_savepoint(savepoint, savepoint + 1); + goto reached_latched_leaf; + } + /* fall through */ + case RW_SX_LATCH: + ut_ad(rw_latch == RW_S_LATCH || + latch_mode == BTR_MODIFY_ROOT_AND_LEAF); + relatch_x: + mtr->rollback_to_savepoint(block_savepoint); + height= ULINT_UNDEFINED; + rw_latch= RW_X_LATCH; + goto search_loop; + case RW_X_LATCH: + if (latch_mode == BTR_MODIFY_TREE) + goto reached_index_root_and_leaf; + goto reached_root_and_leaf; + case RW_NO_LATCH: + ut_ad(0); + } + goto reached_leaf; + } + } + else if (UNIV_UNLIKELY(height != page_level)) + goto corrupted; + else + switch (latch_mode) { + case BTR_MODIFY_TREE: + break; + case BTR_MODIFY_ROOT_AND_LEAF: + ut_ad((mtr->at_savepoint(block_savepoint - 1)->page.id().page_no() == + index()->page) == (tree_height <= height + 2)); + if (tree_height <= height + 2) + /* Retain the root page latch. */ + break; + /* fall through */ + default: + ut_ad(block_savepoint > savepoint); + mtr->rollback_to_savepoint(block_savepoint - 1, block_savepoint); + block_savepoint--; + } + + if (!height) + { + reached_leaf: + /* We reached the leaf level. */ + ut_ad(block == mtr->at_savepoint(block_savepoint)); + + if (latch_mode == BTR_MODIFY_ROOT_AND_LEAF) + { + reached_root_and_leaf: + if (!latch_by_caller) + mtr->rollback_to_savepoint(savepoint, savepoint + 1); + reached_index_root_and_leaf: + ut_ad(rw_latch == RW_X_LATCH); +#ifdef BTR_CUR_HASH_ADAPT + btr_search_drop_page_hash_index(block, true); +#endif + if (page_cur_search_with_match(tuple, mode, &up_match, &low_match, + &page_cur, nullptr)) + goto corrupted; + ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE); + ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); + ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); + goto func_exit; + } + + switch (latch_mode) { + case BTR_SEARCH_PREV: + case BTR_MODIFY_PREV: + static_assert(BTR_MODIFY_PREV & BTR_MODIFY_LEAF, ""); + static_assert(BTR_SEARCH_PREV & BTR_SEARCH_LEAF, ""); + ut_ad(!latch_by_caller); + ut_ad(rw_latch == + rw_lock_type_t(latch_mode & (RW_X_LATCH | RW_S_LATCH))); + + /* latch also siblings from left to right */ + if (page_has_prev(block->page.frame) && + !btr_latch_prev(block, page_id, zip_size, rw_latch, mtr, &err)) + goto func_exit; + if (page_has_next(block->page.frame) && + !btr_block_get(*index(), btr_page_get_next(block->page.frame), + rw_latch, false, mtr, &err)) + goto func_exit; + goto release_tree; + case BTR_SEARCH_LEAF: + case BTR_MODIFY_LEAF: + if (!latch_by_caller) + { +release_tree: + /* Release the tree s-latch */ + block_savepoint--; + mtr->rollback_to_savepoint(savepoint, savepoint + 1); + } + /* release upper blocks */ + if (savepoint < block_savepoint) + mtr->rollback_to_savepoint(savepoint, block_savepoint); + break; + default: + ut_ad(latch_mode == BTR_MODIFY_TREE); + ut_ad(rw_latch == RW_X_LATCH); + /* x-latch also siblings from left to right */ + if (page_has_prev(block->page.frame) && + !btr_latch_prev(block, page_id, zip_size, rw_latch, mtr, &err)) + goto func_exit; + if (page_has_next(block->page.frame) && + !btr_block_get(*index(), btr_page_get_next(block->page.frame), + RW_X_LATCH, false, mtr, &err)) + goto func_exit; + if (btr_cur_need_opposite_intention(block->page, index()->is_clust(), + lock_intention, + node_ptr_max_size, compress_limit, + page_cur.rec)) + goto need_opposite_intention; + } + + reached_latched_leaf: +#ifdef BTR_CUR_HASH_ADAPT + if (ahi_enabled && !(tuple->info_bits & REC_INFO_MIN_REC_FLAG)) + { + if (page_cur_search_with_match_bytes(tuple, mode, + &up_match, &up_bytes, + &low_match, &low_bytes, &page_cur)) + goto corrupted; + } + else +#endif /* BTR_CUR_HASH_ADAPT */ + if (page_cur_search_with_match(tuple, mode, &up_match, &low_match, + &page_cur, nullptr)) + goto corrupted; + + ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE); + ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); + ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); + +#ifdef BTR_CUR_HASH_ADAPT + /* We do a dirty read of btr_search_enabled here. We will + properly check btr_search_enabled again in + btr_search_build_page_hash_index() before building a page hash + index, while holding search latch. */ + if (!btr_search_enabled); + else if (tuple->info_bits & REC_INFO_MIN_REC_FLAG) + /* This may be a search tuple for btr_pcur_t::restore_position(). */ + ut_ad(tuple->is_metadata() || + (tuple->is_metadata(tuple->info_bits ^ REC_STATUS_INSTANT))); + else if (index()->table->is_temporary()); + else if (!rec_is_metadata(page_cur.rec, *index())) + btr_search_info_update(index(), this); +#endif /* BTR_CUR_HASH_ADAPT */ + + goto func_exit; + } + + guess= nullptr; + if (page_cur_search_with_match(tuple, page_mode, &up_match, &low_match, + &page_cur, nullptr)) + goto corrupted; + offsets= rec_get_offsets(page_cur.rec, index(), offsets, 0, ULINT_UNDEFINED, + &heap); + + ut_ad(block == mtr->at_savepoint(block_savepoint)); + + switch (latch_mode) { + default: + break; + case BTR_MODIFY_TREE: + if (btr_cur_need_opposite_intention(block->page, index()->is_clust(), + lock_intention, + node_ptr_max_size, compress_limit, + page_cur.rec)) + /* If the rec is the first or last in the page for pessimistic + delete intention, it might cause node_ptr insert for the upper + level. We should change the intention and retry. */ + need_opposite_intention: + return pessimistic_search_leaf(tuple, mode, mtr); + + if (detected_same_key_root || lock_intention != BTR_INTENTION_BOTH || + index()->is_unique() || + (up_match <= rec_offs_n_fields(offsets) && + low_match <= rec_offs_n_fields(offsets))) + break; + + /* If the first or the last record of the page or the same key + value to the first record or last record, then another page might + be chosen when BTR_CONT_MODIFY_TREE. So, the parent page should + not released to avoiding deadlock with blocking the another search + with the same key value. */ + const rec_t *first= + page_rec_get_next_const(page_get_infimum_rec(block->page.frame)); + ulint matched_fields; + + if (UNIV_UNLIKELY(!first)) + goto corrupted; + if (page_cur.rec == first || + page_rec_is_last(page_cur.rec, block->page.frame)) + { + same_key_root: + detected_same_key_root= true; + break; + } + + matched_fields= 0; + offsets2= rec_get_offsets(first, index(), offsets2, 0, ULINT_UNDEFINED, + &heap); + cmp_rec_rec(page_cur.rec, first, offsets, offsets2, index(), false, + &matched_fields); + if (matched_fields >= rec_offs_n_fields(offsets) - 1) + goto same_key_root; + if (const rec_t* last= + page_rec_get_prev_const(page_get_supremum_rec(block->page.frame))) + { + matched_fields= 0; + offsets2= rec_get_offsets(last, index(), offsets2, 0, ULINT_UNDEFINED, + &heap); + cmp_rec_rec(page_cur.rec, last, offsets, offsets2, index(), false, + &matched_fields); + if (matched_fields >= rec_offs_n_fields(offsets) - 1) + goto same_key_root; + } + else + goto corrupted; + + /* Release the non-root parent page unless it may need to be modified. */ + if (tree_height > height + 1 && + !btr_cur_will_modify_tree(index(), block->page.frame, lock_intention, + page_cur.rec, node_ptr_max_size, + zip_size, mtr)) + { + mtr->rollback_to_savepoint(block_savepoint - 1, block_savepoint); + block_savepoint--; + } + } + + /* Go to the child node */ + page_id.set_page_no(btr_node_ptr_get_child_page_no(page_cur.rec, offsets)); + + if (!--height) + { + /* We are about to access the leaf level. */ + + switch (latch_mode) { + case BTR_MODIFY_ROOT_AND_LEAF: + rw_latch= RW_X_LATCH; + break; + case BTR_MODIFY_PREV: /* ibuf_insert() or btr_pcur_move_to_prev() */ + case BTR_SEARCH_PREV: /* btr_pcur_move_to_prev() */ + ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH); + + if (page_has_prev(block->page.frame) && + page_rec_is_first(page_cur.rec, block->page.frame)) + { + ut_ad(block_savepoint + 1 == mtr->get_savepoint()); + + /* Latch the previous page if the node pointer is the leftmost + of the current page. */ + int ret= btr_latch_prev(block, page_id, zip_size, rw_latch, mtr, &err); + if (!ret) + goto func_exit; + ut_ad(block_savepoint + 2 == mtr->get_savepoint()); + if (ret < 0) + { + /* While our latch on the level-2 page prevents splits or + merges of this level-1 block, other threads may have + modified it due to splitting or merging some level-0 (leaf) + pages underneath it. Thus, we must search again. */ + if (page_cur_search_with_match(tuple, page_mode, + &up_match, &low_match, + &page_cur, nullptr)) + goto corrupted; + offsets= rec_get_offsets(page_cur.rec, index(), offsets, 0, + ULINT_UNDEFINED, &heap); + page_id.set_page_no(btr_node_ptr_get_child_page_no(page_cur.rec, + offsets)); + } + } + rw_latch= rw_lock_type_t(latch_mode & (RW_X_LATCH | RW_S_LATCH)); + break; + case BTR_MODIFY_LEAF: + case BTR_SEARCH_LEAF: + rw_latch= rw_lock_type_t(latch_mode); + if (btr_op != BTR_NO_OP && !index()->is_ibuf() && + ibuf_should_try(index(), btr_op != BTR_INSERT_OP)) + /* Try to buffer the operation if the leaf page + is not in the buffer pool. */ + buf_mode= btr_op == BTR_DELETE_OP + ? BUF_GET_IF_IN_POOL_OR_WATCH + : BUF_GET_IF_IN_POOL; + break; + case BTR_MODIFY_TREE: + ut_ad(rw_latch == RW_X_LATCH); + + if (lock_intention == BTR_INTENTION_INSERT && + page_has_next(block->page.frame) && + page_rec_is_last(page_cur.rec, block->page.frame)) + { + /* btr_insert_into_right_sibling() might cause deleting node_ptr + at upper level */ + mtr->rollback_to_savepoint(block_savepoint); + goto need_opposite_intention; + } + break; + default: + ut_ad(rw_latch == RW_X_LATCH); + } + } + + goto search_loop; +} + +ATTRIBUTE_COLD void mtr_t::index_lock_upgrade() +{ + auto &slot= m_memo[get_savepoint() - 1]; + if (slot.type == MTR_MEMO_X_LOCK) + return; + ut_ad(slot.type == MTR_MEMO_SX_LOCK); + index_lock *lock= static_cast(slot.object); + lock->u_x_upgrade(SRW_LOCK_CALL); + slot.type= MTR_MEMO_X_LOCK; +} + +ATTRIBUTE_COLD +dberr_t btr_cur_t::pessimistic_search_leaf(const dtuple_t *tuple, + page_cur_mode_t mode, mtr_t *mtr) +{ + ut_ad(index()->is_btree() || index()->is_ibuf()); + ut_ad(!index()->is_ibuf() || ibuf_inside(mtr)); + + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets= offsets_; + rec_offs_init(offsets_); + + ut_ad(flag == BTR_CUR_BINARY); + ut_ad(dict_index_check_search_tuple(index(), tuple)); + ut_ad(dtuple_check_typed(tuple)); + buf_block_t *block= mtr->at_savepoint(1); + ut_ad(block->page.id().page_no() == index()->page); + block->page.fix(); + mtr->rollback_to_savepoint(1); + mtr->index_lock_upgrade(); + + const page_cur_mode_t page_mode{btr_cur_nonleaf_mode(mode)}; + + mtr->page_lock(block, RW_X_LATCH); + + up_match= 0; + up_bytes= 0; + low_match= 0; + low_bytes= 0; + ulint height= btr_page_get_level(block->page.frame); + tree_height= height + 1; + mem_heap_t *heap= nullptr; + + search_loop: + dberr_t err; + page_cur.block= block; + + if (UNIV_UNLIKELY(!height)) + { + if (page_cur_search_with_match(tuple, mode, &up_match, &low_match, + &page_cur, nullptr)) + corrupted: + err= DB_CORRUPTION; + else + { + ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE); + ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); + ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); + +#ifdef BTR_CUR_HASH_ADAPT + /* We do a dirty read of btr_search_enabled here. We will + properly check btr_search_enabled again in + btr_search_build_page_hash_index() before building a page hash + index, while holding search latch. */ + if (!btr_search_enabled); + else if (tuple->info_bits & REC_INFO_MIN_REC_FLAG) + /* This may be a search tuple for btr_pcur_t::restore_position(). */ + ut_ad(tuple->is_metadata() || + (tuple->is_metadata(tuple->info_bits ^ REC_STATUS_INSTANT))); + else if (index()->table->is_temporary()); + else if (!rec_is_metadata(page_cur.rec, *index())) + btr_search_info_update(index(), this); +#endif /* BTR_CUR_HASH_ADAPT */ + err= DB_SUCCESS; + } + + func_exit: + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + return err; + } + + if (page_cur_search_with_match(tuple, page_mode, &up_match, &low_match, + &page_cur, nullptr)) + goto corrupted; + + page_id_t page_id{block->page.id()}; + + offsets= rec_get_offsets(page_cur.rec, index(), offsets, 0, ULINT_UNDEFINED, + &heap); + /* Go to the child node */ + page_id.set_page_no(btr_node_ptr_get_child_page_no(page_cur.rec, offsets)); + + block= + buf_page_get_gen(page_id, block->zip_size(), RW_X_LATCH, nullptr, BUF_GET, + mtr, &err, !--height && !index()->is_clust()); + + if (!block) + { + if (err == DB_DECRYPTION_FAILED) + btr_decryption_failed(*index()); + goto func_exit; + } + + if (!!page_is_comp(block->page.frame) != index()->table->not_redundant() || + btr_page_get_index_id(block->page.frame) != index()->id || + fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE || + !fil_page_index_page_check(block->page.frame)) + goto corrupted; + + if (height != btr_page_get_level(block->page.frame)) + goto corrupted; + +#ifdef UNIV_ZIP_DEBUG + const page_zip_des_t *page_zip= buf_block_get_page_zip(block); + ut_a(!page_zip || page_zip_validate(page_zip, block->page.frame, index())); +#endif /* UNIV_ZIP_DEBUG */ + + if (page_has_prev(block->page.frame) && + !btr_latch_prev(block, page_id, block->zip_size(), + RW_X_LATCH, mtr, &err)) + goto func_exit; + if (page_has_next(block->page.frame) && + !btr_block_get(*index(), btr_page_get_next(block->page.frame), + RW_X_LATCH, false, mtr, &err)) + goto func_exit; + goto search_loop; +} + +/********************************************************************//** +Searches an index tree and positions a tree cursor on a given non-leaf level. +NOTE: n_fields_cmp in tuple must be set so that it cannot be compared +to node pointer page number fields on the upper levels of the tree! +cursor->up_match and cursor->low_match both will have sensible values. +Cursor is left at the place where an insert of the +search tuple should be performed in the B-tree. InnoDB does an insert +immediately after the cursor. Thus, the cursor may end up on a user record, +or on a page infimum record. +@param level the tree level of search +@param tuple data tuple; NOTE: n_fields_cmp in tuple must be set so that + it cannot get compared to the node ptr page number field! +@param latch RW_S_LATCH or RW_X_LATCH +@param cursor tree cursor; the cursor page is s- or x-latched, but see also + above! +@param mtr mini-transaction +@return DB_SUCCESS on success or error code otherwise */ +TRANSACTIONAL_TARGET +dberr_t btr_cur_search_to_nth_level(ulint level, + const dtuple_t *tuple, + rw_lock_type_t rw_latch, + btr_cur_t *cursor, mtr_t *mtr) +{ + dict_index_t *const index= cursor->index(); + + ut_ad(index->is_btree() || index->is_ibuf()); + mem_heap_t *heap= nullptr; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs *offsets= offsets_; + rec_offs_init(offsets_); + ut_ad(level); + ut_ad(dict_index_check_search_tuple(index, tuple)); + ut_ad(index->is_ibuf() ? ibuf_inside(mtr) : index->is_btree()); + ut_ad(dtuple_check_typed(tuple)); + ut_ad(index->page != FIL_NULL); + + MEM_UNDEFINED(&cursor->up_bytes, sizeof cursor->up_bytes); + MEM_UNDEFINED(&cursor->low_bytes, sizeof cursor->low_bytes); + cursor->up_match= 0; + cursor->low_match= 0; + cursor->flag= BTR_CUR_BINARY; + +#ifndef BTR_CUR_ADAPT + buf_block_t *block= nullptr; +#else + btr_search_t *info= btr_search_get_info(index); + buf_block_t *block= info->root_guess; +#endif /* BTR_CUR_ADAPT */ + + ut_ad(mtr->memo_contains_flagged(&index->lock, + MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); + + const ulint zip_size= index->table->space->zip_size(); + + /* Start with the root page. */ + page_id_t page_id(index->table->space_id, index->page); + ulint height= ULINT_UNDEFINED; + +search_loop: + dberr_t err= DB_SUCCESS; + if (buf_block_t *b= + mtr->get_already_latched(page_id, mtr_memo_type_t(rw_latch))) + block= b; + else if (!(block= buf_page_get_gen(page_id, zip_size, rw_latch, + block, BUF_GET, mtr, &err))) + { + if (err == DB_DECRYPTION_FAILED) + btr_decryption_failed(*index); + goto func_exit; + } + +#ifdef UNIV_ZIP_DEBUG + if (const page_zip_des_t *page_zip= buf_block_get_page_zip(block)) + ut_a(page_zip_validate(page_zip, block->page.frame, index)); +#endif /* UNIV_ZIP_DEBUG */ + + if (!!page_is_comp(block->page.frame) != index->table->not_redundant() || + btr_page_get_index_id(block->page.frame) != index->id || + fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE || + !fil_page_index_page_check(block->page.frame)) + { + corrupted: + err= DB_CORRUPTION; + func_exit: + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + return err; + } + + const uint32_t page_level= btr_page_get_level(block->page.frame); + + if (height == ULINT_UNDEFINED) + { + /* We are in the root node */ + height= page_level; + if (!height) + goto corrupted; + cursor->tree_height= height + 1; + } + else if (height != ulint{page_level}) + goto corrupted; + + cursor->page_cur.block= block; + + /* Search for complete index fields. */ + if (page_cur_search_with_match(tuple, PAGE_CUR_LE, &cursor->up_match, + &cursor->low_match, &cursor->page_cur, + nullptr)) + goto corrupted; + + /* If this is the desired level, leave the loop */ + if (level == height) + goto func_exit; + + ut_ad(height > level); + height--; + + offsets = rec_get_offsets(cursor->page_cur.rec, index, offsets, 0, + ULINT_UNDEFINED, &heap); + /* Go to the child node */ + page_id.set_page_no(btr_node_ptr_get_child_page_no(cursor->page_cur.rec, + offsets)); + block= nullptr; + goto search_loop; +} + +dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, + btr_latch_mode latch_mode, mtr_t *mtr) +{ + ulint n_blocks= 0; + mem_heap_t *heap= nullptr; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs *offsets= offsets_; + dberr_t err; + + rec_offs_init(offsets_); + + const bool latch_by_caller= latch_mode & BTR_ALREADY_S_LATCHED; + latch_mode= btr_latch_mode(latch_mode & ~BTR_ALREADY_S_LATCHED); + + btr_intention_t lock_intention= btr_cur_get_and_clear_intention(&latch_mode); + + /* Store the position of the tree latch we push to mtr so that we + know how to release it when we have latched the leaf node */ + + auto savepoint= mtr->get_savepoint(); + + rw_lock_type_t upper_rw_latch= RW_X_LATCH; + ulint node_ptr_max_size= 0, compress_limit= 0; + + if (latch_mode == BTR_MODIFY_TREE) + { + node_ptr_max_size= btr_node_ptr_max_size(index); + /* Most of delete-intended operations are purging. Free blocks + and read IO bandwidth should be prioritized for them, when the + history list is growing huge. */ + savepoint++; + if (lock_intention == BTR_INTENTION_DELETE) + { + compress_limit= BTR_CUR_PAGE_COMPRESS_LIMIT(index); + + if (os_aio_pending_reads_approx() && + trx_sys.history_size_approx() > BTR_CUR_FINE_HISTORY_LENGTH) + { + mtr_x_lock_index(index, mtr); + goto index_locked; + } + } + mtr_sx_lock_index(index, mtr); + } + else + { + static_assert(int{BTR_CONT_MODIFY_TREE} == (12 | BTR_MODIFY_LEAF), ""); + ut_ad(!(latch_mode & 8)); + /* This function doesn't need to lock left page of the leaf page */ + static_assert(int{BTR_SEARCH_PREV} == (4 | BTR_SEARCH_LEAF), ""); + static_assert(int{BTR_MODIFY_PREV} == (4 | BTR_MODIFY_LEAF), ""); + latch_mode= btr_latch_mode(latch_mode & ~4); + ut_ad(!latch_by_caller || + mtr->memo_contains_flagged(&index->lock, + MTR_MEMO_SX_LOCK | MTR_MEMO_S_LOCK)); + upper_rw_latch= RW_S_LATCH; + if (!latch_by_caller) + { + savepoint++; + mtr_s_lock_index(index, mtr); + } + } + +index_locked: + ut_ad(savepoint == mtr->get_savepoint()); + + const rw_lock_type_t root_leaf_rw_latch= + rw_lock_type_t(latch_mode & (RW_S_LATCH | RW_X_LATCH)); + + page_cur.index = index; + + uint32_t page= index->page; + const auto zip_size= index->table->space->zip_size(); + + for (ulint height= ULINT_UNDEFINED;;) + { + ut_ad(n_blocks < BTR_MAX_LEVELS); + ut_ad(savepoint + n_blocks == mtr->get_savepoint()); + + buf_block_t* block= + btr_block_get(*index, page, + height ? upper_rw_latch : root_leaf_rw_latch, + !height, mtr, &err); + ut_ad(!block == (err != DB_SUCCESS)); + + if (!block) + { + if (err == DB_DECRYPTION_FAILED) + btr_decryption_failed(*index); + break; + } + + if (first) + page_cur_set_before_first(block, &page_cur); + else + page_cur_set_after_last(block, &page_cur); + + const uint32_t l= btr_page_get_level(block->page.frame); + + if (height == ULINT_UNDEFINED) + { + /* We are in the root node */ + height= l; + if (height); + else if (upper_rw_latch != root_leaf_rw_latch) + { + /* We should retry to get the page, because the root page + is latched with different level as a leaf page. */ + ut_ad(n_blocks == 0); + ut_ad(root_leaf_rw_latch != RW_NO_LATCH); + upper_rw_latch= root_leaf_rw_latch; + mtr->rollback_to_savepoint(savepoint); + height= ULINT_UNDEFINED; + continue; + } + else + { + reached_leaf: + const auto leaf_savepoint= mtr->get_savepoint(); + ut_ad(leaf_savepoint); + ut_ad(block == mtr->at_savepoint(leaf_savepoint - 1)); + + if (latch_mode == BTR_MODIFY_TREE) + { + /* x-latch also siblings from left to right */ + if (page_has_prev(block->page.frame) && + !btr_latch_prev(block, block->page.id(), zip_size, RW_X_LATCH, + mtr, &err)) + break; + if (page_has_next(block->page.frame) && + !btr_block_get(*index, btr_page_get_next(block->page.frame), + RW_X_LATCH, false, mtr, &err)) + break; + + if (!index->lock.have_x() && + btr_cur_need_opposite_intention(block->page, index->is_clust(), + lock_intention, + node_ptr_max_size, + compress_limit, page_cur.rec)) + goto need_opposite_intention; + } + else + { + if (latch_mode != BTR_CONT_MODIFY_TREE) + { + ut_ad(latch_mode == BTR_MODIFY_LEAF || + latch_mode == BTR_SEARCH_LEAF); + /* Release index->lock if needed, and the non-leaf pages. */ + mtr->rollback_to_savepoint(savepoint - !latch_by_caller, + leaf_savepoint - 1); + } + } + break; + } + } + else if (UNIV_UNLIKELY(height != l)) + { + corrupted: + err= DB_CORRUPTION; + break; + } + + if (!height) + goto reached_leaf; + + height--; + + if (first + ? !page_cur_move_to_next(&page_cur) + : !page_cur_move_to_prev(&page_cur)) + goto corrupted; + + offsets= rec_get_offsets(page_cur.rec, index, offsets, 0, ULINT_UNDEFINED, + &heap); + + ut_ad(latch_mode != BTR_MODIFY_TREE || upper_rw_latch == RW_X_LATCH); + + if (latch_mode != BTR_MODIFY_TREE); + else if (btr_cur_need_opposite_intention(block->page, index->is_clust(), + lock_intention, + node_ptr_max_size, compress_limit, + page_cur.rec)) + { + need_opposite_intention: + /* If the rec is the first or last in the page for pessimistic + delete intention, it might cause node_ptr insert for the upper + level. We should change the intention and retry. */ + + mtr->rollback_to_savepoint(savepoint); + mtr->index_lock_upgrade(); + /* X-latch all pages from now on */ + latch_mode= BTR_CONT_MODIFY_TREE; + page= index->page; + height= ULINT_UNDEFINED; + n_blocks= 0; + continue; + } + else + { + if (!btr_cur_will_modify_tree(index, block->page.frame, + lock_intention, page_cur.rec, + node_ptr_max_size, zip_size, mtr)) + { + ut_ad(n_blocks); + /* release buffer-fixes on pages that will not be modified + (except the root) */ + if (n_blocks > 1) + { + mtr->rollback_to_savepoint(savepoint + 1, savepoint + n_blocks - 1); + n_blocks= 1; + } + } + } + + /* Go to the child node */ + page= btr_node_ptr_get_child_page_no(page_cur.rec, offsets); + n_blocks++; + } + + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + + return err; +} + +/*==================== B-TREE INSERT =========================*/ + +/*************************************************************//** +Inserts a record if there is enough space, or if enough space can +be freed by reorganizing. Differs from btr_cur_optimistic_insert because +no heuristics is applied to whether it pays to use CPU time for +reorganizing the page or not. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to inserted record if succeed, else NULL */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +rec_t* +btr_cur_insert_if_possible( +/*=======================*/ + btr_cur_t* cursor, /*!< in: cursor on page after which to insert; + cursor stays valid */ + const dtuple_t* tuple, /*!< in: tuple to insert; the size info need not + have been stored to tuple */ + rec_offs** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + page_cur_t* page_cursor; + rec_t* rec; + + ut_ad(dtuple_check_typed(tuple)); + + ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + page_cursor = btr_cur_get_page_cur(cursor); + + /* Now, try the insert */ + rec = page_cur_tuple_insert(page_cursor, tuple, offsets, heap, n_ext, + mtr); + + /* If the record did not fit, reorganize. + For compressed pages, page_cur_tuple_insert() + attempted this already. */ + if (!rec && !page_cur_get_page_zip(page_cursor) + && btr_page_reorganize(page_cursor, mtr) == DB_SUCCESS) { + rec = page_cur_tuple_insert(page_cursor, tuple, offsets, heap, + n_ext, mtr); + } + + ut_ad(!rec || rec_offs_validate(rec, page_cursor->index, *offsets)); + return(rec); +} + +/*************************************************************//** +For an insert, checks the locks and does the undo logging if desired. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL, or error number */ +UNIV_INLINE MY_ATTRIBUTE((warn_unused_result, nonnull(2,3,5,6))) +dberr_t +btr_cur_ins_lock_and_undo( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags: if + not zero, the parameters index and thr + should be specified */ + btr_cur_t* cursor, /*!< in: cursor on page after which to insert */ + dtuple_t* entry, /*!< in/out: entry to insert */ + que_thr_t* thr, /*!< in: query thread or NULL */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + bool* inherit)/*!< out: true if the inserted new record maybe + should inherit LOCK_GAP type locks from the + successor record */ +{ + if (!(~flags | (BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG))) { + return DB_SUCCESS; + } + + /* Check if we have to wait for a lock: enqueue an explicit lock + request if yes */ + + rec_t* rec = btr_cur_get_rec(cursor); + dict_index_t* index = cursor->index(); + + ut_ad(!dict_index_is_online_ddl(index) + || dict_index_is_clust(index) + || (flags & BTR_CREATE_FLAG)); + ut_ad((flags & BTR_NO_UNDO_LOG_FLAG) + || !index->table->skip_alter_undo); + + ut_ad(mtr->is_named_space(index->table->space)); + + /* Check if there is predicate or GAP lock preventing the insertion */ + if (!(flags & BTR_NO_LOCKING_FLAG)) { + const unsigned type = index->type; + if (UNIV_UNLIKELY(type & DICT_SPATIAL)) { + lock_prdt_t prdt; + rtr_mbr_t mbr; + + rtr_get_mbr_from_tuple(entry, &mbr); + + /* Use on stack MBR variable to test if a lock is + needed. If so, the predicate (MBR) will be allocated + from lock heap in lock_prdt_insert_check_and_lock() */ + lock_init_prdt_from_mbr(&prdt, &mbr, 0, nullptr); + + if (dberr_t err = lock_prdt_insert_check_and_lock( + rec, btr_cur_get_block(cursor), + index, thr, mtr, &prdt)) { + return err; + } + *inherit = false; + } else { + ut_ad(!dict_index_is_online_ddl(index) + || index->is_primary() + || (flags & BTR_CREATE_FLAG)); +#ifdef WITH_WSREP + trx_t* trx= thr_get_trx(thr); + /* If transaction scanning an unique secondary + key is wsrep high priority thread (brute + force) this scanning may involve GAP-locking + in the index. As this locking happens also + when applying replication events in high + priority applier threads, there is a + probability for lock conflicts between two + wsrep high priority threads. To avoid this + GAP-locking we mark that this transaction + is using unique key scan here. */ + if ((type & (DICT_CLUSTERED | DICT_UNIQUE)) == DICT_UNIQUE + && trx->is_wsrep() + && wsrep_thd_is_BF(trx->mysql_thd, false)) { + trx->wsrep = 3; + } +#endif /* WITH_WSREP */ + if (dberr_t err = lock_rec_insert_check_and_lock( + rec, btr_cur_get_block(cursor), + index, thr, mtr, inherit)) { + return err; + } + } + } + + if (!index->is_primary() || !page_is_leaf(page_align(rec))) { + return DB_SUCCESS; + } + + constexpr roll_ptr_t dummy_roll_ptr = roll_ptr_t{1} + << ROLL_PTR_INSERT_FLAG_POS; + roll_ptr_t roll_ptr = dummy_roll_ptr; + + if (!(flags & BTR_NO_UNDO_LOG_FLAG)) { + if (dberr_t err = trx_undo_report_row_operation( + thr, index, entry, NULL, 0, NULL, NULL, + &roll_ptr)) { + return err; + } + + if (roll_ptr != dummy_roll_ptr) { + dfield_t* r = dtuple_get_nth_field(entry, + index->db_trx_id()); + trx_write_trx_id(static_cast(r->data), + thr_get_trx(thr)->id); + } + } + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + dfield_t* r = dtuple_get_nth_field( + entry, index->db_roll_ptr()); + ut_ad(r->len == DATA_ROLL_PTR_LEN); + trx_write_roll_ptr(static_cast(r->data), roll_ptr); + } + + return DB_SUCCESS; +} + +/** +Prefetch siblings of the leaf for the pessimistic operation. +@param block leaf page +@param index index of the page */ +static void btr_cur_prefetch_siblings(const buf_block_t *block, + const dict_index_t *index) +{ + ut_ad(page_is_leaf(block->page.frame)); + + if (index->is_ibuf()) + return; + + const page_t *page= block->page.frame; + uint32_t prev= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV)); + uint32_t next= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT)); + + fil_space_t *space= index->table->space; + + if (prev == FIL_NULL); + else if (space->acquire()) + buf_read_page_background(space, page_id_t(space->id, prev), + block->zip_size()); + if (next == FIL_NULL); + else if (space->acquire()) + buf_read_page_background(space, page_id_t(space->id, next), + block->zip_size()); +} + +/*************************************************************//** +Tries to perform an insert to a page in an index tree, next to cursor. +It is assumed that mtr holds an x-latch on the page. The operation does +not succeed if there is too little space on the page. If there is just +one record on the page, the insert will always succeed; this is to +prevent trying to split a page with just one record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL, or error number */ +dberr_t +btr_cur_optimistic_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags: if not + zero, the parameters index and thr should be + specified */ + btr_cur_t* cursor, /*!< in: cursor on page after which to insert; + cursor stays valid */ + rec_offs** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap */ + dtuple_t* entry, /*!< in/out: entry to insert */ + rec_t** rec, /*!< out: pointer to inserted record if + succeed */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller */ + ulint n_ext, /*!< in: number of externally stored columns */ + que_thr_t* thr, /*!< in/out: query thread; can be NULL if + !(~flags + & (BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG)) */ + mtr_t* mtr) /*!< in/out: mini-transaction; + if this function returns DB_SUCCESS on + a leaf page of a secondary index in a + compressed tablespace, the caller must + mtr_commit(mtr) before latching + any further pages */ +{ + big_rec_t* big_rec_vec = NULL; + dict_index_t* index; + page_cur_t* page_cursor; + buf_block_t* block; + page_t* page; + rec_t* dummy; + bool leaf; + bool reorg __attribute__((unused)); + bool inherit = true; + ulint rec_size; + dberr_t err; + + ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG))); + *big_rec = NULL; + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + index = cursor->index(); + + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(!dict_index_is_online_ddl(index) + || dict_index_is_clust(index) + || (flags & BTR_CREATE_FLAG)); + ut_ad(dtuple_check_typed(entry)); + +#ifdef HAVE_valgrind + if (block->page.zip.data) { + MEM_CHECK_DEFINED(page, srv_page_size); + MEM_CHECK_DEFINED(block->page.zip.data, block->zip_size()); + } +#endif /* HAVE_valgrind */ + + leaf = page_is_leaf(page); + + if (UNIV_UNLIKELY(entry->is_alter_metadata())) { + ut_ad(leaf); + goto convert_big_rec; + } + + /* Calculate the record size when entry is converted to a record */ + rec_size = rec_get_converted_size(index, entry, n_ext); + + if (page_zip_rec_needs_ext(rec_size, page_is_comp(page), + dtuple_get_n_fields(entry), + block->zip_size())) { +convert_big_rec: + /* The record is so big that we have to store some fields + externally on separate database pages */ + big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext); + + if (UNIV_UNLIKELY(big_rec_vec == NULL)) { + + return(DB_TOO_BIG_RECORD); + } + + rec_size = rec_get_converted_size(index, entry, n_ext); + } + + if (block->page.zip.data && page_zip_is_too_big(index, entry)) { + if (big_rec_vec != NULL) { + dtuple_convert_back_big_rec(index, entry, big_rec_vec); + } + + return(DB_TOO_BIG_RECORD); + } + + LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page), goto fail); + + if (block->page.zip.data && leaf + && (page_get_data_size(page) + rec_size + >= dict_index_zip_pad_optimal_page_size(index))) { + /* If compression padding tells us that insertion will + result in too packed up page i.e.: which is likely to + cause compression failure then don't do an optimistic + insertion. */ +fail: + err = DB_FAIL; + + /* prefetch siblings of the leaf for the pessimistic + operation, if the page is leaf. */ + if (leaf) { + btr_cur_prefetch_siblings(block, index); + } +fail_err: + + if (big_rec_vec) { + dtuple_convert_back_big_rec(index, entry, big_rec_vec); + } + + return(err); + } + + ulint max_size = page_get_max_insert_size_after_reorganize(page, 1); + if (max_size < rec_size) { + goto fail; + } + + const ulint n_recs = page_get_n_recs(page); + if (UNIV_UNLIKELY(n_recs >= 8189)) { + ut_ad(srv_page_size == 65536); + goto fail; + } + + if (page_has_garbage(page)) { + if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + && n_recs > 1 + && page_get_max_insert_size(page, 1) < rec_size) { + + goto fail; + } + } + + /* If there have been many consecutive inserts to the + clustered index leaf page of an uncompressed table, check if + we have to split the page to reserve enough free space for + future updates of records. */ + + if (leaf && !block->page.zip.data && dict_index_is_clust(index) + && page_get_n_recs(page) >= 2 + && dict_index_get_space_reserve() + rec_size > max_size + && (btr_page_get_split_rec_to_right(cursor, &dummy) + || btr_page_get_split_rec_to_left(cursor))) { + goto fail; + } + + page_cursor = btr_cur_get_page_cur(cursor); + + DBUG_LOG("ib_cur", + "insert " << index->name << " (" << index->id << ") by " + << ib::hex(thr ? thr->graph->trx->id : 0) + << ' ' << rec_printer(entry).str()); + DBUG_EXECUTE_IF("do_page_reorganize", + ut_a(!n_recs || btr_page_reorganize(page_cursor, mtr) + == DB_SUCCESS);); + + /* Now, try the insert */ + { + const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor); + + /* Check locks and write to the undo log, + if specified */ + err = btr_cur_ins_lock_and_undo(flags, cursor, entry, + thr, mtr, &inherit); + if (err != DB_SUCCESS) { + goto fail_err; + } + +#ifdef UNIV_DEBUG + if (!(flags & BTR_CREATE_FLAG) + && leaf && index->is_primary()) { + const dfield_t* trx_id = dtuple_get_nth_field( + entry, dict_col_get_clust_pos( + dict_table_get_sys_col(index->table, + DATA_TRX_ID), + index)); + + ut_ad(trx_id->len == DATA_TRX_ID_LEN); + ut_ad(trx_id[1].len == DATA_ROLL_PTR_LEN); + ut_ad(*static_cast + (trx_id[1].data) & 0x80); + if (flags & BTR_NO_UNDO_LOG_FLAG) { + ut_ad(!memcmp(trx_id->data, reset_trx_id, + DATA_TRX_ID_LEN)); + } else { + ut_ad(thr->graph->trx->id); + ut_ad(thr->graph->trx->bulk_insert + || thr->graph->trx->id + == trx_read_trx_id( + static_cast( + trx_id->data)) + || index->table->is_temporary()); + } + } +#endif + + *rec = page_cur_tuple_insert(page_cursor, entry, offsets, heap, + n_ext, mtr); + + reorg = page_cursor_rec != page_cur_get_rec(page_cursor); + } + + if (*rec) { + } else if (block->page.zip.data) { + ut_ad(!index->table->is_temporary()); + /* Reset the IBUF_BITMAP_FREE bits, because + page_cur_tuple_insert() will have attempted page + reorganize before failing. */ + if (leaf + && !dict_index_is_clust(index)) { + ibuf_reset_free_bits(block); + } + + goto fail; + } else { + ut_ad(!reorg); + reorg = true; + + /* If the record did not fit, reorganize */ + err = btr_page_reorganize(page_cursor, mtr); + if (err != DB_SUCCESS + || page_get_max_insert_size(page, 1) != max_size + || !(*rec = page_cur_tuple_insert(page_cursor, entry, + offsets, heap, n_ext, + mtr))) { + err = DB_CORRUPTION; + goto fail_err; + } + } + +#ifdef BTR_CUR_HASH_ADAPT + if (!leaf) { + } else if (entry->info_bits & REC_INFO_MIN_REC_FLAG) { + ut_ad(entry->is_metadata()); + ut_ad(index->is_instant()); + ut_ad(flags == BTR_NO_LOCKING_FLAG); + } else if (index->table->is_temporary()) { + } else { + srw_spin_lock* ahi_latch = btr_search_sys.get_latch(*index); + if (!reorg && cursor->flag == BTR_CUR_HASH) { + btr_search_update_hash_node_on_insert( + cursor, ahi_latch); + } else { + btr_search_update_hash_on_insert(cursor, ahi_latch); + } + } +#endif /* BTR_CUR_HASH_ADAPT */ + + if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) { + + lock_update_insert(block, *rec); + } + + if (leaf + && !dict_index_is_clust(index) + && !index->table->is_temporary()) { + /* Update the free bits of the B-tree page in the + insert buffer bitmap. */ + + /* The free bits in the insert buffer bitmap must + never exceed the free space on a page. It is safe to + decrement or reset the bits in the bitmap in a + mini-transaction that is committed before the + mini-transaction that affects the free space. */ + + /* It is unsafe to increment the bits in a separately + committed mini-transaction, because in crash recovery, + the free bits could momentarily be set too high. */ + + if (block->page.zip.data) { + /* Update the bits in the same mini-transaction. */ + ibuf_update_free_bits_zip(block, mtr); + } else { + /* Decrement the bits in a separate + mini-transaction. */ + ibuf_update_free_bits_if_full( + block, max_size, + rec_size + PAGE_DIR_SLOT_SIZE); + } + } + + *big_rec = big_rec_vec; + + return(DB_SUCCESS); +} + +/*************************************************************//** +Performs an insert on a page of an index tree. It is assumed that mtr +holds an x-latch on the tree and on the cursor page. If the insert is +made on the leaf level, to avoid deadlocks, mtr must also own x-latches +to brothers of page, if those brothers exist. +@return DB_SUCCESS or error number */ +dberr_t +btr_cur_pessimistic_insert( +/*=======================*/ + ulint flags, /*!< in: undo logging and locking flags: if not + zero, the parameter thr should be + specified; if no undo logging is specified, + then the caller must have reserved enough + free extents in the file space so that the + insertion will certainly succeed */ + btr_cur_t* cursor, /*!< in: cursor after which to insert; + cursor stays valid */ + rec_offs** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap + that can be emptied */ + dtuple_t* entry, /*!< in/out: entry to insert */ + rec_t** rec, /*!< out: pointer to inserted record if + succeed */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller */ + ulint n_ext, /*!< in: number of externally stored columns */ + que_thr_t* thr, /*!< in/out: query thread; can be NULL if + !(~flags + & (BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG)) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + dict_index_t* index = cursor->index(); + big_rec_t* big_rec_vec = NULL; + bool inherit = false; + uint32_t n_reserved = 0; + + ut_ad(dtuple_check_typed(entry)); + ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG))); + + *big_rec = NULL; + + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + ut_ad(!dict_index_is_online_ddl(index) + || dict_index_is_clust(index) + || (flags & BTR_CREATE_FLAG)); + + cursor->flag = BTR_CUR_BINARY; + + /* Check locks and write to undo log, if specified */ + + dberr_t err = btr_cur_ins_lock_and_undo(flags, cursor, entry, + thr, mtr, &inherit); + + if (err != DB_SUCCESS) { + return(err); + } + + /* First reserve enough free space for the file segments of + the index tree, so that the insert will not fail because of + lack of space */ + + if (!index->is_ibuf() + && (err = fsp_reserve_free_extents(&n_reserved, index->table->space, + uint32_t(cursor->tree_height / 16 + + 3), + FSP_NORMAL, mtr)) + != DB_SUCCESS) { + return err; + } + + if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext), + index->table->not_redundant(), + dtuple_get_n_fields(entry), + btr_cur_get_block(cursor)->zip_size()) + || UNIV_UNLIKELY(entry->is_alter_metadata() + && !dfield_is_ext( + dtuple_get_nth_field( + entry, + index->first_user_field())))) { + /* The record is so big that we have to store some fields + externally on separate database pages */ + + if (UNIV_LIKELY_NULL(big_rec_vec)) { + /* This should never happen, but we handle + the situation in a robust manner. */ + ut_ad(0); + dtuple_convert_back_big_rec(index, entry, big_rec_vec); + } + + big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext); + + if (big_rec_vec == NULL) { + + index->table->space->release_free_extents(n_reserved); + return(DB_TOO_BIG_RECORD); + } + } + + *rec = index->page == btr_cur_get_block(cursor)->page.id().page_no() + ? btr_root_raise_and_insert(flags, cursor, offsets, heap, + entry, n_ext, mtr, &err) + : btr_page_split_and_insert(flags, cursor, offsets, heap, + entry, n_ext, mtr, &err); + + if (!*rec) { + goto func_exit; + } + + ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec + || dict_index_is_spatial(index)); + + if (!(flags & BTR_NO_LOCKING_FLAG)) { + ut_ad(!index->table->is_temporary()); + if (dict_index_is_spatial(index)) { + /* Do nothing */ + } else { + /* The cursor might be moved to the other page + and the max trx id field should be updated after + the cursor was fixed. */ + if (!dict_index_is_clust(index)) { + page_update_max_trx_id( + btr_cur_get_block(cursor), + btr_cur_get_page_zip(cursor), + thr_get_trx(thr)->id, mtr); + } + + if (!page_rec_is_infimum(btr_cur_get_rec(cursor)) + || !page_has_prev(btr_cur_get_page(cursor))) { + /* split and inserted need to call + lock_update_insert() always. */ + inherit = true; + } + } + } + + if (!page_is_leaf(btr_cur_get_page(cursor))) { + ut_ad(!big_rec_vec); + } else { +#ifdef BTR_CUR_HASH_ADAPT + if (entry->info_bits & REC_INFO_MIN_REC_FLAG) { + ut_ad(entry->is_metadata()); + ut_ad(index->is_instant()); + ut_ad(flags & BTR_NO_LOCKING_FLAG); + ut_ad(!(flags & BTR_CREATE_FLAG)); + } else if (index->table->is_temporary()) { + } else { + btr_search_update_hash_on_insert( + cursor, btr_search_sys.get_latch(*index)); + } +#endif /* BTR_CUR_HASH_ADAPT */ + if (inherit && !(flags & BTR_NO_LOCKING_FLAG)) { + + lock_update_insert(btr_cur_get_block(cursor), *rec); + } + } + + err = DB_SUCCESS; +func_exit: + index->table->space->release_free_extents(n_reserved); + *big_rec = big_rec_vec; + + return err; +} + +/*==================== B-TREE UPDATE =========================*/ + +/*************************************************************//** +For an update, checks the locks and does the undo logging. +@return DB_SUCCESS, DB_LOCK_WAIT, or error number */ +UNIV_INLINE MY_ATTRIBUTE((warn_unused_result)) +dberr_t +btr_cur_upd_lock_and_undo( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor on record to update */ + const rec_offs* offsets,/*!< in: rec_get_offsets() on cursor */ + const upd_t* update, /*!< in: update vector */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread + (can be NULL if BTR_NO_LOCKING_FLAG) */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + roll_ptr_t* roll_ptr)/*!< out: roll pointer */ +{ + dict_index_t* index; + const rec_t* rec; + dberr_t err; + + ut_ad((thr != NULL) || (flags & BTR_NO_LOCKING_FLAG)); + + rec = btr_cur_get_rec(cursor); + index = cursor->index(); + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(mtr->is_named_space(index->table->space)); + + if (!dict_index_is_clust(index)) { + ut_ad(dict_index_is_online_ddl(index) + == !!(flags & BTR_CREATE_FLAG)); + + /* We do undo logging only when we update a clustered index + record */ + return(lock_sec_rec_modify_check_and_lock( + flags, btr_cur_get_block(cursor), rec, + index, thr, mtr)); + } + + /* Check if we have to wait for a lock: enqueue an explicit lock + request if yes */ + + if (!(flags & BTR_NO_LOCKING_FLAG)) { + err = lock_clust_rec_modify_check_and_lock( + btr_cur_get_block(cursor), rec, index, + offsets, thr); + if (err != DB_SUCCESS) { + return(err); + } + } + + /* Append the info about the update in the undo log */ + + return((flags & BTR_NO_UNDO_LOG_FLAG) + ? DB_SUCCESS + : trx_undo_report_row_operation( + thr, index, NULL, update, + cmpl_info, rec, offsets, roll_ptr)); +} + +/** Write DB_TRX_ID,DB_ROLL_PTR to a clustered index entry. +@param[in,out] entry clustered index entry +@param[in] index clustered index +@param[in] trx_id DB_TRX_ID +@param[in] roll_ptr DB_ROLL_PTR */ +static void btr_cur_write_sys( + dtuple_t* entry, + const dict_index_t* index, + trx_id_t trx_id, + roll_ptr_t roll_ptr) +{ + dfield_t* t = dtuple_get_nth_field(entry, index->db_trx_id()); + ut_ad(t->len == DATA_TRX_ID_LEN); + trx_write_trx_id(static_cast(t->data), trx_id); + dfield_t* r = dtuple_get_nth_field(entry, index->db_roll_ptr()); + ut_ad(r->len == DATA_ROLL_PTR_LEN); + trx_write_roll_ptr(static_cast(r->data), roll_ptr); +} + +MY_ATTRIBUTE((warn_unused_result)) +/** Update DB_TRX_ID, DB_ROLL_PTR in a clustered index record. +@param[in,out] block clustered index leaf page +@param[in,out] rec clustered index record +@param[in] index clustered index +@param[in] offsets rec_get_offsets(rec, index) +@param[in] trx transaction +@param[in] roll_ptr DB_ROLL_PTR value +@param[in,out] mtr mini-transaction +@return error code */ +static dberr_t btr_cur_upd_rec_sys(buf_block_t *block, rec_t *rec, + dict_index_t *index, const rec_offs *offsets, + const trx_t *trx, roll_ptr_t roll_ptr, + mtr_t *mtr) +{ + ut_ad(index->is_primary()); + ut_ad(rec_offs_validate(rec, index, offsets)); + + if (UNIV_LIKELY_NULL(block->page.zip.data)) + { + page_zip_write_trx_id_and_roll_ptr(block, rec, offsets, index->db_trx_id(), + trx->id, roll_ptr, mtr); + return DB_SUCCESS; + } + + ulint offset= index->trx_id_offset; + + if (!offset) + offset= row_get_trx_id_offset(index, offsets); + + compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR); + + /* During IMPORT the trx id in the record can be in the future, if + the .ibd file is being imported from another instance. During IMPORT + roll_ptr will be 0. */ + ut_ad(roll_ptr == 0 || + lock_check_trx_id_sanity(trx_read_trx_id(rec + offset), + rec, index, offsets)); + + byte sys[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]; + + trx_write_trx_id(sys, trx->id); + trx_write_roll_ptr(sys + DATA_TRX_ID_LEN, roll_ptr); + + ulint d= 0; + const byte *src= nullptr; + byte *dest= rec + offset; + ulint len= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + + if (UNIV_LIKELY(index->trx_id_offset)) + { + const rec_t *prev= page_rec_get_prev_const(rec); + if (UNIV_UNLIKELY(!prev || prev == rec)) + return DB_CORRUPTION; + else if (page_rec_is_infimum(prev)); + else + for (src= prev + offset; d < DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; d++) + if (src[d] != sys[d]) + break; + if (d > 6 && memcmp(dest, sys, d)) + { + /* We save space by replacing a single record + + WRITE,page_offset(dest),byte[13] + + with two records: + + MEMMOVE,page_offset(dest),d(1 byte),offset(1..3 bytes), + WRITE|0x80,0,byte[13-d] + + The single WRITE record would be x+13 bytes long, with x>2. + The MEMMOVE record would be up to x+1+3 = x+4 bytes, and the + second WRITE would be 1+1+13-d = 15-d bytes. + + The total size is: x+13 versus x+4+15-d = x+19-d bytes. + To save space, we must have d>6, that is, the complete DB_TRX_ID and + the first byte(s) of DB_ROLL_PTR must match the previous record. */ + memcpy(dest, src, d); + mtr->memmove(*block, page_offset(dest), page_offset(src), d); + dest+= d; + len-= d; + /* DB_TRX_ID,DB_ROLL_PTR must be unique in each record when + DB_TRX_ID refers to an active transaction. */ + ut_ad(len); + } + else + d= 0; + } + + if (UNIV_LIKELY(len)) /* extra safety, to avoid corrupting the log */ + mtr->memcpy(*block, dest, sys + d, len); + + return DB_SUCCESS; +} + +/*************************************************************//** +See if there is enough place in the page modification log to log +an update-in-place. + +@retval false if out of space; IBUF_BITMAP_FREE will be reset +outside mtr if the page was recompressed +@retval true if enough place; + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is +a secondary index leaf page. This has to be done either within the +same mini-transaction, or by invoking ibuf_reset_free_bits() before +mtr_commit(mtr). */ +bool +btr_cur_update_alloc_zip_func( +/*==========================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + page_cur_t* cursor, /*!< in/out: B-tree page cursor */ +#ifdef UNIV_DEBUG + rec_offs* offsets,/*!< in/out: offsets of the cursor record */ +#endif /* UNIV_DEBUG */ + ulint length, /*!< in: size needed */ + bool create, /*!< in: true=delete-and-insert, + false=update-in-place */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + dict_index_t* index = cursor->index; + + /* Have a local copy of the variables as these can change + dynamically. */ + const page_t* page = page_cur_get_page(cursor); + + ut_ad(page_zip == page_cur_get_page_zip(cursor)); + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets)); + + if (page_zip_available(page_zip, dict_index_is_clust(index), + length, create)) { + return(true); + } + + if (!page_zip->m_nonempty && !page_has_garbage(page)) { + /* The page has been freshly compressed, so + reorganizing it will not help. */ + return(false); + } + + if (create && page_is_leaf(page) + && (length + page_get_data_size(page) + >= dict_index_zip_pad_optimal_page_size(index))) { + return(false); + } + + if (btr_page_reorganize(cursor, mtr) == DB_SUCCESS) { + rec_offs_make_valid(page_cur_get_rec(cursor), index, + page_is_leaf(page), offsets); + + /* After recompressing a page, we must make sure that the free + bits in the insert buffer bitmap will not exceed the free + space on the page. Because this function will not attempt + recompression unless page_zip_available() fails above, it is + safe to reset the free bits if page_zip_available() fails + again, below. The free bits can safely be reset in a separate + mini-transaction. If page_zip_available() succeeds below, we + can be sure that the btr_page_reorganize() above did not reduce + the free space available on the page. */ + + if (page_zip_available(page_zip, dict_index_is_clust(index), + length, create)) { + return true; + } + } + + if (!dict_index_is_clust(index) + && !index->table->is_temporary() + && page_is_leaf(page)) { + ibuf_reset_free_bits(page_cur_get_block(cursor)); + } + + return(false); +} + +/** Apply an update vector to a record. No field size changes are allowed. + +This is usually invoked on a clustered index. The only use case for a +secondary index is row_ins_sec_index_entry_by_modify() or its +counterpart in ibuf_insert_to_index_page(). +@param[in,out] rec index record +@param[in] index the index of the record +@param[in] offsets rec_get_offsets(rec, index) +@param[in] update update vector +@param[in,out] block index page +@param[in,out] mtr mini-transaction */ +void btr_cur_upd_rec_in_place(rec_t *rec, const dict_index_t *index, + const rec_offs *offsets, const upd_t *update, + buf_block_t *block, mtr_t *mtr) +{ + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!index->table->skip_alter_undo); + ut_ad(!block->page.zip.data || index->table->not_redundant()); + +#ifdef UNIV_DEBUG + if (rec_offs_comp(offsets)) { + switch (rec_get_status(rec)) { + case REC_STATUS_ORDINARY: + break; + case REC_STATUS_INSTANT: + ut_ad(index->is_instant()); + break; + case REC_STATUS_NODE_PTR: + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + ut_ad("wrong record status in update" == 0); + } + } +#endif /* UNIV_DEBUG */ + + static_assert(REC_INFO_BITS_SHIFT == 0, "compatibility"); + if (UNIV_LIKELY_NULL(block->page.zip.data)) { + ut_ad(rec_offs_comp(offsets)); + byte* info_bits = &rec[-REC_NEW_INFO_BITS]; + const bool flip_del_mark = (*info_bits ^ update->info_bits) + & REC_INFO_DELETED_FLAG; + *info_bits &= byte(~REC_INFO_BITS_MASK); + *info_bits |= update->info_bits; + + if (flip_del_mark) { + page_zip_rec_set_deleted(block, rec, update->info_bits + & REC_INFO_DELETED_FLAG, mtr); + } + } else { + byte* info_bits = &rec[rec_offs_comp(offsets) + ? -REC_NEW_INFO_BITS + : -REC_OLD_INFO_BITS]; + + mtr->write<1,mtr_t::MAYBE_NOP>(*block, info_bits, + (*info_bits + & ~REC_INFO_BITS_MASK) + | update->info_bits); + } + + for (ulint i = 0; i < update->n_fields; i++) { + const upd_field_t* uf = upd_get_nth_field(update, i); + if (upd_fld_is_virtual_col(uf) && !index->has_virtual()) { + continue; + } + const ulint n = uf->field_no; + + ut_ad(!dfield_is_ext(&uf->new_val) + == !rec_offs_nth_extern(offsets, n)); + ut_ad(!rec_offs_nth_default(offsets, n)); + + if (UNIV_UNLIKELY(dfield_is_null(&uf->new_val))) { + if (rec_offs_nth_sql_null(offsets, n)) { + ut_ad(index->table->is_instant()); + ut_ad(n >= index->n_core_fields); + continue; + } + + ut_ad(!index->table->not_redundant()); + switch (ulint size = rec_get_nth_field_size(rec, n)) { + case 0: + break; + case 1: + mtr->write<1,mtr_t::MAYBE_NOP>( + *block, + rec_get_field_start_offs(rec, n) + rec, + 0U); + break; + default: + mtr->memset( + block, + page_offset(rec_get_field_start_offs( + rec, n) + rec), + size, 0); + } + ulint l = rec_get_1byte_offs_flag(rec) + ? (n + 1) : (n + 1) * 2; + byte* b = rec - REC_N_OLD_EXTRA_BYTES - l; + compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8 + == REC_2BYTE_SQL_NULL_MASK); + mtr->write<1>(*block, b, + byte(*b | REC_1BYTE_SQL_NULL_MASK)); + continue; + } + + ulint len; + byte* data = rec_get_nth_field(rec, offsets, n, &len); + if (UNIV_LIKELY_NULL(block->page.zip.data)) { + ut_ad(len == uf->new_val.len); + memcpy(data, uf->new_val.data, len); + continue; + } + + if (UNIV_UNLIKELY(len != uf->new_val.len)) { + ut_ad(len == UNIV_SQL_NULL); + ut_ad(!rec_offs_comp(offsets)); + len = uf->new_val.len; + ut_ad(len == rec_get_nth_field_size(rec, n)); + ulint l = rec_get_1byte_offs_flag(rec) + ? (n + 1) : (n + 1) * 2; + byte* b = rec - REC_N_OLD_EXTRA_BYTES - l; + compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8 + == REC_2BYTE_SQL_NULL_MASK); + mtr->write<1>(*block, b, + byte(*b & ~REC_1BYTE_SQL_NULL_MASK)); + } + + if (len) { + mtr->memcpy(*block, data, + uf->new_val.data, len); + } + } + + if (UNIV_LIKELY(!block->page.zip.data)) { + return; + } + + switch (update->n_fields) { + case 0: + /* We only changed the delete-mark flag. */ + return; + case 1: + if (!index->is_clust() + || update->fields[0].field_no != index->db_roll_ptr()) { + break; + } + goto update_sys; + case 2: + if (!index->is_clust() + || update->fields[0].field_no != index->db_trx_id() + || update->fields[1].field_no != index->db_roll_ptr()) { + break; + } + update_sys: + ulint len; + const byte* sys = rec_get_nth_field(rec, offsets, + index->db_trx_id(), &len); + ut_ad(len == DATA_TRX_ID_LEN); + page_zip_write_trx_id_and_roll_ptr( + block, rec, offsets, index->db_trx_id(), + trx_read_trx_id(sys), + trx_read_roll_ptr(sys + DATA_TRX_ID_LEN), mtr); + return; + } + + page_zip_write_rec(block, rec, index, offsets, 0, mtr); +} + +/** Check if a ROW_FORMAT=COMPRESSED page can be updated in place +@param cur cursor pointing to ROW_FORMAT=COMPRESSED page +@param offsets rec_get_offsets(btr_cur_get_rec(cur)) +@param update index fields being updated +@param mtr mini-transaction +@return the record in the ROW_FORMAT=COMPRESSED page +@retval nullptr if the page cannot be updated in place */ +ATTRIBUTE_COLD static +rec_t *btr_cur_update_in_place_zip_check(btr_cur_t *cur, rec_offs *offsets, + const upd_t& update, mtr_t *mtr) +{ + dict_index_t *index= cur->index(); + ut_ad(!index->table->is_temporary()); + + switch (update.n_fields) { + case 0: + /* We are only changing the delete-mark flag. */ + break; + case 1: + if (!index->is_clust() || + update.fields[0].field_no != index->db_roll_ptr()) + goto check_for_overflow; + /* We are only changing the delete-mark flag and DB_ROLL_PTR. */ + break; + case 2: + if (!index->is_clust() || + update.fields[0].field_no != index->db_trx_id() || + update.fields[1].field_no != index->db_roll_ptr()) + goto check_for_overflow; + /* We are only changing DB_TRX_ID, DB_ROLL_PTR, and the delete-mark. + They can be updated in place in the uncompressed part of the + ROW_FORMAT=COMPRESSED page. */ + break; + check_for_overflow: + default: + if (!btr_cur_update_alloc_zip(btr_cur_get_page_zip(cur), + btr_cur_get_page_cur(cur), + offsets, rec_offs_size(offsets), + false, mtr)) + return nullptr; + } + + return btr_cur_get_rec(cur); +} + +/*************************************************************//** +Updates a record when the update causes no size changes in its fields. +We assume here that the ordering fields of the record do not change. +@return locking or undo log related error code, or +@retval DB_SUCCESS on success +@retval DB_ZIP_OVERFLOW if there is not enough space left +on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */ +dberr_t +btr_cur_update_in_place( +/*====================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor on the record to update; + cursor stays valid and positioned on the + same record */ + rec_offs* offsets,/*!< in/out: offsets on cursor->page_cur.rec */ + const upd_t* update, /*!< in: update vector */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction; if this + is a secondary index, the caller must + mtr_commit(mtr) before latching any + further pages */ +{ + dict_index_t* index; + dberr_t err; + rec_t* rec; + roll_ptr_t roll_ptr = 0; + ulint was_delete_marked; + + ut_ad(page_is_leaf(cursor->page_cur.block->page.frame)); + rec = btr_cur_get_rec(cursor); + index = cursor->index(); + ut_ad(!index->is_ibuf()); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG) + || index->table->is_temporary()); + /* The insert buffer tree should never be updated in place. */ + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG) + || dict_index_is_clust(index)); + ut_ad(thr_get_trx(thr)->id == trx_id + || (flags & ulint(~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))) + == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG + | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG)); + ut_ad(fil_page_index_page_check(btr_cur_get_page(cursor))); + ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id); + ut_ad(!(update->info_bits & REC_INFO_MIN_REC_FLAG)); + + DBUG_LOG("ib_cur", + "update-in-place " << index->name << " (" << index->id + << ") by " << ib::hex(trx_id) << ": " + << rec_printer(rec, offsets).str()); + + buf_block_t* block = btr_cur_get_block(cursor); + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + + /* Check that enough space is available on the compressed page. */ + if (UNIV_LIKELY_NULL(page_zip) + && !(rec = btr_cur_update_in_place_zip_check( + cursor, offsets, *update, mtr))) { + return DB_ZIP_OVERFLOW; + } + + /* Do lock checking and undo logging */ + err = btr_cur_upd_lock_and_undo(flags, cursor, offsets, + update, cmpl_info, + thr, mtr, &roll_ptr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + /* We may need to update the IBUF_BITMAP_FREE + bits after a reorganize that was done in + btr_cur_update_alloc_zip(). */ + goto func_exit; + } + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + err = btr_cur_upd_rec_sys(block, rec, index, offsets, + thr_get_trx(thr), roll_ptr, mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + goto func_exit; + } + } + + was_delete_marked = rec_get_deleted_flag( + rec, page_is_comp(buf_block_get_frame(block))); + /* In delete-marked records, DB_TRX_ID must always refer to an + existing undo log record. */ + ut_ad(!was_delete_marked + || !dict_index_is_clust(index) + || row_get_rec_trx_id(rec, index, offsets)); + +#ifdef BTR_CUR_HASH_ADAPT + { + srw_spin_lock* ahi_latch = block->index + ? btr_search_sys.get_latch(*index) : NULL; + if (ahi_latch) { + /* TO DO: Can we skip this if none of the fields + index->search_info->curr_n_fields + are being updated? */ + + /* The function row_upd_changes_ord_field_binary + does not work on a secondary index. */ + + if (!dict_index_is_clust(index) + || row_upd_changes_ord_field_binary( + index, update, thr, NULL, NULL)) { + ut_ad(!(update->info_bits + & REC_INFO_MIN_REC_FLAG)); + /* Remove possible hash index pointer + to this record */ + btr_search_update_hash_on_delete(cursor); + } + + ahi_latch->wr_lock(SRW_LOCK_CALL); + } + + assert_block_ahi_valid(block); +#endif /* BTR_CUR_HASH_ADAPT */ + + btr_cur_upd_rec_in_place(rec, index, offsets, update, block, + mtr); + +#ifdef BTR_CUR_HASH_ADAPT + if (ahi_latch) { + ahi_latch->wr_unlock(); + } + } +#endif /* BTR_CUR_HASH_ADAPT */ + + if (was_delete_marked + && !rec_get_deleted_flag( + rec, page_is_comp(buf_block_get_frame(block)))) { + /* The new updated record owns its possible externally + stored fields */ + + btr_cur_unmark_extern_fields(block, rec, index, offsets, mtr); + } + + ut_ad(err == DB_SUCCESS); + +func_exit: + if (page_zip + && !(flags & BTR_KEEP_IBUF_BITMAP) + && !dict_index_is_clust(index) + && page_is_leaf(buf_block_get_frame(block))) { + /* Update the free bits in the insert buffer. */ + ut_ad(!index->table->is_temporary()); + ibuf_update_free_bits_zip(block, mtr); + } + + return(err); +} + +/** Trim a metadata record during the rollback of instant ALTER TABLE. +@param[in] entry metadata tuple +@param[in] index primary key +@param[in] update update vector for the rollback */ +ATTRIBUTE_COLD +static void btr_cur_trim_alter_metadata(dtuple_t* entry, + const dict_index_t* index, + const upd_t* update) +{ + ut_ad(index->is_instant()); + ut_ad(update->is_alter_metadata()); + ut_ad(entry->is_alter_metadata()); + + ut_ad(update->fields[0].field_no == index->first_user_field()); + ut_ad(update->fields[0].new_val.ext); + ut_ad(update->fields[0].new_val.len == FIELD_REF_SIZE); + ut_ad(entry->n_fields - 1 == index->n_fields); + + const byte* ptr = static_cast( + update->fields[0].new_val.data); + ut_ad(!mach_read_from_4(ptr + BTR_EXTERN_LEN)); + ut_ad(mach_read_from_4(ptr + BTR_EXTERN_LEN + 4) > 4); + ut_ad(mach_read_from_4(ptr + BTR_EXTERN_OFFSET) == FIL_PAGE_DATA); + ut_ad(mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID) + == index->table->space->id); + + ulint n_fields = update->fields[1].field_no; + ut_ad(n_fields <= index->n_fields); + if (n_fields != index->n_uniq) { + ut_ad(n_fields + >= index->n_core_fields); + entry->n_fields = n_fields; + return; + } + + /* This is based on dict_table_t::deserialise_columns() + and btr_cur_instant_init_low(). */ + mtr_t mtr; + mtr.start(); + buf_block_t* block = buf_page_get( + page_id_t(index->table->space->id, + mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)), + 0, RW_S_LATCH, &mtr); + if (!block) { + ut_ad("corruption" == 0); + mtr.commit(); + return; + } + ut_ad(fil_page_get_type(block->page.frame) == FIL_PAGE_TYPE_BLOB); + ut_ad(mach_read_from_4(&block->page.frame + [FIL_PAGE_DATA + BTR_BLOB_HDR_NEXT_PAGE_NO]) + == FIL_NULL); + ut_ad(mach_read_from_4(&block->page.frame + [FIL_PAGE_DATA + BTR_BLOB_HDR_PART_LEN]) + == mach_read_from_4(ptr + BTR_EXTERN_LEN + 4)); + n_fields = mach_read_from_4( + &block->page.frame[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE]) + + index->first_user_field(); + /* Rollback should not increase the number of fields. */ + ut_ad(n_fields <= index->n_fields); + ut_ad(n_fields + 1 <= entry->n_fields); + /* dict_index_t::clear_instant_alter() cannot be invoked while + rollback of an instant ALTER TABLE transaction is in progress + for an is_alter_metadata() record. */ + ut_ad(n_fields >= index->n_core_fields); + + mtr.commit(); + entry->n_fields = n_fields + 1; +} + +/** Trim an update tuple due to instant ADD COLUMN, if needed. +For normal records, the trailing instantly added fields that match +the initial default values are omitted. + +For the special metadata record on a table on which instant +ADD COLUMN has already been executed, both ADD COLUMN and the +rollback of ADD COLUMN need to be handled specially. + +@param[in,out] entry index entry +@param[in] index index +@param[in] update update vector +@param[in] thr execution thread */ +static inline +void +btr_cur_trim( + dtuple_t* entry, + const dict_index_t* index, + const upd_t* update, + const que_thr_t* thr) +{ + if (!index->is_instant()) { + } else if (UNIV_UNLIKELY(update->is_metadata())) { + /* We are either updating a metadata record + (instant ALTER TABLE on a table where instant ALTER was + already executed) or rolling back such an operation. */ + ut_ad(!upd_get_nth_field(update, 0)->orig_len); + ut_ad(entry->is_metadata()); + + if (thr->graph->trx->in_rollback) { + /* This rollback can occur either as part of + ha_innobase::commit_inplace_alter_table() rolling + back after a failed innobase_add_instant_try(), + or as part of crash recovery. Either way, the + table will be in the data dictionary cache, with + the instantly added columns going to be removed + later in the rollback. */ + ut_ad(index->table->cached); + /* The DB_TRX_ID,DB_ROLL_PTR are always last, + and there should be some change to roll back. + The first field in the update vector is the + first instantly added column logged by + innobase_add_instant_try(). */ + ut_ad(update->n_fields > 2); + if (update->is_alter_metadata()) { + btr_cur_trim_alter_metadata( + entry, index, update); + return; + } + ut_ad(!entry->is_alter_metadata()); + + ulint n_fields = upd_get_nth_field(update, 0) + ->field_no; + ut_ad(n_fields + 1 >= entry->n_fields); + entry->n_fields = n_fields; + } + } else { + entry->trim(*index); + } +} + +/*************************************************************//** +Tries to update a record on a page in an index tree. It is assumed that mtr +holds an x-latch on the page. The operation does not succeed if there is too +little space on the page or if the update would result in too empty a page, +so that tree compression is recommended. We assume here that the ordering +fields of the record do not change. +@return error code, including +@retval DB_SUCCESS on success +@retval DB_OVERFLOW if the updated record does not fit +@retval DB_UNDERFLOW if the page would become too empty +@retval DB_ZIP_OVERFLOW if there is not enough space left +on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */ +dberr_t +btr_cur_optimistic_update( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor on the record to update; + cursor stays valid and positioned on the + same record */ + rec_offs** offsets,/*!< out: offsets on cursor->page_cur.rec */ + mem_heap_t** heap, /*!< in/out: pointer to NULL or memory heap */ + const upd_t* update, /*!< in: update vector; this must also + contain trx id and roll ptr fields */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction; if this + is a secondary index, the caller must + mtr_commit(mtr) before latching any + further pages */ +{ + dict_index_t* index; + page_cur_t* page_cursor; + dberr_t err; + buf_block_t* block; + page_t* page; + page_zip_des_t* page_zip; + rec_t* rec; + ulint max_size; + ulint new_rec_size; + ulint old_rec_size; + ulint max_ins_size = 0; + dtuple_t* new_entry; + roll_ptr_t roll_ptr; + ulint i; + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + rec = btr_cur_get_rec(cursor); + index = cursor->index(); + ut_ad(index->has_locking()); + ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG) + || index->table->is_temporary()); + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + /* This is intended only for leaf page updates */ + ut_ad(page_is_leaf(page)); + /* The insert buffer tree should never be updated in place. */ + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG) + || dict_index_is_clust(index)); + ut_ad(thr_get_trx(thr)->id == trx_id + || (flags & ulint(~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))) + == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG + | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG)); + ut_ad(fil_page_index_page_check(page)); + ut_ad(btr_page_get_index_id(page) == index->id); + + *offsets = rec_get_offsets(rec, index, *offsets, index->n_core_fields, + ULINT_UNDEFINED, heap); +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!rec_offs_any_null_extern(rec, *offsets) + || thr_get_trx(thr) == trx_roll_crash_recv_trx); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + + if (UNIV_LIKELY(!update->is_metadata()) + && !row_upd_changes_field_size_or_external(index, *offsets, + update)) { + + /* The simplest and the most common case: the update does not + change the size of any field and none of the updated fields is + externally stored in rec or update, and there is enough space + on the compressed page to log the update. */ + + return(btr_cur_update_in_place( + flags, cursor, *offsets, update, + cmpl_info, thr, trx_id, mtr)); + } + + if (rec_offs_any_extern(*offsets)) { +any_extern: + ut_ad(!index->is_ibuf()); + /* Externally stored fields are treated in pessimistic + update */ + + /* prefetch siblings of the leaf for the pessimistic + operation. */ + btr_cur_prefetch_siblings(block, index); + + return(DB_OVERFLOW); + } + + if (rec_is_metadata(rec, *index) && index->table->instant) { + goto any_extern; + } + + for (i = 0; i < upd_get_n_fields(update); i++) { + if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) { + + goto any_extern; + } + } + + DBUG_LOG("ib_cur", + "update " << index->name << " (" << index->id << ") by " + << ib::hex(trx_id) << ": " + << rec_printer(rec, *offsets).str()); + + page_cursor = btr_cur_get_page_cur(cursor); + + if (!*heap) { + *heap = mem_heap_create( + rec_offs_size(*offsets) + + DTUPLE_EST_ALLOC(rec_offs_n_fields(*offsets))); + } + + new_entry = row_rec_to_index_entry(rec, index, *offsets, *heap); + ut_ad(!dtuple_get_n_ext(new_entry)); + + /* The page containing the clustered index record + corresponding to new_entry is latched in mtr. + Thus the following call is safe. */ + row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update, + *heap); + btr_cur_trim(new_entry, index, update, thr); + old_rec_size = rec_offs_size(*offsets); + new_rec_size = rec_get_converted_size(index, new_entry, 0); + + page_zip = buf_block_get_page_zip(block); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + if (page_zip) { + ut_ad(!index->table->is_temporary()); + + if (page_zip_rec_needs_ext(new_rec_size, page_is_comp(page), + dict_index_get_n_fields(index), + block->zip_size())) { + goto any_extern; + } + + if (!btr_cur_update_alloc_zip( + page_zip, page_cursor, *offsets, + new_rec_size, true, mtr)) { + return(DB_ZIP_OVERFLOW); + } + + rec = page_cur_get_rec(page_cursor); + } + + /* We limit max record size to 16k even for 64k page size. */ + if (new_rec_size >= COMPRESSED_REC_MAX_DATA_SIZE || + (!dict_table_is_comp(index->table) + && new_rec_size >= REDUNDANT_REC_MAX_DATA_SIZE)) { + err = DB_OVERFLOW; + goto func_exit; + } + + if (UNIV_UNLIKELY(new_rec_size + >= (page_get_free_space_of_empty(page_is_comp(page)) + / 2))) { + /* We may need to update the IBUF_BITMAP_FREE + bits after a reorganize that was done in + btr_cur_update_alloc_zip(). */ + err = DB_OVERFLOW; + goto func_exit; + } + + if (UNIV_UNLIKELY(page_get_data_size(page) + - old_rec_size + new_rec_size + < BTR_CUR_PAGE_COMPRESS_LIMIT(index))) { + /* We may need to update the IBUF_BITMAP_FREE + bits after a reorganize that was done in + btr_cur_update_alloc_zip(). */ + + /* The page would become too empty */ + err = DB_UNDERFLOW; + goto func_exit; + } + + /* We do not attempt to reorganize if the page is compressed. + This is because the page may fail to compress after reorganization. */ + max_size = page_zip + ? page_get_max_insert_size(page, 1) + : (old_rec_size + + page_get_max_insert_size_after_reorganize(page, 1)); + + if (!page_zip) { + max_ins_size = page_get_max_insert_size_after_reorganize( + page, 1); + } + + if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT) + && (max_size >= new_rec_size)) + || (page_get_n_recs(page) <= 1))) { + + /* We may need to update the IBUF_BITMAP_FREE + bits after a reorganize that was done in + btr_cur_update_alloc_zip(). */ + + /* There was not enough space, or it did not pay to + reorganize: for simplicity, we decide what to do assuming a + reorganization is needed, though it might not be necessary */ + + err = DB_OVERFLOW; + goto func_exit; + } + + /* Do lock checking and undo logging */ + err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets, + update, cmpl_info, + thr, mtr, &roll_ptr); + if (err != DB_SUCCESS) { + /* We may need to update the IBUF_BITMAP_FREE + bits after a reorganize that was done in + btr_cur_update_alloc_zip(). */ + goto func_exit; + } + + /* Ok, we may do the replacement. Store on the page infimum the + explicit locks on rec, before deleting rec (see the comment in + btr_cur_pessimistic_update). */ + if (index->has_locking()) { + lock_rec_store_on_page_infimum(block, rec); + } + + if (UNIV_UNLIKELY(update->is_metadata())) { + ut_ad(new_entry->is_metadata()); + ut_ad(index->is_instant()); + /* This can be innobase_add_instant_try() performing a + subsequent instant ADD COLUMN, or its rollback by + row_undo_mod_clust_low(). */ + ut_ad(flags & BTR_NO_LOCKING_FLAG); + } else { + btr_search_update_hash_on_delete(cursor); + } + + page_cur_delete_rec(page_cursor, *offsets, mtr); + + if (!page_cur_move_to_prev(page_cursor)) { + return DB_CORRUPTION; + } + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + btr_cur_write_sys(new_entry, index, trx_id, roll_ptr); + } + + rec = btr_cur_insert_if_possible(cursor, new_entry, offsets, heap, + 0/*n_ext*/, mtr); + if (UNIV_UNLIKELY(!rec)) { + goto corrupted; + } + + if (UNIV_UNLIKELY(update->is_metadata())) { + /* We must empty the PAGE_FREE list, because if this + was a rollback, the shortened metadata record + would have too many fields, and we would be unable to + know the size of the freed record. */ + err = btr_page_reorganize(page_cursor, mtr); + if (err != DB_SUCCESS) { + goto func_exit; + } + } else { + /* Restore the old explicit lock state on the record */ + lock_rec_restore_from_page_infimum(*block, rec, + block->page.id()); + } + + ut_ad(err == DB_SUCCESS); + if (!page_cur_move_to_next(page_cursor)) { +corrupted: + err = DB_CORRUPTION; + } + +func_exit: + if (!(flags & BTR_KEEP_IBUF_BITMAP) + && !dict_index_is_clust(index)) { + /* Update the free bits in the insert buffer. */ + if (page_zip) { + ut_ad(!index->table->is_temporary()); + ibuf_update_free_bits_zip(block, mtr); + } else if (!index->table->is_temporary()) { + ibuf_update_free_bits_low(block, max_ins_size, mtr); + } + } + + if (err != DB_SUCCESS) { + /* prefetch siblings of the leaf for the pessimistic + operation. */ + btr_cur_prefetch_siblings(block, index); + } + + return(err); +} + +/*************************************************************//** +If, in a split, a new supremum record was created as the predecessor of the +updated record, the supremum record must inherit exactly the locks on the +updated record. In the split it may have inherited locks from the successor +of the updated record, which is not correct. This function restores the +right locks for the new supremum. */ +static +dberr_t +btr_cur_pess_upd_restore_supremum( +/*==============================*/ + buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: updated record */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* page; + + page = buf_block_get_frame(block); + + if (page_rec_get_next(page_get_infimum_rec(page)) != rec) { + /* Updated record is not the first user record on its page */ + return DB_SUCCESS; + } + + const uint32_t prev_page_no = btr_page_get_prev(page); + + const page_id_t block_id{block->page.id()}; + const page_id_t prev_id(block_id.space(), prev_page_no); + dberr_t err; + buf_block_t* prev_block + = buf_page_get_gen(prev_id, 0, RW_NO_LATCH, nullptr, + BUF_PEEK_IF_IN_POOL, mtr, &err); + /* Since we already held an x-latch on prev_block, it must + be available and not be corrupted unless the buffer pool got + corrupted somehow. */ + if (UNIV_UNLIKELY(!prev_block)) { + return err; + } + ut_ad(!memcmp_aligned<4>(prev_block->page.frame + FIL_PAGE_NEXT, + block->page.frame + FIL_PAGE_OFFSET, 4)); + + /* We must already have an x-latch on prev_block! */ + ut_ad(mtr->memo_contains_flagged(prev_block, MTR_MEMO_PAGE_X_FIX)); + + lock_rec_reset_and_inherit_gap_locks(*prev_block, block_id, + PAGE_HEAP_NO_SUPREMUM, + page_rec_get_heap_no(rec)); + return DB_SUCCESS; +} + +/*************************************************************//** +Performs an update of a record on a page of a tree. It is assumed +that mtr holds an x-latch on the tree and on the cursor page. If the +update is made on the leaf level, to avoid deadlocks, mtr must also +own x-latches to brothers of page, if those brothers exist. We assume +here that the ordering fields of the record do not change. +@return DB_SUCCESS or error code */ +dberr_t +btr_cur_pessimistic_update( +/*=======================*/ + ulint flags, /*!< in: undo logging, locking, and rollback + flags */ + btr_cur_t* cursor, /*!< in/out: cursor on the record to update; + cursor may become invalid if *big_rec == NULL + || !(flags & BTR_KEEP_POS_FLAG) */ + rec_offs** offsets,/*!< out: offsets on cursor->page_cur.rec */ + mem_heap_t** offsets_heap, + /*!< in/out: pointer to memory heap + that can be emptied */ + mem_heap_t* entry_heap, + /*!< in/out: memory heap for allocating + big_rec and the index tuple */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller */ + upd_t* update, /*!< in/out: update vector; this is allowed to + also contain trx id and roll ptr fields. + Non-updated columns that are moved offpage will + be appended to this. */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction; must be + committed before latching any further pages */ +{ + big_rec_t* big_rec_vec = NULL; + big_rec_t* dummy_big_rec; + dict_index_t* index; + buf_block_t* block; + page_zip_des_t* page_zip; + rec_t* rec; + page_cur_t* page_cursor; + dberr_t err; + dberr_t optim_err; + roll_ptr_t roll_ptr; + bool was_first; + uint32_t n_reserved = 0; + + *offsets = NULL; + *big_rec = NULL; + + block = btr_cur_get_block(cursor); + page_zip = buf_block_get_page_zip(block); + index = cursor->index(); + ut_ad(index->has_locking()); + + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK | + MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip + || page_zip_validate(page_zip, block->page.frame, index)); +#endif /* UNIV_ZIP_DEBUG */ + ut_ad(!page_zip || !index->table->is_temporary()); + /* The insert buffer tree should never be updated in place. */ + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG) + || index->table->is_temporary()); + ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG) + || dict_index_is_clust(index)); + ut_ad(thr_get_trx(thr)->id == trx_id + || (flags & ulint(~BTR_KEEP_POS_FLAG)) + == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG + | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG)); + + err = optim_err = btr_cur_optimistic_update( + flags | BTR_KEEP_IBUF_BITMAP, + cursor, offsets, offsets_heap, update, + cmpl_info, thr, trx_id, mtr); + + switch (err) { + case DB_ZIP_OVERFLOW: + case DB_UNDERFLOW: + case DB_OVERFLOW: + break; + default: + err_exit: + /* We suppressed this with BTR_KEEP_IBUF_BITMAP. + For DB_ZIP_OVERFLOW, the IBUF_BITMAP_FREE bits were + already reset by btr_cur_update_alloc_zip() if the + page was recompressed. */ + if (page_zip + && optim_err != DB_ZIP_OVERFLOW + && !dict_index_is_clust(index) + && page_is_leaf(block->page.frame)) { + ut_ad(!index->table->is_temporary()); + ibuf_update_free_bits_zip(block, mtr); + } + + if (big_rec_vec != NULL) { + dtuple_big_rec_free(big_rec_vec); + } + + return(err); + } + + rec = btr_cur_get_rec(cursor); + ut_ad(rec_offs_validate(rec, index, *offsets)); + + dtuple_t* new_entry; + + const bool is_metadata = rec_is_metadata(rec, *index); + + if (UNIV_UNLIKELY(is_metadata)) { + ut_ad(update->is_metadata()); + ut_ad(flags & BTR_NO_LOCKING_FLAG); + ut_ad(index->is_instant()); + new_entry = row_metadata_to_tuple( + rec, index, *offsets, entry_heap, + update->info_bits, !thr_get_trx(thr)->in_rollback); + ut_ad(new_entry->n_fields + == ulint(index->n_fields) + + update->is_alter_metadata()); + } else { + new_entry = row_rec_to_index_entry(rec, index, *offsets, + entry_heap); + } + + /* The page containing the clustered index record + corresponding to new_entry is latched in mtr. If the + clustered index record is delete-marked, then its externally + stored fields cannot have been purged yet, because then the + purge would also have removed the clustered index record + itself. Thus the following call is safe. */ + row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update, + entry_heap); + btr_cur_trim(new_entry, index, update, thr); + + /* We have to set appropriate extern storage bits in the new + record to be inserted: we have to remember which fields were such */ + + ut_ad(!page_is_comp(block->page.frame) || !rec_get_node_ptr_flag(rec)); + ut_ad(rec_offs_validate(rec, index, *offsets)); + + if ((flags & BTR_NO_UNDO_LOG_FLAG) + && rec_offs_any_extern(*offsets)) { + /* We are in a transaction rollback undoing a row + update: we must free possible externally stored fields + which got new values in the update, if they are not + inherited values. They can be inherited if we have + updated the primary key to another value, and then + update it back again. */ + + ut_ad(big_rec_vec == NULL); + ut_ad(dict_index_is_clust(index)); + ut_ad(thr_get_trx(thr)->in_rollback); + + DEBUG_SYNC_C("blob_rollback_middle"); + + btr_rec_free_updated_extern_fields( + index, rec, block, *offsets, update, true, mtr); + } + + ulint n_ext = index->is_primary() ? dtuple_get_n_ext(new_entry) : 0; + + if (page_zip_rec_needs_ext( + rec_get_converted_size(index, new_entry, n_ext), + page_is_comp(block->page.frame), + dict_index_get_n_fields(index), + block->zip_size()) + || (UNIV_UNLIKELY(update->is_alter_metadata()) + && !dfield_is_ext(dtuple_get_nth_field( + new_entry, + index->first_user_field())))) { + big_rec_vec = dtuple_convert_big_rec(index, update, new_entry, &n_ext); + if (UNIV_UNLIKELY(big_rec_vec == NULL)) { + + /* We cannot goto return_after_reservations, + because we may need to update the + IBUF_BITMAP_FREE bits, which was suppressed by + BTR_KEEP_IBUF_BITMAP. */ +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip + || page_zip_validate(page_zip, block->page.frame, + index)); +#endif /* UNIV_ZIP_DEBUG */ + index->table->space->release_free_extents(n_reserved); + err = DB_TOO_BIG_RECORD; + goto err_exit; + } + + ut_ad(page_is_leaf(block->page.frame)); + ut_ad(dict_index_is_clust(index)); + if (UNIV_UNLIKELY(!(flags & BTR_KEEP_POS_FLAG))) { + ut_ad(page_zip != NULL); + dtuple_convert_back_big_rec(index, new_entry, + big_rec_vec); + big_rec_vec = NULL; + n_ext = dtuple_get_n_ext(new_entry); + } + } + + /* Do lock checking and undo logging */ + err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets, + update, cmpl_info, + thr, mtr, &roll_ptr); + if (err != DB_SUCCESS) { + goto err_exit; + } + + if (optim_err == DB_OVERFLOW) { + /* First reserve enough free space for the file segments + of the index tree, so that the update will not fail because + of lack of space */ + + err = fsp_reserve_free_extents( + &n_reserved, index->table->space, + uint32_t(cursor->tree_height / 16 + 3), + flags & BTR_NO_UNDO_LOG_FLAG + ? FSP_CLEANING : FSP_NORMAL, + mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + err = DB_OUT_OF_FILE_SPACE; + goto err_exit; + } + } + + if (!(flags & BTR_KEEP_SYS_FLAG)) { + btr_cur_write_sys(new_entry, index, trx_id, roll_ptr); + } + + const ulint max_ins_size = page_zip + ? 0 + : page_get_max_insert_size_after_reorganize(block->page.frame, + 1); + + if (UNIV_UNLIKELY(is_metadata)) { + ut_ad(new_entry->is_metadata()); + ut_ad(index->is_instant()); + /* This can be innobase_add_instant_try() performing a + subsequent instant ALTER TABLE, or its rollback by + row_undo_mod_clust_low(). */ + ut_ad(flags & BTR_NO_LOCKING_FLAG); + } else { + btr_search_update_hash_on_delete(cursor); + + /* Store state of explicit locks on rec on the page + infimum record, before deleting rec. The page infimum + acts as a dummy carrier of the locks, taking care also + of lock releases, before we can move the locks back on + the actual record. There is a special case: if we are + inserting on the root page and the insert causes a + call of btr_root_raise_and_insert. Therefore we cannot + in the lock system delete the lock structs set on the + root page even if the root page carries just node + pointers. */ + lock_rec_store_on_page_infimum(block, rec); + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip + || page_zip_validate(page_zip, block->page.frame, index)); +#endif /* UNIV_ZIP_DEBUG */ + page_cursor = btr_cur_get_page_cur(cursor); + + page_cur_delete_rec(page_cursor, *offsets, mtr); + + if (!page_cur_move_to_prev(page_cursor)) { + err = DB_CORRUPTION; + goto return_after_reservations; + } + + rec = btr_cur_insert_if_possible(cursor, new_entry, + offsets, offsets_heap, n_ext, mtr); + + if (rec) { + page_cursor->rec = rec; + + if (UNIV_UNLIKELY(is_metadata)) { + /* We must empty the PAGE_FREE list, because if this + was a rollback, the shortened metadata record + would have too many fields, and we would be unable to + know the size of the freed record. */ + err = btr_page_reorganize(page_cursor, mtr); + if (err != DB_SUCCESS) { + goto return_after_reservations; + } + rec = page_cursor->rec; + rec_offs_make_valid(rec, index, true, *offsets); + if (page_cursor->block->page.id().page_no() + == index->page) { + btr_set_instant(page_cursor->block, *index, + mtr); + } + } else { + lock_rec_restore_from_page_infimum( + *btr_cur_get_block(cursor), rec, + block->page.id()); + } + + if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets)) + || rec_is_alter_metadata(rec, *index)) { + /* The new inserted record owns its possible externally + stored fields */ + btr_cur_unmark_extern_fields(btr_cur_get_block(cursor), + rec, index, *offsets, mtr); + } else { + /* In delete-marked records, DB_TRX_ID must + always refer to an existing undo log record. */ + ut_ad(row_get_rec_trx_id(rec, index, *offsets)); + } + + bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG); + ut_ad(!adjust || page_is_leaf(block->page.frame)); + + if (btr_cur_compress_if_useful(cursor, adjust, mtr)) { + if (adjust) { + rec_offs_make_valid(page_cursor->rec, index, + true, *offsets); + } + } else if (!dict_index_is_clust(index) + && page_is_leaf(block->page.frame)) { + /* Update the free bits in the insert buffer. + This is the same block which was skipped by + BTR_KEEP_IBUF_BITMAP. */ + if (page_zip) { + ut_ad(!index->table->is_temporary()); + ibuf_update_free_bits_zip(block, mtr); + } else if (!index->table->is_temporary()) { + ibuf_update_free_bits_low(block, max_ins_size, + mtr); + } + } + +#if 0 // FIXME: this used to be a no-op, and will cause trouble if enabled + if (!big_rec_vec + && page_is_leaf(block->page.frame) + && !dict_index_is_online_ddl(index)) { + mtr->release(index->lock); + /* NOTE: We cannot release root block latch here, because it + has segment header and already modified in most of cases.*/ + } +#endif + + err = DB_SUCCESS; + goto return_after_reservations; + } else { + /* If the page is compressed and it initially + compresses very well, and there is a subsequent insert + of a badly-compressing record, it is possible for + btr_cur_optimistic_update() to return DB_UNDERFLOW and + btr_cur_insert_if_possible() to return FALSE. */ + ut_a(page_zip || optim_err != DB_UNDERFLOW); + + /* Out of space: reset the free bits. + This is the same block which was skipped by + BTR_KEEP_IBUF_BITMAP. */ + if (!dict_index_is_clust(index) + && !index->table->is_temporary() + && page_is_leaf(block->page.frame)) { + ibuf_reset_free_bits(block); + } + } + + if (big_rec_vec != NULL) { + ut_ad(page_is_leaf(block->page.frame)); + ut_ad(dict_index_is_clust(index)); + ut_ad(flags & BTR_KEEP_POS_FLAG); + + /* btr_page_split_and_insert() in + btr_cur_pessimistic_insert() invokes + mtr->release(index->lock). + We must keep the index->lock when we created a + big_rec, so that row_upd_clust_rec() can store the + big_rec in the same mini-transaction. */ + + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + mtr_sx_lock_index(index, mtr); + } + + /* Was the record to be updated positioned as the first user + record on its page? */ + was_first = page_cur_is_before_first(page_cursor); + + /* Lock checks and undo logging were already performed by + btr_cur_upd_lock_and_undo(). We do not try + btr_cur_optimistic_insert() because + btr_cur_insert_if_possible() already failed above. */ + + err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG + | BTR_NO_LOCKING_FLAG + | BTR_KEEP_SYS_FLAG, + cursor, offsets, offsets_heap, + new_entry, &rec, + &dummy_big_rec, n_ext, NULL, mtr); + ut_a(err == DB_SUCCESS); + ut_a(rec); + ut_a(dummy_big_rec == NULL); + ut_ad(rec_offs_validate(rec, cursor->index(), *offsets)); + page_cursor->rec = rec; + + /* Multiple transactions cannot simultaneously operate on the + same temp-table in parallel. + max_trx_id is ignored for temp tables because it not required + for MVCC. */ + if (dict_index_is_sec_or_ibuf(index) + && !index->table->is_temporary()) { + /* Update PAGE_MAX_TRX_ID in the index page header. + It was not updated by btr_cur_pessimistic_insert() + because of BTR_NO_LOCKING_FLAG. */ + page_update_max_trx_id(btr_cur_get_block(cursor), + btr_cur_get_page_zip(cursor), + trx_id, mtr); + } + + if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) { + /* The new inserted record owns its possible externally + stored fields */ +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip + || page_zip_validate(page_zip, block->page.frame, index)); +#endif /* UNIV_ZIP_DEBUG */ + btr_cur_unmark_extern_fields(btr_cur_get_block(cursor), rec, + index, *offsets, mtr); + } else { + /* In delete-marked records, DB_TRX_ID must + always refer to an existing undo log record. */ + ut_ad(row_get_rec_trx_id(rec, index, *offsets)); + } + + if (UNIV_UNLIKELY(is_metadata)) { + /* We must empty the PAGE_FREE list, because if this + was a rollback, the shortened metadata record + would have too many fields, and we would be unable to + know the size of the freed record. */ + err = btr_page_reorganize(page_cursor, mtr); + if (err != DB_SUCCESS) { + goto return_after_reservations; + } + rec = page_cursor->rec; + } else { + lock_rec_restore_from_page_infimum( + *btr_cur_get_block(cursor), rec, block->page.id()); + } + + /* If necessary, restore also the correct lock state for a new, + preceding supremum record created in a page split. While the old + record was nonexistent, the supremum might have inherited its locks + from a wrong record. */ + + if (!was_first) { + err = btr_cur_pess_upd_restore_supremum( + btr_cur_get_block(cursor), rec, mtr); + } + +return_after_reservations: +#ifdef UNIV_ZIP_DEBUG + ut_a(err || + !page_zip || page_zip_validate(btr_cur_get_page_zip(cursor), + btr_cur_get_page(cursor), index)); +#endif /* UNIV_ZIP_DEBUG */ + + index->table->space->release_free_extents(n_reserved); + *big_rec = big_rec_vec; + return(err); +} + +/*==================== B-TREE DELETE MARK AND UNMARK ===============*/ + +/** Modify the delete-mark flag of a record. +@tparam flag the value of the delete-mark flag +@param[in,out] block buffer block +@param[in,out] rec record on a physical index page +@param[in,out] mtr mini-transaction */ +template +void btr_rec_set_deleted(buf_block_t *block, rec_t *rec, mtr_t *mtr) +{ + if (page_rec_is_comp(rec)) + { + byte *b= &rec[-REC_NEW_INFO_BITS]; + const byte v= flag + ? (*b | REC_INFO_DELETED_FLAG) + : (*b & byte(~REC_INFO_DELETED_FLAG)); + if (*b == v); + else if (UNIV_LIKELY_NULL(block->page.zip.data)) + { + *b= v; + page_zip_rec_set_deleted(block, rec, flag, mtr); + } + else + mtr->write<1>(*block, b, v); + } + else + { + ut_ad(!block->page.zip.data); + byte *b= &rec[-REC_OLD_INFO_BITS]; + const byte v = flag + ? (*b | REC_INFO_DELETED_FLAG) + : (*b & byte(~REC_INFO_DELETED_FLAG)); + mtr->write<1,mtr_t::MAYBE_NOP>(*block, b, v); + } +} + +template void btr_rec_set_deleted(buf_block_t *, rec_t *, mtr_t *); +template void btr_rec_set_deleted(buf_block_t *, rec_t *, mtr_t *); + +/***********************************************************//** +Marks a clustered index record deleted. Writes an undo log record to +undo log on this delete marking. Writes in the trx id field the id +of the deleting transaction, and in the roll ptr field pointer to the +undo log record created. +@return DB_SUCCESS, DB_LOCK_WAIT, or error number */ +dberr_t +btr_cur_del_mark_set_clust_rec( +/*===========================*/ + buf_block_t* block, /*!< in/out: buffer block of the record */ + rec_t* rec, /*!< in/out: record */ + dict_index_t* index, /*!< in: clustered index of the record */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec) */ + que_thr_t* thr, /*!< in: query thread */ + const dtuple_t* entry, /*!< in: dtuple for the deleting record, also + contains the virtual cols if there are any */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + roll_ptr_t roll_ptr; + dberr_t err; + trx_t* trx; + + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); + ut_ad(buf_block_get_frame(block) == page_align(rec)); + ut_ad(page_rec_is_leaf(rec)); + ut_ad(mtr->is_named_space(index->table->space)); + + if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) { + /* We may already have delete-marked this record + when executing an ON DELETE CASCADE operation. */ + ut_ad(row_get_rec_trx_id(rec, index, offsets) + == thr_get_trx(thr)->id); + return(DB_SUCCESS); + } + + err = trx_undo_report_row_operation(thr, index, + entry, NULL, 0, rec, offsets, + &roll_ptr); + if (err != DB_SUCCESS) { + + return(err); + } + + /* The search latch is not needed here, because + the adaptive hash index does not depend on the delete-mark + and the delete-mark is being updated in place. */ + + btr_rec_set_deleted(block, rec, mtr); + + trx = thr_get_trx(thr); + + DBUG_LOG("ib_cur", + "delete-mark clust " << index->table->name + << " (" << index->id << ") by " + << ib::hex(trx->id) << ": " + << rec_printer(rec, offsets).str()); + + return btr_cur_upd_rec_sys(block, rec, index, offsets, trx, roll_ptr, + mtr); +} + +/*==================== B-TREE RECORD REMOVE =========================*/ + +/*************************************************************//** +Tries to compress a page of the tree if it seems useful. It is assumed +that mtr holds an x-latch on the tree and on the cursor page. To avoid +deadlocks, mtr must also own x-latches to brothers of page, if those +brothers exist. NOTE: it is assumed that the caller has reserved enough +free extents so that the compression will always succeed if done! +@return whether compression occurred */ +bool +btr_cur_compress_if_useful( +/*=======================*/ + btr_cur_t* cursor, /*!< in/out: cursor on the page to compress; + cursor does not stay valid if !adjust and + compression occurs */ + bool adjust, /*!< in: whether the cursor position should be + adjusted even when compression occurs */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock, + MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + + if (cursor->index()->is_spatial()) { + const trx_t* trx = cursor->rtr_info->thr + ? thr_get_trx(cursor->rtr_info->thr) + : NULL; + const buf_block_t* block = btr_cur_get_block(cursor); + + /* Check whether page lock prevents the compression */ + if (!lock_test_prdt_page_lock(trx, block->page.id())) { + return(false); + } + } + + return btr_cur_compress_recommendation(cursor, mtr) + && btr_compress(cursor, adjust, mtr) == DB_SUCCESS; +} + +/*******************************************************//** +Removes the record on which the tree cursor is positioned on a leaf page. +It is assumed that the mtr has an x-latch on the page where the cursor is +positioned, but no latch on the whole tree. +@return error code +@retval DB_FAIL if the page would become too empty */ +dberr_t +btr_cur_optimistic_delete( +/*======================*/ + btr_cur_t* cursor, /*!< in: cursor on leaf page, on the record to + delete; cursor stays valid: if deletion + succeeds, on function exit it points to the + successor of the deleted record */ + ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */ + mtr_t* mtr) /*!< in: mtr; if this function returns + TRUE on a leaf page of a secondary + index, the mtr must be committed + before latching any further pages */ +{ + buf_block_t* block; + rec_t* rec; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(flags == 0 || flags == BTR_CREATE_FLAG); + ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr->is_named_space(cursor->index()->table->space)); + ut_ad(!cursor->index()->is_dummy); + + /* This is intended only for leaf page deletions */ + + block = btr_cur_get_block(cursor); + + ut_ad(block->page.id().space() == cursor->index()->table->space->id); + ut_ad(page_is_leaf(buf_block_get_frame(block))); + ut_ad(!dict_index_is_online_ddl(cursor->index()) + || cursor->index()->is_clust() + || (flags & BTR_CREATE_FLAG)); + + rec = btr_cur_get_rec(cursor); + + offsets = rec_get_offsets(rec, cursor->index(), offsets, + cursor->index()->n_core_fields, + ULINT_UNDEFINED, &heap); + + dberr_t err = DB_SUCCESS; + if (rec_offs_any_extern(offsets) + || !btr_cur_can_delete_without_compress(cursor, + rec_offs_size(offsets), + mtr)) { + /* prefetch siblings of the leaf for the pessimistic + operation. */ + btr_cur_prefetch_siblings(block, cursor->index()); + err = DB_FAIL; + goto func_exit; + } + + if (UNIV_UNLIKELY(block->page.id().page_no() == cursor->index()->page + && page_get_n_recs(block->page.frame) == 1 + + (cursor->index()->is_instant() + && !rec_is_metadata(rec, *cursor->index())) + && !cursor->index() + ->must_avoid_clear_instant_add())) { + /* The whole index (and table) becomes logically empty. + Empty the whole page. That is, if we are deleting the + only user record, also delete the metadata record + if one exists for instant ADD COLUMN (not generic ALTER TABLE). + If we are deleting the metadata record and the + table becomes empty, clean up the whole page. */ + dict_index_t* index = cursor->index(); + const rec_t* first_rec = page_rec_get_next_const( + page_get_infimum_rec(block->page.frame)); + if (UNIV_UNLIKELY(!first_rec)) { + err = DB_CORRUPTION; + goto func_exit; + } + ut_ad(!index->is_instant() + || rec_is_metadata(first_rec, *index)); + const bool is_metadata = rec_is_metadata(rec, *index); + /* We can remove the metadata when rolling back an + instant ALTER TABLE operation, or when deleting the + last user record on the page such that only metadata for + instant ADD COLUMN (not generic ALTER TABLE) remains. */ + const bool empty_table = is_metadata + || !index->is_instant() + || (first_rec != rec + && rec_is_add_metadata(first_rec, *index)); + if (UNIV_LIKELY(empty_table)) { + if (UNIV_LIKELY(!is_metadata && !flags)) { + lock_update_delete(block, rec); + } + btr_page_empty(block, buf_block_get_page_zip(block), + index, 0, mtr); + if (index->is_instant()) { + /* MDEV-17383: free metadata BLOBs! */ + index->clear_instant_alter(); + } + + page_cur_set_after_last(block, + btr_cur_get_page_cur(cursor)); + goto func_exit; + } + } + + { + page_t* page = buf_block_get_frame(block); + page_zip_des_t* page_zip= buf_block_get_page_zip(block); + + if (UNIV_UNLIKELY(rec_get_info_bits(rec, page_rec_is_comp(rec)) + & REC_INFO_MIN_REC_FLAG)) { + /* This should be rolling back instant ADD COLUMN. + If this is a recovered transaction, then + index->is_instant() will hold until the + insert into SYS_COLUMNS is rolled back. */ + ut_ad(cursor->index()->table->supports_instant()); + ut_ad(cursor->index()->is_primary()); + ut_ad(!page_zip); + page_cur_delete_rec(btr_cur_get_page_cur(cursor), + offsets, mtr); + /* We must empty the PAGE_FREE list, because + after rollback, this deleted metadata record + would have too many fields, and we would be + unable to know the size of the freed record. */ + err = btr_page_reorganize(btr_cur_get_page_cur(cursor), + mtr); + goto func_exit; + } else { + if (!flags) { + lock_update_delete(block, rec); + } + + btr_search_update_hash_on_delete(cursor); + } + + if (page_zip) { +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, + cursor->index())); +#endif /* UNIV_ZIP_DEBUG */ + page_cur_delete_rec(btr_cur_get_page_cur(cursor), + offsets, mtr); +#ifdef UNIV_ZIP_DEBUG + ut_a(page_zip_validate(page_zip, page, + cursor->index())); +#endif /* UNIV_ZIP_DEBUG */ + + /* On compressed pages, the IBUF_BITMAP_FREE + space is not affected by deleting (purging) + records, because it is defined as the minimum + of space available *without* reorganize, and + space available in the modification log. */ + } else { + const ulint max_ins + = page_get_max_insert_size_after_reorganize( + page, 1); + + page_cur_delete_rec(btr_cur_get_page_cur(cursor), + offsets, mtr); + + /* The change buffer does not handle inserts + into non-leaf pages, into clustered indexes, + or into the change buffer. */ + if (!cursor->index()->is_clust() + && !cursor->index()->table->is_temporary() + && !dict_index_is_ibuf(cursor->index())) { + ibuf_update_free_bits_low(block, max_ins, mtr); + } + } + } + +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return err; +} + +/*************************************************************//** +Removes the record on which the tree cursor is positioned. Tries +to compress the page if its fillfactor drops below a threshold +or if it is the only page on the level. It is assumed that mtr holds +an x-latch on the tree and on the cursor page. To avoid deadlocks, +mtr must also own x-latches to brothers of page, if those brothers +exist. +@return TRUE if compression occurred and FALSE if not or something +wrong. */ +ibool +btr_cur_pessimistic_delete( +/*=======================*/ + dberr_t* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE; + the latter may occur because we may have + to update node pointers on upper levels, + and in the case of variable length keys + these may actually grow in size */ + ibool has_reserved_extents, /*!< in: TRUE if the + caller has already reserved enough free + extents so that he knows that the operation + will succeed */ + btr_cur_t* cursor, /*!< in: cursor on the record to delete; + if compression does not occur, the cursor + stays valid: it points to successor of + deleted record on function exit */ + ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */ + bool rollback,/*!< in: performing rollback? */ + mtr_t* mtr) /*!< in: mtr */ +{ + buf_block_t* block; + page_t* page; + page_zip_des_t* page_zip; + dict_index_t* index; + rec_t* rec; + uint32_t n_reserved = 0; + ibool ret = FALSE; + mem_heap_t* heap; + rec_offs* offsets; +#ifdef UNIV_DEBUG + bool parent_latched = false; +#endif /* UNIV_DEBUG */ + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + index = btr_cur_get_index(cursor); + + ut_ad(flags == 0 || flags == BTR_CREATE_FLAG); + ut_ad(!dict_index_is_online_ddl(index) + || dict_index_is_clust(index) + || (flags & BTR_CREATE_FLAG)); + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr->is_named_space(index->table->space)); + ut_ad(!index->is_dummy); + ut_ad(block->page.id().space() == index->table->space->id); + + if (!has_reserved_extents) { + /* First reserve enough free space for the file segments + of the index tree, so that the node pointer updates will + not fail because of lack of space */ + + uint32_t n_extents = uint32_t(cursor->tree_height / 32 + 1); + + *err = fsp_reserve_free_extents(&n_reserved, + index->table->space, + n_extents, + FSP_CLEANING, mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { + return(FALSE); + } + } + + heap = mem_heap_create(1024); + rec = btr_cur_get_rec(cursor); + page_zip = buf_block_get_page_zip(block); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + offsets = rec_get_offsets(rec, index, NULL, page_is_leaf(page) + ? index->n_core_fields : 0, + ULINT_UNDEFINED, &heap); + + if (rec_offs_any_extern(offsets)) { + btr_rec_free_externally_stored_fields(index, + rec, offsets, block, + rollback, mtr); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + } + + rec_t* next_rec = NULL; + bool min_mark_next_rec = false; + + if (page_is_leaf(page)) { + const bool is_metadata = rec_is_metadata( + rec, page_rec_is_comp(rec)); + if (UNIV_UNLIKELY(is_metadata)) { + /* This should be rolling back instant ALTER TABLE. + If this is a recovered transaction, then + index->is_instant() will hold until the + insert into SYS_COLUMNS is rolled back. */ + ut_ad(rollback); + ut_ad(index->table->supports_instant()); + ut_ad(index->is_primary()); + } else if (flags == 0) { + lock_update_delete(block, rec); + } + + if (block->page.id().page_no() != index->page) { + if (page_get_n_recs(page) < 2) { + goto discard_page; + } + } else if (page_get_n_recs(page) == 1 + + (index->is_instant() && !is_metadata) + && !index->must_avoid_clear_instant_add()) { + /* The whole index (and table) becomes logically empty. + Empty the whole page. That is, if we are deleting the + only user record, also delete the metadata record + if one exists for instant ADD COLUMN + (not generic ALTER TABLE). + If we are deleting the metadata record + (in the rollback of instant ALTER TABLE) and the + table becomes empty, clean up the whole page. */ + + const rec_t* first_rec = page_rec_get_next_const( + page_get_infimum_rec(page)); + if (UNIV_UNLIKELY(!first_rec)) { + *err = DB_CORRUPTION; + goto err_exit; + } + ut_ad(!index->is_instant() + || rec_is_metadata(first_rec, *index)); + if (is_metadata || !index->is_instant() + || (first_rec != rec + && rec_is_add_metadata(first_rec, *index))) { + btr_page_empty(block, page_zip, index, 0, mtr); + if (index->is_instant()) { + /* MDEV-17383: free metadata BLOBs! */ + index->clear_instant_alter(); + } + + page_cur_set_after_last( + block, + btr_cur_get_page_cur(cursor)); + ret = TRUE; + goto return_after_reservations; + } + } + + if (UNIV_LIKELY(!is_metadata)) { + btr_search_update_hash_on_delete(cursor); + } else { + page_cur_delete_rec(btr_cur_get_page_cur(cursor), + offsets, mtr); + /* We must empty the PAGE_FREE list, because + after rollback, this deleted metadata record + would carry too many fields, and we would be + unable to know the size of the freed record. */ + *err = btr_page_reorganize(btr_cur_get_page_cur(cursor), + mtr); + ut_ad(!ret); + goto err_exit; + } + } else if (UNIV_UNLIKELY(page_rec_is_first(rec, page))) { + if (page_rec_is_last(rec, page)) { +discard_page: + ut_ad(page_get_n_recs(page) == 1); + /* If there is only one record, drop + the whole page. */ + + btr_discard_page(cursor, mtr); + + ret = TRUE; + goto return_after_reservations; + } + + if (UNIV_UNLIKELY(!(next_rec = page_rec_get_next(rec)))) { + ut_ad(!ret); + *err = DB_CORRUPTION; + goto err_exit; + } + + btr_cur_t cursor; + cursor.page_cur.index = index; + cursor.page_cur.block = block; + + if (!page_has_prev(page)) { + /* If we delete the leftmost node pointer on a + non-leaf level, we must mark the new leftmost node + pointer as the predefined minimum record */ + + min_mark_next_rec = true; + } else if (index->is_spatial()) { + /* For rtree, if delete the leftmost node pointer, + we need to update parent page. */ + rtr_mbr_t father_mbr; + rec_t* father_rec; + rec_offs* offsets; + ulint len; + + rtr_page_get_father_block(NULL, heap, mtr, NULL, + &cursor); + father_rec = btr_cur_get_rec(&cursor); + offsets = rec_get_offsets(father_rec, index, NULL, + 0, ULINT_UNDEFINED, &heap); + + rtr_read_mbr(rec_get_nth_field( + father_rec, offsets, 0, &len), &father_mbr); + + rtr_update_mbr_field(&cursor, offsets, NULL, + page, &father_mbr, next_rec, mtr); + ut_d(parent_latched = true); + } else { + /* Otherwise, if we delete the leftmost node pointer + on a page, we have to change the parent node pointer + so that it is equal to the new leftmost node pointer + on the page */ + ret = btr_page_get_father(mtr, &cursor); + if (!ret) { + *err = DB_CORRUPTION; + goto err_exit; + } + *err = btr_cur_node_ptr_delete(&cursor, mtr); + if (*err != DB_SUCCESS) { +got_err: + ret = FALSE; + goto err_exit; + } + + const ulint level = btr_page_get_level(page); + // FIXME: reuse the node_ptr from above + dtuple_t* node_ptr = dict_index_build_node_ptr( + index, next_rec, block->page.id().page_no(), + heap, level); + + *err = btr_insert_on_non_leaf_level( + flags, index, level + 1, node_ptr, mtr); + if (*err != DB_SUCCESS) { + ret = FALSE; + goto got_err; + } + + ut_d(parent_latched = true); + } + } + + /* SPATIAL INDEX never use U locks; we can allow page merges + while holding X lock on the spatial index tree. + Do not allow merges of non-leaf B-tree pages unless it is + safe to do so. */ + { + const bool allow_merge = page_is_leaf(page) + || dict_index_is_spatial(index) + || btr_cur_will_modify_tree( + index, page, BTR_INTENTION_DELETE, rec, + btr_node_ptr_max_size(index), + block->zip_size(), mtr); + page_cur_delete_rec(btr_cur_get_page_cur(cursor), + offsets, mtr); + + if (min_mark_next_rec) { + btr_set_min_rec_mark(next_rec, *block, mtr); + } + +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + ut_ad(!parent_latched + || btr_check_node_ptr(index, block, mtr)); + + if (!ret && btr_cur_compress_recommendation(cursor, mtr)) { + if (UNIV_LIKELY(allow_merge)) { + ret = btr_cur_compress_if_useful( + cursor, FALSE, mtr); + } else { + ib::warn() << "Not merging page " + << block->page.id() + << " in index " << index->name + << " of " << index->table->name; + ut_ad("MDEV-14637" == 0); + } + } + } + +return_after_reservations: + *err = DB_SUCCESS; +err_exit: + mem_heap_free(heap); + +#if 0 // FIXME: this used to be a no-op, and will cause trouble if enabled + if (page_is_leaf(page) + && !dict_index_is_online_ddl(index)) { + mtr->release(index->lock); + /* NOTE: We cannot release root block latch here, because it + has segment header and already modified in most of cases.*/ + } +#endif + + index->table->space->release_free_extents(n_reserved); + return(ret); +} + +/** Delete the node pointer in a parent page. +@param[in,out] parent cursor pointing to parent record +@param[in,out] mtr mini-transaction */ +dberr_t btr_cur_node_ptr_delete(btr_cur_t* parent, mtr_t* mtr) +{ + ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(parent), + MTR_MEMO_PAGE_X_FIX)); + dberr_t err; + ibool compressed = btr_cur_pessimistic_delete(&err, TRUE, parent, + BTR_CREATE_FLAG, false, + mtr); + if (err == DB_SUCCESS && !compressed) { + btr_cur_compress_if_useful(parent, FALSE, mtr); + } + + return err; +} + +/** Represents the cursor for the number of rows estimation. The +content is used for level-by-level diving and estimation the number of rows +on each level. */ +class btr_est_cur_t +{ + /* Assume a page like: + records: (inf, a, b, c, d, sup) + index of the record: 0, 1, 2, 3, 4, 5 + */ + + /** Index of the record where the page cursor stopped on this level + (index in alphabetical order). In the above example, if the search stopped on + record 'c', then nth_rec will be 3. */ + ulint m_nth_rec; + + /** Number of the records on the page, not counting inf and sup. + In the above example n_recs will be 4. */ + ulint m_n_recs; + + /** Search tuple */ + const dtuple_t &m_tuple; + /** Cursor search mode */ + page_cur_mode_t m_mode; + /** Page cursor which is used for search */ + page_cur_t m_page_cur; + /** Page id of the page to get on level down, can differ from + m_block->page.id at the moment when the child's page id is already found, but + the child's block has not fetched yet */ + page_id_t m_page_id; + /** Current block */ + buf_block_t *m_block; + /** Page search mode, can differ from m_mode for non-leaf pages, see c-tor + comments for details */ + page_cur_mode_t m_page_mode; + + /** Matched fields and bytes which are used for on-page search, see + btr_cur_t::(up|low)_(match|bytes) comments for details */ + ulint m_up_match= 0; + ulint m_up_bytes= 0; + ulint m_low_match= 0; + ulint m_low_bytes= 0; + +public: + btr_est_cur_t(dict_index_t *index, const dtuple_t &tuple, + page_cur_mode_t mode) + : m_tuple(tuple), m_mode(mode), + m_page_id(index->table->space_id, index->page), m_block(nullptr) + { + + ut_ad(dict_index_check_search_tuple(index, &tuple)); + ut_ad(dtuple_check_typed(&tuple)); + + m_page_cur.index = index; + /* We use these modified search modes on non-leaf levels of the B-tree. + These let us end up in the right B-tree leaf. In that leaf we use the + original search mode. */ + switch (mode) { + case PAGE_CUR_GE: + m_page_mode= PAGE_CUR_L; + break; + case PAGE_CUR_G: + m_page_mode= PAGE_CUR_LE; + break; + default: +#ifdef PAGE_CUR_LE_OR_EXTENDS + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE || + mode == PAGE_CUR_LE_OR_EXTENDS); +#else /* PAGE_CUR_LE_OR_EXTENDS */ + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE); +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + m_page_mode= mode; + break; + } + } + + /** Retrieve block with m_page_id, release the previously gotten block + if necessary. If this is a left border block cursor and both left and right + border blocks have the same parent, don't unlatch the parent, as it must be + latched to get the right block, and will be unlatched after the right block + is fetched. + @param level distance from the leaf page level; ULINT_UNDEFINED when + fetching the root page + @param mtr mtr + @param right_parent right border block parent, nullptr if the function + is called for the right block itself + @return true on success or false otherwise. */ + bool fetch_child(ulint level, mtr_t &mtr, const buf_block_t *right_parent) + { + buf_block_t *parent_block= m_block; + + m_block= btr_block_get(*index(), m_page_id.page_no(), RW_S_LATCH, !level, + &mtr, nullptr); + if (!m_block) + return false; + + if (parent_block && parent_block != right_parent) + { + ut_ad(mtr.get_savepoint() >= 2); + mtr.rollback_to_savepoint(1, 2); + } + + return level == ULINT_UNDEFINED || + btr_page_get_level(m_block->page.frame) == level; + } + + /** Sets page mode for leaves */ + void set_page_mode_for_leaves() { m_page_mode= m_mode; } + + /** Does search on the current page. If there is no border in m_tuple, then + just move the cursor to the most left or right record. + @param level current level on tree. + @param root_height root height + @param left true if this is left border, false otherwise. + @return true on success, false otherwise. */ + bool search_on_page(ulint level, ulint root_height, bool left) + { + if (level != btr_page_get_level(m_block->page.frame)) + return false; + + m_n_recs= page_get_n_recs(m_block->page.frame); + + if (dtuple_get_n_fields(&m_tuple) > 0) + { + m_up_bytes= m_low_bytes= 0; + m_page_cur.block= m_block; + if (page_cur_search_with_match(&m_tuple, m_page_mode, + &m_up_match, &m_low_match, &m_page_cur, + nullptr)) + return false; + m_nth_rec= page_rec_get_n_recs_before(page_cur_get_rec(&m_page_cur)); + } + else if (left) + { + page_cur_set_before_first(m_block, &m_page_cur); + if (level) + { + if (!page_cur_move_to_next(&m_page_cur)) + return false; + m_nth_rec= 1; + } + else + m_nth_rec= 0; + } + else + { + m_nth_rec= m_n_recs; + if (!level) + { + page_cur_set_after_last(m_block, &m_page_cur); + ++m_nth_rec; + } + else + { + m_page_cur.block= m_block; + m_page_cur.rec= page_rec_get_nth(m_block->page.frame, m_nth_rec); + } + } + + return true; + } + + /** Read page id of the current record child. + @param offsets offsets array. + @param heap heap for offsets array */ + void read_child_page_id(rec_offs **offsets, mem_heap_t **heap) + { + const rec_t *node_ptr= page_cur_get_rec(&m_page_cur); + + /* FIXME: get the child page number directly without computing offsets */ + *offsets= rec_get_offsets(node_ptr, index(), *offsets, 0, ULINT_UNDEFINED, + heap); + + /* Go to the child node */ + m_page_id.set_page_no(btr_node_ptr_get_child_page_no(node_ptr, *offsets)); + } + + /** @return true if left border should be counted */ + bool should_count_the_left_border() const + { + if (dtuple_get_n_fields(&m_tuple) > 0) + { + ut_ad(!page_rec_is_infimum(page_cur_get_rec(&m_page_cur))); + return !page_rec_is_supremum(page_cur_get_rec(&m_page_cur)); + } + ut_ad(page_rec_is_infimum(page_cur_get_rec(&m_page_cur))); + return false; + } + + /** @return true if right border should be counted */ + bool should_count_the_right_border() const + { + if (dtuple_get_n_fields(&m_tuple) > 0) + { + const rec_t *rec= page_cur_get_rec(&m_page_cur); + ut_ad(!(m_mode == PAGE_CUR_L && page_rec_is_supremum(rec))); + + return (m_mode == PAGE_CUR_LE /* if the range is '<=' */ + /* and the record was found */ + && m_low_match >= dtuple_get_n_fields(&m_tuple)) || + (m_mode == PAGE_CUR_L /* or if the range is '<' */ + /* and there are any records to match the criteria, i.e. if the + minimum record on the tree is 5 and x < 7 is specified then the + cursor will be positioned at 5 and we should count the border, + but if x < 2 is specified, then the cursor will be positioned at + 'inf' and we should not count the border */ + && !page_rec_is_infimum(rec)); + /* Notice that for "WHERE col <= 'foo'" the server passes to + ha_innobase::records_in_range(): min_key=NULL (left-unbounded) which is + expected max_key='foo' flag=HA_READ_AFTER_KEY (PAGE_CUR_G), which is + unexpected - one would expect flag=HA_READ_KEY_OR_PREV (PAGE_CUR_LE). In + this case the cursor will be positioned on the first record to the right + of the requested one (can also be positioned on the 'sup') and we should + not count the right border. */ + } + ut_ad(page_rec_is_supremum(page_cur_get_rec(&m_page_cur))); + + /* The range specified is without a right border, just 'x > 123' + or 'x >= 123' and search_on_page() positioned the cursor on the + supremum record on the rightmost page, which must not be counted. */ + return false; + } + + /** @return index */ + const dict_index_t *index() const { return m_page_cur.index; } + + /** @return current block */ + const buf_block_t *block() const { return m_block; } + + /** @return current page id */ + page_id_t page_id() const { return m_page_id; } + + /** Copies block pointer and savepoint from another btr_est_cur_t in the case + if both left and right border cursors point to the same block. + @param o reference to the other btr_est_cur_t object. */ + void set_block(const btr_est_cur_t &o) { m_block= o.m_block; } + + /** @return current record number. */ + ulint nth_rec() const { return m_nth_rec; } + + /** @return number of records in the current page. */ + ulint n_recs() const { return m_n_recs; } +}; + +/** Estimate the number of rows between the left record of the path and the +right one(non-inclusive) for the certain level on a B-tree. This function +starts from the page next to the left page and reads a few pages to the right, +counting their records. If we reach the right page quickly then we know exactly +how many records there are between left and right records and we set +is_n_rows_exact to true. After some page is latched, the previous page is +unlatched. If we cannot reach the right page quickly then we calculate the +average number of records in the pages scanned so far and assume that all pages +that we did not scan up to the right page contain the same number of records, +then we multiply that average to the number of pages between right and left +records (which is n_rows_on_prev_level). In this case we set is_n_rows_exact to +false. +@param level current level. +@param left_cur the cursor of the left page. +@param right_page_no right page number. +@param n_rows_on_prev_level number of rows on the previous level. +@param[out] is_n_rows_exact true if exact rows number is returned. +@param[in,out] mtr mtr, +@return number of rows, not including the borders (exact or estimated). */ +static ha_rows btr_estimate_n_rows_in_range_on_level( + ulint level, btr_est_cur_t &left_cur, uint32_t right_page_no, + ha_rows n_rows_on_prev_level, bool &is_n_rows_exact, mtr_t &mtr) +{ + ha_rows n_rows= 0; + uint n_pages_read= 0; + /* Do not read more than this number of pages in order not to hurt + performance with this code which is just an estimation. If we read this many + pages before reaching right_page_no, then we estimate the average from the + pages scanned so far. */ + static constexpr uint n_pages_read_limit= 9; + buf_block_t *block= nullptr; + const dict_index_t *index= left_cur.index(); + + /* Assume by default that we will scan all pages between left and right(non + inclusive) pages */ + is_n_rows_exact= true; + + /* Add records from the left page which are to the right of the record which + serves as a left border of the range, if any (we don't include the record + itself in this count). */ + if (left_cur.nth_rec() <= left_cur.n_recs()) + { + n_rows+= left_cur.n_recs() - left_cur.nth_rec(); + } + + /* Count the records in the pages between left and right (non inclusive) + pages */ + + const fil_space_t *space= index->table->space; + page_id_t page_id(space->id, + btr_page_get_next(buf_block_get_frame(left_cur.block()))); + + if (page_id.page_no() == FIL_NULL) + goto inexact; + + do + { + page_t *page; + buf_block_t *prev_block= block; + + /* Fetch the page. */ + block= btr_block_get(*index, page_id.page_no(), RW_S_LATCH, !level, &mtr, + nullptr); + + if (prev_block) + { + ulint savepoint = mtr.get_savepoint(); + /* Index s-lock, p1, p2 latches, can also be p1 and p2 parent latch if + they are not diverged */ + ut_ad(savepoint >= 3); + mtr.rollback_to_savepoint(savepoint - 2, savepoint - 1); + } + + if (!block || btr_page_get_level(buf_block_get_frame(block)) != level) + goto inexact; + + page= buf_block_get_frame(block); + + /* It is possible but highly unlikely that the page was originally written + by an old version of InnoDB that did not initialize FIL_PAGE_TYPE on other + than B-tree pages. For example, this could be an almost-empty BLOB page + that happens to contain the magic values in the fields + that we checked above. */ + + n_pages_read++; + + n_rows+= page_get_n_recs(page); + + page_id.set_page_no(btr_page_get_next(page)); + + if (n_pages_read == n_pages_read_limit) + { + /* We read too many pages or we reached the end of the level + without passing through right_page_no. */ + goto inexact; + } + + } while (page_id.page_no() != right_page_no); + + if (block) + { + ut_ad(block == mtr.at_savepoint(mtr.get_savepoint() - 1)); + mtr.rollback_to_savepoint(mtr.get_savepoint() - 1); + } + + return (n_rows); + +inexact: + + if (block) + { + ut_ad(block == mtr.at_savepoint(mtr.get_savepoint() - 1)); + mtr.rollback_to_savepoint(mtr.get_savepoint() - 1); + } + + is_n_rows_exact= false; + + /* We did interrupt before reaching right page */ + + if (n_pages_read > 0) + { + /* The number of pages on this level is + n_rows_on_prev_level, multiply it by the + average number of recs per page so far */ + n_rows= n_rows_on_prev_level * n_rows / n_pages_read; + } + else + { + n_rows= 10; + } + + return (n_rows); +} + +/** Estimates the number of rows in a given index range. Do search in the left +page, then if there are pages between left and right ones, read a few pages to +the right, if the right page is reached, count the exact number of rows without +fetching the right page, the right page will be fetched in the caller of this +function and the amount of its rows will be added. If the right page is not +reached, count the estimated(see btr_estimate_n_rows_in_range_on_level() for +details) rows number, and fetch the right page. If leaves are reached, unlatch +non-leaf pages except the right leaf parent. After the right leaf page is +fetched, commit mtr. +@param[in] index index +@param[in] range_start range start +@param[in] range_end range end +@return estimated number of rows; */ +ha_rows btr_estimate_n_rows_in_range(dict_index_t *index, + btr_pos_t *range_start, + btr_pos_t *range_end) +{ + DBUG_ENTER("btr_estimate_n_rows_in_range"); + + if (UNIV_UNLIKELY(index->page == FIL_NULL || index->is_corrupted())) + DBUG_RETURN(0); + + ut_ad(index->is_btree()); + + btr_est_cur_t p1(index, *range_start->tuple, range_start->mode); + btr_est_cur_t p2(index, *range_end->tuple, range_end->mode); + mtr_t mtr; + + ulint height; + ulint root_height= 0; /* remove warning */ + + mem_heap_t *heap= NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs *offsets= offsets_; + rec_offs_init(offsets_); + + mtr.start(); + + ut_ad(mtr.get_savepoint() == 0); + mtr_s_lock_index(index, &mtr); + + ha_rows table_n_rows= dict_table_get_n_rows(index->table); + + height= ULINT_UNDEFINED; + + /* This becomes true when the two paths do not pass through the same pages + anymore. */ + bool diverged= false; + /* This is the height, i.e. the number of levels from the root, where paths + are not the same or adjacent any more. */ + ulint divergence_height= ULINT_UNDEFINED; + bool should_count_the_left_border= true; + bool should_count_the_right_border= true; + bool is_n_rows_exact= true; + ha_rows n_rows= 0; + + /* Loop and search until we arrive at the desired level. */ +search_loop: + if (!p1.fetch_child(height, mtr, p2.block())) + goto error; + + if (height == ULINT_UNDEFINED) + { + /* We are in the root node */ + height= btr_page_get_level(buf_block_get_frame(p1.block())); + root_height= height; + } + + if (!height) + { + p1.set_page_mode_for_leaves(); + p2.set_page_mode_for_leaves(); + } + + if (p1.page_id() == p2.page_id()) + p2.set_block(p1); + else + { + ut_ad(diverged); + if (divergence_height != ULINT_UNDEFINED) { + /* We need to call p1.search_on_page() here as + btr_estimate_n_rows_in_range_on_level() uses p1.m_n_recs and + p1.m_nth_rec. */ + if (!p1.search_on_page(height, root_height, true)) + goto error; + n_rows= btr_estimate_n_rows_in_range_on_level( + height, p1, p2.page_id().page_no(), n_rows, is_n_rows_exact, mtr); + } + if (!p2.fetch_child(height, mtr, nullptr)) + goto error; + } + + if (height == 0) + /* There is no need to release non-leaf pages here as they must already be + unlatched in btr_est_cur_t::fetch_child(). Try to search on pages after + releasing the index latch, to decrease contention. */ + mtr.rollback_to_savepoint(0, 1); + + /* There is no need to search on left page if + divergence_height != ULINT_UNDEFINED, as it was already searched before + btr_estimate_n_rows_in_range_on_level() call */ + if (divergence_height == ULINT_UNDEFINED && + !p1.search_on_page(height, root_height, true)) + goto error; + + if (!p2.search_on_page(height, root_height, false)) + goto error; + + if (!diverged && (p1.nth_rec() != p2.nth_rec())) + { + ut_ad(p1.page_id() == p2.page_id()); + diverged= true; + if (p1.nth_rec() < p2.nth_rec()) + { + /* We do not count the borders (nor the left nor the right one), thus + "- 1". */ + n_rows= p2.nth_rec() - p1.nth_rec() - 1; + + if (n_rows > 0) + { + /* There is at least one row between the two borders pointed to by p1 + and p2, so on the level below the slots will point to non-adjacent + pages. */ + divergence_height= root_height - height; + } + } + else + { + /* It is possible that p1->nth_rec > p2->nth_rec if, for example, we have + a single page tree which contains (inf, 5, 6, supr) and we select where x + > 20 and x < 30; in this case p1->nth_rec will point to the supr record + and p2->nth_rec will point to 6. */ + n_rows= 0; + should_count_the_left_border= false; + should_count_the_right_border= false; + } + } + else if (diverged && divergence_height == ULINT_UNDEFINED) + { + + if (p1.nth_rec() < p1.n_recs() || p2.nth_rec() > 1) + { + ut_ad(p1.page_id() != p2.page_id()); + divergence_height= root_height - height; + + n_rows= 0; + + if (p1.nth_rec() < p1.n_recs()) + { + n_rows+= p1.n_recs() - p1.nth_rec(); + } + + if (p2.nth_rec() > 1) + { + n_rows+= p2.nth_rec() - 1; + } + } + } + else if (divergence_height != ULINT_UNDEFINED) + { + /* All records before the right page was already counted. Add records from + p2->page_no which are to the left of the record which servers as a right + border of the range, if any (we don't include the record itself in this + count). */ + if (p2.nth_rec() > 1) + n_rows+= p2.nth_rec() - 1; + } + + if (height) + { + ut_ad(height > 0); + height--; + ut_ad(mtr.memo_contains(p1.index()->lock, MTR_MEMO_S_LOCK)); + ut_ad(mtr.memo_contains_flagged(p1.block(), MTR_MEMO_PAGE_S_FIX)); + p1.read_child_page_id(&offsets, &heap); + ut_ad(mtr.memo_contains(p2.index()->lock, MTR_MEMO_S_LOCK)); + ut_ad(mtr.memo_contains_flagged(p2.block(), MTR_MEMO_PAGE_S_FIX)); + p2.read_child_page_id(&offsets, &heap); + goto search_loop; + } + + should_count_the_left_border= + should_count_the_left_border && p1.should_count_the_left_border(); + should_count_the_right_border= + should_count_the_right_border && p2.should_count_the_right_border(); + + mtr.commit(); + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + + + range_start->page_id= p1.page_id(); + range_end->page_id= p2.page_id(); + + /* Here none of the borders were counted. For example, if on the leaf level + we descended to: + (inf, a, b, c, d, e, f, sup) + ^ ^ + path1 path2 + then n_rows will be 2 (c and d). */ + + if (is_n_rows_exact) + { + /* Only fiddle to adjust this off-by-one if the number is exact, otherwise + we do much grosser adjustments below. */ + + /* If both paths end up on the same record on the leaf level. */ + if (p1.page_id() == p2.page_id() && p1.nth_rec() == p2.nth_rec()) + { + + /* n_rows can be > 0 here if the paths were first different and then + converged to the same record on the leaf level. + For example: + SELECT ... LIKE 'wait/synch/rwlock%' + mode1=PAGE_CUR_GE, + tuple1="wait/synch/rwlock" + path1[0]={nth_rec=58, n_recs=58, + page_no=3, page_level=1} + path1[1]={nth_rec=56, n_recs=55, + page_no=119, page_level=0} + + mode2=PAGE_CUR_G + tuple2="wait/synch/rwlock" + path2[0]={nth_rec=57, n_recs=57, + page_no=3, page_level=1} + path2[1]={nth_rec=56, n_recs=55, + page_no=119, page_level=0} */ + + /* If the range is such that we should count both borders, then avoid + counting that record twice - once as a left border and once as a right + border. Some of the borders should not be counted, e.g. [3,3). */ + n_rows= should_count_the_left_border && should_count_the_right_border; + } + else + n_rows+= should_count_the_left_border + should_count_the_right_border; + } + + if (root_height > divergence_height && !is_n_rows_exact) + /* In trees whose height is > 1 our algorithm tends to underestimate: + multiply the estimate by 2: */ + n_rows*= 2; + + DBUG_EXECUTE_IF("bug14007649", DBUG_RETURN(n_rows);); + + /* Do not estimate the number of rows in the range to over 1 / 2 of the + estimated rows in the whole table */ + + if (n_rows > table_n_rows / 2 && !is_n_rows_exact) + { + + n_rows= table_n_rows / 2; + + /* If there are just 0 or 1 rows in the table, then we estimate all rows + are in the range */ + + if (n_rows == 0) + n_rows= table_n_rows; + } + + DBUG_RETURN(n_rows); + +error: + mtr.commit(); + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + + DBUG_RETURN(0); +} + +/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/ + +/***********************************************************//** +Gets the offset of the pointer to the externally stored part of a field. +@return offset of the pointer to the externally stored part */ +static +ulint +btr_rec_get_field_ref_offs( +/*=======================*/ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n) /*!< in: index of the external field */ +{ + ulint field_ref_offs; + ulint local_len; + + ut_a(rec_offs_nth_extern(offsets, n)); + field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len); + ut_a(len_is_stored(local_len)); + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE); +} + +/** Gets a pointer to the externally stored part of a field. +@param rec record +@param offsets rec_get_offsets(rec) +@param n index of the externally stored field +@return pointer to the externally stored part */ +#define btr_rec_get_field_ref(rec, offsets, n) \ + ((rec) + btr_rec_get_field_ref_offs(offsets, n)) + +/** Gets the externally stored size of a record, in units of a database page. +@param[in] rec record +@param[in] offsets array returned by rec_get_offsets() +@return externally stored part, in units of a database page */ +ulint +btr_rec_get_externally_stored_len( + const rec_t* rec, + const rec_offs* offsets) +{ + ulint n_fields; + ulint total_extern_len = 0; + ulint i; + + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + + if (!rec_offs_any_extern(offsets)) { + return(0); + } + + n_fields = rec_offs_n_fields(offsets); + + for (i = 0; i < n_fields; i++) { + if (rec_offs_nth_extern(offsets, i)) { + + ulint extern_len = mach_read_from_4( + btr_rec_get_field_ref(rec, offsets, i) + + BTR_EXTERN_LEN + 4); + + total_extern_len += ut_calc_align( + extern_len, ulint(srv_page_size)); + } + } + + return total_extern_len >> srv_page_size_shift; +} + +/*******************************************************************//** +Sets the ownership bit of an externally stored field in a record. */ +static +void +btr_cur_set_ownership_of_extern_field( +/*==================================*/ + buf_block_t* block, /*!< in/out: index page */ + rec_t* rec, /*!< in/out: clustered index record */ + dict_index_t* index, /*!< in: index of the page */ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint i, /*!< in: field number */ + bool val, /*!< in: value to set */ + mtr_t* mtr) /*!< in: mtr, or NULL if not logged */ +{ + byte* data; + ulint local_len; + ulint byte_val; + + data = rec_get_nth_field(rec, offsets, i, &local_len); + ut_ad(rec_offs_nth_extern(offsets, i)); + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN); + + if (val) { + byte_val &= ~BTR_EXTERN_OWNER_FLAG; + } else { +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + byte_val |= BTR_EXTERN_OWNER_FLAG; + } + + if (UNIV_LIKELY_NULL(block->page.zip.data)) { + mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val); + page_zip_write_blob_ptr(block, rec, index, offsets, i, mtr); + } else { + mtr->write<1,mtr_t::MAYBE_NOP>(*block, data + local_len + + BTR_EXTERN_LEN, byte_val); + } +} + +/*******************************************************************//** +Marks non-updated off-page fields as disowned by this record. The ownership +must be transferred to the updated record which is inserted elsewhere in the +index tree. In purge only the owner of externally stored field is allowed +to free the field. */ +void +btr_cur_disown_inherited_fields( +/*============================*/ + buf_block_t* block, /*!< in/out: index page */ + rec_t* rec, /*!< in/out: record in a clustered index */ + dict_index_t* index, /*!< in: index of the page */ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + const upd_t* update, /*!< in: update vector */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + ut_ad(rec_offs_any_extern(offsets)); + + for (uint16_t i = 0; i < rec_offs_n_fields(offsets); i++) { + if (rec_offs_nth_extern(offsets, i) + && !upd_get_field_by_field_no(update, i, false)) { + btr_cur_set_ownership_of_extern_field( + block, rec, index, offsets, i, false, mtr); + } + } +} + +/*******************************************************************//** +Marks all extern fields in a record as owned by the record. This function +should be called if the delete mark of a record is removed: a not delete +marked record always owns all its extern fields. */ +static +void +btr_cur_unmark_extern_fields( +/*=========================*/ + buf_block_t* block, /*!< in/out: index page */ + rec_t* rec, /*!< in/out: record in a clustered index */ + dict_index_t* index, /*!< in: index of the page */ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + mtr_t* mtr) /*!< in: mtr, or NULL if not logged */ +{ + ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec)); + if (!rec_offs_any_extern(offsets)) { + return; + } + + const ulint n = rec_offs_n_fields(offsets); + + for (ulint i = 0; i < n; i++) { + if (rec_offs_nth_extern(offsets, i)) { + btr_cur_set_ownership_of_extern_field( + block, rec, index, offsets, i, true, mtr); + } + } +} + +/*******************************************************************//** +Returns the length of a BLOB part stored on the header page. +@return part length */ +static +uint32_t +btr_blob_get_part_len( +/*==================*/ + const byte* blob_header) /*!< in: blob header */ +{ + return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN)); +} + +/*******************************************************************//** +Returns the page number where the next BLOB part is stored. +@return page number or FIL_NULL if no more pages */ +static +uint32_t +btr_blob_get_next_page_no( +/*======================*/ + const byte* blob_header) /*!< in: blob header */ +{ + return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO)); +} + +/** Deallocate a buffer block that was reserved for a BLOB part. +@param block buffer block +@param all flag whether to remove a ROW_FORMAT=COMPRESSED page +@param mtr mini-transaction to commit */ +static void btr_blob_free(buf_block_t *block, bool all, mtr_t *mtr) +{ + const page_id_t page_id(block->page.id()); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + mtr->commit(); + + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold()); + mysql_mutex_lock(&buf_pool.mutex); + + if (buf_page_t *bpage= buf_pool.page_hash.get(page_id, chain)) + if (!buf_LRU_free_page(bpage, all) && all && bpage->zip.data) + /* Attempt to deallocate the redundant copy of the uncompressed page + if the whole ROW_FORMAT=COMPRESSED block cannot be deallocted. */ + buf_LRU_free_page(bpage, false); + + mysql_mutex_unlock(&buf_pool.mutex); +} + +/** Helper class used while writing blob pages, during insert or update. */ +struct btr_blob_log_check_t { + /** Persistent cursor on a clusterex index record with blobs. */ + btr_pcur_t* m_pcur; + /** Mini transaction holding the latches for m_pcur */ + mtr_t* m_mtr; + /** rec_get_offsets(rec, index); offset of clust_rec */ + const rec_offs* m_offsets; + /** The block containing clustered record */ + buf_block_t** m_block; + /** The clustered record pointer */ + rec_t** m_rec; + /** The blob operation code */ + enum blob_op m_op; + + /** Constructor + @param[in] pcur persistent cursor on a clustered + index record with blobs. + @param[in] mtr mini-transaction holding latches for + pcur. + @param[in] offsets offsets of the clust_rec + @param[in,out] block record block containing pcur record + @param[in,out] rec the clustered record pointer + @param[in] op the blob operation code */ + btr_blob_log_check_t( + btr_pcur_t* pcur, + mtr_t* mtr, + const rec_offs* offsets, + buf_block_t** block, + rec_t** rec, + enum blob_op op) + : m_pcur(pcur), + m_mtr(mtr), + m_offsets(offsets), + m_block(block), + m_rec(rec), + m_op(op) + { + ut_ad(rec_offs_validate(*m_rec, m_pcur->index(), m_offsets)); + ut_ad((*m_block)->page.frame == page_align(*m_rec)); + ut_ad(*m_rec == btr_pcur_get_rec(m_pcur)); + } + + /** Check if there is enough space in log file. Commit and re-start the + mini transaction. */ + void check() + { + dict_index_t* index = m_pcur->index(); + ulint offs = 0; + uint32_t page_no = FIL_NULL; + + if (UNIV_UNLIKELY(m_op == BTR_STORE_INSERT_BULK)) { + offs = page_offset(*m_rec); + page_no = (*m_block)->page.id().page_no(); + (*m_block)->page.fix(); + ut_ad(page_no != FIL_NULL); + } else { + btr_pcur_store_position(m_pcur, m_mtr); + } + m_mtr->commit(); + + DEBUG_SYNC_C("blob_write_middle"); + + const mtr_log_t log_mode = m_mtr->get_log_mode(); + m_mtr->start(); + m_mtr->set_log_mode(log_mode); + index->set_modified(*m_mtr); + + log_free_check(); + + DEBUG_SYNC_C("blob_write_middle_after_check"); + + if (UNIV_UNLIKELY(page_no != FIL_NULL)) { + dberr_t err; + if (UNIV_LIKELY(index->page != page_no)) { + ut_a(btr_root_block_get(index, RW_SX_LATCH, + m_mtr, &err)); + } + m_pcur->btr_cur.page_cur.block = btr_block_get( + *index, page_no, RW_X_LATCH, false, m_mtr); + /* The page should not be evicted or corrupted while + we are holding a buffer-fix on it. */ + m_pcur->btr_cur.page_cur.block->page.unfix(); + m_pcur->btr_cur.page_cur.rec + = m_pcur->btr_cur.page_cur.block->page.frame + + offs; + } else { + ut_ad(m_pcur->rel_pos == BTR_PCUR_ON); + mtr_sx_lock_index(index, m_mtr); + ut_a(m_pcur->restore_position( + BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED, + m_mtr) == btr_pcur_t::SAME_ALL); + } + + *m_block = btr_pcur_get_block(m_pcur); + *m_rec = btr_pcur_get_rec(m_pcur); + + rec_offs_make_valid(*m_rec, index, true, + const_cast(m_offsets)); + + ut_ad(m_mtr->memo_contains_page_flagged( + *m_rec, + MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX)); + + ut_ad((m_op == BTR_STORE_INSERT_BULK) + == !m_mtr->memo_contains_flagged(&index->lock, + MTR_MEMO_SX_LOCK + | MTR_MEMO_X_LOCK)); + } +}; + +/*******************************************************************//** +Stores the fields in big_rec_vec to the tablespace and puts pointers to +them in rec. The extern flags in rec will have to be set beforehand. +The fields are stored on pages allocated from leaf node +file segment of the index tree. + +TODO: If the allocation extends the tablespace, it will not be redo logged, in +any mini-transaction. Tablespace extension should be redo-logged, so that +recovery will not fail when the big_rec was written to the extended portion of +the file, in case the file was somehow truncated in the crash. + +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +dberr_t +btr_store_big_rec_extern_fields( +/*============================*/ + btr_pcur_t* pcur, /*!< in: a persistent cursor */ + rec_offs* offsets, /*!< in/out: rec_get_offsets() on + pcur. the "external storage" flags + in offsets will correctly correspond + to rec when this function returns */ + const big_rec_t*big_rec_vec, /*!< in: vector containing fields + to be stored externally */ + mtr_t* btr_mtr, /*!< in/out: mtr containing the + latches to the clustered index. can be + committed and restarted. */ + enum blob_op op) /*! in: operation code */ +{ + byte* field_ref; + ulint extern_len; + ulint store_len; + ulint i; + mtr_t mtr; + mem_heap_t* heap = NULL; + page_zip_des_t* page_zip; + z_stream c_stream; + dberr_t error = DB_SUCCESS; + dict_index_t* index = pcur->index(); + buf_block_t* rec_block = btr_pcur_get_block(pcur); + rec_t* rec = btr_pcur_get_rec(pcur); + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_any_extern(offsets)); + ut_ad(op == BTR_STORE_INSERT_BULK + || btr_mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(btr_mtr->memo_contains_flagged(rec_block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(buf_block_get_frame(rec_block) == page_align(rec)); + ut_a(dict_index_is_clust(index)); + + if (!fil_page_index_page_check(page_align(rec))) { + if (op != BTR_STORE_INSERT_BULK) { + return DB_PAGE_CORRUPTED; + } + } + + btr_blob_log_check_t redo_log(pcur, btr_mtr, offsets, &rec_block, + &rec, op); + page_zip = buf_block_get_page_zip(rec_block); + + if (page_zip) { + int err; + + /* Zlib deflate needs 128 kilobytes for the default + window size, plus 512 << memLevel, plus a few + kilobytes for small objects. We use reduced memLevel + to limit the memory consumption, and preallocate the + heap, hoping to avoid memory fragmentation. */ + heap = mem_heap_create(250000); + page_zip_set_alloc(&c_stream, heap); + + err = deflateInit2(&c_stream, int(page_zip_level), + Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY); + ut_a(err == Z_OK); + } + +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + /* All pointers to externally stored columns in the record + must either be zero or they must be pointers to inherited + columns, owned by this record or an earlier record version. */ + for (i = 0; i < big_rec_vec->n_fields; i++) { + field_ref = btr_rec_get_field_ref( + rec, offsets, big_rec_vec->fields[i].field_no); + + ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG)); + /* Either this must be an update in place, + or the BLOB must be inherited, or the BLOB pointer + must be zero (will be written in this function). */ + ut_a(op == BTR_STORE_UPDATE + || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG) + || !memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)); + } +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + + /* Space available in compressed page to carry blob data */ + const ulint payload_size_zip = rec_block->physical_size() + - FIL_PAGE_DATA; + + /* Space available in uncompressed page to carry blob data */ + const ulint payload_size = payload_size_zip + - (BTR_BLOB_HDR_SIZE + FIL_PAGE_DATA_END); + + /* We have to create a file segment to the tablespace + for each field and put the pointer to the field in rec */ + + for (i = 0; i < big_rec_vec->n_fields; i++) { + const ulint field_no = big_rec_vec->fields[i].field_no; + + field_ref = btr_rec_get_field_ref(rec, offsets, field_no); +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + /* A zero BLOB pointer should have been initially inserted. */ + ut_a(!memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)); +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + extern_len = big_rec_vec->fields[i].len; + MEM_CHECK_DEFINED(big_rec_vec->fields[i].data, extern_len); + ut_a(extern_len > 0); + + uint32_t prev_page_no = FIL_NULL; + + if (page_zip) { + int err = deflateReset(&c_stream); + ut_a(err == Z_OK); + + c_stream.next_in = (Bytef*) + big_rec_vec->fields[i].data; + c_stream.avail_in = static_cast(extern_len); + } + + for (ulint blob_npages = 0;; ++blob_npages) { + buf_block_t* block; + const ulint commit_freq = 4; + uint32_t r_extents; + + ut_ad(page_align(field_ref) == page_align(rec)); + + if (!(blob_npages % commit_freq)) { + + redo_log.check(); + + field_ref = btr_rec_get_field_ref( + rec, offsets, field_no); + + page_zip = buf_block_get_page_zip(rec_block); + } + + ut_ad(btr_mtr->get_already_latched( + page_id_t{index->table->space_id, index->page}, + MTR_MEMO_PAGE_SX_FIX)); + + mtr.start(); + index->set_modified(mtr); + mtr.set_log_mode_sub(*btr_mtr); + + rec_block->page.fix(); + rec_block->page.lock.x_lock(); + + mtr.memo_push(rec_block, MTR_MEMO_PAGE_X_FIX); +#ifdef BTR_CUR_HASH_ADAPT + ut_ad(!btr_search_check_marked_free_index(rec_block)); +#endif + + uint32_t hint_prev = prev_page_no; + if (hint_prev == FIL_NULL) { + hint_prev = rec_block->page.id().page_no(); + } + + error = fsp_reserve_free_extents( + &r_extents, index->table->space, 1, + FSP_BLOB, &mtr, 1); + if (UNIV_UNLIKELY(error != DB_SUCCESS)) { +alloc_fail: + mtr.commit(); + goto func_exit; + } + + block = btr_page_alloc(index, hint_prev + 1, + FSP_NO_DIR, 0, &mtr, &mtr, + &error); + + index->table->space->release_free_extents(r_extents); + if (!block) { + goto alloc_fail; + } + + const uint32_t space_id = block->page.id().space(); + const uint32_t page_no = block->page.id().page_no(); + + if (prev_page_no == FIL_NULL) { + } else if (buf_block_t* prev_block = + buf_page_get_gen(page_id_t(space_id, + prev_page_no), + rec_block->zip_size(), + RW_X_LATCH, nullptr, + BUF_GET, &mtr, &error)) { + if (page_zip) { + mtr.write<4>(*prev_block, + prev_block->page.frame + + FIL_PAGE_NEXT, + page_no); + memcpy_aligned<4>( + buf_block_get_page_zip( + prev_block) + ->data + FIL_PAGE_NEXT, + prev_block->page.frame + + FIL_PAGE_NEXT, 4); + } else { + mtr.write<4>(*prev_block, + BTR_BLOB_HDR_NEXT_PAGE_NO + + FIL_PAGE_DATA + + prev_block->page.frame, + page_no); + } + } else { + goto alloc_fail; + } + + ut_ad(!page_has_siblings(block->page.frame)); + ut_ad(!fil_page_get_type(block->page.frame)); + + if (page_zip) { + int err; + page_zip_des_t* blob_page_zip; + + mtr.write<1>(*block, + FIL_PAGE_TYPE + 1 + + block->page.frame, + prev_page_no == FIL_NULL + ? FIL_PAGE_TYPE_ZBLOB + : FIL_PAGE_TYPE_ZBLOB2); + block->page.zip.data[FIL_PAGE_TYPE + 1] + = block->page.frame[FIL_PAGE_TYPE + 1]; + + c_stream.next_out = block->page.frame + + FIL_PAGE_DATA; + c_stream.avail_out = static_cast( + payload_size_zip); + + err = deflate(&c_stream, Z_FINISH); + ut_a(err == Z_OK || err == Z_STREAM_END); + ut_a(err == Z_STREAM_END + || c_stream.avail_out == 0); + + mtr.memcpy(*block, + FIL_PAGE_DATA, + page_zip_get_size(page_zip) + - FIL_PAGE_DATA + - c_stream.avail_out); + /* Copy the page to compressed storage, + because it will be flushed to disk + from there. */ + blob_page_zip = buf_block_get_page_zip(block); + ut_ad(blob_page_zip); + ut_ad(page_zip_get_size(blob_page_zip) + == page_zip_get_size(page_zip)); + memcpy(blob_page_zip->data, block->page.frame, + page_zip_get_size(page_zip)); + + if (err == Z_OK && prev_page_no != FIL_NULL) { + + goto next_zip_page; + } + + if (err == Z_STREAM_END) { + mach_write_to_4(field_ref + + BTR_EXTERN_LEN, 0); + mach_write_to_4(field_ref + + BTR_EXTERN_LEN + 4, + c_stream.total_in); + } else { + memset(field_ref + BTR_EXTERN_LEN, + 0, 8); + } + + if (prev_page_no == FIL_NULL) { + ut_ad(blob_npages == 0); + mach_write_to_4(field_ref + + BTR_EXTERN_SPACE_ID, + space_id); + + mach_write_to_4(field_ref + + BTR_EXTERN_PAGE_NO, + page_no); + + mach_write_to_4(field_ref + + BTR_EXTERN_OFFSET, + FIL_PAGE_NEXT); + } + + /* We compress a page when finish bulk insert.*/ + if (UNIV_LIKELY(op != BTR_STORE_INSERT_BULK)) { + page_zip_write_blob_ptr( + rec_block, rec, index, offsets, + field_no, &mtr); + } + +next_zip_page: + prev_page_no = page_no; + + /* Commit mtr and release the + uncompressed page frame to save memory. */ + btr_blob_free(block, FALSE, &mtr); + + if (err == Z_STREAM_END) { + break; + } + } else { + mtr.write<1>(*block, FIL_PAGE_TYPE + 1 + + block->page.frame, + FIL_PAGE_TYPE_BLOB); + + if (extern_len > payload_size) { + store_len = payload_size; + } else { + store_len = extern_len; + } + + mtr.memcpy( + *block, + FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE + + block->page.frame, + static_cast + (big_rec_vec->fields[i].data) + + big_rec_vec->fields[i].len + - extern_len, store_len); + mtr.write<4>(*block, BTR_BLOB_HDR_PART_LEN + + FIL_PAGE_DATA + + block->page.frame, + store_len); + compile_time_assert(FIL_NULL == 0xffffffff); + mtr.memset(block, BTR_BLOB_HDR_NEXT_PAGE_NO + + FIL_PAGE_DATA, 4, 0xff); + + extern_len -= store_len; + + ut_ad(!mach_read_from_4(BTR_EXTERN_LEN + + field_ref)); + mtr.write<4>(*rec_block, + BTR_EXTERN_LEN + 4 + field_ref, + big_rec_vec->fields[i].len + - extern_len); + + if (prev_page_no == FIL_NULL) { + ut_ad(blob_npages == 0); + mtr.write<4,mtr_t::MAYBE_NOP>( + *rec_block, + field_ref + BTR_EXTERN_SPACE_ID, + space_id); + + mtr.write<4>(*rec_block, field_ref + + BTR_EXTERN_PAGE_NO, + page_no); + + mtr.write<4>(*rec_block, field_ref + + BTR_EXTERN_OFFSET, + FIL_PAGE_DATA); + } + + prev_page_no = page_no; + + mtr.commit(); + + if (extern_len == 0) { + break; + } + } + } + + DBUG_EXECUTE_IF("btr_store_big_rec_extern", + error = DB_OUT_OF_FILE_SPACE; + goto func_exit;); + + rec_offs_make_nth_extern(offsets, field_no); + } + +func_exit: + if (page_zip) { + deflateEnd(&c_stream); + } + + if (heap != NULL) { + mem_heap_free(heap); + } + +#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG + /* All pointers to externally stored columns in the record + must be valid. */ + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (!rec_offs_nth_extern(offsets, i)) { + continue; + } + + field_ref = btr_rec_get_field_ref(rec, offsets, i); + + /* The pointer must not be zero if the operation + succeeded. */ + ut_a(0 != memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE) + || error != DB_SUCCESS); + /* The column must not be disowned by this record. */ + ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG)); + } +#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ + return(error); +} + +/** Check the FIL_PAGE_TYPE on an uncompressed BLOB page. +@param block uncompressed BLOB page +@param op operation +@return whether the type is invalid */ +static bool btr_check_blob_fil_page_type(const buf_block_t& block, + const char *op) +{ + uint16_t type= fil_page_get_type(block.page.frame); + + if (UNIV_LIKELY(type == FIL_PAGE_TYPE_BLOB)); + else if (fil_space_t *space= fil_space_t::get(block.page.id().space())) + { + /* Old versions of InnoDB did not initialize FIL_PAGE_TYPE on BLOB + pages. Do not print anything about the type mismatch when reading + a BLOB page that may be from old versions. */ + bool fail= space->full_crc32() || DICT_TF_HAS_ATOMIC_BLOBS(space->flags); + if (fail) + sql_print_error("InnoDB: FIL_PAGE_TYPE=%u on BLOB %s file %s page %u", + type, op, space->chain.start->name, + block.page.id().page_no()); + space->release(); + return fail; + } + return false; +} + +/*******************************************************************//** +Frees the space in an externally stored field to the file space +management if the field in data is owned by the externally stored field, +in a rollback we may have the additional condition that the field must +not be inherited. */ +void +btr_free_externally_stored_field( +/*=============================*/ + dict_index_t* index, /*!< in: index of the data, the index + tree MUST be X-latched; if the tree + height is 1, then also the root page + must be X-latched! (this is relevant + in the case this function is called + from purge where 'data' is located on + an undo log page, not an index + page) */ + byte* field_ref, /*!< in/out: field reference */ + const rec_t* rec, /*!< in: record containing field_ref, for + page_zip_write_blob_ptr(), or NULL */ + const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index), + or NULL */ + buf_block_t* block, /*!< in/out: page of field_ref */ + ulint i, /*!< in: field number of field_ref; + ignored if rec == NULL */ + bool rollback, /*!< in: performing rollback? */ + mtr_t* local_mtr) /*!< in: mtr + containing the latch to data an an + X-latch to the index tree */ +{ + const uint32_t space_id = mach_read_from_4( + field_ref + BTR_EXTERN_SPACE_ID); + + ut_ad(index->is_primary()); + ut_ad(block->page.lock.have_x()); + ut_ad(local_mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(local_mtr->memo_contains_page_flagged(field_ref, + MTR_MEMO_PAGE_X_FIX)); + ut_ad(!rec || rec_offs_validate(rec, index, offsets)); + ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i)); + ut_ad(index->table->space_id == index->table->space->id); + ut_ad(local_mtr->is_named_space(index->table->space)); + + if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE))) { + /* In the rollback, we may encounter a clustered index + record with some unwritten off-page columns. There is + nothing to free then. */ + ut_a(rollback); + return; + } + + ut_ad(!(mach_read_from_4(field_ref + BTR_EXTERN_LEN) + & ~((BTR_EXTERN_OWNER_FLAG + | BTR_EXTERN_INHERITED_FLAG) << 24))); + ut_ad(space_id == index->table->space_id); + + const ulint ext_zip_size = index->table->space->zip_size(); + /* !rec holds in a call from purge when field_ref is in an undo page */ + ut_ad(rec || !block->page.zip.data); + + for (;;) { + mtr_t mtr; + + mtr.start(); + mtr.set_spaces(*local_mtr); + mtr.set_log_mode_sub(*local_mtr); + + ut_ad(!index->table->is_temporary() + || local_mtr->get_log_mode() == MTR_LOG_NO_REDO); + + const uint32_t page_no = mach_read_from_4( + field_ref + BTR_EXTERN_PAGE_NO); + buf_block_t* ext_block; + + if (/* There is no external storage data */ + page_no == FIL_NULL + /* This field does not own the externally stored field */ + || (mach_read_from_1(field_ref + BTR_EXTERN_LEN) + & BTR_EXTERN_OWNER_FLAG) + /* Rollback and inherited field */ + || (rollback + && (mach_read_from_1(field_ref + BTR_EXTERN_LEN) + & BTR_EXTERN_INHERITED_FLAG))) { +skip_free: + /* Do not free */ + mtr.commit(); + + return; + } + + ext_block = buf_page_get(page_id_t(space_id, page_no), + ext_zip_size, RW_X_LATCH, &mtr); + + if (!ext_block) { + goto skip_free; + } + + /* The buffer pool block containing the BLOB pointer is + exclusively latched by local_mtr. To satisfy some design + constraints, we must recursively latch it in mtr as well. */ + block->fix(); + block->page.lock.x_lock(); + + mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX); +#ifdef BTR_CUR_HASH_ADAPT + ut_ad(!btr_search_check_marked_free_index(block)); +#endif + + const page_t* page = buf_block_get_frame(ext_block); + + if (ext_zip_size) { + /* Note that page_zip will be NULL + in row_purge_upd_exist_or_extern(). */ + switch (fil_page_get_type(page)) { + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + break; + default: + MY_ASSERT_UNREACHABLE(); + } + const uint32_t next_page_no = mach_read_from_4( + page + FIL_PAGE_NEXT); + + btr_page_free(index, ext_block, &mtr, true, + local_mtr->memo_contains( + *index->table->space)); + + if (UNIV_LIKELY_NULL(block->page.zip.data)) { + mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO, + next_page_no); + memset(field_ref + BTR_EXTERN_LEN + 4, 0, 4); + page_zip_write_blob_ptr(block, rec, index, + offsets, i, &mtr); + } else { + mtr.write<4>(*block, + BTR_EXTERN_PAGE_NO + field_ref, + next_page_no); + mtr.write<4,mtr_t::MAYBE_NOP>(*block, + BTR_EXTERN_LEN + + 4 + field_ref, + 0U); + } + } else { + ut_ad(!block->page.zip.data); + btr_check_blob_fil_page_type(*ext_block, "purge"); + + const uint32_t next_page_no = mach_read_from_4( + page + FIL_PAGE_DATA + + BTR_BLOB_HDR_NEXT_PAGE_NO); + btr_page_free(index, ext_block, &mtr, true, + local_mtr->memo_contains( + *index->table->space)); + + mtr.write<4>(*block, BTR_EXTERN_PAGE_NO + field_ref, + next_page_no); + /* Zero out the BLOB length. If the server + crashes during the execution of this function, + trx_rollback_all_recovered() could + dereference the half-deleted BLOB, fetching a + wrong prefix for the BLOB. */ + mtr.write<4,mtr_t::MAYBE_NOP>(*block, + BTR_EXTERN_LEN + 4 + + field_ref, 0U); + } + + /* Commit mtr and release the BLOB block to save memory. */ + btr_blob_free(ext_block, TRUE, &mtr); + } +} + +/***********************************************************//** +Frees the externally stored fields for a record. */ +static +void +btr_rec_free_externally_stored_fields( +/*==================================*/ + dict_index_t* index, /*!< in: index of the data, the index + tree MUST be X-latched */ + rec_t* rec, /*!< in/out: record */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + buf_block_t* block, /*!< in: index page of rec */ + bool rollback,/*!< in: performing rollback? */ + mtr_t* mtr) /*!< in: mini-transaction handle which contains + an X-latch to record page and to the index + tree */ +{ + ulint n_fields; + ulint i; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX)); + ut_ad(index->is_primary()); + ut_ad(page_rec_is_leaf(rec)); + /* Free possible externally stored fields in the record */ + + ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets)); + n_fields = rec_offs_n_fields(offsets); + + for (i = 0; i < n_fields; i++) { + if (rec_offs_nth_extern(offsets, i)) { + btr_free_externally_stored_field( + index, btr_rec_get_field_ref(rec, offsets, i), + rec, offsets, block, i, rollback, mtr); + } + } +} + +/***********************************************************//** +Frees the externally stored fields for a record, if the field is mentioned +in the update vector. */ +static +void +btr_rec_free_updated_extern_fields( +/*===============================*/ + dict_index_t* index, /*!< in: index of rec; the index tree MUST be + X-latched */ + rec_t* rec, /*!< in/out: record */ + buf_block_t* block, /*!< in: index page of rec */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + const upd_t* update, /*!< in: update vector */ + bool rollback,/*!< in: performing rollback? */ + mtr_t* mtr) /*!< in: mini-transaction handle which contains + an X-latch to record page and to the tree */ +{ + ulint n_fields; + ulint i; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX)); + + /* Free possible externally stored fields in the record */ + + n_fields = upd_get_n_fields(update); + + for (i = 0; i < n_fields; i++) { + const upd_field_t* ufield = upd_get_nth_field(update, i); + + if (rec_offs_nth_extern(offsets, ufield->field_no)) { + ulint len; + byte* data = rec_get_nth_field( + rec, offsets, ufield->field_no, &len); + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + + btr_free_externally_stored_field( + index, data + len - BTR_EXTERN_FIELD_REF_SIZE, + rec, offsets, block, + ufield->field_no, rollback, mtr); + } + } +} + +/*******************************************************************//** +Copies the prefix of an uncompressed BLOB. The clustered index record +that points to this BLOB must be protected by a lock or a page latch. +@return number of bytes written to buf */ +static +ulint +btr_copy_blob_prefix( +/*=================*/ + byte* buf, /*!< out: the externally stored part of + the field, or a prefix of it */ + uint32_t len, /*!< in: length of buf, in bytes */ + page_id_t id, /*!< in: page identifier of the first BLOB page */ + uint32_t offset) /*!< in: offset on the first BLOB page */ +{ + ulint copied_len = 0; + + for (;;) { + mtr_t mtr; + buf_block_t* block; + const page_t* page; + const byte* blob_header; + ulint part_len; + ulint copy_len; + + mtr_start(&mtr); + + block = buf_page_get(id, 0, RW_S_LATCH, &mtr); + if (!block || btr_check_blob_fil_page_type(*block, "read")) { + mtr.commit(); + return copied_len; + } + page = buf_block_get_frame(block); + + blob_header = page + offset; + part_len = btr_blob_get_part_len(blob_header); + copy_len = ut_min(part_len, len - copied_len); + + memcpy(buf + copied_len, + blob_header + BTR_BLOB_HDR_SIZE, copy_len); + copied_len += copy_len; + + id.set_page_no(btr_blob_get_next_page_no(blob_header)); + + mtr_commit(&mtr); + + if (id.page_no() == FIL_NULL || copy_len != part_len) { + MEM_CHECK_DEFINED(buf, copied_len); + return(copied_len); + } + + /* On other BLOB pages except the first the BLOB header + always is at the page data start: */ + + offset = FIL_PAGE_DATA; + + ut_ad(copied_len <= len); + } +} + +/** Copies the prefix of a compressed BLOB. +The clustered index record that points to this BLOB must be protected +by a lock or a page latch. +@param[out] buf the externally stored part of the field, +or a prefix of it +@param[in] len length of buf, in bytes +@param[in] zip_size ROW_FORMAT=COMPRESSED page size +@param[in] id page identifier of the BLOB pages +@return number of bytes written to buf */ +static +ulint +btr_copy_zblob_prefix( + byte* buf, + uint32_t len, + ulint zip_size, + page_id_t id, + uint32_t offset) +{ + ulint page_type = FIL_PAGE_TYPE_ZBLOB; + mem_heap_t* heap; + int err; + z_stream d_stream; + + d_stream.next_out = buf; + d_stream.avail_out = static_cast(len); + d_stream.next_in = Z_NULL; + d_stream.avail_in = 0; + + /* Zlib inflate needs 32 kilobytes for the default + window size, plus a few kilobytes for small objects. */ + heap = mem_heap_create(40000); + page_zip_set_alloc(&d_stream, heap); + + ut_ad(zip_size); + ut_ad(ut_is_2pow(zip_size)); + ut_ad(id.space()); + + err = inflateInit(&d_stream); + ut_a(err == Z_OK); + + for (;;) { + buf_page_t* bpage; + uint32_t next_page_no; + + /* There is no latch on bpage directly. Instead, + bpage is protected by the B-tree page latch that + is being held on the clustered index record, or, + in row_merge_copy_blobs(), by an exclusive table lock. */ + bpage = buf_page_get_zip(id, zip_size); + + if (UNIV_UNLIKELY(!bpage)) { + ib::error() << "Cannot load compressed BLOB " << id; + goto func_exit; + } + + if (UNIV_UNLIKELY + (fil_page_get_type(bpage->zip.data) != page_type)) { + + ib::error() << "Unexpected type " + << fil_page_get_type(bpage->zip.data) + << " of compressed BLOB page " << id; + + ut_ad(0); + goto end_of_blob; + } + + next_page_no = mach_read_from_4(bpage->zip.data + offset); + + if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) { + /* When the BLOB begins at page header, + the compressed data payload does not + immediately follow the next page pointer. */ + offset = FIL_PAGE_DATA; + } else { + offset += 4; + } + + d_stream.next_in = bpage->zip.data + offset; + d_stream.avail_in = uInt(zip_size - offset); + + err = inflate(&d_stream, Z_NO_FLUSH); + switch (err) { + case Z_OK: + if (!d_stream.avail_out) { + goto end_of_blob; + } + break; + case Z_STREAM_END: + if (next_page_no == FIL_NULL) { + goto end_of_blob; + } + /* fall through */ + default: +inflate_error: + ib::error() << "inflate() of compressed BLOB page " + << id + << " returned " << err + << " (" << d_stream.msg << ")"; + + case Z_BUF_ERROR: + goto end_of_blob; + } + + if (next_page_no == FIL_NULL) { + if (!d_stream.avail_in) { + ib::error() + << "Unexpected end of compressed " + << "BLOB page " << id; + } else { + err = inflate(&d_stream, Z_FINISH); + switch (err) { + case Z_STREAM_END: + case Z_BUF_ERROR: + break; + default: + goto inflate_error; + } + } + +end_of_blob: + bpage->lock.s_unlock(); + bpage->unfix(); + goto func_exit; + } + + bpage->lock.s_unlock(); + bpage->unfix(); + + /* On other BLOB pages except the first + the BLOB header always is at the page header: */ + + id.set_page_no(next_page_no); + offset = FIL_PAGE_NEXT; + page_type = FIL_PAGE_TYPE_ZBLOB2; + } + +func_exit: + inflateEnd(&d_stream); + mem_heap_free(heap); + MEM_CHECK_DEFINED(buf, d_stream.total_out); + return(d_stream.total_out); +} + +/** Copies the prefix of an externally stored field of a record. +The clustered index record that points to this BLOB must be protected +by a lock or a page latch. +@param[out] buf the externally stored part of the +field, or a prefix of it +@param[in] len length of buf, in bytes +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] id page identifier of the first BLOB page +@param[in] offset offset on the first BLOB page +@return number of bytes written to buf */ +static +ulint +btr_copy_externally_stored_field_prefix_low( + byte* buf, + uint32_t len, + ulint zip_size, + page_id_t id, + uint32_t offset) +{ + if (len == 0) + return 0; + + return zip_size + ? btr_copy_zblob_prefix(buf, len, zip_size, id, offset) + : btr_copy_blob_prefix(buf, len, id, offset); +} + +/** Copies the prefix of an externally stored field of a record. +The clustered index record must be protected by a lock or a page latch. +@param[out] buf the field, or a prefix of it +@param[in] len length of buf, in bytes +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] data 'internally' stored part of the field +containing also the reference to the external part; must be protected by +a lock or a page latch +@param[in] local_len length of data, in bytes +@return the length of the copied field, or 0 if the column was being +or has been deleted */ +ulint +btr_copy_externally_stored_field_prefix( + byte* buf, + ulint len, + ulint zip_size, + const byte* data, + ulint local_len) +{ + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + if (UNIV_UNLIKELY(local_len >= len)) { + memcpy(buf, data, len); + return(len); + } + + memcpy(buf, data, local_len); + data += local_len; + + ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE)); + + if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) { + /* The externally stored part of the column has been + (partially) deleted. Signal the half-deleted BLOB + to the caller. */ + + return(0); + } + + uint32_t space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID); + uint32_t page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO); + uint32_t offset = mach_read_from_4(data + BTR_EXTERN_OFFSET); + len -= local_len; + + return(local_len + + btr_copy_externally_stored_field_prefix_low(buf + local_len, + uint32_t(len), + zip_size, + page_id_t( + space_id, + page_no), + offset)); +} + +/** Copies an externally stored field of a record to mem heap. +The clustered index record must be protected by a lock or a page latch. +@param[out] len length of the whole field +@param[in] data 'internally' stored part of the field +containing also the reference to the external part; must be protected by +a lock or a page latch +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] local_len length of data +@param[in,out] heap mem heap +@return the whole field copied to heap */ +byte* +btr_copy_externally_stored_field( + ulint* len, + const byte* data, + ulint zip_size, + ulint local_len, + mem_heap_t* heap) +{ + byte* buf; + + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + uint32_t space_id = mach_read_from_4(data + local_len + + BTR_EXTERN_SPACE_ID); + uint32_t page_no = mach_read_from_4(data + local_len + + BTR_EXTERN_PAGE_NO); + uint32_t offset = mach_read_from_4(data + local_len + + BTR_EXTERN_OFFSET); + + /* Currently a BLOB cannot be bigger than 4 GB; we + leave the 4 upper bytes in the length field unused */ + + uint32_t extern_len = mach_read_from_4(data + local_len + + BTR_EXTERN_LEN + 4); + + buf = (byte*) mem_heap_alloc(heap, local_len + extern_len); + + memcpy(buf, data, local_len); + *len = local_len + + btr_copy_externally_stored_field_prefix_low(buf + local_len, + extern_len, + zip_size, + page_id_t( + space_id, + page_no), + offset); + + return(buf); +} + +/** Copies an externally stored field of a record to mem heap. +@param[in] rec record in a clustered index; must be +protected by a lock or a page latch +@param[in] offset array returned by rec_get_offsets() +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] no field number +@param[out] len length of the field +@param[in,out] heap mem heap +@return the field copied to heap, or NULL if the field is incomplete */ +byte* +btr_rec_copy_externally_stored_field( + const rec_t* rec, + const rec_offs* offsets, + ulint zip_size, + ulint no, + ulint* len, + mem_heap_t* heap) +{ + ulint local_len; + const byte* data; + + ut_a(rec_offs_nth_extern(offsets, no)); + + /* An externally stored field can contain some initial + data from the field, and in the last 20 bytes it has the + space id, page number, and offset where the rest of the + field data is stored, and the data length in addition to + the data stored locally. We may need to store some data + locally to get the local record length above the 128 byte + limit so that field offsets are stored in two bytes, and + the extern bit is available in those two bytes. */ + + data = rec_get_nth_field(rec, offsets, no, &local_len); + + ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + if (UNIV_UNLIKELY + (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE, + field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) { + /* The externally stored field was not written yet. + This record should only be seen by + trx_rollback_recovered() or any + TRX_ISO_READ_UNCOMMITTED transactions. */ + return(NULL); + } + + return(btr_copy_externally_stored_field(len, data, + zip_size, local_len, heap)); +} diff --git a/storage/innobase/btr/btr0defragment.cc b/storage/innobase/btr/btr0defragment.cc new file mode 100644 index 00000000..642db0e9 --- /dev/null +++ b/storage/innobase/btr/btr0defragment.cc @@ -0,0 +1,820 @@ +/***************************************************************************** + +Copyright (C) 2012, 2014 Facebook, Inc. All Rights Reserved. +Copyright (C) 2014, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ +/**************************************************//** +@file btr/btr0defragment.cc +Index defragmentation. + +Created 05/29/2014 Rongrong Zhong +Modified 16/07/2014 Sunguck Lee +Modified 30/07/2014 Jan Lindström jan.lindstrom@mariadb.com +*******************************************************/ + +#include "btr0defragment.h" +#include "btr0btr.h" +#include "btr0cur.h" +#include "btr0sea.h" +#include "btr0pcur.h" +#include "dict0stats.h" +#include "dict0stats_bg.h" +#include "dict0defrag_bg.h" +#include "ibuf0ibuf.h" +#include "lock0lock.h" +#include "srv0start.h" +#include "mysqld.h" + +#include + +/* When there's no work, either because defragment is disabled, or because no +query is submitted, thread checks state every BTR_DEFRAGMENT_SLEEP_IN_USECS.*/ +#define BTR_DEFRAGMENT_SLEEP_IN_USECS 1000000 +/* Reduce the target page size by this amount when compression failure happens +during defragmentaiton. 512 is chosen because it's a power of 2 and it is about +3% of the page size. When there are compression failures in defragmentation, +our goal is to get a decent defrag ratio with as few compression failure as +possible. From experimentation it seems that reduce the target size by 512 every +time will make sure the page is compressible within a couple of iterations. */ +#define BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE 512 + +/** Item in the work queue for btr_degrament_thread. */ +struct btr_defragment_item_t +{ + /** persistent cursor where btr_defragment_n_pages should start */ + btr_pcur_t * const pcur; + /** completion signal */ + pthread_cond_t *cond; + /** timestamp of last time this index is processed by defragment thread */ + ulonglong last_processed= 0; + + btr_defragment_item_t(btr_pcur_t *pcur, pthread_cond_t *cond) + : pcur(pcur), cond(cond) {} +}; + +/* Work queue for defragmentation. */ +typedef std::list btr_defragment_wq_t; +static btr_defragment_wq_t btr_defragment_wq; + +/* Mutex protecting the defragmentation work queue.*/ +static mysql_mutex_t btr_defragment_mutex; +#ifdef UNIV_PFS_MUTEX +mysql_pfs_key_t btr_defragment_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +/* Number of compression failures caused by defragmentation since server +start. */ +Atomic_counter btr_defragment_compression_failures; +/* Number of btr_defragment_n_pages calls that altered page but didn't +manage to release any page. */ +Atomic_counter btr_defragment_failures; +/* Total number of btr_defragment_n_pages calls that altered page. +The difference between btr_defragment_count and btr_defragment_failures shows +the amount of effort wasted. */ +Atomic_counter btr_defragment_count; + +bool btr_defragment_active; +static void btr_defragment_chunk(void*); + +static tpool::timer* btr_defragment_timer; +static tpool::task_group task_group(1); +static tpool::task btr_defragment_task(btr_defragment_chunk, 0, &task_group); +static void btr_defragment_start(); + +static void submit_defragment_task(void*arg=0) +{ + srv_thread_pool->submit_task(&btr_defragment_task); +} + +/******************************************************************//** +Initialize defragmentation. */ +void +btr_defragment_init() +{ + srv_defragment_interval = 1000000000ULL / srv_defragment_frequency; + mysql_mutex_init(btr_defragment_mutex_key, &btr_defragment_mutex, + nullptr); + btr_defragment_timer = srv_thread_pool->create_timer(submit_defragment_task); + btr_defragment_active = true; +} + +/******************************************************************//** +Shutdown defragmentation. Release all resources. */ +void +btr_defragment_shutdown() +{ + if (!btr_defragment_timer) + return; + delete btr_defragment_timer; + btr_defragment_timer = 0; + task_group.cancel_pending(&btr_defragment_task); + mysql_mutex_lock(&btr_defragment_mutex); + std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + while(iter != btr_defragment_wq.end()) { + btr_defragment_item_t* item = *iter; + iter = btr_defragment_wq.erase(iter); + if (item->cond) { + pthread_cond_signal(item->cond); + } + } + mysql_mutex_unlock(&btr_defragment_mutex); + mysql_mutex_destroy(&btr_defragment_mutex); + btr_defragment_active = false; +} + + +/******************************************************************//** +Functions used by the query threads: btr_defragment_xxx_index +Query threads find/add/remove index. */ +/******************************************************************//** +Check whether the given index is in btr_defragment_wq. We use index->id +to identify indices. */ +bool +btr_defragment_find_index( + dict_index_t* index) /*!< Index to find. */ +{ + mysql_mutex_lock(&btr_defragment_mutex); + for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + iter != btr_defragment_wq.end(); + ++iter) { + btr_defragment_item_t* item = *iter; + btr_pcur_t* pcur = item->pcur; + btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur); + dict_index_t* idx = btr_cur_get_index(cursor); + if (index->id == idx->id) { + mysql_mutex_unlock(&btr_defragment_mutex); + return true; + } + } + mysql_mutex_unlock(&btr_defragment_mutex); + return false; +} + +/** Defragment an index. +@param pcur persistent cursor +@param thd current session, for checking thd_killed() +@return whether the operation was interrupted */ +bool btr_defragment_add_index(btr_pcur_t *pcur, THD *thd) +{ + dict_stats_empty_defrag_summary(pcur->index()); + pthread_cond_t cond; + pthread_cond_init(&cond, nullptr); + btr_defragment_item_t item(pcur, &cond); + mysql_mutex_lock(&btr_defragment_mutex); + btr_defragment_wq.push_back(&item); + if (btr_defragment_wq.size() == 1) + /* Kick off defragmentation work */ + btr_defragment_start(); + bool interrupted= false; + for (;;) + { + timespec abstime; + set_timespec(abstime, 1); + if (!my_cond_timedwait(&cond, &btr_defragment_mutex.m_mutex, &abstime)) + break; + if (thd_killed(thd)) + { + item.cond= nullptr; + interrupted= true; + break; + } + } + + pthread_cond_destroy(&cond); + mysql_mutex_unlock(&btr_defragment_mutex); + return interrupted; +} + +/******************************************************************//** +When table is dropped, this function is called to mark a table as removed in +btr_efragment_wq. The difference between this function and the remove_index +function is this will not NULL the event. */ +void +btr_defragment_remove_table( + dict_table_t* table) /*!< Index to be removed. */ +{ + mysql_mutex_lock(&btr_defragment_mutex); + for (auto item : btr_defragment_wq) + { + if (item->cond && table == item->pcur->index()->table) + { + pthread_cond_signal(item->cond); + item->cond= nullptr; + } + } + mysql_mutex_unlock(&btr_defragment_mutex); +} + +/*********************************************************************//** +Check whether we should save defragmentation statistics to persistent storage. +Currently we save the stats to persistent storage every 100 updates. */ +void btr_defragment_save_defrag_stats_if_needed(dict_index_t *index) +{ + if (srv_defragment_stats_accuracy != 0 // stats tracking disabled + && index->table->space_id != 0 // do not track system tables + && !index->table->is_temporary() + && index->stat_defrag_modified_counter + >= srv_defragment_stats_accuracy) { + dict_stats_defrag_pool_add(index); + index->stat_defrag_modified_counter = 0; + } +} + +/*********************************************************************//** +Main defragment functionalities used by defragment thread.*/ +/*************************************************************//** +Calculate number of records from beginning of block that can +fit into size_limit +@return number of records */ +static +ulint +btr_defragment_calc_n_recs_for_size( + buf_block_t* block, /*!< in: B-tree page */ + dict_index_t* index, /*!< in: index of the page */ + ulint size_limit, /*!< in: size limit to fit records in */ + ulint* n_recs_size) /*!< out: actual size of the records that fit + in size_limit. */ +{ + page_t* page = buf_block_get_frame(block); + ulint n_recs = 0; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + mem_heap_t* heap = NULL; + ulint size = 0; + page_cur_t cur; + + const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0; + page_cur_set_before_first(block, &cur); + while (rec_t* cur_rec = page_cur_move_to_next(&cur)) { + if (page_rec_is_supremum(cur_rec)) { + break; + } + offsets = rec_get_offsets(cur_rec, index, offsets, n_core, + ULINT_UNDEFINED, &heap); + ulint rec_size = rec_offs_size(offsets); + size += rec_size; + if (size > size_limit) { + size = size - rec_size; + break; + } + n_recs ++; + } + *n_recs_size = size; + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return n_recs; +} + +MY_ATTRIBUTE((nonnull(2,3,4), warn_unused_result)) +/************************************************************//** +Returns the upper level node pointer to a page. It is assumed that mtr holds +an sx-latch on the tree. +@return rec_get_offsets() of the node pointer record */ +static +rec_offs* +btr_page_search_father_node_ptr( + rec_offs* offsets,/*!< in: work area for the return value */ + mem_heap_t* heap, /*!< in: memory heap to use */ + btr_cur_t* cursor, /*!< in: cursor pointing to user record, + out: cursor on node pointer record, + its page x-latched */ + mtr_t* mtr) /*!< in: mtr */ +{ + const uint32_t page_no = btr_cur_get_block(cursor)->page.id().page_no(); + dict_index_t* index = btr_cur_get_index(cursor); + ut_ad(!index->is_spatial()); + + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + ut_ad(dict_index_get_page(index) != page_no); + + const auto level = btr_page_get_level(btr_cur_get_page(cursor)); + + const rec_t* user_rec = btr_cur_get_rec(cursor); + ut_a(page_rec_is_user_rec(user_rec)); + + if (btr_cur_search_to_nth_level(level + 1, + dict_index_build_node_ptr(index, + user_rec, 0, + heap, level), + RW_X_LATCH, + cursor, mtr) != DB_SUCCESS) { + return nullptr; + } + + const rec_t* node_ptr = btr_cur_get_rec(cursor); + ut_ad(!btr_cur_get_block(cursor)->page.lock.not_recursive() + || mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK)); + + offsets = rec_get_offsets(node_ptr, index, offsets, 0, + ULINT_UNDEFINED, &heap); + + if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) { + offsets = nullptr; + } + + return(offsets); +} + +static bool btr_page_search_father(mtr_t *mtr, btr_cur_t *cursor) +{ + rec_t *rec= + page_rec_get_next(page_get_infimum_rec(cursor->block()->page.frame)); + if (UNIV_UNLIKELY(!rec)) + return false; + cursor->page_cur.rec= rec; + mem_heap_t *heap= mem_heap_create(100); + const bool got= btr_page_search_father_node_ptr(nullptr, heap, cursor, mtr); + mem_heap_free(heap); + return got; +} + +/*************************************************************//** +Merge as many records from the from_block to the to_block. Delete +the from_block if all records are successfully merged to to_block. +@return the to_block to target for next merge operation. +@retval nullptr if corruption was noticed */ +static +buf_block_t* +btr_defragment_merge_pages( + dict_index_t* index, /*!< in: index tree */ + buf_block_t* from_block, /*!< in: origin of merge */ + buf_block_t* to_block, /*!< in: destination of merge */ + ulint zip_size, /*!< in: ROW_FORMAT=COMPRESSED size */ + ulint reserved_space, /*!< in: space reserved for future + insert to avoid immediate page split */ + ulint* max_data_size, /*!< in/out: max data size to + fit in a single compressed page. */ + mem_heap_t* heap, /*!< in/out: pointer to memory heap */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + page_t* from_page = buf_block_get_frame(from_block); + page_t* to_page = buf_block_get_frame(to_block); + ulint level = btr_page_get_level(from_page); + ulint n_recs = page_get_n_recs(from_page); + ulint new_data_size = page_get_data_size(to_page); + ulint max_ins_size = + page_get_max_insert_size(to_page, n_recs); + ulint max_ins_size_reorg = + page_get_max_insert_size_after_reorganize( + to_page, n_recs); + ulint max_ins_size_to_use = max_ins_size_reorg > reserved_space + ? max_ins_size_reorg - reserved_space : 0; + ulint move_size = 0; + ulint n_recs_to_move = 0; + rec_t* rec = NULL; + ulint target_n_recs = 0; + rec_t* orig_pred; + + // Estimate how many records can be moved from the from_page to + // the to_page. + if (zip_size) { + ulint page_diff = srv_page_size - *max_data_size; + max_ins_size_to_use = (max_ins_size_to_use > page_diff) + ? max_ins_size_to_use - page_diff : 0; + } + n_recs_to_move = btr_defragment_calc_n_recs_for_size( + from_block, index, max_ins_size_to_use, &move_size); + + // If max_ins_size >= move_size, we can move the records without + // reorganizing the page, otherwise we need to reorganize the page + // first to release more space. + if (move_size > max_ins_size) { + dberr_t err = btr_page_reorganize_block(page_zip_level, + to_block, index, mtr); + if (err != DB_SUCCESS) { + if (!dict_index_is_clust(index) + && page_is_leaf(to_page)) { + ibuf_reset_free_bits(to_block); + } + // If reorganization fails, that means page is + // not compressable. There's no point to try + // merging into this page. Continue to the + // next page. + return err == DB_FAIL ? from_block : nullptr; + } + ut_ad(page_validate(to_page, index)); + max_ins_size = page_get_max_insert_size(to_page, n_recs); + if (max_ins_size < move_size) { + return nullptr; + } + } + + // Move records to pack to_page more full. + orig_pred = NULL; + target_n_recs = n_recs_to_move; + dberr_t err; + while (n_recs_to_move > 0) { + if (!(rec = page_rec_get_nth(from_page, n_recs_to_move + 1))) { + return nullptr; + } + orig_pred = page_copy_rec_list_start( + to_block, from_block, rec, index, mtr, &err); + if (orig_pred) + break; + if (err != DB_FAIL) { + return nullptr; + } + + // If we reach here, that means compression failed after packing + // n_recs_to_move number of records to to_page. We try to reduce + // the targeted data size on the to_page by + // BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE and try again. + btr_defragment_compression_failures++; + max_ins_size_to_use = + move_size > BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE + ? move_size - BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE + : 0; + if (max_ins_size_to_use == 0) { + n_recs_to_move = 0; + move_size = 0; + break; + } + n_recs_to_move = btr_defragment_calc_n_recs_for_size( + from_block, index, max_ins_size_to_use, &move_size); + } + // If less than target_n_recs are moved, it means there are + // compression failures during page_copy_rec_list_start. Adjust + // the max_data_size estimation to reduce compression failures + // in the following runs. + if (target_n_recs > n_recs_to_move + && *max_data_size > new_data_size + move_size) { + *max_data_size = new_data_size + move_size; + } + // Set ibuf free bits if necessary. + if (!dict_index_is_clust(index) + && page_is_leaf(to_page)) { + if (zip_size) { + ibuf_reset_free_bits(to_block); + } else { + ibuf_update_free_bits_if_full( + to_block, + srv_page_size, + ULINT_UNDEFINED); + } + } + btr_cur_t parent; + parent.page_cur.index = index; + parent.page_cur.block = from_block; + + if (!btr_page_search_father(mtr, &parent)) { + to_block = nullptr; + } else if (n_recs_to_move == n_recs) { + /* The whole page is merged with the previous page, + free it. */ + lock_update_merge_left(*to_block, orig_pred, + from_block->page.id()); + btr_search_drop_page_hash_index(from_block, false); + if (btr_level_list_remove(*from_block, *index, mtr) + != DB_SUCCESS + || btr_cur_node_ptr_delete(&parent, mtr) != DB_SUCCESS + || btr_page_free(index, from_block, mtr) != DB_SUCCESS) { + return nullptr; + } + } else { + // There are still records left on the page, so + // increment n_defragmented. Node pointer will be changed + // so remove the old node pointer. + if (n_recs_to_move > 0) { + // Part of the page is merged to left, remove + // the merged records, update record locks and + // node pointer. + dtuple_t* node_ptr; + page_delete_rec_list_start(rec, from_block, + index, mtr); + lock_update_split_and_merge(to_block, + orig_pred, + from_block); + // FIXME: reuse the node_ptr! + if (btr_cur_node_ptr_delete(&parent, mtr) + != DB_SUCCESS) { + return nullptr; + } + rec = page_rec_get_next( + page_get_infimum_rec(from_page)); + if (!rec) { + return nullptr; + } + node_ptr = dict_index_build_node_ptr( + index, rec, page_get_page_no(from_page), + heap, level); + if (btr_insert_on_non_leaf_level(0, index, level+1, + node_ptr, mtr) + != DB_SUCCESS) { + return nullptr; + } + } + to_block = from_block; + } + return to_block; +} + +/*************************************************************//** +Tries to merge N consecutive pages, starting from the page pointed by the +cursor. Skip space 0. Only consider leaf pages. +This function first loads all N pages into memory, then for each of +the pages other than the first page, it tries to move as many records +as possible to the left sibling to keep the left sibling full. During +the process, if any page becomes empty, that page will be removed from +the level list. Record locks, hash, and node pointers are updated after +page reorganization. +@return pointer to the last block processed, or NULL if reaching end of index */ +static +buf_block_t* +btr_defragment_n_pages( + buf_block_t* block, /*!< in: starting block for defragmentation */ + dict_index_t* index, /*!< in: index tree */ + uint n_pages,/*!< in: number of pages to defragment */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + /* We will need to load the n+1 block because if the last page is freed + and we need to modify the prev_page_no of that block. */ + buf_block_t* blocks[BTR_DEFRAGMENT_MAX_N_PAGES + 1]; + page_t* first_page; + buf_block_t* current_block; + ulint total_data_size = 0; + ulint total_n_recs = 0; + ulint data_size_per_rec; + ulint optimal_page_size; + ulint reserved_space; + ulint max_data_size = 0; + uint n_defragmented = 0; + uint n_new_slots; + mem_heap_t* heap; + ibool end_of_index = FALSE; + + /* It doesn't make sense to call this function with n_pages = 1. */ + ut_ad(n_pages > 1); + + if (!page_is_leaf(block->page.frame)) { + return NULL; + } + + if (!index->table->space || !index->table->space_id) { + /* Ignore space 0. */ + return NULL; + } + + if (n_pages > BTR_DEFRAGMENT_MAX_N_PAGES) { + n_pages = BTR_DEFRAGMENT_MAX_N_PAGES; + } + + first_page = buf_block_get_frame(block); + const ulint zip_size = index->table->space->zip_size(); + + /* 1. Load the pages and calculate the total data size. */ + blocks[0] = block; + for (uint i = 1; i <= n_pages; i++) { + page_t* page = buf_block_get_frame(blocks[i-1]); + uint32_t page_no = btr_page_get_next(page); + total_data_size += page_get_data_size(page); + total_n_recs += page_get_n_recs(page); + if (page_no == FIL_NULL) { + n_pages = i; + end_of_index = TRUE; + break; + } + + blocks[i] = btr_block_get(*index, page_no, RW_X_LATCH, true, + mtr); + if (!blocks[i]) { + return nullptr; + } + } + + if (n_pages == 1) { + if (!page_has_prev(first_page)) { + /* last page in the index */ + if (dict_index_get_page(index) + == page_get_page_no(first_page)) + return NULL; + /* given page is the last page. + Lift the records to father. */ + dberr_t err; + btr_lift_page_up(index, block, mtr, &err); + } + return NULL; + } + + /* 2. Calculate how many pages data can fit in. If not compressable, + return early. */ + ut_a(total_n_recs != 0); + data_size_per_rec = total_data_size / total_n_recs; + // For uncompressed pages, the optimal data size if the free space of a + // empty page. + optimal_page_size = page_get_free_space_of_empty( + page_is_comp(first_page)); + // For compressed pages, we take compression failures into account. + if (zip_size) { + ulint size = 0; + uint i = 0; + // We estimate the optimal data size of the index use samples of + // data size. These samples are taken when pages failed to + // compress due to insertion on the page. We use the average + // of all samples we have as the estimation. Different pages of + // the same index vary in compressibility. Average gives a good + // enough estimation. + for (;i < STAT_DEFRAG_DATA_SIZE_N_SAMPLE; i++) { + if (index->stat_defrag_data_size_sample[i] == 0) { + break; + } + size += index->stat_defrag_data_size_sample[i]; + } + if (i != 0) { + size /= i; + optimal_page_size = ut_min(optimal_page_size, size); + } + max_data_size = optimal_page_size; + } + + reserved_space = ut_min(static_cast( + static_cast(optimal_page_size) + * (1 - srv_defragment_fill_factor)), + (data_size_per_rec + * srv_defragment_fill_factor_n_recs)); + optimal_page_size -= reserved_space; + n_new_slots = uint((total_data_size + optimal_page_size - 1) + / optimal_page_size); + if (n_new_slots >= n_pages) { + /* Can't defragment. */ + if (end_of_index) + return NULL; + return blocks[n_pages-1]; + } + + /* 3. Defragment pages. */ + heap = mem_heap_create(256); + // First defragmented page will be the first page. + current_block = blocks[0]; + // Start from the second page. + for (uint i = 1; i < n_pages; i ++) { + buf_block_t* new_block = btr_defragment_merge_pages( + index, blocks[i], current_block, zip_size, + reserved_space, &max_data_size, heap, mtr); + if (new_block != current_block) { + n_defragmented ++; + current_block = new_block; + if (!new_block) { + break; + } + } + } + mem_heap_free(heap); + n_defragmented ++; + btr_defragment_count++; + if (n_pages == n_defragmented) { + btr_defragment_failures++; + } else { + index->stat_defrag_n_pages_freed += (n_pages - n_defragmented); + } + if (end_of_index) + return NULL; + return current_block; +} + + + +void btr_defragment_start() { + if (!srv_defragment) + return; + ut_ad(!btr_defragment_wq.empty()); + submit_defragment_task(); +} + + +/** +Callback used by defragment timer + +Throttling "sleep", is implemented via rescheduling the +threadpool timer, which, when fired, will resume the work again, +where it is left. + +The state (current item) is stored in function parameter. +*/ +static void btr_defragment_chunk(void*) +{ + THD *thd = innobase_create_background_thd("InnoDB defragment"); + set_current_thd(thd); + + btr_defragment_item_t* item = nullptr; + mtr_t mtr; + + mysql_mutex_lock(&btr_defragment_mutex); + + while (srv_shutdown_state == SRV_SHUTDOWN_NONE) { + if (!item) { + if (btr_defragment_wq.empty()) { +release_and_exit: + mysql_mutex_unlock(&btr_defragment_mutex); +func_exit: + set_current_thd(nullptr); + destroy_background_thd(thd); + return; + } + item = *btr_defragment_wq.begin(); + ut_ad(item); + } + + if (!item->cond) { +processed: + btr_defragment_wq.remove(item); + item = nullptr; + continue; + } + + mysql_mutex_unlock(&btr_defragment_mutex); + + ulonglong now = my_interval_timer(); + ulonglong elapsed = now - item->last_processed; + + if (elapsed < srv_defragment_interval) { + /* If we see an index again before the interval + determined by the configured frequency is reached, + we just sleep until the interval pass. Since + defragmentation of all indices queue up on a single + thread, it's likely other indices that follow this one + don't need to sleep again. */ + int sleep_ms = (int)((srv_defragment_interval - elapsed) / 1000 / 1000); + if (sleep_ms) { + btr_defragment_timer->set_time(sleep_ms, 0); + goto func_exit; + } + } + log_free_check(); + mtr_start(&mtr); + dict_index_t *index = item->pcur->index(); + index->set_modified(mtr); + /* To follow the latching order defined in WL#6326, + acquire index->lock X-latch. This entitles us to + acquire page latches in any order for the index. */ + mtr_x_lock_index(index, &mtr); + if (buf_block_t *last_block = + item->pcur->restore_position( + BTR_PURGE_TREE_ALREADY_LATCHED, &mtr) + == btr_pcur_t::CORRUPTED + ? nullptr + : btr_defragment_n_pages(btr_pcur_get_block(item->pcur), + index, srv_defragment_n_pages, + &mtr)) { + /* If we haven't reached the end of the index, + place the cursor on the last record of last page, + store the cursor position, and put back in queue. */ + page_t* last_page = buf_block_get_frame(last_block); + rec_t* rec = page_rec_get_prev( + page_get_supremum_rec(last_page)); + if (rec && page_rec_is_user_rec(rec)) { + page_cur_position(rec, last_block, + btr_pcur_get_page_cur( + item->pcur)); + } + btr_pcur_store_position(item->pcur, &mtr); + mtr_commit(&mtr); + /* Update the last_processed time of this index. */ + item->last_processed = now; + mysql_mutex_lock(&btr_defragment_mutex); + } else { + mtr_commit(&mtr); + /* Reaching the end of the index. */ + dict_stats_empty_defrag_stats(index); + if (dberr_t err= dict_stats_save_defrag_stats(index)) { + ib::error() << "Saving defragmentation stats for table " + << index->table->name + << " index " << index->name() + << " failed with error " << err; + } else { + err = dict_stats_save_defrag_summary(index, + thd); + + if (err != DB_SUCCESS) { + ib::error() << "Saving defragmentation summary for table " + << index->table->name + << " index " << index->name() + << " failed with error " << err; + } + } + + mysql_mutex_lock(&btr_defragment_mutex); + if (item->cond) { + pthread_cond_signal(item->cond); + } + goto processed; + } + } + + goto release_and_exit; +} diff --git a/storage/innobase/btr/btr0pcur.cc b/storage/innobase/btr/btr0pcur.cc new file mode 100644 index 00000000..54dd15ac --- /dev/null +++ b/storage/innobase/btr/btr0pcur.cc @@ -0,0 +1,667 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file btr/btr0pcur.cc +The index tree persistent cursor + +Created 2/23/1996 Heikki Tuuri +*******************************************************/ + +#include "btr0pcur.h" +#include "ut0byte.h" +#include "rem0cmp.h" +#include "trx0trx.h" + +/**************************************************************//** +Resets a persistent cursor object, freeing ::old_rec_buf if it is +allocated and resetting the other members to their initial values. */ +void +btr_pcur_reset( +/*===========*/ + btr_pcur_t* cursor) /*!< in, out: persistent cursor */ +{ + ut_free(cursor->old_rec_buf); + memset(&cursor->btr_cur.page_cur, 0, sizeof(page_cur_t)); + cursor->old_rec_buf = NULL; + cursor->old_rec = NULL; + cursor->old_n_core_fields = 0; + cursor->old_n_fields = 0; + + cursor->latch_mode = BTR_NO_LATCHES; + cursor->pos_state = BTR_PCUR_NOT_POSITIONED; +} + +/**************************************************************//** +The position of the cursor is stored by taking an initial segment of the +record the cursor is positioned on, before, or after, and copying it to the +cursor data structure, or just setting a flag if the cursor id before the +first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the +page where the cursor is positioned must not be empty if the index tree is +not totally empty! */ +void +btr_pcur_store_position( +/*====================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_cur_t* page_cursor; + buf_block_t* block; + rec_t* rec; + dict_index_t* index; + ulint offs; + + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + block = btr_pcur_get_block(cursor); + index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor)); + + page_cursor = btr_pcur_get_page_cur(cursor); + + rec = page_cur_get_rec(page_cursor); + offs = rec - block->page.frame; + ut_ad(block->page.id().page_no() + == page_get_page_no(block->page.frame)); + ut_ad(block->page.buf_fix_count()); + /* For spatial index, when we do positioning on parent + buffer if necessary, it might not hold latches, but the + tree must be locked to prevent change on the page */ + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_S_FIX + | MTR_MEMO_PAGE_X_FIX) + || (index->is_spatial() + && mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK))); + + if (page_is_empty(block->page.frame)) { + /* It must be an empty index tree; NOTE that in this case + we do not store the modify_clock, but always do a search + if we restore the cursor position */ + + ut_a(!page_has_siblings(block->page.frame)); + ut_ad(page_is_leaf(block->page.frame)); + ut_ad(block->page.id().page_no() == index->page); + + if (page_rec_is_supremum_low(offs)) { + cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE; + } else { +before_first: + cursor->rel_pos = BTR_PCUR_BEFORE_FIRST_IN_TREE; + } + + return; + } + + if (page_rec_is_supremum_low(offs)) { + rec = page_rec_get_prev(rec); + if (UNIV_UNLIKELY(!rec || page_rec_is_infimum(rec))) { + ut_ad("corrupted index" == 0); + cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE; + return; + } + + ut_ad(!page_rec_is_infimum(rec)); + if (UNIV_UNLIKELY(rec_is_metadata(rec, *index))) { +#if 0 /* MDEV-22867 had to relax this */ + /* If the table is emptied during an ALGORITHM=NOCOPY + DROP COLUMN ... that is not ALGORITHM=INSTANT, + then we must preserve any instant ADD metadata. */ + ut_ad(index->table->instant + || block->page.id().page_no() != index->page); +#endif + ut_ad(index->is_instant() + || block->page.id().page_no() != index->page); + ut_ad(page_get_n_recs(block->page.frame) == 1); + ut_ad(page_is_leaf(block->page.frame)); + ut_ad(!page_has_prev(block->page.frame)); + cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE; + return; + } + + cursor->rel_pos = BTR_PCUR_AFTER; + } else if (page_rec_is_infimum_low(offs)) { + rec = page_rec_get_next(rec); + + if (UNIV_UNLIKELY(!rec)) { + ut_ad("corrupted page" == 0); + goto before_first; + } + + if (rec_is_metadata(rec, *index)) { + ut_ad(!page_has_prev(block->page.frame)); + rec = page_rec_get_next(rec); + ut_ad(rec); + if (!rec || page_rec_is_supremum(rec)) { + goto before_first; + } + } + + cursor->rel_pos = BTR_PCUR_BEFORE; + } else { + cursor->rel_pos = BTR_PCUR_ON; + } + + if (index->is_ibuf()) { + ut_ad(!index->table->not_redundant()); + cursor->old_n_fields = uint16_t(rec_get_n_fields_old(rec)); + } else { + cursor->old_n_fields = static_cast( + dict_index_get_n_unique_in_tree(index)); + if (index->is_spatial() && !page_rec_is_leaf(rec)) { + ut_ad(dict_index_get_n_unique_in_tree_nonleaf(index) + == DICT_INDEX_SPATIAL_NODEPTR_SIZE); + /* For R-tree, we have to compare + the child page numbers as well. */ + cursor->old_n_fields + = DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1; + } + } + + cursor->old_n_core_fields = index->n_core_fields; + cursor->old_rec = rec_copy_prefix_to_buf(rec, index, + cursor->old_n_fields, + &cursor->old_rec_buf, + &cursor->buf_size); + cursor->block_when_stored.store(block); + + /* Function try to check if block is S/X latch. */ + cursor->modify_clock = buf_block_get_modify_clock(block); +} + +/**************************************************************//** +Copies the stored position of a pcur to another pcur. */ +void +btr_pcur_copy_stored_position( +/*==========================*/ + btr_pcur_t* pcur_receive, /*!< in: pcur which will receive the + position info */ + btr_pcur_t* pcur_donate) /*!< in: pcur from which the info is + copied */ +{ + ut_free(pcur_receive->old_rec_buf); + memcpy(pcur_receive, pcur_donate, sizeof(btr_pcur_t)); + + if (pcur_donate->old_rec_buf) { + + pcur_receive->old_rec_buf = (byte*) + ut_malloc_nokey(pcur_donate->buf_size); + + memcpy(pcur_receive->old_rec_buf, pcur_donate->old_rec_buf, + pcur_donate->buf_size); + pcur_receive->old_rec = pcur_receive->old_rec_buf + + (pcur_donate->old_rec - pcur_donate->old_rec_buf); + } + + pcur_receive->old_n_core_fields = pcur_donate->old_n_core_fields; + pcur_receive->old_n_fields = pcur_donate->old_n_fields; +} + +/** Optimistically latches the leaf page or pages requested. +@param[in] block guessed buffer block +@param[in,out] pcur cursor +@param[in,out] latch_mode BTR_SEARCH_LEAF, ... +@param[in,out] mtr mini-transaction +@return true if success */ +TRANSACTIONAL_TARGET +static bool btr_pcur_optimistic_latch_leaves(buf_block_t *block, + btr_pcur_t *pcur, + btr_latch_mode *latch_mode, + mtr_t *mtr) +{ + ut_ad(block->page.buf_fix_count()); + ut_ad(block->page.in_file()); + ut_ad(block->page.frame); + + static_assert(BTR_SEARCH_PREV & BTR_SEARCH_LEAF, ""); + static_assert(BTR_MODIFY_PREV & BTR_MODIFY_LEAF, ""); + static_assert((BTR_SEARCH_PREV ^ BTR_MODIFY_PREV) == + (RW_S_LATCH ^ RW_X_LATCH), ""); + + const rw_lock_type_t mode= + rw_lock_type_t(*latch_mode & (RW_X_LATCH | RW_S_LATCH)); + + switch (*latch_mode) { + default: + ut_ad(*latch_mode == BTR_SEARCH_LEAF || *latch_mode == BTR_MODIFY_LEAF); + return buf_page_optimistic_get(mode, block, pcur->modify_clock, mtr); + case BTR_SEARCH_PREV: + case BTR_MODIFY_PREV: + page_id_t id{0}; + uint32_t left_page_no; + ulint zip_size; + buf_block_t *left_block= nullptr; + { + transactional_shared_lock_guard g{block->page.lock}; + if (block->modify_clock != pcur->modify_clock) + return false; + id= block->page.id(); + zip_size= block->zip_size(); + left_page_no= btr_page_get_prev(block->page.frame); + } + + if (left_page_no != FIL_NULL) + { + left_block= + buf_page_get_gen(page_id_t(id.space(), left_page_no), zip_size, + mode, nullptr, BUF_GET_POSSIBLY_FREED, mtr); + + if (left_block && + btr_page_get_next(left_block->page.frame) != id.page_no()) + { +release_left_block: + mtr->release_last_page(); + return false; + } + } + + if (buf_page_optimistic_get(mode, block, pcur->modify_clock, mtr)) + { + if (btr_page_get_prev(block->page.frame) == left_page_no) + { + /* block was already buffer-fixed while entering the function and + buf_page_optimistic_get() buffer-fixes it again. */ + ut_ad(2 <= block->page.buf_fix_count()); + *latch_mode= btr_latch_mode(mode); + return true; + } + + mtr->release_last_page(); + } + + ut_ad(block->page.buf_fix_count()); + if (left_block) + goto release_left_block; + return false; + } +} + +/** Structure acts as functor to do the latching of leaf pages. +It returns true if latching of leaf pages succeeded and false +otherwise. */ +struct optimistic_latch_leaves +{ + btr_pcur_t *const cursor; + btr_latch_mode *const latch_mode; + mtr_t *const mtr; + + bool operator()(buf_block_t *hint) const + { + return hint && + btr_pcur_optimistic_latch_leaves(hint, cursor, latch_mode, mtr); + } +}; + +/** Restores the stored position of a persistent cursor bufferfixing +the page and obtaining the specified latches. If the cursor position +was saved when the +(1) cursor was positioned on a user record: this function restores the +position to the last record LESS OR EQUAL to the stored record; +(2) cursor was positioned on a page infimum record: restores the +position to the last record LESS than the user record which was the +successor of the page infimum; +(3) cursor was positioned on the page supremum: restores to the first +record GREATER than the user record which was the predecessor of the +supremum. +(4) cursor was positioned before the first or after the last in an +empty tree: restores to before first or after the last in the tree. +@param latch_mode BTR_SEARCH_LEAF, ... +@param mtr mini-transaction +@return btr_pcur_t::SAME_ALL cursor position on user rec and points on +the record with the same field values as in the stored record, +btr_pcur_t::SAME_UNIQ cursor position is on user rec and points on the +record with the same unique field values as in the stored record, +btr_pcur_t::NOT_SAME cursor position is not on user rec or points on +the record with not the samebuniq field values as in the stored */ +btr_pcur_t::restore_status +btr_pcur_t::restore_position(btr_latch_mode restore_latch_mode, mtr_t *mtr) +{ + dict_index_t* index; + dtuple_t* tuple; + page_cur_mode_t mode; + page_cur_mode_t old_mode; + mem_heap_t* heap; + + ut_ad(mtr->is_active()); + ut_ad(pos_state == BTR_PCUR_WAS_POSITIONED + || pos_state == BTR_PCUR_IS_POSITIONED); + + index = btr_cur_get_index(&btr_cur); + + if (UNIV_UNLIKELY + (rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE + || rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE)) { + /* In these cases we do not try an optimistic restoration, + but always do a search */ + + if (btr_cur.open_leaf(rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE, + index, restore_latch_mode, mtr) + != DB_SUCCESS) { + return restore_status::CORRUPTED; + } + + latch_mode = + BTR_LATCH_MODE_WITHOUT_INTENTION(restore_latch_mode); + pos_state = BTR_PCUR_IS_POSITIONED; + block_when_stored.clear(); + + return restore_status::NOT_SAME; + } + + ut_a(old_rec); + ut_a(old_n_core_fields); + ut_a(old_n_core_fields <= index->n_core_fields); + ut_a(old_n_fields); + + static_assert(BTR_SEARCH_PREV == (4 | BTR_SEARCH_LEAF), ""); + static_assert(BTR_MODIFY_PREV == (4 | BTR_MODIFY_LEAF), ""); + + switch (restore_latch_mode | 4) { + case BTR_SEARCH_PREV: + case BTR_MODIFY_PREV: + /* Try optimistic restoration. */ + if (block_when_stored.run_with_hint( + optimistic_latch_leaves{this, &restore_latch_mode, + mtr})) { + pos_state = BTR_PCUR_IS_POSITIONED; + latch_mode = restore_latch_mode; + + if (rel_pos == BTR_PCUR_ON) { +#ifdef UNIV_DEBUG + const rec_t* rec; + rec_offs offsets1_[REC_OFFS_NORMAL_SIZE]; + rec_offs offsets2_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets1 = offsets1_; + rec_offs* offsets2 = offsets2_; + rec = btr_pcur_get_rec(this); + + rec_offs_init(offsets1_); + rec_offs_init(offsets2_); + + heap = mem_heap_create(256); + ut_ad(old_n_core_fields + == index->n_core_fields); + + offsets1 = rec_get_offsets( + old_rec, index, offsets1, + old_n_core_fields, + old_n_fields, &heap); + offsets2 = rec_get_offsets( + rec, index, offsets2, + index->n_core_fields, + old_n_fields, &heap); + + ut_ad(!cmp_rec_rec(old_rec, + rec, offsets1, offsets2, + index)); + mem_heap_free(heap); +#endif /* UNIV_DEBUG */ + return restore_status::SAME_ALL; + } + /* This is the same record as stored, + may need to be adjusted for BTR_PCUR_BEFORE/AFTER, + depending on search mode and direction. */ + if (btr_pcur_is_on_user_rec(this)) { + pos_state + = BTR_PCUR_IS_POSITIONED_OPTIMISTIC; + } + return restore_status::NOT_SAME; + } + } + + /* If optimistic restoration did not succeed, open the cursor anew */ + + heap = mem_heap_create(256); + + tuple = dtuple_create(heap, old_n_fields); + + dict_index_copy_types(tuple, index, old_n_fields); + + rec_copy_prefix_to_dtuple(tuple, old_rec, index, + old_n_core_fields, + old_n_fields, heap); + ut_ad(dtuple_check_typed(tuple)); + + /* Save the old search mode of the cursor */ + old_mode = search_mode; + + switch (rel_pos) { + case BTR_PCUR_ON: + mode = PAGE_CUR_LE; + break; + case BTR_PCUR_AFTER: + mode = PAGE_CUR_G; + break; + case BTR_PCUR_BEFORE: + mode = PAGE_CUR_L; + break; + default: + MY_ASSERT_UNREACHABLE(); + mode = PAGE_CUR_UNSUPP; + } + + if (btr_pcur_open_with_no_init(tuple, mode, restore_latch_mode, + this, mtr) != DB_SUCCESS) { + mem_heap_free(heap); + return restore_status::CORRUPTED; + } + + /* Restore the old search mode */ + search_mode = old_mode; + + ut_ad(rel_pos == BTR_PCUR_ON + || rel_pos == BTR_PCUR_BEFORE + || rel_pos == BTR_PCUR_AFTER); + rec_offs offsets[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets); + restore_status ret_val= restore_status::NOT_SAME; + if (rel_pos == BTR_PCUR_ON && btr_pcur_is_on_user_rec(this)) { + ulint n_matched_fields= 0; + if (!cmp_dtuple_rec_with_match( + tuple, btr_pcur_get_rec(this), index, + rec_get_offsets(btr_pcur_get_rec(this), index, offsets, + index->n_core_fields, ULINT_UNDEFINED, &heap), + &n_matched_fields)) { + + /* We have to store the NEW value for the modify clock, + since the cursor can now be on a different page! + But we can retain the value of old_rec */ + + block_when_stored.store(btr_pcur_get_block(this)); + modify_clock= buf_block_get_modify_clock( + block_when_stored.block()); + + mem_heap_free(heap); + + return restore_status::SAME_ALL; + } + if (n_matched_fields >= index->n_uniq) + ret_val= restore_status::SAME_UNIQ; + } + + mem_heap_free(heap); + + /* We have to store new position information, modify_clock etc., + to the cursor because it can now be on a different page, the record + under it may have been removed, etc. */ + + btr_pcur_store_position(this, mtr); + + return ret_val; +} + +/*********************************************************//** +Moves the persistent cursor to the first record on the next page. Releases the +latch on the current page, and bufferunfixes it. Note that there must not be +modifications on the current page, as then the x-latch can be released only in +mtr_commit. */ +dberr_t +btr_pcur_move_to_next_page( +/*=======================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; must be on the + last record of the current page */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + ut_ad(btr_pcur_is_after_last_on_page(cursor)); + + cursor->old_rec = nullptr; + + const page_t* page = btr_pcur_get_page(cursor); + const uint32_t next_page_no = btr_page_get_next(page); + + switch (next_page_no) { + case 0: + case 1: + case FIL_NULL: + return DB_CORRUPTION; + } + + if (UNIV_UNLIKELY(next_page_no == btr_pcur_get_block(cursor) + ->page.id().page_no())) { + return DB_CORRUPTION; + } + + dberr_t err; + buf_block_t* next_block = btr_block_get( + *cursor->index(), next_page_no, + rw_lock_type_t(cursor->latch_mode & (RW_X_LATCH | RW_S_LATCH)), + page_is_leaf(page), mtr, &err); + + if (UNIV_UNLIKELY(!next_block)) { + return err; + } + + const page_t* next_page = buf_block_get_frame(next_block); + + if (UNIV_UNLIKELY(memcmp_aligned<4>(next_page + FIL_PAGE_PREV, + page + FIL_PAGE_OFFSET, 4))) { + return DB_CORRUPTION; + } + + page_cur_set_before_first(next_block, btr_pcur_get_page_cur(cursor)); + + ut_d(page_check_dir(next_page)); + + const auto s = mtr->get_savepoint(); + mtr->rollback_to_savepoint(s - 2, s - 1); + return DB_SUCCESS; +} + +MY_ATTRIBUTE((nonnull,warn_unused_result)) +/*********************************************************//** +Moves the persistent cursor backward if it is on the first record of the page. +Commits mtr. Note that to prevent a possible deadlock, the operation +first stores the position of the cursor, commits mtr, acquires the necessary +latches and restores the cursor position again before returning. The +alphabetical position of the cursor is guaranteed to be sensible on +return, but it may happen that the cursor is not positioned on the last +record of any page, because the structure of the tree may have changed +during the time when the cursor had no latches. */ +static +bool +btr_pcur_move_backward_from_page( +/*=============================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor, must be on the first + record of the current page */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(btr_pcur_is_before_first_on_page(cursor)); + ut_ad(!btr_pcur_is_before_first_in_tree(cursor)); + + const auto latch_mode = cursor->latch_mode; + ut_ad(latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF); + + btr_pcur_store_position(cursor, mtr); + + mtr_commit(mtr); + + mtr_start(mtr); + + static_assert(BTR_SEARCH_PREV == (4 | BTR_SEARCH_LEAF), ""); + static_assert(BTR_MODIFY_PREV == (4 | BTR_MODIFY_LEAF), ""); + + if (UNIV_UNLIKELY(cursor->restore_position( + btr_latch_mode(4 | latch_mode), mtr) + == btr_pcur_t::CORRUPTED)) { + return true; + } + + buf_block_t* block = btr_pcur_get_block(cursor); + + if (page_has_prev(block->page.frame)) { + buf_block_t* left_block + = mtr->at_savepoint(mtr->get_savepoint() - 1); + const page_t* const left = left_block->page.frame; + if (memcmp_aligned<4>(left + FIL_PAGE_NEXT, + block->page.frame + + FIL_PAGE_OFFSET, 4)) { + /* This should be the right sibling page, or + if there is none, the current block. */ + ut_ad(left_block == block + || !memcmp_aligned<4>(left + FIL_PAGE_PREV, + block->page.frame + + FIL_PAGE_OFFSET, 4)); + /* The previous one must be the left sibling. */ + left_block + = mtr->at_savepoint(mtr->get_savepoint() - 2); + ut_ad(!memcmp_aligned<4>(left_block->page.frame + + FIL_PAGE_NEXT, + block->page.frame + + FIL_PAGE_OFFSET, 4)); + } + if (btr_pcur_is_before_first_on_page(cursor)) { + page_cur_set_after_last(left_block, + &cursor->btr_cur.page_cur); + /* Release the right sibling. */ + } else { + /* Release the left sibling. */ + block = left_block; + } + mtr->release(*block); + } + + cursor->latch_mode = latch_mode; + cursor->old_rec = nullptr; + return false; +} + +/*********************************************************//** +Moves the persistent cursor to the previous record in the tree. If no records +are left, the cursor stays 'before first in tree'. +@return TRUE if the cursor was not before first in tree */ +bool +btr_pcur_move_to_prev( +/*==================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + cursor->old_rec = nullptr; + + if (btr_pcur_is_before_first_on_page(cursor)) { + return (!btr_pcur_is_before_first_in_tree(cursor) + && !btr_pcur_move_backward_from_page(cursor, mtr)); + } + + return btr_pcur_move_to_prev_on_page(cursor) != nullptr; +} diff --git a/storage/innobase/btr/btr0sea.cc b/storage/innobase/btr/btr0sea.cc new file mode 100644 index 00000000..8435047c --- /dev/null +++ b/storage/innobase/btr/btr0sea.cc @@ -0,0 +1,2328 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file btr/btr0sea.cc +The index tree adaptive search + +Created 2/17/1996 Heikki Tuuri +*************************************************************************/ + +#include "btr0sea.h" +#ifdef BTR_CUR_HASH_ADAPT +#include "buf0buf.h" +#include "page0page.h" +#include "page0cur.h" +#include "btr0cur.h" +#include "btr0pcur.h" +#include "btr0btr.h" +#include "srv0mon.h" + +/** Is search system enabled. +Search system is protected by array of latches. */ +char btr_search_enabled; + +/** Number of adaptive hash index partition. */ +ulong btr_ahi_parts; + +#ifdef UNIV_SEARCH_PERF_STAT +/** Number of successful adaptive hash index lookups */ +ulint btr_search_n_succ = 0; +/** Number of failed adaptive hash index lookups */ +ulint btr_search_n_hash_fail = 0; +#endif /* UNIV_SEARCH_PERF_STAT */ + +#ifdef UNIV_PFS_RWLOCK +mysql_pfs_key_t btr_search_latch_key; +#endif /* UNIV_PFS_RWLOCK */ + +/** The adaptive hash index */ +btr_search_sys_t btr_search_sys; + +/** If the number of records on the page divided by this parameter +would have been successfully accessed using a hash index, the index +is then built on the page, assuming the global limit has been reached */ +#define BTR_SEARCH_PAGE_BUILD_LIMIT 16U + +/** The global limit for consecutive potentially successful hash searches, +before hash index building is started */ +#define BTR_SEARCH_BUILD_LIMIT 100U + +/** Compute a hash value of a record in a page. +@param[in] rec index record +@param[in] offsets return value of rec_get_offsets() +@param[in] n_fields number of complete fields to fold +@param[in] n_bytes number of bytes to fold in the last field +@param[in] index_id index tree ID +@return the hash value */ +static inline +ulint +rec_fold( + const rec_t* rec, + const rec_offs* offsets, + ulint n_fields, + ulint n_bytes, + index_id_t tree_id) +{ + ulint i; + const byte* data; + ulint len; + ulint fold; + ulint n_fields_rec; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_validate(rec, offsets)); + ut_ad(page_rec_is_leaf(rec)); + ut_ad(!page_rec_is_metadata(rec)); + ut_ad(n_fields > 0 || n_bytes > 0); + + n_fields_rec = rec_offs_n_fields(offsets); + ut_ad(n_fields <= n_fields_rec); + ut_ad(n_fields < n_fields_rec || n_bytes == 0); + + if (n_fields > n_fields_rec) { + n_fields = n_fields_rec; + } + + if (n_fields == n_fields_rec) { + n_bytes = 0; + } + + fold = ut_fold_ull(tree_id); + + for (i = 0; i < n_fields; i++) { + data = rec_get_nth_field(rec, offsets, i, &len); + + if (len != UNIV_SQL_NULL) { + fold = ut_fold_ulint_pair(fold, + ut_fold_binary(data, len)); + } + } + + if (n_bytes > 0) { + data = rec_get_nth_field(rec, offsets, i, &len); + + if (len != UNIV_SQL_NULL) { + if (len > n_bytes) { + len = n_bytes; + } + + fold = ut_fold_ulint_pair(fold, + ut_fold_binary(data, len)); + } + } + + return(fold); +} + +/** Determine the number of accessed key fields. +@param[in] n_fields number of complete fields +@param[in] n_bytes number of bytes in an incomplete last field +@return number of complete or incomplete fields */ +inline MY_ATTRIBUTE((warn_unused_result)) +ulint +btr_search_get_n_fields( + ulint n_fields, + ulint n_bytes) +{ + return(n_fields + (n_bytes > 0 ? 1 : 0)); +} + +/** Determine the number of accessed key fields. +@param[in] cursor b-tree cursor +@return number of complete or incomplete fields */ +inline MY_ATTRIBUTE((warn_unused_result)) +ulint +btr_search_get_n_fields( + const btr_cur_t* cursor) +{ + return(btr_search_get_n_fields(cursor->n_fields, cursor->n_bytes)); +} + +/** This function should be called before reserving any btr search mutex, if +the intended operation might add nodes to the search system hash table. +Because of the latching order, once we have reserved the btr search system +latch, we cannot allocate a free frame from the buffer pool. Checks that +there is a free buffer frame allocated for hash table heap in the btr search +system. If not, allocates a free frames for the heap. This check makes it +probable that, when have reserved the btr search system latch and we need to +allocate a new node to the hash table, it will succeed. However, the check +will not guarantee success. +@param[in] index index handler */ +static void btr_search_check_free_space_in_heap(const dict_index_t *index) +{ + /* Note that we peek the value of heap->free_block without reserving + the latch: this is ok, because we will not guarantee that there will + be enough free space in the hash table. */ + + buf_block_t *block= buf_block_alloc(); + auto part= btr_search_sys.get_part(*index); + + part->latch.wr_lock(SRW_LOCK_CALL); + + if (!btr_search_enabled || part->heap->free_block) + buf_block_free(block); + else + part->heap->free_block= block; + + part->latch.wr_unlock(); +} + +/** Set index->ref_count = 0 on all indexes of a table. +@param[in,out] table table handler */ +static void btr_search_disable_ref_count(dict_table_t *table) +{ + for (dict_index_t *index= dict_table_get_first_index(table); index; + index= dict_table_get_next_index(index)) + index->search_info->ref_count= 0; +} + +/** Lazily free detached metadata when removing the last reference. */ +ATTRIBUTE_COLD static void btr_search_lazy_free(dict_index_t *index) +{ + ut_ad(index->freed()); + dict_table_t *table= index->table; + table->autoinc_mutex.wr_lock(); + + /* Perform the skipped steps of dict_index_remove_from_cache_low(). */ + UT_LIST_REMOVE(table->freed_indexes, index); + index->lock.free(); + dict_mem_index_free(index); + + if (!UT_LIST_GET_LEN(table->freed_indexes) && + !UT_LIST_GET_LEN(table->indexes)) + { + ut_ad(!table->id); + table->autoinc_mutex.wr_unlock(); + table->autoinc_mutex.destroy(); + dict_mem_table_free(table); + return; + } + + table->autoinc_mutex.wr_unlock(); +} + +/** Disable the adaptive hash search system and empty the index. */ +void btr_search_disable() +{ + dict_table_t* table; + + dict_sys.freeze(SRW_LOCK_CALL); + + btr_search_x_lock_all(); + + if (!btr_search_enabled) { + dict_sys.unfreeze(); + btr_search_x_unlock_all(); + return; + } + + btr_search_enabled = false; + + /* Clear the index->search_info->ref_count of every index in + the data dictionary cache. */ + for (table = UT_LIST_GET_FIRST(dict_sys.table_LRU); table; + table = UT_LIST_GET_NEXT(table_LRU, table)) { + + btr_search_disable_ref_count(table); + } + + for (table = UT_LIST_GET_FIRST(dict_sys.table_non_LRU); table; + table = UT_LIST_GET_NEXT(table_LRU, table)) { + + btr_search_disable_ref_count(table); + } + + dict_sys.unfreeze(); + + /* Set all block->index = NULL. */ + buf_pool.clear_hash_index(); + + /* Clear the adaptive hash index. */ + btr_search_sys.clear(); + + btr_search_x_unlock_all(); +} + +/** Enable the adaptive hash search system. +@param resize whether buf_pool_t::resize() is the caller */ +void btr_search_enable(bool resize) +{ + if (!resize) { + mysql_mutex_lock(&buf_pool.mutex); + bool changed = srv_buf_pool_old_size != srv_buf_pool_size; + mysql_mutex_unlock(&buf_pool.mutex); + if (changed) { + return; + } + } + + btr_search_x_lock_all(); + ulint hash_size = buf_pool_get_curr_size() / sizeof(void *) / 64; + + if (btr_search_sys.parts[0].heap) { + ut_ad(btr_search_enabled); + btr_search_x_unlock_all(); + return; + } + + btr_search_sys.alloc(hash_size); + + btr_search_enabled = true; + btr_search_x_unlock_all(); +} + +/** Updates the search info of an index about hash successes. NOTE that info +is NOT protected by any semaphore, to save CPU time! Do not assume its fields +are consistent. +@param[in,out] info search info +@param[in] cursor cursor which was just positioned */ +static void btr_search_info_update_hash(btr_search_t *info, btr_cur_t *cursor) +{ + dict_index_t* index = cursor->index(); + int cmp; + + if (dict_index_is_ibuf(index)) { + /* So many deletes are performed on an insert buffer tree + that we do not consider a hash index useful on it: */ + + return; + } + + uint16_t n_unique = dict_index_get_n_unique_in_tree(index); + + if (info->n_hash_potential == 0) { + + goto set_new_recomm; + } + + /* Test if the search would have succeeded using the recommended + hash prefix */ + + if (info->n_fields >= n_unique && cursor->up_match >= n_unique) { +increment_potential: + info->n_hash_potential++; + + return; + } + + cmp = ut_pair_cmp(info->n_fields, info->n_bytes, + cursor->low_match, cursor->low_bytes); + + if (info->left_side ? cmp <= 0 : cmp > 0) { + + goto set_new_recomm; + } + + cmp = ut_pair_cmp(info->n_fields, info->n_bytes, + cursor->up_match, cursor->up_bytes); + + if (info->left_side ? cmp <= 0 : cmp > 0) { + + goto increment_potential; + } + +set_new_recomm: + /* We have to set a new recommendation; skip the hash analysis + for a while to avoid unnecessary CPU time usage when there is no + chance for success */ + + info->hash_analysis = 0; + + cmp = ut_pair_cmp(cursor->up_match, cursor->up_bytes, + cursor->low_match, cursor->low_bytes); + info->left_side = cmp >= 0; + info->n_hash_potential = cmp != 0; + + if (cmp == 0) { + /* For extra safety, we set some sensible values here */ + info->n_fields = 1; + info->n_bytes = 0; + } else if (cmp > 0) { + info->n_hash_potential = 1; + + if (cursor->up_match >= n_unique) { + + info->n_fields = n_unique; + info->n_bytes = 0; + + } else if (cursor->low_match < cursor->up_match) { + + info->n_fields = static_cast( + cursor->low_match + 1); + info->n_bytes = 0; + } else { + info->n_fields = static_cast( + cursor->low_match); + info->n_bytes = static_cast( + cursor->low_bytes + 1); + } + } else { + if (cursor->low_match >= n_unique) { + + info->n_fields = n_unique; + info->n_bytes = 0; + } else if (cursor->low_match > cursor->up_match) { + + info->n_fields = static_cast( + cursor->up_match + 1); + info->n_bytes = 0; + } else { + info->n_fields = static_cast( + cursor->up_match); + info->n_bytes = static_cast( + cursor->up_bytes + 1); + } + } +} + +/** Update the block search info on hash successes. NOTE that info and +block->n_hash_helps, n_fields, n_bytes, left_side are NOT protected by any +semaphore, to save CPU time! Do not assume the fields are consistent. +@return TRUE if building a (new) hash index on the block is recommended +@param[in,out] info search info +@param[in,out] block buffer block */ +static +bool +btr_search_update_block_hash_info(btr_search_t* info, buf_block_t* block) +{ + ut_ad(block->page.lock.have_x() || block->page.lock.have_s()); + + info->last_hash_succ = FALSE; + ut_ad(block->page.frame); + ut_ad(info->magic_n == BTR_SEARCH_MAGIC_N); + + if ((block->n_hash_helps > 0) + && (info->n_hash_potential > 0) + && (block->n_fields == info->n_fields) + && (block->n_bytes == info->n_bytes) + && (block->left_side == info->left_side)) { + + if ((block->index) + && (block->curr_n_fields == info->n_fields) + && (block->curr_n_bytes == info->n_bytes) + && (block->curr_left_side == info->left_side)) { + + /* The search would presumably have succeeded using + the hash index */ + + info->last_hash_succ = TRUE; + } + + block->n_hash_helps++; + } else { + block->n_hash_helps = 1; + block->n_fields = info->n_fields; + block->n_bytes = info->n_bytes; + block->left_side = info->left_side; + } + + if ((block->n_hash_helps > page_get_n_recs(block->page.frame) + / BTR_SEARCH_PAGE_BUILD_LIMIT) + && (info->n_hash_potential >= BTR_SEARCH_BUILD_LIMIT)) { + + if ((!block->index) + || (block->n_hash_helps + > 2U * page_get_n_recs(block->page.frame)) + || (block->n_fields != block->curr_n_fields) + || (block->n_bytes != block->curr_n_bytes) + || (block->left_side != block->curr_left_side)) { + + /* Build a new hash index on the page */ + + return(true); + } + } + + return(false); +} + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +/** Maximum number of records in a page */ +constexpr ulint MAX_N_POINTERS = UNIV_PAGE_SIZE_MAX / REC_N_NEW_EXTRA_BYTES; +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + +__attribute__((nonnull)) +/** +Insert an entry into the hash table. If an entry with the same fold number +is found, its node is updated to point to the new data, and no new node +is inserted. +@param table hash table +@param heap memory heap +@param fold folded value of the record +@param block buffer block containing the record +@param data the record +@retval true on success +@retval false if no more memory could be allocated */ +static bool ha_insert_for_fold(hash_table_t *table, mem_heap_t* heap, + ulint fold, +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + buf_block_t *block, /*!< buffer block of data */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + const rec_t *data) +{ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + ut_a(block->page.frame == page_align(data)); +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + ut_ad(btr_search_enabled); + + hash_cell_t *cell= &table->array[table->calc_hash(fold)]; + + for (ha_node_t *prev= static_cast(cell->node); prev; + prev= prev->next) + { + if (prev->fold == fold) + { +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + buf_block_t *prev_block= prev->block; + ut_a(prev_block->page.frame == page_align(prev->data)); + ut_a(prev_block->n_pointers-- < MAX_N_POINTERS); + ut_a(block->n_pointers++ < MAX_N_POINTERS); + + prev->block= block; +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + prev->data= data; + return true; + } + } + + /* We have to allocate a new chain node */ + ha_node_t *node= static_cast(mem_heap_alloc(heap, sizeof *node)); + + if (!node) + return false; + + ha_node_set_data(node, block, data); + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + ut_a(block->n_pointers++ < MAX_N_POINTERS); +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + + node->fold= fold; + node->next= nullptr; + + ha_node_t *prev= static_cast(cell->node); + if (!prev) + cell->node= node; + else + { + while (prev->next) + prev= prev->next; + prev->next= node; + } + return true; +} + +__attribute__((nonnull)) +/** Delete a record. +@param table hash table +@param heap memory heap +@param del_node record to be deleted */ +static void ha_delete_hash_node(hash_table_t *table, mem_heap_t *heap, + ha_node_t *del_node) +{ + ut_ad(btr_search_enabled); +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + ut_a(del_node->block->page.frame == page_align(del_node->data)); + ut_a(del_node->block->n_pointers-- < MAX_N_POINTERS); +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + + const ulint fold= del_node->fold; + + HASH_DELETE(ha_node_t, next, table, fold, del_node); + + ha_node_t *top= static_cast(mem_heap_get_top(heap, sizeof *top)); + + if (del_node != top) + { + /* Compact the heap of nodes by moving the top in the place of del_node. */ + *del_node= *top; + hash_cell_t *cell= &table->array[table->calc_hash(top->fold)]; + + /* Look for the pointer to the top node, to update it */ + if (cell->node == top) + /* The top node is the first in the chain */ + cell->node= del_node; + else + { + /* We have to look for the predecessor */ + ha_node_t *node= static_cast(cell->node); + + while (top != HASH_GET_NEXT(next, node)) + node= static_cast(HASH_GET_NEXT(next, node)); + + /* Now we have the predecessor node */ + node->next= del_node; + } + } + + /* Free the occupied space */ + mem_heap_free_top(heap, sizeof *top); +} + +__attribute__((nonnull)) +/** Delete all pointers to a page. +@param table hash table +@param heap memory heap +@param page record to be deleted */ +static void ha_remove_all_nodes_to_page(hash_table_t *table, mem_heap_t *heap, + ulint fold, const page_t *page) +{ + for (ha_node_t *node= ha_chain_get_first(table, fold); node; ) + { + if (page_align(ha_node_get_data(node)) == page) + { + ha_delete_hash_node(table, heap, node); + /* The deletion may compact the heap of nodes and move other nodes! */ + node= ha_chain_get_first(table, fold); + } + else + node= ha_chain_get_next(node); + } +#ifdef UNIV_DEBUG + /* Check that all nodes really got deleted */ + for (ha_node_t *node= ha_chain_get_first(table, fold); node; + node= ha_chain_get_next(node)) + ut_ad(page_align(ha_node_get_data(node)) != page); +#endif /* UNIV_DEBUG */ +} + +/** Delete a record if found. +@param table hash table +@param heap memory heap for the hash bucket chain +@param fold folded value of the searched data +@param data pointer to the record +@return whether the record was found */ +static bool ha_search_and_delete_if_found(hash_table_t *table, + mem_heap_t *heap, + ulint fold, const rec_t *data) +{ + if (ha_node_t *node= ha_search_with_data(table, fold, data)) + { + ha_delete_hash_node(table, heap, node); + return true; + } + + return false; +} + +__attribute__((nonnull)) +/** Looks for an element when we know the pointer to the data and +updates the pointer to data if found. +@param table hash table +@param fold folded value of the searched data +@param data pointer to the data +@param new_data new pointer to the data +@return whether the element was found */ +static bool ha_search_and_update_if_found(hash_table_t *table, ulint fold, + const rec_t *data, +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + /** block containing new_data */ + buf_block_t *new_block, +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + const rec_t *new_data) +{ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + ut_a(new_block->page.frame == page_align(new_data)); +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + + if (!btr_search_enabled) + return false; + + if (ha_node_t *node= ha_search_with_data(table, fold, data)) + { +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + ut_a(node->block->n_pointers-- < MAX_N_POINTERS); + ut_a(new_block->n_pointers++ < MAX_N_POINTERS); + node->block= new_block; +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + node->data= new_data; + + return true; + } + + return false; +} + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +#else +# define ha_insert_for_fold(t,h,f,b,d) ha_insert_for_fold(t,h,f,d) +# define ha_search_and_update_if_found(table,fold,data,new_block,new_data) \ + ha_search_and_update_if_found(table,fold,data,new_data) +#endif + +/** Updates a hash node reference when it has been unsuccessfully used in a +search which could have succeeded with the used hash parameters. This can +happen because when building a hash index for a page, we do not check +what happens at page boundaries, and therefore there can be misleading +hash nodes. Also, collisions in the fold value can lead to misleading +references. This function lazily fixes these imperfections in the hash +index. +@param[in] info search info +@param[in] block buffer block where cursor positioned +@param[in] cursor cursor */ +static +void +btr_search_update_hash_ref( + const btr_search_t* info, + buf_block_t* block, + const btr_cur_t* cursor) +{ + ut_ad(cursor->flag == BTR_CUR_HASH_FAIL); + + ut_ad(block->page.lock.have_x() || block->page.lock.have_s()); + ut_ad(page_align(btr_cur_get_rec(cursor)) == block->page.frame); + ut_ad(page_is_leaf(block->page.frame)); + assert_block_ahi_valid(block); + + dict_index_t* index = block->index; + + if (!index || !info->n_hash_potential) { + return; + } + + if (index != cursor->index()) { + ut_ad(index->id == cursor->index()->id); + btr_search_drop_page_hash_index(block, false); + return; + } + + ut_ad(block->page.id().space() == index->table->space_id); + ut_ad(index == cursor->index()); + ut_ad(!dict_index_is_ibuf(index)); + auto part = btr_search_sys.get_part(*index); + part->latch.wr_lock(SRW_LOCK_CALL); + ut_ad(!block->index || block->index == index); + + if (block->index + && (block->curr_n_fields == info->n_fields) + && (block->curr_n_bytes == info->n_bytes) + && (block->curr_left_side == info->left_side) + && btr_search_enabled) { + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs_init(offsets_); + + const rec_t* rec = btr_cur_get_rec(cursor); + + if (!page_rec_is_user_rec(rec)) { + goto func_exit; + } + + ulint fold = rec_fold( + rec, + rec_get_offsets(rec, index, offsets_, + index->n_core_fields, + ULINT_UNDEFINED, &heap), + block->curr_n_fields, + block->curr_n_bytes, index->id); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + ha_insert_for_fold(&part->table, part->heap, fold, block, rec); + + MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED); + } + +func_exit: + part->latch.wr_unlock(); +} + +/** Checks if a guessed position for a tree cursor is right. Note that if +mode is PAGE_CUR_LE, which is used in inserts, and the function returns +TRUE, then cursor->up_match and cursor->low_match both have sensible values. +@param[in,out] cursor guess cursor position +@param[in] can_only_compare_to_cursor_rec + if we do not have a latch on the page of cursor, + but a latch corresponding search system, then + ONLY the columns of the record UNDER the cursor + are protected, not the next or previous record + in the chain: we cannot look at the next or + previous record to check our guess! +@param[in] tuple data tuple +@param[in] mode PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, PAGE_CUR_GE +@return whether a match was found */ +static +bool +btr_search_check_guess( + btr_cur_t* cursor, + bool can_only_compare_to_cursor_rec, + const dtuple_t* tuple, + ulint mode) +{ + rec_t* rec; + ulint n_unique; + ulint match; + int cmp; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + bool success = false; + rec_offs_init(offsets_); + + n_unique = dict_index_get_n_unique_in_tree(cursor->index()); + + rec = btr_cur_get_rec(cursor); + + if (UNIV_UNLIKELY(!page_rec_is_user_rec(rec) + || !page_rec_is_leaf(rec))) { + ut_ad("corrupted index" == 0); + return false; + } else if (cursor->index()->table->not_redundant()) { + switch (rec_get_status(rec)) { + case REC_STATUS_INSTANT: + case REC_STATUS_ORDINARY: + break; + default: + ut_ad("corrupted index" == 0); + return false; + } + } + + match = 0; + + offsets = rec_get_offsets(rec, cursor->index(), offsets, + cursor->index()->n_core_fields, + n_unique, &heap); + cmp = cmp_dtuple_rec_with_match(tuple, rec, cursor->index(), offsets, + &match); + + if (mode == PAGE_CUR_GE) { + if (cmp > 0) { + goto exit_func; + } + + cursor->up_match = match; + + if (match >= n_unique) { + success = true; + goto exit_func; + } + } else if (mode == PAGE_CUR_LE) { + if (cmp < 0) { + goto exit_func; + } + + cursor->low_match = match; + + } else if (mode == PAGE_CUR_G) { + if (cmp >= 0) { + goto exit_func; + } + } else if (mode == PAGE_CUR_L) { + if (cmp <= 0) { + goto exit_func; + } + } + + if (can_only_compare_to_cursor_rec) { + /* Since we could not determine if our guess is right just by + looking at the record under the cursor, return FALSE */ + goto exit_func; + } + + match = 0; + + if ((mode == PAGE_CUR_G) || (mode == PAGE_CUR_GE)) { + const rec_t* prev_rec = page_rec_get_prev(rec); + + if (UNIV_UNLIKELY(!prev_rec)) { + ut_ad("corrupted index" == 0); + goto exit_func; + } + + if (page_rec_is_infimum(prev_rec)) { + success = !page_has_prev(page_align(prev_rec)); + goto exit_func; + } + + if (cursor->index()->table->not_redundant()) { + switch (rec_get_status(prev_rec)) { + case REC_STATUS_INSTANT: + case REC_STATUS_ORDINARY: + break; + default: + ut_ad("corrupted index" == 0); + goto exit_func; + } + } + + offsets = rec_get_offsets(prev_rec, cursor->index(), offsets, + cursor->index()->n_core_fields, + n_unique, &heap); + cmp = cmp_dtuple_rec_with_match(tuple, prev_rec, + cursor->index(), offsets, + &match); + if (mode == PAGE_CUR_GE) { + success = cmp > 0; + } else { + success = cmp >= 0; + } + } else { + ut_ad(!page_rec_is_supremum(rec)); + + const rec_t* next_rec = page_rec_get_next(rec); + + if (UNIV_UNLIKELY(!next_rec)) { + ut_ad("corrupted index" == 0); + goto exit_func; + } + + if (page_rec_is_supremum(next_rec)) { + if (!page_has_next(page_align(next_rec))) { + cursor->up_match = 0; + success = true; + } + + goto exit_func; + } + + if (cursor->index()->table->not_redundant()) { + switch (rec_get_status(next_rec)) { + case REC_STATUS_INSTANT: + case REC_STATUS_ORDINARY: + break; + default: + ut_ad("corrupted index" == 0); + goto exit_func; + } + } + + offsets = rec_get_offsets(next_rec, cursor->index(), offsets, + cursor->index()->n_core_fields, + n_unique, &heap); + cmp = cmp_dtuple_rec_with_match( + tuple, next_rec, cursor->index(), offsets, &match); + if (mode == PAGE_CUR_LE) { + success = cmp < 0; + cursor->up_match = match; + } else { + success = cmp <= 0; + } + } +exit_func: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return(success); +} + +static +void +btr_search_failure(btr_search_t* info, btr_cur_t* cursor) +{ + cursor->flag = BTR_CUR_HASH_FAIL; + +#ifdef UNIV_SEARCH_PERF_STAT + ++info->n_hash_fail; + + if (info->n_hash_succ > 0) { + --info->n_hash_succ; + } +#endif /* UNIV_SEARCH_PERF_STAT */ + + info->last_hash_succ = FALSE; +} + +/** Clear the adaptive hash index on all pages in the buffer pool. */ +inline void buf_pool_t::clear_hash_index() +{ + ut_ad(!resizing); + ut_ad(!btr_search_enabled); + + std::set garbage; + + for (chunk_t *chunk= chunks + n_chunks; chunk-- != chunks; ) + { + for (buf_block_t *block= chunk->blocks, * const end= block + chunk->size; + block != end; block++) + { + dict_index_t *index= block->index; + assert_block_ahi_valid(block); + + /* We can clear block->index and block->n_pointers when + holding all AHI latches exclusively; see the comments in buf0buf.h */ + + if (!index) + { +# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + ut_a(!block->n_pointers); +# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + continue; + } + + ut_d(const auto s= block->page.state()); + /* Another thread may have set the state to + REMOVE_HASH in buf_LRU_block_remove_hashed(). + + The state change in buf_pool_t::realloc() is not observable + here, because in that case we would have !block->index. + + In the end, the entire adaptive hash index will be removed. */ + ut_ad(s >= buf_page_t::UNFIXED || s == buf_page_t::REMOVE_HASH); +# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + block->n_pointers= 0; +# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + if (index->freed()) + garbage.insert(index); + block->index= nullptr; + } + } + + for (dict_index_t *index : garbage) + btr_search_lazy_free(index); +} + +/** Get a buffer block from an adaptive hash index pointer. +This function does not return if the block is not identified. +@param ptr pointer to within a page frame +@return pointer to block, never NULL */ +inline buf_block_t* buf_pool_t::block_from_ahi(const byte *ptr) const +{ + chunk_t::map *chunk_map = chunk_t::map_ref; + ut_ad(chunk_t::map_ref == chunk_t::map_reg); + ut_ad(!resizing); + + chunk_t::map::const_iterator it= chunk_map->upper_bound(ptr); + ut_a(it != chunk_map->begin()); + + chunk_t *chunk= it == chunk_map->end() + ? chunk_map->rbegin()->second + : (--it)->second; + + const size_t offs= size_t(ptr - chunk->blocks->page.frame) >> + srv_page_size_shift; + ut_a(offs < chunk->size); + + buf_block_t *block= &chunk->blocks[offs]; + /* buf_pool_t::chunk_t::init() invokes buf_block_init() so that + block[n].frame == block->page.frame + n * srv_page_size. Check it. */ + ut_ad(block->page.frame == page_align(ptr)); + /* Read the state of the block without holding hash_lock. + A state transition to REMOVE_HASH is possible during + this execution. */ + ut_ad(block->page.state() >= buf_page_t::REMOVE_HASH); + + return block; +} + +/** Tries to guess the right search position based on the hash search info +of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts, +and the function returns TRUE, then cursor->up_match and cursor->low_match +both have sensible values. +@param[in,out] index index +@param[in,out] info index search info +@param[in] tuple logical record +@param[in] mode PAGE_CUR_L, .... +@param[in] latch_mode BTR_SEARCH_LEAF, ... +@param[out] cursor tree cursor +@param[in] mtr mini-transaction +@return whether the search succeeded */ +TRANSACTIONAL_TARGET +bool +btr_search_guess_on_hash( + dict_index_t* index, + btr_search_t* info, + const dtuple_t* tuple, + ulint mode, + ulint latch_mode, + btr_cur_t* cursor, + mtr_t* mtr) +{ + ulint fold; + index_id_t index_id; + + ut_ad(mtr->is_active()); + ut_ad(index->is_btree() || index->is_ibuf()); + + /* Note that, for efficiency, the struct info may not be protected by + any latch here! */ + + if (latch_mode > BTR_MODIFY_LEAF + || !info->last_hash_succ || !info->n_hash_potential + || (tuple->info_bits & REC_INFO_MIN_REC_FLAG)) { + return false; + } + + ut_ad(index->is_btree()); + ut_ad(!index->table->is_temporary()); + + ut_ad(latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF); + compile_time_assert(ulint{BTR_SEARCH_LEAF} == ulint{RW_S_LATCH}); + compile_time_assert(ulint{BTR_MODIFY_LEAF} == ulint{RW_X_LATCH}); + + cursor->n_fields = info->n_fields; + cursor->n_bytes = info->n_bytes; + + if (dtuple_get_n_fields(tuple) < btr_search_get_n_fields(cursor)) { + return false; + } + + index_id = index->id; + +#ifdef UNIV_SEARCH_PERF_STAT + info->n_hash_succ++; +#endif + fold = dtuple_fold(tuple, cursor->n_fields, cursor->n_bytes, index_id); + + cursor->fold = fold; + cursor->flag = BTR_CUR_HASH; + + auto part = btr_search_sys.get_part(*index); + const rec_t* rec; + + part->latch.rd_lock(SRW_LOCK_CALL); + + if (!btr_search_enabled) { + goto ahi_release_and_fail; + } + + rec = static_cast( + ha_search_and_get_data(&part->table, fold)); + + if (!rec) { +ahi_release_and_fail: + part->latch.rd_unlock(); +fail: + btr_search_failure(info, cursor); + return false; + } + + buf_block_t* block = buf_pool.block_from_ahi(rec); + + buf_pool_t::hash_chain& chain = buf_pool.page_hash.cell_get( + block->page.id().fold()); + bool got_latch; + { + transactional_shared_lock_guard g{ + buf_pool.page_hash.lock_get(chain)}; + got_latch = (latch_mode == BTR_SEARCH_LEAF) + ? block->page.lock.s_lock_try() + : block->page.lock.x_lock_try(); + } + + if (!got_latch) { + goto ahi_release_and_fail; + } + + const auto state = block->page.state(); + if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) { + ut_ad(state == buf_page_t::REMOVE_HASH); +block_and_ahi_release_and_fail: + if (latch_mode == BTR_SEARCH_LEAF) { + block->page.lock.s_unlock(); + } else { + block->page.lock.x_unlock(); + } + goto ahi_release_and_fail; + } + + ut_ad(state < buf_page_t::READ_FIX || state >= buf_page_t::WRITE_FIX); + ut_ad(state < buf_page_t::READ_FIX || latch_mode == BTR_SEARCH_LEAF); + + if (index != block->index && index_id == block->index->id) { + ut_a(block->index->freed()); + goto block_and_ahi_release_and_fail; + } + + block->page.fix(); + block->page.set_accessed(); + buf_page_make_young_if_needed(&block->page); + static_assert(ulint{MTR_MEMO_PAGE_S_FIX} == ulint{BTR_SEARCH_LEAF}, + ""); + static_assert(ulint{MTR_MEMO_PAGE_X_FIX} == ulint{BTR_MODIFY_LEAF}, + ""); + + part->latch.rd_unlock(); + + ++buf_pool.stat.n_page_gets; + + mtr->memo_push(block, mtr_memo_type_t(latch_mode)); + + ut_ad(page_rec_is_user_rec(rec)); + + btr_cur_position(index, (rec_t*) rec, block, cursor); + + /* Check the validity of the guess within the page */ + + /* If we only have the latch on search system, not on the + page, it only protects the columns of the record the cursor + is positioned on. We cannot look at the next of the previous + record to determine if our guess for the cursor position is + right. */ + if (index_id != btr_page_get_index_id(block->page.frame) + || !btr_search_check_guess(cursor, false, tuple, mode)) { + mtr->release_last_page(); + goto fail; + } + + if (info->n_hash_potential < BTR_SEARCH_BUILD_LIMIT + 5) { + + info->n_hash_potential++; + } + + info->last_hash_succ = TRUE; + +#ifdef UNIV_SEARCH_PERF_STAT + btr_search_n_succ++; +#endif + return true; +} + +/** Drop any adaptive hash index entries that point to an index page. +@param[in,out] block block containing index page, s- or x-latched, or an + index page for which we know that + block->buf_fix_count == 0 or it is an index page which + has already been removed from the buf_pool.page_hash + i.e.: it is in state BUF_BLOCK_REMOVE_HASH +@param[in] garbage_collect drop ahi only if the index is marked + as freed */ +void btr_search_drop_page_hash_index(buf_block_t* block, + bool garbage_collect) +{ + ulint n_fields; + ulint n_bytes; + const rec_t* rec; + mem_heap_t* heap; + rec_offs* offsets; + +retry: + if (!block->index) { + return; + } + + ut_d(const auto state = block->page.state()); + ut_ad(state == buf_page_t::REMOVE_HASH + || state >= buf_page_t::UNFIXED); + ut_ad(state == buf_page_t::REMOVE_HASH + || !(~buf_page_t::LRU_MASK & state) + || block->page.lock.have_any()); + ut_ad(state < buf_page_t::READ_FIX || state >= buf_page_t::WRITE_FIX); + ut_ad(page_is_leaf(block->page.frame)); + + /* We must not dereference block->index here, because it could be freed + if (!index->table->get_ref_count() && !dict_sys.frozen()). + Determine the ahi_slot based on the block contents. */ + + const index_id_t index_id + = btr_page_get_index_id(block->page.frame); + + auto part = btr_search_sys.get_part(index_id, + block->page.id().space()); + + part->latch.rd_lock(SRW_LOCK_CALL); + + dict_index_t* index = block->index; + bool is_freed = index && index->freed(); + + if (is_freed) { + part->latch.rd_unlock(); + part->latch.wr_lock(SRW_LOCK_CALL); + if (index != block->index) { + part->latch.wr_unlock(); + goto retry; + } + } else if (garbage_collect) { + part->latch.rd_unlock(); + return; + } + + assert_block_ahi_valid(block); + + if (!index || !btr_search_enabled) { + if (is_freed) { + part->latch.wr_unlock(); + } else { + part->latch.rd_unlock(); + } + return; + } + + ut_ad(!index->table->is_temporary()); + ut_ad(btr_search_enabled); + + ut_ad(block->page.id().space() == index->table->space_id); + ut_a(index_id == index->id); + ut_ad(!dict_index_is_ibuf(index)); + + n_fields = block->curr_n_fields; + n_bytes = block->curr_n_bytes; + + /* NOTE: The AHI fields of block must not be accessed after + releasing search latch, as the index page might only be s-latched! */ + + if (!is_freed) { + part->latch.rd_unlock(); + } + + ut_a(n_fields > 0 || n_bytes > 0); + + const page_t* const page = block->page.frame; + ulint n_recs = page_get_n_recs(page); + if (!n_recs) { + ut_ad("corrupted adaptive hash index" == 0); + return; + } + + /* Calculate and cache fold values into an array for fast deletion + from the hash index */ + + rec = page_get_infimum_rec(page); + rec = page_rec_get_next_low(rec, page_is_comp(page)); + + ulint* folds; + ulint n_cached = 0; + ulint prev_fold = 0; + + if (rec && rec_is_metadata(rec, *index)) { + rec = page_rec_get_next_low(rec, page_is_comp(page)); + if (!--n_recs) { + /* The page only contains the hidden metadata record + for instant ALTER TABLE that the adaptive hash index + never points to. */ + folds = nullptr; + goto all_deleted; + } + } + + folds = (ulint*) ut_malloc_nokey(n_recs * sizeof(ulint)); + heap = nullptr; + offsets = nullptr; + + while (rec) { + if (n_cached >= n_recs) { + ut_ad(page_rec_is_supremum(rec)); + break; + } + ut_ad(page_rec_is_user_rec(rec)); + offsets = rec_get_offsets( + rec, index, offsets, index->n_core_fields, + btr_search_get_n_fields(n_fields, n_bytes), + &heap); + const ulint fold = rec_fold(rec, offsets, n_fields, n_bytes, + index_id); + + if (fold == prev_fold && prev_fold != 0) { + + goto next_rec; + } + + /* Remove all hash nodes pointing to this page from the + hash chain */ + folds[n_cached++] = fold; + +next_rec: + rec = page_rec_get_next_low(rec, page_rec_is_comp(rec)); + if (!rec || page_rec_is_supremum(rec)) { + break; + } + prev_fold = fold; + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + +all_deleted: + if (!is_freed) { + part->latch.wr_lock(SRW_LOCK_CALL); + + if (UNIV_UNLIKELY(!block->index)) { + /* Someone else has meanwhile dropped the + hash index */ + goto cleanup; + } + + ut_a(block->index == index); + } + + if (block->curr_n_fields != n_fields + || block->curr_n_bytes != n_bytes) { + + /* Someone else has meanwhile built a new hash index on the + page, with different parameters */ + + part->latch.wr_unlock(); + + ut_free(folds); + goto retry; + } + + for (ulint i = 0; i < n_cached; i++) { + ha_remove_all_nodes_to_page(&part->table, part->heap, + folds[i], page); + } + + switch (index->search_info->ref_count--) { + case 0: + ut_error; + case 1: + if (index->freed()) { + btr_search_lazy_free(index); + } + } + + block->index = nullptr; + + MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_REMOVED); + MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_REMOVED, n_cached); + +cleanup: + assert_block_ahi_valid(block); + part->latch.wr_unlock(); + + ut_free(folds); +} + +/** Drop possible adaptive hash index entries when a page is evicted +from the buffer pool or freed in a file, or the index is being dropped. +@param[in] page_id page id */ +void btr_search_drop_page_hash_when_freed(const page_id_t page_id) +{ + buf_block_t* block; + mtr_t mtr; + + mtr_start(&mtr); + + /* If the caller has a latch on the page, then the caller must + have a x-latch on the page and it must have already dropped + the hash index for the page. Because of the x-latch that we + are possibly holding, we cannot s-latch the page, but must + (recursively) x-latch it, even though we are only reading. */ + + block = buf_page_get_gen(page_id, 0, RW_X_LATCH, NULL, + BUF_PEEK_IF_IN_POOL, &mtr); + + if (block && block->index) { + /* In all our callers, the table handle should + be open, or we should be in the process of + dropping the table (preventing eviction). */ + DBUG_ASSERT(block->index->table->get_ref_count() + || dict_sys.locked()); + btr_search_drop_page_hash_index(block, false); + } + + mtr_commit(&mtr); +} + +/** Build a hash index on a page with the given parameters. If the page already +has a hash index with different parameters, the old hash index is removed. +If index is non-NULL, this function checks if n_fields and n_bytes are +sensible, and does not build a hash index if not. +@param[in,out] index index for which to build. +@param[in,out] block index page, s-/x- latched. +@param[in,out] ahi_latch the adaptive search latch +@param[in] n_fields hash this many full fields +@param[in] n_bytes hash this many bytes of the next field +@param[in] left_side hash for searches from left side */ +static +void +btr_search_build_page_hash_index( + dict_index_t* index, + buf_block_t* block, + srw_spin_lock* ahi_latch, + uint16_t n_fields, + uint16_t n_bytes, + bool left_side) +{ + const rec_t* rec; + ulint fold; + ulint next_fold; + ulint n_cached; + ulint n_recs; + ulint* folds; + const rec_t** recs; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + + ut_ad(!index->table->is_temporary()); + + if (!btr_search_enabled) { + return; + } + + rec_offs_init(offsets_); + ut_ad(ahi_latch == &btr_search_sys.get_part(*index)->latch); + ut_ad(index); + ut_ad(block->page.id().space() == index->table->space_id); + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(page_is_leaf(block->page.frame)); + + ut_ad(block->page.lock.have_x() || block->page.lock.have_s()); + ut_ad(block->page.id().page_no() >= 3); + + ahi_latch->rd_lock(SRW_LOCK_CALL); + + const bool enabled = btr_search_enabled; + const bool rebuild = enabled && block->index + && (block->curr_n_fields != n_fields + || block->curr_n_bytes != n_bytes + || block->curr_left_side != left_side); + + ahi_latch->rd_unlock(); + + if (!enabled) { + return; + } + + if (rebuild) { + btr_search_drop_page_hash_index(block, false); + } + + /* Check that the values for hash index build are sensible */ + + if (n_fields == 0 && n_bytes == 0) { + + return; + } + + if (dict_index_get_n_unique_in_tree(index) + < btr_search_get_n_fields(n_fields, n_bytes)) { + return; + } + + page_t* page = buf_block_get_frame(block); + n_recs = page_get_n_recs(page); + + if (n_recs == 0) { + + return; + } + + rec = page_rec_get_next_const(page_get_infimum_rec(page)); + if (!rec) return; + + if (rec_is_metadata(rec, *index)) { + rec = page_rec_get_next_const(rec); + if (!rec || !--n_recs) return; + } + + /* Calculate and cache fold values and corresponding records into + an array for fast insertion to the hash index */ + + folds = static_cast(ut_malloc_nokey(n_recs * sizeof *folds)); + recs = static_cast( + ut_malloc_nokey(n_recs * sizeof *recs)); + + n_cached = 0; + + ut_a(index->id == btr_page_get_index_id(page)); + + offsets = rec_get_offsets( + rec, index, offsets, index->n_core_fields, + btr_search_get_n_fields(n_fields, n_bytes), + &heap); + ut_ad(page_rec_is_supremum(rec) + || n_fields == rec_offs_n_fields(offsets) - (n_bytes > 0)); + + fold = rec_fold(rec, offsets, n_fields, n_bytes, index->id); + + if (left_side) { + + folds[n_cached] = fold; + recs[n_cached] = rec; + n_cached++; + } + + while (const rec_t* next_rec = page_rec_get_next_const(rec)) { + if (page_rec_is_supremum(next_rec)) { + + if (!left_side) { + + folds[n_cached] = fold; + recs[n_cached] = rec; + n_cached++; + } + + break; + } + + offsets = rec_get_offsets( + next_rec, index, offsets, index->n_core_fields, + btr_search_get_n_fields(n_fields, n_bytes), &heap); + next_fold = rec_fold(next_rec, offsets, n_fields, + n_bytes, index->id); + + if (fold != next_fold) { + /* Insert an entry into the hash index */ + + if (left_side) { + + folds[n_cached] = next_fold; + recs[n_cached] = next_rec; + n_cached++; + } else { + folds[n_cached] = fold; + recs[n_cached] = rec; + n_cached++; + } + } + + rec = next_rec; + fold = next_fold; + } + + btr_search_check_free_space_in_heap(index); + + ahi_latch->wr_lock(SRW_LOCK_CALL); + + if (!btr_search_enabled) { + goto exit_func; + } + + /* This counter is decremented every time we drop page + hash index entries and is incremented here. Since we can + rebuild hash index for a page that is already hashed, we + have to take care not to increment the counter in that + case. */ + if (!block->index) { + assert_block_ahi_empty(block); + index->search_info->ref_count++; + } else if (block->curr_n_fields != n_fields + || block->curr_n_bytes != n_bytes + || block->curr_left_side != left_side) { + goto exit_func; + } + + block->n_hash_helps = 0; + + block->curr_n_fields = n_fields & dict_index_t::MAX_N_FIELDS; + block->curr_n_bytes = n_bytes & ((1U << 15) - 1); + block->curr_left_side = left_side; + block->index = index; + + { + auto part = btr_search_sys.get_part(*index); + for (ulint i = 0; i < n_cached; i++) { + ha_insert_for_fold(&part->table, part->heap, + folds[i], block, recs[i]); + } + } + + MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_ADDED); + MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_ADDED, n_cached); +exit_func: + assert_block_ahi_valid(block); + ahi_latch->wr_unlock(); + + ut_free(folds); + ut_free(recs); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +/** Updates the search info. +@param[in,out] info search info +@param[in,out] cursor cursor which was just positioned */ +void btr_search_info_update_slow(btr_search_t *info, btr_cur_t *cursor) +{ + srw_spin_lock* ahi_latch = &btr_search_sys.get_part(*cursor->index()) + ->latch; + buf_block_t* block = btr_cur_get_block(cursor); + + /* NOTE that the following two function calls do NOT protect + info or block->n_fields etc. with any semaphore, to save CPU time! + We cannot assume the fields are consistent when we return from + those functions! */ + + btr_search_info_update_hash(info, cursor); + + bool build_index = btr_search_update_block_hash_info(info, block); + + if (build_index || (cursor->flag == BTR_CUR_HASH_FAIL)) { + + btr_search_check_free_space_in_heap(cursor->index()); + } + + if (cursor->flag == BTR_CUR_HASH_FAIL) { + /* Update the hash node reference, if appropriate */ + +#ifdef UNIV_SEARCH_PERF_STAT + btr_search_n_hash_fail++; +#endif /* UNIV_SEARCH_PERF_STAT */ + + btr_search_update_hash_ref(info, block, cursor); + } + + if (build_index) { + /* Note that since we did not protect block->n_fields etc. + with any semaphore, the values can be inconsistent. We have + to check inside the function call that they make sense. */ + btr_search_build_page_hash_index(cursor->index(), block, + ahi_latch, + block->n_fields, + block->n_bytes, + block->left_side); + } +} + +/** Move or delete hash entries for moved records, usually in a page split. +If new_block is already hashed, then any hash index for block is dropped. +If new_block is not hashed, and block is hashed, then a new hash index is +built to new_block with the same parameters as block. +@param[in,out] new_block destination page +@param[in,out] block source page (subject to deletion later) */ +void +btr_search_move_or_delete_hash_entries( + buf_block_t* new_block, + buf_block_t* block) +{ + ut_ad(block->page.lock.have_x()); + ut_ad(new_block->page.lock.have_x()); + + if (!btr_search_enabled) { + return; + } + + dict_index_t* index = block->index; + if (!index) { + index = new_block->index; + } else { + ut_ad(!new_block->index || index == new_block->index); + } + assert_block_ahi_valid(block); + assert_block_ahi_valid(new_block); + + srw_spin_lock* ahi_latch = index + ? &btr_search_sys.get_part(*index)->latch + : nullptr; + + if (new_block->index) { +drop_exit: + btr_search_drop_page_hash_index(block, false); + return; + } + + if (!index) { + return; + } + + ahi_latch->rd_lock(SRW_LOCK_CALL); + + if (index->freed()) { + ahi_latch->rd_unlock(); + goto drop_exit; + } + + if (block->index) { + uint16_t n_fields = block->curr_n_fields; + uint16_t n_bytes = block->curr_n_bytes; + bool left_side = block->curr_left_side; + + new_block->n_fields = block->curr_n_fields; + new_block->n_bytes = block->curr_n_bytes; + new_block->left_side = left_side; + + ahi_latch->rd_unlock(); + + ut_a(n_fields > 0 || n_bytes > 0); + + btr_search_build_page_hash_index( + index, new_block, ahi_latch, + n_fields, n_bytes, left_side); + ut_ad(n_fields == block->curr_n_fields); + ut_ad(n_bytes == block->curr_n_bytes); + ut_ad(left_side == block->curr_left_side); + return; + } + + ahi_latch->rd_unlock(); +} + +/** Updates the page hash index when a single record is deleted from a page. +@param[in] cursor cursor which was positioned on the record to delete + using btr_cur_search_, the record is not yet deleted.*/ +void btr_search_update_hash_on_delete(btr_cur_t *cursor) +{ + buf_block_t* block; + const rec_t* rec; + ulint fold; + dict_index_t* index; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + mem_heap_t* heap = NULL; + rec_offs_init(offsets_); + + ut_ad(page_is_leaf(btr_cur_get_page(cursor))); + + if (!btr_search_enabled) { + return; + } + + block = btr_cur_get_block(cursor); + + ut_ad(block->page.lock.have_x()); + + assert_block_ahi_valid(block); + index = block->index; + + if (!index) { + + return; + } + + ut_ad(!cursor->index()->table->is_temporary()); + + if (index != cursor->index()) { + btr_search_drop_page_hash_index(block, false); + return; + } + + ut_ad(block->page.id().space() == index->table->space_id); + ut_a(index == cursor->index()); + ut_a(block->curr_n_fields > 0 || block->curr_n_bytes > 0); + ut_ad(!dict_index_is_ibuf(index)); + + rec = btr_cur_get_rec(cursor); + + fold = rec_fold(rec, rec_get_offsets(rec, index, offsets_, + index->n_core_fields, + ULINT_UNDEFINED, &heap), + block->curr_n_fields, block->curr_n_bytes, index->id); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + auto part = btr_search_sys.get_part(*index); + + part->latch.wr_lock(SRW_LOCK_CALL); + assert_block_ahi_valid(block); + + if (block->index && btr_search_enabled) { + ut_a(block->index == index); + + if (ha_search_and_delete_if_found(&part->table, part->heap, + fold, rec)) { + MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_REMOVED); + } else { + MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND); + } + + assert_block_ahi_valid(block); + } + + part->latch.wr_unlock(); +} + +/** Updates the page hash index when a single record is inserted on a page. +@param[in] cursor cursor which was positioned to the place to insert + using btr_cur_search_, and the new record has been + inserted next to the cursor. +@param[in] ahi_latch the adaptive hash index latch */ +void btr_search_update_hash_node_on_insert(btr_cur_t *cursor, + srw_spin_lock *ahi_latch) +{ + buf_block_t* block; + dict_index_t* index; + rec_t* rec; + + ut_ad(ahi_latch == &btr_search_sys.get_part(*cursor->index())->latch); + + if (!btr_search_enabled) { + return; + } + + rec = btr_cur_get_rec(cursor); + + block = btr_cur_get_block(cursor); + + ut_ad(block->page.lock.have_x()); + + index = block->index; + + if (!index) { + + return; + } + + ut_ad(!cursor->index()->table->is_temporary()); + + if (index != cursor->index()) { + ut_ad(index->id == cursor->index()->id); + btr_search_drop_page_hash_index(block, false); + return; + } + + ut_a(cursor->index() == index); + ut_ad(!dict_index_is_ibuf(index)); + ahi_latch->wr_lock(SRW_LOCK_CALL); + + if (!block->index || !btr_search_enabled) { + + goto func_exit; + } + + ut_a(block->index == index); + + if ((cursor->flag == BTR_CUR_HASH) + && (cursor->n_fields == block->curr_n_fields) + && (cursor->n_bytes == block->curr_n_bytes) + && !block->curr_left_side) { + if (const rec_t *new_rec = page_rec_get_next_const(rec)) { + if (ha_search_and_update_if_found( + &btr_search_sys.get_part(*cursor->index()) + ->table, + cursor->fold, rec, block, new_rec)) { + MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_UPDATED); + } + } else { + ut_ad("corrupted page" == 0); + } + +func_exit: + assert_block_ahi_valid(block); + ahi_latch->wr_unlock(); + } else { + ahi_latch->wr_unlock(); + + btr_search_update_hash_on_insert(cursor, ahi_latch); + } +} + +/** Updates the page hash index when a single record is inserted on a page. +@param[in,out] cursor cursor which was positioned to the + place to insert using btr_cur_search_..., + and the new record has been inserted next + to the cursor +@param[in] ahi_latch the adaptive hash index latch */ +void btr_search_update_hash_on_insert(btr_cur_t *cursor, + srw_spin_lock *ahi_latch) +{ + buf_block_t* block; + dict_index_t* index; + const rec_t* rec; + const rec_t* ins_rec; + const rec_t* next_rec; + ulint fold; + ulint ins_fold; + ulint next_fold = 0; /* remove warning (??? bug ???) */ + ulint n_fields; + ulint n_bytes; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + ut_ad(ahi_latch == &btr_search_sys.get_part(*cursor->index())->latch); + ut_ad(page_is_leaf(btr_cur_get_page(cursor))); + + if (!btr_search_enabled) { + return; + } + + block = btr_cur_get_block(cursor); + + ut_ad(block->page.lock.have_x()); + assert_block_ahi_valid(block); + + index = block->index; + + if (!index) { + + return; + } + + ut_ad(block->page.id().space() == index->table->space_id); + btr_search_check_free_space_in_heap(index); + + rec = btr_cur_get_rec(cursor); + + ut_ad(!cursor->index()->table->is_temporary()); + + if (index != cursor->index()) { + ut_ad(index->id == cursor->index()->id); +drop: + btr_search_drop_page_hash_index(block, false); + return; + } + + ut_a(index == cursor->index()); + ut_ad(!dict_index_is_ibuf(index)); + + n_fields = block->curr_n_fields; + n_bytes = block->curr_n_bytes; + const bool left_side = block->curr_left_side; + + ins_rec = page_rec_get_next_const(rec); + if (UNIV_UNLIKELY(!ins_rec)) goto drop; + next_rec = page_rec_get_next_const(ins_rec); + if (UNIV_UNLIKELY(!next_rec)) goto drop; + + offsets = rec_get_offsets(ins_rec, index, offsets, + index->n_core_fields, + ULINT_UNDEFINED, &heap); + ins_fold = rec_fold(ins_rec, offsets, n_fields, n_bytes, index->id); + + if (!page_rec_is_supremum(next_rec)) { + offsets = rec_get_offsets( + next_rec, index, offsets, index->n_core_fields, + btr_search_get_n_fields(n_fields, n_bytes), &heap); + next_fold = rec_fold(next_rec, offsets, n_fields, + n_bytes, index->id); + } + + /* We must not look up "part" before acquiring ahi_latch. */ + btr_search_sys_t::partition* part= nullptr; + bool locked = false; + + if (!page_rec_is_infimum(rec) && !rec_is_metadata(rec, *index)) { + offsets = rec_get_offsets( + rec, index, offsets, index->n_core_fields, + btr_search_get_n_fields(n_fields, n_bytes), &heap); + fold = rec_fold(rec, offsets, n_fields, n_bytes, index->id); + } else { + if (left_side) { + locked = true; + ahi_latch->wr_lock(SRW_LOCK_CALL); + + if (!btr_search_enabled || !block->index) { + goto function_exit; + } + + part = btr_search_sys.get_part(*index); + ha_insert_for_fold(&part->table, part->heap, + ins_fold, block, ins_rec); + MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED); + } + + goto check_next_rec; + } + + if (fold != ins_fold) { + + if (!locked) { + locked = true; + ahi_latch->wr_lock(SRW_LOCK_CALL); + + if (!btr_search_enabled || !block->index) { + goto function_exit; + } + + part = btr_search_sys.get_part(*index); + } + + if (!left_side) { + ha_insert_for_fold(&part->table, part->heap, + fold, block, rec); + } else { + ha_insert_for_fold(&part->table, part->heap, + ins_fold, block, ins_rec); + } + MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED); + } + +check_next_rec: + if (page_rec_is_supremum(next_rec)) { + + if (!left_side) { + if (!locked) { + locked = true; + ahi_latch->wr_lock(SRW_LOCK_CALL); + + if (!btr_search_enabled || !block->index) { + goto function_exit; + } + + part = btr_search_sys.get_part(*index); + } + + ha_insert_for_fold(&part->table, part->heap, + ins_fold, block, ins_rec); + MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED); + } + + goto function_exit; + } + + if (ins_fold != next_fold) { + if (!locked) { + locked = true; + ahi_latch->wr_lock(SRW_LOCK_CALL); + + if (!btr_search_enabled || !block->index) { + goto function_exit; + } + + part = btr_search_sys.get_part(*index); + } + + if (!left_side) { + ha_insert_for_fold(&part->table, part->heap, + ins_fold, block, ins_rec); + } else { + ha_insert_for_fold(&part->table, part->heap, + next_fold, block, next_rec); + } + MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED); + } + +function_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + if (locked) { + ahi_latch->wr_unlock(); + } +} + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +__attribute__((nonnull)) +/** @return whether a range of the cells is valid */ +static bool ha_validate(const hash_table_t *table, + ulint start_index, ulint end_index) +{ + ut_a(start_index <= end_index); + ut_a(end_index < table->n_cells); + + bool ok= true; + + for (ulint i= start_index; i <= end_index; i++) + { + for (auto node= static_cast(table->array[i].node); node; + node= node->next) + { + if (table->calc_hash(node->fold) != i) { + ib::error() << "Hash table node fold value " << node->fold + << " does not match the cell number " << i; + ok= false; + } + } + } + + return ok; +} + +/** Validates the search system for given hash table. +@param thd connection, for checking if CHECK TABLE has been killed +@param hash_table_id hash table to validate +@return true if ok */ +static bool btr_search_hash_table_validate(THD *thd, ulint hash_table_id) +{ + ha_node_t* node; + bool ok = true; + ulint i; + ulint cell_count; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + + btr_search_x_lock_all(); + if (!btr_search_enabled || (thd && thd_kill_level(thd))) { +func_exit: + btr_search_x_unlock_all(); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return ok; + } + + /* How many cells to check before temporarily releasing + search latches. */ + ulint chunk_size = 10000; + + rec_offs_init(offsets_); + + mysql_mutex_lock(&buf_pool.mutex); + + auto &part = btr_search_sys.parts[hash_table_id]; + + cell_count = part.table.n_cells; + + for (i = 0; i < cell_count; i++) { + /* We release search latches every once in a while to + give other queries a chance to run. */ + if ((i != 0) && ((i % chunk_size) == 0)) { + + mysql_mutex_unlock(&buf_pool.mutex); + btr_search_x_unlock_all(); + + std::this_thread::yield(); + + btr_search_x_lock_all(); + + if (!btr_search_enabled + || (thd && thd_kill_level(thd))) { + goto func_exit; + } + + mysql_mutex_lock(&buf_pool.mutex); + + ulint curr_cell_count = part.table.n_cells; + + if (cell_count != curr_cell_count) { + + cell_count = curr_cell_count; + + if (i >= cell_count) { + break; + } + } + } + + node = static_cast(part.table.array[i].node); + + for (; node != NULL; node = node->next) { + const buf_block_t* block + = buf_pool.block_from_ahi((byte*) node->data); + index_id_t page_index_id; + + if (UNIV_LIKELY(block->page.in_file())) { + /* The space and offset are only valid + for file blocks. It is possible that + the block is being freed + (BUF_BLOCK_REMOVE_HASH, see the + assertion and the comment below) */ + const page_id_t id(block->page.id()); + if (const buf_page_t* hash_page + = buf_pool.page_hash.get( + id, buf_pool.page_hash.cell_get( + id.fold()))) { + ut_ad(hash_page == &block->page); + goto state_ok; + } + } + + /* When a block is being freed, + buf_LRU_search_and_free_block() first removes + the block from buf_pool.page_hash by calling + buf_LRU_block_remove_hashed_page(). Then it + invokes btr_search_drop_page_hash_index(). */ + ut_a(block->page.state() == buf_page_t::REMOVE_HASH); +state_ok: + ut_ad(!dict_index_is_ibuf(block->index)); + ut_ad(block->page.id().space() + == block->index->table->space_id); + + const page_t* page = block->page.frame; + + page_index_id = btr_page_get_index_id(page); + + offsets = rec_get_offsets( + node->data, block->index, offsets, + block->index->n_core_fields, + btr_search_get_n_fields(block->curr_n_fields, + block->curr_n_bytes), + &heap); + + const ulint fold = rec_fold( + node->data, offsets, + block->curr_n_fields, + block->curr_n_bytes, + page_index_id); + + if (node->fold != fold) { + ok = FALSE; + + ib::error() << "Error in an adaptive hash" + << " index pointer to page " + << block->page.id() + << ", ptr mem address " + << reinterpret_cast( + node->data) + << ", index id " << page_index_id + << ", node fold " << node->fold + << ", rec fold " << fold; + + fputs("InnoDB: Record ", stderr); + rec_print_new(stderr, node->data, offsets); + fprintf(stderr, "\nInnoDB: on that page." + " Page mem address %p, is hashed %p," + " n fields %lu\n" + "InnoDB: side %lu\n", + (void*) page, (void*) block->index, + (ulong) block->curr_n_fields, + (ulong) block->curr_left_side); + ut_ad(0); + } + } + } + + for (i = 0; i < cell_count; i += chunk_size) { + /* We release search latches every once in a while to + give other queries a chance to run. */ + if (i != 0) { + mysql_mutex_unlock(&buf_pool.mutex); + btr_search_x_unlock_all(); + + std::this_thread::yield(); + + btr_search_x_lock_all(); + + if (!btr_search_enabled + || (thd && thd_kill_level(thd))) { + goto func_exit; + } + + mysql_mutex_lock(&buf_pool.mutex); + + ulint curr_cell_count = part.table.n_cells; + + if (cell_count != curr_cell_count) { + + cell_count = curr_cell_count; + + if (i >= cell_count) { + break; + } + } + } + + ulint end_index = ut_min(i + chunk_size - 1, cell_count - 1); + + if (!ha_validate(&part.table, i, end_index)) { + ok = false; + } + } + + mysql_mutex_unlock(&buf_pool.mutex); + goto func_exit; +} + +/** Validates the search system. +@param thd connection, for checking if CHECK TABLE has been killed +@return true if ok */ +bool btr_search_validate(THD *thd) +{ + for (ulint i= 0; i < btr_ahi_parts; ++i) + if (!btr_search_hash_table_validate(thd, i)) + return(false); + return true; +} + +#ifdef UNIV_DEBUG +bool btr_search_check_marked_free_index(const buf_block_t *block) +{ + const index_id_t index_id= btr_page_get_index_id(block->page.frame); + auto part= btr_search_sys.get_part(index_id, block->page.id().space()); + + part->latch.rd_lock(SRW_LOCK_CALL); + + bool is_freed= block->index && block->index->freed(); + + part->latch.rd_unlock(); + + return is_freed; +} +#endif /* UNIV_DEBUG */ +#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */ +#endif /* BTR_CUR_HASH_ADAPT */ diff --git a/storage/innobase/buf/buf0block_hint.cc b/storage/innobase/buf/buf0block_hint.cc new file mode 100644 index 00000000..6bd01faa --- /dev/null +++ b/storage/innobase/buf/buf0block_hint.cc @@ -0,0 +1,59 @@ +/***************************************************************************** + +Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2020, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License, version 2.0, as published by the +Free Software Foundation. + +This program is also distributed with certain software (including but not +limited to OpenSSL) that is licensed under separate terms, as designated in a +particular file or component or in included license documentation. The authors +of MySQL hereby grant you an additional permission to link the program and +your derivative works with the separately licensed software that they have +included with MySQL. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0, +for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +#include "buf0block_hint.h" +namespace buf { + +TRANSACTIONAL_TARGET +void Block_hint::buffer_fix_block_if_still_valid() +{ + /* To check if m_block belongs to the current buf_pool, we must + prevent freeing memory while we check, and until we buffer-fix the + block. For this purpose it is enough to latch any of the many + latches taken by buf_pool_t::resize(). + + Similar to buf_page_optimistic_get(), we must validate + m_block->page.id() after acquiring the hash_lock, because the object + may have been freed and not actually attached to buf_pool.page_hash + at the moment. (The block could have been reused to store a + different page, and that slice of buf_pool.page_hash could be protected + by another hash_lock that we are not holding.) + + Finally, we must ensure that the block is not being freed. */ + if (m_block) + { + auto &cell= buf_pool.page_hash.cell_get(m_page_id.fold()); + transactional_shared_lock_guard g + {buf_pool.page_hash.lock_get(cell)}; + if (buf_pool.is_uncompressed(m_block) && m_page_id == m_block->page.id() && + m_block->page.frame && m_block->page.in_file()) + m_block->page.fix(); + else + clear(); + } +} +} // namespace buf diff --git a/storage/innobase/buf/buf0buddy.cc b/storage/innobase/buf/buf0buddy.cc new file mode 100644 index 00000000..85a698bc --- /dev/null +++ b/storage/innobase/buf/buf0buddy.cc @@ -0,0 +1,769 @@ +/***************************************************************************** + +Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0buddy.cc +Binary buddy allocator for compressed pages + +Created December 2006 by Marko Makela +*******************************************************/ + +#include "buf0buddy.h" +#include "buf0buf.h" +#include "buf0lru.h" +#include "buf0flu.h" +#include "page0zip.h" +#include "srv0start.h" + +/** When freeing a buf we attempt to coalesce by looking at its buddy +and deciding whether it is free or not. To ascertain if the buddy is +free we look for BUF_BUDDY_STAMP_FREE at BUF_BUDDY_STAMP_OFFSET +within the buddy. The question is how we can be sure that it is +safe to look at BUF_BUDDY_STAMP_OFFSET. +The answer lies in following invariants: +* All blocks allocated by buddy allocator are used for compressed +page frame. +* A compressed table always have space_id < SRV_SPACE_ID_UPPER_BOUND +* BUF_BUDDY_STAMP_OFFSET always points to the space_id field in +a frame. + -- The above is true because we look at these fields when the + corresponding buddy block is free which implies that: + * The block we are looking at must have an address aligned at + the same size that its free buddy has. For example, if we have + a free block of 8K then its buddy's address must be aligned at + 8K as well. + * It is possible that the block we are looking at may have been + further divided into smaller sized blocks but its starting + address must still remain the start of a page frame i.e.: it + cannot be middle of a block. For example, if we have a free + block of size 8K then its buddy may be divided into blocks + of, say, 1K, 1K, 2K, 4K but the buddy's address will still be + the starting address of first 1K compressed page. + * What is important to note is that for any given block, the + buddy's address cannot be in the middle of a larger block i.e.: + in above example, our 8K block cannot have a buddy whose address + is aligned on 8K but it is part of a larger 16K block. +*/ + +/** Offset within buf_buddy_free_t where free or non_free stamps +are written.*/ +#define BUF_BUDDY_STAMP_OFFSET FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID + +/** Value that we stamp on all buffers that are currently on the zip_free +list. This value is stamped at BUF_BUDDY_STAMP_OFFSET offset */ +#define BUF_BUDDY_STAMP_FREE SRV_SPACE_ID_UPPER_BOUND + +/** Stamp value for non-free buffers. Will be overwritten by a non-zero +value by the consumer of the block */ +#define BUF_BUDDY_STAMP_NONFREE 0XFFFFFFFFUL + +/** Return type of buf_buddy_is_free() */ +enum buf_buddy_state_t { + BUF_BUDDY_STATE_FREE, /*!< If the buddy to completely free */ + BUF_BUDDY_STATE_USED, /*!< Buddy currently in used */ + BUF_BUDDY_STATE_PARTIALLY_USED/*!< Some sub-blocks in the buddy + are in use */ +}; + +/**********************************************************************//** +Invalidate memory area that we won't access while page is free */ +UNIV_INLINE +void +buf_buddy_mem_invalid( +/*==================*/ + buf_buddy_free_t* buf, /*!< in: block to check */ + ulint i) /*!< in: index of zip_free[] */ +{ + ut_ad(i <= BUF_BUDDY_SIZES); + + MEM_CHECK_ADDRESSABLE(buf, BUF_BUDDY_LOW << i); + MEM_UNDEFINED(buf, BUF_BUDDY_LOW << i); +} + +/**********************************************************************//** +Check if a buddy is stamped free. +@return whether the buddy is free */ +UNIV_INLINE MY_ATTRIBUTE((warn_unused_result)) +bool +buf_buddy_stamp_is_free( +/*====================*/ + const buf_buddy_free_t* buf) /*!< in: block to check */ +{ + compile_time_assert(BUF_BUDDY_STAMP_FREE < BUF_BUDDY_STAMP_NONFREE); + return(mach_read_from_4(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET) + == BUF_BUDDY_STAMP_FREE); +} + +/**********************************************************************//** +Stamps a buddy free. */ +UNIV_INLINE +void +buf_buddy_stamp_free( +/*=================*/ + buf_buddy_free_t* buf, /*!< in/out: block to stamp */ + ulint i) /*!< in: block size */ +{ + ut_d(memset(&buf->stamp.bytes, int(i), BUF_BUDDY_LOW << i)); + buf_buddy_mem_invalid(buf, i); + mach_write_to_4(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET, + BUF_BUDDY_STAMP_FREE); + buf->stamp.size = i; +} + +/**********************************************************************//** +Stamps a buddy nonfree. +@param[in,out] buf block to stamp +@param[in] i block size */ +static inline void buf_buddy_stamp_nonfree(buf_buddy_free_t* buf, ulint i) +{ + buf_buddy_mem_invalid(buf, i); + compile_time_assert(BUF_BUDDY_STAMP_NONFREE == 0xffffffffU); + memset(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET, 0xff, 4); +} + +/**********************************************************************//** +Get the offset of the buddy of a compressed page frame. +@return the buddy relative of page */ +UNIV_INLINE +void* +buf_buddy_get( +/*==========*/ + byte* page, /*!< in: compressed page */ + ulint size) /*!< in: page size in bytes */ +{ + ut_ad(ut_is_2pow(size)); + ut_ad(size >= BUF_BUDDY_LOW); + ut_ad(BUF_BUDDY_LOW <= UNIV_ZIP_SIZE_MIN); + ut_ad(size < BUF_BUDDY_HIGH); + ut_ad(BUF_BUDDY_HIGH == srv_page_size); + ut_ad(!ut_align_offset(page, size)); + + if (((ulint) page) & size) { + return(page - size); + } else { + return(page + size); + } +} + +#ifdef UNIV_DEBUG +/** Validate a given zip_free list. */ +struct CheckZipFree { + CheckZipFree(ulint i) : m_i(i) {} + + void operator()(const buf_buddy_free_t* elem) const + { + ut_ad(buf_buddy_stamp_is_free(elem)); + ut_ad(elem->stamp.size <= m_i); + } + + const ulint m_i; +}; + +/** Validate a buddy list. +@param[in] i buddy size to validate */ +static void buf_buddy_list_validate(ulint i) +{ + ut_list_validate(buf_pool.zip_free[i], CheckZipFree(i)); +} + +/**********************************************************************//** +Debug function to validate that a buffer is indeed free i.e.: in the +zip_free[]. +@param[in] buf block to check +@param[in] i index of buf_pool.zip_free[] +@return true if free */ +static bool buf_buddy_check_free(const buf_buddy_free_t* buf, ulint i) +{ + const ulint size = BUF_BUDDY_LOW << i; + + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(!ut_align_offset(buf, size)); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + + buf_buddy_free_t* itr; + + for (itr = UT_LIST_GET_FIRST(buf_pool.zip_free[i]); + itr && itr != buf; + itr = UT_LIST_GET_NEXT(list, itr)) { + } + + return(itr == buf); +} +#endif /* UNIV_DEBUG */ + +/**********************************************************************//** +Checks if a buf is free i.e.: in the zip_free[]. +@retval BUF_BUDDY_STATE_FREE if fully free +@retval BUF_BUDDY_STATE_USED if currently in use +@retval BUF_BUDDY_STATE_PARTIALLY_USED if partially in use. */ +static MY_ATTRIBUTE((warn_unused_result)) +buf_buddy_state_t +buf_buddy_is_free( +/*==============*/ + buf_buddy_free_t* buf, /*!< in: block to check */ + ulint i) /*!< in: index of + buf_pool.zip_free[] */ +{ +#ifdef UNIV_DEBUG + const ulint size = BUF_BUDDY_LOW << i; + ut_ad(!ut_align_offset(buf, size)); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); +#endif /* UNIV_DEBUG */ + + /* We assume that all memory from buf_buddy_alloc() + is used for compressed page frames. */ + + /* We look inside the allocated objects returned by + buf_buddy_alloc() and assume that each block is a compressed + page that contains one of the following in space_id. + * BUF_BUDDY_STAMP_FREE if the block is in a zip_free list or + * BUF_BUDDY_STAMP_NONFREE if the block has been allocated but + not initialized yet or + * A valid space_id of a compressed tablespace + + The call below attempts to read from free memory. The memory + is "owned" by the buddy allocator (and it has been allocated + from the buffer pool), so there is nothing wrong about this. */ + if (!buf_buddy_stamp_is_free(buf)) { + return(BUF_BUDDY_STATE_USED); + } + + /* A block may be free but a fragment of it may still be in use. + To guard against that we write the free block size in terms of + zip_free index at start of stamped block. Note that we can + safely rely on this value only if the buf is free. */ + ut_ad(buf->stamp.size <= i); + return(buf->stamp.size == i + ? BUF_BUDDY_STATE_FREE + : BUF_BUDDY_STATE_PARTIALLY_USED); +} + +/** Add a block to the head of the appropriate buddy free list. +@param[in,out] buf block to be freed +@param[in] i index of buf_pool.zip_free[] */ +UNIV_INLINE +void +buf_buddy_add_to_free(buf_buddy_free_t* buf, ulint i) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(buf_pool.zip_free[i].start != buf); + + buf_buddy_stamp_free(buf, i); + UT_LIST_ADD_FIRST(buf_pool.zip_free[i], buf); + ut_d(buf_buddy_list_validate(i)); +} + +/** Remove a block from the appropriate buddy free list. +@param[in,out] buf block to be freed +@param[in] i index of buf_pool.zip_free[] */ +UNIV_INLINE +void +buf_buddy_remove_from_free(buf_buddy_free_t* buf, ulint i) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(buf_buddy_check_free(buf, i)); + + UT_LIST_REMOVE(buf_pool.zip_free[i], buf); + buf_buddy_stamp_nonfree(buf, i); +} + +/** Try to allocate a block from buf_pool.zip_free[]. +@param[in] i index of buf_pool.zip_free[] +@return allocated block, or NULL if buf_pool.zip_free[] was empty */ +static buf_buddy_free_t* buf_buddy_alloc_zip(ulint i) +{ + buf_buddy_free_t* buf; + + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_a(i < BUF_BUDDY_SIZES); + ut_a(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + + ut_d(buf_buddy_list_validate(i)); + + buf = UT_LIST_GET_FIRST(buf_pool.zip_free[i]); + + if (buf_pool.is_shrinking() + && UT_LIST_GET_LEN(buf_pool.withdraw) + < buf_pool.withdraw_target) { + + while (buf != NULL + && buf_pool.will_be_withdrawn( + reinterpret_cast(buf))) { + /* This should be withdrawn, not to be allocated */ + buf = UT_LIST_GET_NEXT(list, buf); + } + } + + if (buf) { + buf_buddy_remove_from_free(buf, i); + } else if (i + 1 < BUF_BUDDY_SIZES) { + /* Attempt to split. */ + buf = buf_buddy_alloc_zip(i + 1); + + if (buf) { + buf_buddy_free_t* buddy = + reinterpret_cast( + reinterpret_cast(buf) + + (BUF_BUDDY_LOW << i)); + ut_ad(!buf_pool.contains_zip(buddy)); + buf_buddy_add_to_free(buddy, i); + } + } + + if (buf) { + /* Trash the page other than the BUF_BUDDY_STAMP_NONFREE. */ + MEM_UNDEFINED(buf, BUF_BUDDY_STAMP_OFFSET); + MEM_UNDEFINED(BUF_BUDDY_STAMP_OFFSET + 4 + buf->stamp.bytes, + (BUF_BUDDY_LOW << i) + - (BUF_BUDDY_STAMP_OFFSET + 4)); + ut_ad(mach_read_from_4(buf->stamp.bytes + + BUF_BUDDY_STAMP_OFFSET) + == BUF_BUDDY_STAMP_NONFREE); + } + + return(buf); +} + +/** Deallocate a buffer frame of srv_page_size. +@param[in] buf buffer frame to deallocate */ +static +void +buf_buddy_block_free(void* buf) +{ + const ulint fold = BUF_POOL_ZIP_FOLD_PTR(buf); + buf_page_t* bpage; + buf_block_t* block; + + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_a(!ut_align_offset(buf, srv_page_size)); + + HASH_SEARCH(hash, &buf_pool.zip_hash, fold, buf_page_t*, bpage, + ut_ad(bpage->state() == buf_page_t::MEMORY + && bpage->in_zip_hash), + bpage->frame == buf); + ut_a(bpage); + ut_a(bpage->state() == buf_page_t::MEMORY); + ut_ad(bpage->in_zip_hash); + ut_d(bpage->in_zip_hash = false); + HASH_DELETE(buf_page_t, hash, &buf_pool.zip_hash, fold, bpage); + bpage->hash = nullptr; + + ut_d(memset(buf, 0, srv_page_size)); + MEM_UNDEFINED(buf, srv_page_size); + + block = (buf_block_t*) bpage; + buf_LRU_block_free_non_file_page(block); + + ut_ad(buf_pool.buddy_n_frames > 0); + ut_d(buf_pool.buddy_n_frames--); +} + +/**********************************************************************//** +Allocate a buffer block to the buddy allocator. */ +static +void +buf_buddy_block_register( +/*=====================*/ + buf_block_t* block) /*!< in: buffer frame to allocate */ +{ + const ulint fold = BUF_POOL_ZIP_FOLD(block); + ut_ad(block->page.state() == buf_page_t::MEMORY); + + ut_a(block->page.frame); + ut_a(!ut_align_offset(block->page.frame, srv_page_size)); + + ut_ad(!block->page.in_zip_hash); + ut_d(block->page.in_zip_hash = true); + HASH_INSERT(buf_page_t, hash, &buf_pool.zip_hash, fold, &block->page); + + ut_d(buf_pool.buddy_n_frames++); +} + +/** Allocate a block from a bigger object. +@param[in] buf a block that is free to use +@param[in] i index of buf_pool.zip_free[] +@param[in] j size of buf as an index of buf_pool.zip_free[] +@return allocated block */ +static +void* +buf_buddy_alloc_from(void* buf, ulint i, ulint j) +{ + ulint offs = BUF_BUDDY_LOW << j; + ut_ad(j <= BUF_BUDDY_SIZES); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + ut_ad(j >= i); + ut_ad(!ut_align_offset(buf, offs)); + + /* Add the unused parts of the block to the free lists. */ + while (j > i) { + buf_buddy_free_t* zip_buf; + + offs >>= 1; + j--; + + zip_buf = reinterpret_cast( + reinterpret_cast(buf) + offs); + buf_buddy_add_to_free(zip_buf, j); + } + + buf_buddy_stamp_nonfree(reinterpret_cast(buf), i); + return(buf); +} + +/** Allocate a ROW_FORMAT=COMPRESSED block. +@param i index of buf_pool.zip_free[] or BUF_BUDDY_SIZES +@param lru assigned to true if buf_pool.mutex was temporarily released +@return allocated block, never NULL */ +byte *buf_buddy_alloc_low(ulint i, bool *lru) +{ + buf_block_t* block; + + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + + if (i < BUF_BUDDY_SIZES) { + /* Try to allocate from the buddy system. */ + block = (buf_block_t*) buf_buddy_alloc_zip(i); + + if (block) { + goto func_exit; + } + } + + /* Try allocating from the buf_pool.free list. */ + block = buf_LRU_get_free_only(); + + if (block) { + goto alloc_big; + } + + /* Try replacing an uncompressed page in the buffer pool. */ + block = buf_LRU_get_free_block(true); + if (lru) { + *lru = true; + } + +alloc_big: + buf_buddy_block_register(block); + + block = reinterpret_cast( + buf_buddy_alloc_from(block->page.frame, i, BUF_BUDDY_SIZES)); + +func_exit: + buf_pool.buddy_stat[i].used++; + return reinterpret_cast(block); +} + +/** Try to relocate a block. The caller must hold zip_free_mutex, and this +function will release and lock it again. +@param[in] src block to relocate +@param[in] dst free block to relocated to +@param[in] i index of buf_pool.zip_free[] +@param[in] force true if we must relocated always +@return true if relocated */ +static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force) +{ + buf_page_t* bpage; + const ulint size = BUF_BUDDY_LOW << i; + + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(!ut_align_offset(src, size)); + ut_ad(!ut_align_offset(dst, size)); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + MEM_CHECK_ADDRESSABLE(dst, size); + + uint32_t space = mach_read_from_4(static_cast(src) + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + uint32_t offset = mach_read_from_4(static_cast(src) + + FIL_PAGE_OFFSET); + + /* Suppress Valgrind or MSAN warnings. */ + MEM_MAKE_DEFINED(&space, sizeof space); + MEM_MAKE_DEFINED(&offset, sizeof offset); + + ut_ad(space != BUF_BUDDY_STAMP_FREE); + + const page_id_t page_id(space, offset); + /* FIXME: we are computing this while holding buf_pool.mutex */ + auto &cell= buf_pool.page_hash.cell_get(page_id.fold()); + + bpage = buf_pool.page_hash.get(page_id, cell); + + if (!bpage || bpage->zip.data != src) { + /* The block has probably been freshly + allocated by buf_LRU_get_free_block() but not + added to buf_pool.page_hash yet. Obviously, + it cannot be relocated. */ + + if (!force || space != 0 || offset != 0) { + return(false); + } + + /* It might be just uninitialized page. + We should search from LRU list also. */ + + bpage = UT_LIST_GET_FIRST(buf_pool.LRU); + while (bpage != NULL) { + if (bpage->zip.data == src) { + ut_ad(bpage->id() == page_id); + break; + } + bpage = UT_LIST_GET_NEXT(LRU, bpage); + } + + if (bpage == NULL) { + return(false); + } + } + + if (page_zip_get_size(&bpage->zip) != size) { + /* The block is of different size. We would + have to relocate all blocks covered by src. + For the sake of simplicity, give up. */ + ut_ad(page_zip_get_size(&bpage->zip) < size); + return(false); + } + + /* The block must have been allocated, but it may + contain uninitialized data. */ + MEM_CHECK_ADDRESSABLE(src, size); + + if (!bpage->can_relocate()) { + return false; + } + + page_hash_latch &hash_lock = buf_pool.page_hash.lock_get(cell); + /* It does not make sense to use transactional_lock_guard here, + because the memcpy() of 1024 to 16384 bytes would likely make the + memory transaction too large. */ + hash_lock.lock(); + + if (bpage->can_relocate()) { + /* Relocate the compressed page. */ + const ulonglong ns = my_interval_timer(); + + ut_a(bpage->zip.data == src); + + memcpy(dst, src, size); + bpage->zip.data = reinterpret_cast(dst); + + hash_lock.unlock(); + + buf_buddy_mem_invalid( + reinterpret_cast(src), i); + + buf_buddy_stat_t* buddy_stat = &buf_pool.buddy_stat[i]; + buddy_stat->relocated++; + buddy_stat->relocated_usec+= (my_interval_timer() - ns) / 1000; + return(true); + } + + hash_lock.unlock(); + + return(false); +} + +/** Deallocate a block. +@param[in] buf block to be freed, must not be pointed to + by the buffer pool +@param[in] i index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */ +void buf_buddy_free_low(void* buf, ulint i) +{ + buf_buddy_free_t* buddy; + + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(i <= BUF_BUDDY_SIZES); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + ut_ad(buf_pool.buddy_stat[i].used > 0); + + buf_pool.buddy_stat[i].used--; +recombine: + MEM_UNDEFINED(buf, BUF_BUDDY_LOW << i); + + if (i == BUF_BUDDY_SIZES) { + buf_buddy_block_free(buf); + return; + } + + ut_ad(i < BUF_BUDDY_SIZES); + ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i)); + ut_ad(!buf_pool.contains_zip(buf)); + + /* Do not recombine blocks if there are few free blocks. + We may waste up to 15360*max_len bytes to free blocks + (1024 + 2048 + 4096 + 8192 = 15360) */ + if (UT_LIST_GET_LEN(buf_pool.zip_free[i]) < 16 + && !buf_pool.is_shrinking()) { + goto func_exit; + } + + /* Try to combine adjacent blocks. */ + buddy = reinterpret_cast( + buf_buddy_get(reinterpret_cast(buf), + BUF_BUDDY_LOW << i)); + + switch (buf_buddy_is_free(buddy, i)) { + case BUF_BUDDY_STATE_FREE: + /* The buddy is free: recombine */ + buf_buddy_remove_from_free(buddy, i); +buddy_is_free: + ut_ad(!buf_pool.contains_zip(buddy)); + i++; + buf = ut_align_down(buf, BUF_BUDDY_LOW << i); + + goto recombine; + + case BUF_BUDDY_STATE_USED: + ut_d(buf_buddy_list_validate(i)); + + /* The buddy is not free. Is there a free block of + this size? */ + if (buf_buddy_free_t* zip_buf = + UT_LIST_GET_FIRST(buf_pool.zip_free[i])) { + + /* Remove the block from the free list, because + a successful buf_buddy_relocate() will overwrite + zip_free->list. */ + buf_buddy_remove_from_free(zip_buf, i); + + /* Try to relocate the buddy of buf to the free + block. */ + if (buf_buddy_relocate(buddy, zip_buf, i, false)) { + goto buddy_is_free; + } + + buf_buddy_add_to_free(zip_buf, i); + } + + break; + case BUF_BUDDY_STATE_PARTIALLY_USED: + /* Some sub-blocks in the buddy are still in use. + Relocation will fail. No need to try. */ + break; + } + +func_exit: + /* Free the block to the buddy list. */ + buf_buddy_add_to_free(reinterpret_cast(buf), i); +} + +/** Try to reallocate a block. +@param[in] buf buf_pool block to be reallocated +@param[in] size block size, up to srv_page_size +@return whether the reallocation succeeded */ +bool +buf_buddy_realloc(void* buf, ulint size) +{ + buf_block_t* block = NULL; + ulint i = buf_buddy_get_slot(size); + + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(i <= BUF_BUDDY_SIZES); + ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + + if (i < BUF_BUDDY_SIZES) { + /* Try to allocate from the buddy system. */ + block = reinterpret_cast(buf_buddy_alloc_zip(i)); + } + + if (block == NULL) { + /* Try allocating from the buf_pool.free list. */ + block = buf_LRU_get_free_only(); + + if (block == NULL) { + return(false); /* free_list was not enough */ + } + + buf_buddy_block_register(block); + + block = reinterpret_cast( + buf_buddy_alloc_from( + block->page.frame, i, BUF_BUDDY_SIZES)); + } + + buf_pool.buddy_stat[i].used++; + + /* Try to relocate the buddy of buf to the free block. */ + if (buf_buddy_relocate(buf, block, i, true)) { + /* succeeded */ + buf_buddy_free_low(buf, i); + } else { + /* failed */ + buf_buddy_free_low(block, i); + } + + return(true); /* free_list was enough */ +} + +/** Combine all pairs of free buddies. */ +void buf_buddy_condense_free() +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(buf_pool.is_shrinking()); + + for (ulint i = 0; i < UT_ARR_SIZE(buf_pool.zip_free); ++i) { + buf_buddy_free_t* buf = + UT_LIST_GET_FIRST(buf_pool.zip_free[i]); + + /* seek to withdraw target */ + while (buf != NULL + && !buf_pool.will_be_withdrawn( + reinterpret_cast(buf))) { + buf = UT_LIST_GET_NEXT(list, buf); + } + + while (buf != NULL) { + buf_buddy_free_t* next = + UT_LIST_GET_NEXT(list, buf); + + buf_buddy_free_t* buddy = + reinterpret_cast( + buf_buddy_get( + reinterpret_cast(buf), + BUF_BUDDY_LOW << i)); + + /* seek to the next withdraw target */ + while (true) { + while (next != NULL + && !buf_pool.will_be_withdrawn( + reinterpret_cast(next))) { + next = UT_LIST_GET_NEXT(list, next); + } + + if (buddy != next) { + break; + } + + next = UT_LIST_GET_NEXT(list, next); + } + + if (buf_buddy_is_free(buddy, i) + == BUF_BUDDY_STATE_FREE) { + /* Both buf and buddy are free. + Try to combine them. */ + buf_buddy_remove_from_free(buf, i); + buf_pool.buddy_stat[i].used++; + + buf_buddy_free_low(buf, i); + } + + buf = next; + } + } +} diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc new file mode 100644 index 00000000..8ef18ee0 --- /dev/null +++ b/storage/innobase/buf/buf0buf.cc @@ -0,0 +1,4180 @@ +/***************************************************************************** + +Copyright (c) 1995, 2018, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0buf.cc +The database buffer buf_pool + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "assume_aligned.h" +#include "mtr0types.h" +#include "mach0data.h" +#include "buf0checksum.h" +#include "mariadb_stats.h" +#include + +#ifdef UNIV_INNOCHECKSUM +# include "my_sys.h" +# include "buf0buf.h" +#else +#include "my_cpu.h" +#include "mem0mem.h" +#include "btr0btr.h" +#include "fil0fil.h" +#include "fil0crypt.h" +#include "buf0rea.h" +#include "buf0flu.h" +#include "buf0buddy.h" +#include "buf0dblwr.h" +#include "lock0lock.h" +#include "btr0sea.h" +#include "ibuf0ibuf.h" +#include "trx0undo.h" +#include "trx0purge.h" +#include "log0log.h" +#include "dict0stats_bg.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "dict0dict.h" +#include "log0recv.h" +#include "srv0mon.h" +#include "log0crypt.h" +#include "fil0pagecompress.h" +#endif /* !UNIV_INNOCHECKSUM */ +#include "page0zip.h" +#include "buf0dump.h" +#include +#include +#include "log.h" + +using st_::span; + +#ifdef HAVE_LIBNUMA +#include +#include +struct set_numa_interleave_t +{ + set_numa_interleave_t() + { + if (srv_numa_interleave) { + + struct bitmask *numa_mems_allowed = numa_get_mems_allowed(); + ib::info() << "Setting NUMA memory policy to" + " MPOL_INTERLEAVE"; + if (set_mempolicy(MPOL_INTERLEAVE, + numa_mems_allowed->maskp, + numa_mems_allowed->size) != 0) { + + ib::warn() << "Failed to set NUMA memory" + " policy to MPOL_INTERLEAVE: " + << strerror(errno); + } + numa_bitmask_free(numa_mems_allowed); + } + } + + ~set_numa_interleave_t() + { + if (srv_numa_interleave) { + + ib::info() << "Setting NUMA memory policy to" + " MPOL_DEFAULT"; + if (set_mempolicy(MPOL_DEFAULT, NULL, 0) != 0) { + ib::warn() << "Failed to set NUMA memory" + " policy to MPOL_DEFAULT: " + << strerror(errno); + } + } + } +}; + +#define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE set_numa_interleave_t scoped_numa +#else +#define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE +#endif /* HAVE_LIBNUMA */ + +/* + IMPLEMENTATION OF THE BUFFER POOL + ================================= + + Buffer frames and blocks + ------------------------ +Following the terminology of Gray and Reuter, we call the memory +blocks where file pages are loaded buffer frames. For each buffer +frame there is a control block, or shortly, a block, in the buffer +control array. The control info which does not need to be stored +in the file along with the file page, resides in the control block. + + Buffer pool struct + ------------------ +The buffer buf_pool contains a single mutex which protects all the +control data structures of the buf_pool. The content of a buffer frame is +protected by a separate read-write lock in its control block, though. +These locks can be locked and unlocked without owning the buf_pool.mutex. +The OS events in the buf_pool struct can be waited for without owning the +buf_pool.mutex. + +The buf_pool.mutex is a hot-spot in main memory, causing a lot of +memory bus traffic on multiprocessor systems when processors +alternately access the mutex. On our Pentium, the mutex is accessed +maybe every 10 microseconds. We gave up the solution to have mutexes +for each control block, for instance, because it seemed to be +complicated. + +A solution to reduce mutex contention of the buf_pool.mutex is to +create a separate mutex for the page hash table. On Pentium, +accessing the hash table takes 2 microseconds, about half +of the total buf_pool.mutex hold time. + + Control blocks + -------------- + +The control block contains, for instance, the bufferfix count +which is incremented when a thread wants a file page to be fixed +in a buffer frame. The bufferfix operation does not lock the +contents of the frame, however. For this purpose, the control +block contains a read-write lock. + +The buffer frames have to be aligned so that the start memory +address of a frame is divisible by the universal page size, which +is a power of two. + +The control blocks containing file pages are put to a hash table +according to the file address of the page. +We could speed up the access to an individual page by using +"pointer swizzling": we could replace the page references on +non-leaf index pages by direct pointers to the page, if it exists +in the buf_pool. We could make a separate hash table where we could +chain all the page references in non-leaf pages residing in the buf_pool, +using the page reference as the hash key, +and at the time of reading of a page update the pointers accordingly. +Drawbacks of this solution are added complexity and, +possibly, extra space required on non-leaf pages for memory pointers. +A simpler solution is just to speed up the hash table mechanism +in the database, using tables whose size is a power of 2. + + Lists of blocks + --------------- + +There are several lists of control blocks. + +The free list (buf_pool.free) contains blocks which are currently not +used. + +The common LRU list contains all the blocks holding a file page +except those for which the bufferfix count is non-zero. +The pages are in the LRU list roughly in the order of the last +access to the page, so that the oldest pages are at the end of the +list. We also keep a pointer to near the end of the LRU list, +which we can use when we want to artificially age a page in the +buf_pool. This is used if we know that some page is not needed +again for some time: we insert the block right after the pointer, +causing it to be replaced sooner than would normally be the case. +Currently this aging mechanism is used for read-ahead mechanism +of pages, and it can also be used when there is a scan of a full +table which cannot fit in the memory. Putting the pages near the +end of the LRU list, we make sure that most of the buf_pool stays +in the main memory, undisturbed. + +The unzip_LRU list contains a subset of the common LRU list. The +blocks on the unzip_LRU list hold a compressed file page and the +corresponding uncompressed page frame. A block is in unzip_LRU if and +only if the predicate block->page.belongs_to_unzip_LRU() +holds. The blocks in unzip_LRU will be in same order as they are in +the common LRU list. That is, each manipulation of the common LRU +list will result in the same manipulation of the unzip_LRU list. + +The chain of modified blocks (buf_pool.flush_list) contains the blocks +holding persistent file pages that have been modified in the memory +but not written to disk yet. The block with the oldest modification +which has not yet been written to disk is at the end of the chain. +The access to this list is protected by buf_pool.flush_list_mutex. + +The control blocks for uncompressed pages are accessible via +buf_block_t objects that are reachable via buf_pool.chunks[]. +The control blocks (buf_page_t) of those ROW_FORMAT=COMPRESSED pages +that are not in buf_pool.flush_list and for which no uncompressed +page has been allocated in buf_pool are only accessible via +buf_pool.LRU. + +The chains of free memory blocks (buf_pool.zip_free[]) are used by +the buddy allocator (buf0buddy.cc) to keep track of currently unused +memory blocks of size 1024..innodb_page_size / 2. These +blocks are inside the memory blocks of size innodb_page_size and type +BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer +pool. The buddy allocator is solely used for allocating +ROW_FORMAT=COMPRESSED page frames. + + Loading a file page + ------------------- + +First, a victim block for replacement has to be found in the +buf_pool. It is taken from the free list or searched for from the +end of the LRU-list. An exclusive lock is reserved for the frame, +the io_fix is set in the block fixing the block in buf_pool, +and the io-operation for loading the page is queued. The io-handler thread +releases the X-lock on the frame and releases the io_fix +when the io operation completes. + +A thread may request the above operation using the function +buf_page_get(). It may then continue to request a lock on the frame. +The lock is granted when the io-handler releases the x-lock. + + Read-ahead + ---------- + +The read-ahead mechanism is intended to be intelligent and +isolated from the semantically higher levels of the database +index management. From the higher level we only need the +information if a file page has a natural successor or +predecessor page. On the leaf level of a B-tree index, +these are the next and previous pages in the natural +order of the pages. + +Let us first explain the read-ahead mechanism when the leafs +of a B-tree are scanned in an ascending or descending order. +When a read page is the first time referenced in the buf_pool, +the buffer manager checks if it is at the border of a so-called +linear read-ahead area. The tablespace is divided into these +areas of size 64 blocks, for example. So if the page is at the +border of such an area, the read-ahead mechanism checks if +all the other blocks in the area have been accessed in an +ascending or descending order. If this is the case, the system +looks at the natural successor or predecessor of the page, +checks if that is at the border of another area, and in this case +issues read-requests for all the pages in that area. Maybe +we could relax the condition that all the pages in the area +have to be accessed: if data is deleted from a table, there may +appear holes of unused pages in the area. + +A different read-ahead mechanism is used when there appears +to be a random access pattern to a file. +If a new page is referenced in the buf_pool, and several pages +of its random access area (for instance, 32 consecutive pages +in a tablespace) have recently been referenced, we may predict +that the whole area may be needed in the near future, and issue +the read requests for the whole area. +*/ + +#ifndef UNIV_INNOCHECKSUM +# ifdef SUX_LOCK_GENERIC +void page_hash_latch::read_lock_wait() +{ + /* First, try busy spinning for a while. */ + for (auto spin= srv_n_spin_wait_rounds; spin--; ) + { + LF_BACKOFF(); + if (read_trylock()) + return; + } + /* Fall back to yielding to other threads. */ + do + std::this_thread::yield(); + while (!read_trylock()); +} + +void page_hash_latch::write_lock_wait() +{ + write_lock_wait_start(); + + /* First, try busy spinning for a while. */ + for (auto spin= srv_n_spin_wait_rounds; spin--; ) + { + if (write_lock_poll()) + return; + LF_BACKOFF(); + } + + /* Fall back to yielding to other threads. */ + do + std::this_thread::yield(); + while (!write_lock_poll()); +} +# endif + +/** Number of attempts made to read in a page in the buffer pool */ +constexpr ulint BUF_PAGE_READ_MAX_RETRIES= 100; +/** The maximum portion of the buffer pool that can be used for the +read-ahead buffer. (Divide buf_pool size by this amount) */ +constexpr uint32_t BUF_READ_AHEAD_PORTION= 32; + +/** A 64KiB buffer of NUL bytes, for use in assertions and checks, +and dummy default values of instantly dropped columns. +Initially, BLOB field references are set to NUL bytes, in +dtuple_convert_big_rec(). */ +const byte *field_ref_zero; + +/** The InnoDB buffer pool */ +buf_pool_t buf_pool; +buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_reg; +buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_ref; + +#ifdef UNIV_DEBUG +/** This is used to insert validation operations in execution +in the debug version */ +static Atomic_counter buf_dbg_counter; +#endif /* UNIV_DEBUG */ + +/** Macro to determine whether the read of write counter is used depending +on the io_type */ +#define MONITOR_RW_COUNTER(read, counter) \ + (read ? (counter##_READ) : (counter##_WRITTEN)) + +/** Decrypt a page for temporary tablespace. +@param[in,out] tmp_frame Temporary buffer +@param[in] src_frame Page to decrypt +@return true if temporary tablespace decrypted, false if not */ +static bool buf_tmp_page_decrypt(byte* tmp_frame, byte* src_frame) +{ + if (buf_is_zeroes(span(src_frame, srv_page_size))) { + return true; + } + + /* read space & lsn */ + uint header_len = FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION; + + /* Copy FIL page header, it is not encrypted */ + memcpy(tmp_frame, src_frame, header_len); + + /* Calculate the offset where decryption starts */ + const byte* src = src_frame + header_len; + byte* dst = tmp_frame + header_len; + uint srclen = uint(srv_page_size) + - (header_len + FIL_PAGE_FCRC32_CHECKSUM); + ulint offset = mach_read_from_4(src_frame + FIL_PAGE_OFFSET); + + if (!log_tmp_block_decrypt(src, srclen, dst, + (offset * srv_page_size))) { + return false; + } + + static_assert(FIL_PAGE_FCRC32_CHECKSUM == 4, "alignment"); + memcpy_aligned<4>(tmp_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM, + src_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM, + FIL_PAGE_FCRC32_CHECKSUM); + + memcpy_aligned(src_frame, tmp_frame, + srv_page_size); + srv_stats.pages_decrypted.inc(); + srv_stats.n_temp_blocks_decrypted.inc(); + + return true; /* page was decrypted */ +} + +/** Decrypt a page. +@param[in,out] bpage Page control block +@param[in] node data file +@return whether the operation was successful */ +static bool buf_page_decrypt_after_read(buf_page_t *bpage, + const fil_node_t &node) +{ + ut_ad(node.space->referenced()); + ut_ad(node.space->id == bpage->id().space()); + const auto flags = node.space->flags; + + byte* dst_frame = bpage->zip.data ? bpage->zip.data : bpage->frame; + bool page_compressed = node.space->is_compressed() + && buf_page_is_compressed(dst_frame, flags); + const page_id_t id(bpage->id()); + + if (id.page_no() == 0) { + /* File header pages are not encrypted/compressed */ + return (true); + } + + buf_tmp_buffer_t* slot; + + if (id.space() == SRV_TMP_SPACE_ID + && innodb_encrypt_temporary_tables) { + slot = buf_pool.io_buf_reserve(); + slot->allocate(); + bool ok = buf_tmp_page_decrypt(slot->crypt_buf, dst_frame); + slot->release(); + return ok; + } + + /* Page is encrypted if encryption information is found from + tablespace and page contains used key_version. This is true + also for pages first compressed and then encrypted. */ + + uint key_version = buf_page_get_key_version(dst_frame, flags); + + if (page_compressed && !key_version) { + /* the page we read is unencrypted */ + /* Find free slot from temporary memory array */ +decompress: + if (fil_space_t::full_crc32(flags) + && buf_page_is_corrupted(true, dst_frame, flags)) { + return false; + } + + slot = buf_pool.io_buf_reserve(); + slot->allocate(); + +decompress_with_slot: + ulint write_size = fil_page_decompress( + slot->crypt_buf, dst_frame, flags); + slot->release(); + ut_ad(node.space->referenced()); + return write_size != 0; + } + + if (key_version && node.space->crypt_data) { + /* Verify encryption checksum before we even try to + decrypt. */ + if (!buf_page_verify_crypt_checksum(dst_frame, flags)) { +decrypt_failed: + ib::error() << "Encrypted page " << id + << " in file " << node.name + << " looks corrupted; key_version=" + << key_version; + return false; + } + + slot = buf_pool.io_buf_reserve(); + slot->allocate(); + + /* decrypt using crypt_buf to dst_frame */ + if (!fil_space_decrypt(node.space, slot->crypt_buf, dst_frame)) { + slot->release(); + goto decrypt_failed; + } + + if ((fil_space_t::full_crc32(flags) && page_compressed) + || fil_page_get_type(dst_frame) + == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) { + goto decompress_with_slot; + } + + slot->release(); + } else if (fil_page_get_type(dst_frame) + == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) { + goto decompress; + } + + ut_ad(node.space->referenced()); + return true; +} +#endif /* !UNIV_INNOCHECKSUM */ + +/** Checks if the page is in crc32 checksum format. +@param[in] read_buf database page +@param[in] checksum_field1 new checksum field +@param[in] checksum_field2 old checksum field +@return true if the page is in crc32 checksum format. */ +static +bool +buf_page_is_checksum_valid_crc32( + const byte* read_buf, + ulint checksum_field1, + ulint checksum_field2) +{ + const uint32_t crc32 = buf_calc_page_crc32(read_buf); + +#ifdef UNIV_INNOCHECKSUM + extern FILE* log_file; + extern uint32_t cur_page_num; + if (log_file) { + fprintf(log_file, "page::" UINT32PF ";" + " crc32 calculated = " UINT32PF ";" + " recorded checksum field1 = " ULINTPF " recorded" + " checksum field2 =" ULINTPF "\n", cur_page_num, + crc32, checksum_field1, checksum_field2); + } +#endif /* UNIV_INNOCHECKSUM */ + + if (checksum_field1 != checksum_field2) { + return false; + } + + return checksum_field1 == crc32; +} + +/** Checks whether the lsn present in the page is lesser than the +peek current lsn. +@param[in] check_lsn lsn to check +@param[in] read_buf page. */ +static void buf_page_check_lsn(bool check_lsn, const byte* read_buf) +{ +#ifndef UNIV_INNOCHECKSUM + if (check_lsn && recv_lsn_checks_on) { + const lsn_t current_lsn = log_sys.get_lsn(); + const lsn_t page_lsn + = mach_read_from_8(read_buf + FIL_PAGE_LSN); + + /* Since we are going to reset the page LSN during the import + phase it makes no sense to spam the log with error messages. */ + if (current_lsn < page_lsn) { + + const uint32_t space_id = mach_read_from_4( + read_buf + FIL_PAGE_SPACE_ID); + const uint32_t page_no = mach_read_from_4( + read_buf + FIL_PAGE_OFFSET); + + ib::error() << "Page " << page_id_t(space_id, page_no) + << " log sequence number " << page_lsn + << " is in the future! Current system" + << " log sequence number " + << current_lsn << "."; + + ib::error() << "Your database may be corrupt or" + " you may have copied the InnoDB" + " tablespace but not the InnoDB" + " log files. " + << FORCE_RECOVERY_MSG; + + } + } +#endif /* !UNIV_INNOCHECKSUM */ +} + + +/** Check if a buffer is all zeroes. +@param[in] buf data to check +@return whether the buffer is all zeroes */ +bool buf_is_zeroes(span buf) +{ + ut_ad(buf.size() <= UNIV_PAGE_SIZE_MAX); + return memcmp(buf.data(), field_ref_zero, buf.size()) == 0; +} + +/** Check if a page is corrupt. +@param check_lsn whether FIL_PAGE_LSN should be checked +@param read_buf database page +@param fsp_flags contents of FIL_SPACE_FLAGS +@return whether the page is corrupted */ +bool buf_page_is_corrupted(bool check_lsn, const byte *read_buf, + uint32_t fsp_flags) +{ + if (fil_space_t::full_crc32(fsp_flags)) { + bool compressed = false, corrupted = false; + const uint size = buf_page_full_crc32_size( + read_buf, &compressed, &corrupted); + if (corrupted) { + return true; + } + const byte* end = read_buf + (size - FIL_PAGE_FCRC32_CHECKSUM); + uint crc32 = mach_read_from_4(end); + + if (!crc32 && size == srv_page_size + && buf_is_zeroes(span(read_buf, size))) { + return false; + } + + DBUG_EXECUTE_IF( + "page_intermittent_checksum_mismatch", { + static int page_counter; + if (page_counter++ == 3) { + crc32++; + } + }); + + if (crc32 != my_crc32c(0, read_buf, + size - FIL_PAGE_FCRC32_CHECKSUM)) { + return true; + } + static_assert(FIL_PAGE_FCRC32_KEY_VERSION == 0, "alignment"); + static_assert(FIL_PAGE_LSN % 4 == 0, "alignment"); + static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment"); + if (!compressed + && !mach_read_from_4(FIL_PAGE_FCRC32_KEY_VERSION + + read_buf) + && memcmp_aligned<4>(read_buf + (FIL_PAGE_LSN + 4), + end - (FIL_PAGE_FCRC32_END_LSN + - FIL_PAGE_FCRC32_CHECKSUM), + 4)) { + return true; + } + + buf_page_check_lsn(check_lsn, read_buf); + return false; + } + + const ulint zip_size = fil_space_t::zip_size(fsp_flags); + const uint16_t page_type = fil_page_get_type(read_buf); + + /* We can trust page type if page compression is set on tablespace + flags because page compression flag means file must have been + created with 10.1 (later than 5.5 code base). In 10.1 page + compressed tables do not contain post compression checksum and + FIL_PAGE_END_LSN_OLD_CHKSUM field stored. Note that space can + be null if we are in fil_check_first_page() and first page + is not compressed or encrypted. Page checksum is verified + after decompression (i.e. normally pages are already + decompressed at this stage). */ + if ((page_type == FIL_PAGE_PAGE_COMPRESSED || + page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) +#ifndef UNIV_INNOCHECKSUM + && FSP_FLAGS_HAS_PAGE_COMPRESSION(fsp_flags) +#endif + ) { + return(false); + } + + static_assert(FIL_PAGE_LSN % 4 == 0, "alignment"); + static_assert(FIL_PAGE_END_LSN_OLD_CHKSUM % 4 == 0, "alignment"); + + if (!zip_size + && memcmp_aligned<4>(read_buf + FIL_PAGE_LSN + 4, + read_buf + srv_page_size + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) { + /* Stored log sequence numbers at the start and the end + of page do not match */ + + return(true); + } + + buf_page_check_lsn(check_lsn, read_buf); + + /* Check whether the checksum fields have correct values */ + + if (zip_size) { + return !page_zip_verify_checksum(read_buf, zip_size); + } + + const uint32_t checksum_field1 = mach_read_from_4( + read_buf + FIL_PAGE_SPACE_OR_CHKSUM); + + const uint32_t checksum_field2 = mach_read_from_4( + read_buf + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM); + + static_assert(FIL_PAGE_LSN % 8 == 0, "alignment"); + + /* A page filled with NUL bytes is considered not corrupted. + Before MariaDB Server 10.1.25 (MDEV-12113) or 10.2.2 (or MySQL 5.7), + the FIL_PAGE_FILE_FLUSH_LSN field may have been written nonzero + for the first page of each file of the system tablespace. + We want to ignore it for the system tablespace, but because + we do not know the expected tablespace here, we ignore the + field for all data files, except for + innodb_checksum_algorithm=full_crc32 which we handled above. */ + if (!checksum_field1 && !checksum_field2) { + /* Checksum fields can have valid value as zero. + If the page is not empty then do the checksum + calculation for the page. */ + bool all_zeroes = true; + for (size_t i = 0; i < srv_page_size; i++) { +#ifndef UNIV_INNOCHECKSUM + if (i == FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) { + i += 8; + } +#endif + if (read_buf[i]) { + all_zeroes = false; + break; + } + } + + if (all_zeroes) { + return false; + } + } + +#ifndef UNIV_INNOCHECKSUM + switch (srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: +#endif /* !UNIV_INNOCHECKSUM */ + return !buf_page_is_checksum_valid_crc32( + read_buf, checksum_field1, checksum_field2); +#ifndef UNIV_INNOCHECKSUM + default: + if (checksum_field1 == BUF_NO_CHECKSUM_MAGIC + && checksum_field2 == BUF_NO_CHECKSUM_MAGIC) { + return false; + } + + const uint32_t crc32 = buf_calc_page_crc32(read_buf); + + /* Very old versions of InnoDB only stored 8 byte lsn to the + start and the end of the page. */ + + /* Since innodb_checksum_algorithm is not strict_* allow + any of the algos to match for the old field */ + + if (checksum_field2 + != mach_read_from_4(read_buf + FIL_PAGE_LSN) + && checksum_field2 != BUF_NO_CHECKSUM_MAGIC) { + + DBUG_EXECUTE_IF( + "page_intermittent_checksum_mismatch", { + static int page_counter; + if (page_counter++ == 3) return true; + }); + + if ((checksum_field1 != crc32 + || checksum_field2 != crc32) + && checksum_field2 + != buf_calc_page_old_checksum(read_buf)) { + return true; + } + } + + switch (checksum_field1) { + case 0: + case BUF_NO_CHECKSUM_MAGIC: + return false; + } + return (checksum_field1 != crc32 || checksum_field2 != crc32) + && checksum_field1 + != buf_calc_page_new_checksum(read_buf); + } +#endif /* !UNIV_INNOCHECKSUM */ +} + +#ifndef UNIV_INNOCHECKSUM + +#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP) +/** Enable buffers to be dumped to core files + +A convience function, not called anyhwere directly however +it is left available for gdb or any debugger to call +in the event that you want all of the memory to be dumped +to a core file. + +Returns number of errors found in madvise calls. */ +MY_ATTRIBUTE((used)) +int +buf_madvise_do_dump() +{ + int ret= 0; + + /* mirrors allocation in log_t::create() */ + if (log_sys.buf) { + ret += madvise(log_sys.buf, log_sys.buf_size, MADV_DODUMP); + ret += madvise(log_sys.flush_buf, log_sys.buf_size, + MADV_DODUMP); + } + + mysql_mutex_lock(&buf_pool.mutex); + auto chunk = buf_pool.chunks; + + for (ulint n = buf_pool.n_chunks; n--; chunk++) { + ret+= madvise(chunk->mem, chunk->mem_size(), MADV_DODUMP); + } + + mysql_mutex_unlock(&buf_pool.mutex); + return ret; +} +#endif + +#ifndef UNIV_DEBUG +static inline byte hex_to_ascii(byte hex_digit) +{ + const int offset= hex_digit <= 9 ? '0' : 'a' - 10; + return byte(hex_digit + offset); +} +#endif + +/** Dump a page to stderr. +@param[in] read_buf database page +@param[in] zip_size compressed page size, or 0 */ +ATTRIBUTE_COLD +void buf_page_print(const byte *read_buf, ulint zip_size) +{ +#ifndef UNIV_DEBUG + const size_t size = zip_size ? zip_size : srv_page_size; + const byte * const end= read_buf + size; + sql_print_information("InnoDB: Page dump (%zu bytes):", size); + + do + { + byte row[64]; + + for (byte *r= row; r != &row[64]; r+= 2, read_buf++) + { + r[0]= hex_to_ascii(byte(*read_buf >> 4)); + r[1]= hex_to_ascii(*read_buf & 15); + } + + sql_print_information("InnoDB: %.*s", 64, row); + } + while (read_buf != end); + + sql_print_information("InnoDB: End of page dump"); +#endif +} + +/** Initialize a buffer page descriptor. +@param[in,out] block buffer page descriptor +@param[in] frame buffer page frame */ +static +void +buf_block_init(buf_block_t* block, byte* frame) +{ + /* This function should only be executed at database startup or by + buf_pool.resize(). Either way, adaptive hash index must not exist. */ + assert_block_ahi_empty_on_init(block); + + block->page.frame = frame; + + MEM_MAKE_DEFINED(&block->modify_clock, sizeof block->modify_clock); + ut_ad(!block->modify_clock); + MEM_MAKE_DEFINED(&block->page.lock, sizeof block->page.lock); + block->page.init(buf_page_t::NOT_USED, page_id_t(~0ULL)); +#ifdef BTR_CUR_HASH_ADAPT + MEM_MAKE_DEFINED(&block->index, sizeof block->index); + ut_ad(!block->index); +#endif /* BTR_CUR_HASH_ADAPT */ + ut_d(block->in_unzip_LRU_list = false); + ut_d(block->in_withdraw_list = false); + + page_zip_des_init(&block->page.zip); + + MEM_MAKE_DEFINED(&block->page.hash, sizeof block->page.hash); + ut_ad(!block->page.hash); +} + +/** Allocate a chunk of buffer frames. +@param bytes requested size +@return whether the allocation succeeded */ +inline bool buf_pool_t::chunk_t::create(size_t bytes) +{ + DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", return false;); + /* Round down to a multiple of page size, although it already should be. */ + bytes= ut_2pow_round(bytes, srv_page_size); + + mem= buf_pool.allocator.allocate_large_dontdump(bytes, &mem_pfx); + + if (UNIV_UNLIKELY(!mem)) + return false; + + MEM_UNDEFINED(mem, mem_size()); + +#ifdef HAVE_LIBNUMA + if (srv_numa_interleave) + { + struct bitmask *numa_mems_allowed= numa_get_mems_allowed(); + if (mbind(mem, mem_size(), MPOL_INTERLEAVE, + numa_mems_allowed->maskp, numa_mems_allowed->size, + MPOL_MF_MOVE)) + { + ib::warn() << "Failed to set NUMA memory policy of" + " buffer pool page frames to MPOL_INTERLEAVE" + " (error: " << strerror(errno) << ")."; + } + numa_bitmask_free(numa_mems_allowed); + } +#endif /* HAVE_LIBNUMA */ + + + /* Allocate the block descriptors from + the start of the memory block. */ + blocks= reinterpret_cast(mem); + + /* Align a pointer to the first frame. Note that when + opt_large_page_size is smaller than srv_page_size, + (with max srv_page_size at 64k don't think any hardware + makes this true), + we may allocate one fewer block than requested. When + it is bigger, we may allocate more blocks than requested. */ + static_assert(sizeof(byte*) == sizeof(ulint), "pointer size"); + + byte *frame= reinterpret_cast((reinterpret_cast(mem) + + srv_page_size - 1) & + ~ulint{srv_page_size - 1}); + size= (mem_pfx.m_size >> srv_page_size_shift) - (frame != mem); + + /* Subtract the space needed for block descriptors. */ + { + ulint s= size; + + while (frame < reinterpret_cast(blocks + s)) + { + frame+= srv_page_size; + s--; + } + + size= s; + } + + /* Init block structs and assign frames for them. Then we assign the + frames to the first blocks (we already mapped the memory above). */ + + buf_block_t *block= blocks; + + for (auto i= size; i--; ) { + buf_block_init(block, frame); + MEM_UNDEFINED(block->page.frame, srv_page_size); + /* Add the block to the free list */ + UT_LIST_ADD_LAST(buf_pool.free, &block->page); + + ut_d(block->page.in_free_list = TRUE); + block++; + frame+= srv_page_size; + } + + reg(); + + return true; +} + +#ifdef UNIV_DEBUG +/** Check that all file pages in the buffer chunk are in a replaceable state. +@return address of a non-free block +@retval nullptr if all freed */ +inline const buf_block_t *buf_pool_t::chunk_t::not_freed() const +{ + buf_block_t *block= blocks; + for (auto i= size; i--; block++) + { + if (block->page.in_file()) + { + /* The uncompressed buffer pool should never + contain ROW_FORMAT=COMPRESSED block descriptors. */ + ut_ad(block->page.frame); + const lsn_t lsn= block->page.oldest_modification(); + + if (srv_read_only_mode) + { + /* The page cleaner is disabled in read-only mode. No pages + can be dirtied, so all of them must be clean. */ + ut_ad(lsn == 0 || lsn == recv_sys.lsn || + srv_force_recovery == SRV_FORCE_NO_LOG_REDO); + break; + } + + if (fsp_is_system_temporary(block->page.id().space())) + { + ut_ad(lsn == 0 || lsn == 2); + break; + } + + if (lsn > 1 || !block->page.can_relocate()) + return block; + + break; + } + } + + return nullptr; +} +#endif /* UNIV_DEBUG */ + +/** Create the hash table. +@param n the lower bound of n_cells */ +void buf_pool_t::page_hash_table::create(ulint n) +{ + n_cells= ut_find_prime(n); + const size_t size= MY_ALIGN(pad(n_cells) * sizeof *array, + CPU_LEVEL1_DCACHE_LINESIZE); + void *v= aligned_malloc(size, CPU_LEVEL1_DCACHE_LINESIZE); + memset_aligned(v, 0, size); + array= static_cast(v); +} + +/** Create the buffer pool. +@return whether the creation failed */ +bool buf_pool_t::create() +{ + ut_ad(this == &buf_pool); + ut_ad(srv_buf_pool_size % srv_buf_pool_chunk_unit == 0); + ut_ad(!is_initialised()); + ut_ad(srv_buf_pool_size > 0); + ut_ad(!resizing); + ut_ad(!chunks_old); + /* mariabackup loads tablespaces, and it requires field_ref_zero to be + allocated before innodb initialization */ + ut_ad(srv_operation >= SRV_OPERATION_RESTORE || !field_ref_zero); + + NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE; + + if (!field_ref_zero) { + if (auto b= aligned_malloc(UNIV_PAGE_SIZE_MAX, 4096)) + field_ref_zero= static_cast + (memset_aligned<4096>(b, 0, UNIV_PAGE_SIZE_MAX)); + else + return true; + } + + chunk_t::map_reg= UT_NEW_NOKEY(chunk_t::map()); + + new(&allocator) ut_allocator(mem_key_buf_buf_pool); + + n_chunks= srv_buf_pool_size / srv_buf_pool_chunk_unit; + const size_t chunk_size= srv_buf_pool_chunk_unit; + + chunks= static_cast(ut_zalloc_nokey(n_chunks * sizeof *chunks)); + UT_LIST_INIT(free, &buf_page_t::list); + curr_size= 0; + auto chunk= chunks; + + do + { + if (!chunk->create(chunk_size)) + { + while (--chunk >= chunks) + { + buf_block_t* block= chunk->blocks; + + for (auto i= chunk->size; i--; block++) + block->page.lock.free(); + + allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx); + } + ut_free(chunks); + chunks= nullptr; + UT_DELETE(chunk_t::map_reg); + chunk_t::map_reg= nullptr; + aligned_free(const_cast(field_ref_zero)); + field_ref_zero= nullptr; + ut_ad(!is_initialised()); + return true; + } + + curr_size+= chunk->size; + } + while (++chunk < chunks + n_chunks); + + ut_ad(is_initialised()); +#if defined(__aarch64__) + mysql_mutex_init(buf_pool_mutex_key, &mutex, MY_MUTEX_INIT_FAST); +#else + mysql_mutex_init(buf_pool_mutex_key, &mutex, nullptr); +#endif + + UT_LIST_INIT(LRU, &buf_page_t::LRU); + UT_LIST_INIT(withdraw, &buf_page_t::list); + withdraw_target= 0; + UT_LIST_INIT(flush_list, &buf_page_t::list); + UT_LIST_INIT(unzip_LRU, &buf_block_t::unzip_LRU); + + for (size_t i= 0; i < UT_ARR_SIZE(zip_free); ++i) + UT_LIST_INIT(zip_free[i], &buf_buddy_free_t::list); + ulint s= curr_size; + s/= BUF_READ_AHEAD_PORTION; + read_ahead_area= s >= READ_AHEAD_PAGES + ? READ_AHEAD_PAGES + : my_round_up_to_next_power(static_cast(s)); + curr_pool_size= srv_buf_pool_size; + + n_chunks_new= n_chunks; + + page_hash.create(2 * curr_size); + zip_hash.create(2 * curr_size); + last_printout_time= time(NULL); + + mysql_mutex_init(flush_list_mutex_key, &flush_list_mutex, + MY_MUTEX_INIT_FAST); + + pthread_cond_init(&done_flush_LRU, nullptr); + pthread_cond_init(&done_flush_list, nullptr); + pthread_cond_init(&do_flush_list, nullptr); + pthread_cond_init(&done_free, nullptr); + + try_LRU_scan= true; + + ut_d(flush_hp.m_mutex= &flush_list_mutex;); + ut_d(lru_hp.m_mutex= &mutex); + ut_d(lru_scan_itr.m_mutex= &mutex); + + io_buf.create((srv_n_read_io_threads + srv_n_write_io_threads) * + OS_AIO_N_PENDING_IOS_PER_THREAD); + + /* FIXME: remove some of these variables */ + srv_buf_pool_curr_size= curr_pool_size; + srv_buf_pool_old_size= srv_buf_pool_size; + srv_buf_pool_base_size= srv_buf_pool_size; + + last_activity_count= srv_get_activity_count(); + + chunk_t::map_ref= chunk_t::map_reg; + buf_LRU_old_ratio_update(100 * 3 / 8, false); + btr_search_sys_create(); + ut_ad(is_initialised()); + return false; +} + +/** Clean up after successful create() */ +void buf_pool_t::close() +{ + ut_ad(this == &buf_pool); + if (!is_initialised()) + return; + + mysql_mutex_destroy(&mutex); + mysql_mutex_destroy(&flush_list_mutex); + + for (buf_page_t *bpage= UT_LIST_GET_LAST(LRU), *prev_bpage= nullptr; bpage; + bpage= prev_bpage) + { + prev_bpage= UT_LIST_GET_PREV(LRU, bpage); + ut_ad(bpage->in_file()); + ut_ad(bpage->in_LRU_list); + /* The buffer pool must be clean during normal shutdown. + Only on aborted startup (with recovery) or with innodb_fast_shutdown=2 + we may discard changes. */ + ut_d(const lsn_t oldest= bpage->oldest_modification();) + ut_ad(fsp_is_system_temporary(bpage->id().space()) + ? (oldest == 0 || oldest == 2) + : oldest <= 1 || srv_is_being_started || srv_fast_shutdown == 2); + + if (UNIV_UNLIKELY(!bpage->frame)) + { + bpage->lock.free(); + ut_free(bpage); + } + } + + for (auto chunk= chunks + n_chunks; --chunk >= chunks; ) + { + buf_block_t *block= chunk->blocks; + + for (auto i= chunk->size; i--; block++) + block->page.lock.free(); + + allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx); + } + + pthread_cond_destroy(&done_flush_LRU); + pthread_cond_destroy(&done_flush_list); + pthread_cond_destroy(&do_flush_list); + pthread_cond_destroy(&done_free); + + ut_free(chunks); + chunks= nullptr; + page_hash.free(); + zip_hash.free(); + + io_buf.close(); + UT_DELETE(chunk_t::map_reg); + chunk_t::map_reg= chunk_t::map_ref= nullptr; + aligned_free(const_cast(field_ref_zero)); + field_ref_zero= nullptr; +} + +/** Try to reallocate a control block. +@param block control block to reallocate +@return whether the reallocation succeeded */ +inline bool buf_pool_t::realloc(buf_block_t *block) +{ + buf_block_t* new_block; + + mysql_mutex_assert_owner(&mutex); + ut_ad(block->page.in_file()); + ut_ad(block->page.frame); + + new_block = buf_LRU_get_free_only(); + + if (new_block == NULL) { + mysql_mutex_lock(&buf_pool.flush_list_mutex); + page_cleaner_wakeup(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + return(false); /* free list was not enough */ + } + + const page_id_t id{block->page.id()}; + hash_chain& chain = page_hash.cell_get(id.fold()); + page_hash_latch& hash_lock = page_hash.lock_get(chain); + /* It does not make sense to use transactional_lock_guard + here, because copying innodb_page_size (4096 to 65536) bytes + as well as other changes would likely make the memory + transaction too large. */ + hash_lock.lock(); + + if (block->page.can_relocate()) { + memcpy_aligned( + new_block->page.frame, block->page.frame, + srv_page_size); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + const auto frame = new_block->page.frame; + new_block->page.lock.free(); + new (&new_block->page) buf_page_t(block->page); + new_block->page.frame = frame; + + /* relocate LRU list */ + if (buf_page_t* prev_b = buf_pool.LRU_remove(&block->page)) { + UT_LIST_INSERT_AFTER(LRU, prev_b, &new_block->page); + } else { + UT_LIST_ADD_FIRST(LRU, &new_block->page); + } + + if (LRU_old == &block->page) { + LRU_old = &new_block->page; + } + + ut_ad(new_block->page.in_LRU_list); + + /* relocate unzip_LRU list */ + if (block->page.zip.data != NULL) { + ut_ad(block->in_unzip_LRU_list); + ut_d(new_block->in_unzip_LRU_list = true); + + buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU, block); + UT_LIST_REMOVE(unzip_LRU, block); + + ut_d(block->in_unzip_LRU_list = false); + block->page.zip.data = NULL; + page_zip_set_size(&block->page.zip, 0); + + if (prev_block != NULL) { + UT_LIST_INSERT_AFTER(unzip_LRU, prev_block, new_block); + } else { + UT_LIST_ADD_FIRST(unzip_LRU, new_block); + } + } else { + ut_ad(!block->in_unzip_LRU_list); + ut_d(new_block->in_unzip_LRU_list = false); + } + + /* relocate page_hash */ + hash_chain& chain = page_hash.cell_get(id.fold()); + ut_ad(&block->page == page_hash.get(id, chain)); + buf_pool.page_hash.replace(chain, &block->page, + &new_block->page); + buf_block_modify_clock_inc(block); + static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment"); + memset_aligned<4>(block->page.frame + + FIL_PAGE_OFFSET, 0xff, 4); + static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2, + "not perfect alignment"); + memset_aligned<2>(block->page.frame + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4); + MEM_UNDEFINED(block->page.frame, srv_page_size); + block->page.set_state(buf_page_t::REMOVE_HASH); + if (!fsp_is_system_temporary(id.space())) { + buf_flush_relocate_on_flush_list(&block->page, + &new_block->page); + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + block->page.set_corrupt_id(); + + /* set other flags of buf_block_t */ + +#ifdef BTR_CUR_HASH_ADAPT + /* This code should only be executed by resize(), + while the adaptive hash index is disabled. */ + assert_block_ahi_empty(block); + assert_block_ahi_empty_on_init(new_block); + ut_ad(!block->index); + new_block->index = NULL; + new_block->n_hash_helps = 0; + new_block->n_fields = 1; + new_block->left_side = TRUE; +#endif /* BTR_CUR_HASH_ADAPT */ + ut_d(block->page.set_state(buf_page_t::MEMORY)); + /* free block */ + new_block = block; + } + + hash_lock.unlock(); + buf_LRU_block_free_non_file_page(new_block); + return(true); /* free_list was enough */ +} + +void buf_pool_t::io_buf_t::create(ulint n_slots) +{ + this->n_slots= n_slots; + slots= static_cast + (ut_malloc_nokey(n_slots * sizeof *slots)); + memset((void*) slots, 0, n_slots * sizeof *slots); +} + +void buf_pool_t::io_buf_t::close() +{ + for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++) + { + aligned_free(s->crypt_buf); + aligned_free(s->comp_buf); + } + ut_free(slots); + slots= nullptr; + n_slots= 0; +} + +buf_tmp_buffer_t *buf_pool_t::io_buf_t::reserve() +{ + for (;;) + { + for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++) + if (s->acquire()) + return s; + os_aio_wait_until_no_pending_writes(true); + for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++) + if (s->acquire()) + return s; + os_aio_wait_until_no_pending_reads(true); + } +} + +/** Sets the global variable that feeds MySQL's innodb_buffer_pool_resize_status +to the specified string. The format and the following parameters are the +same as the ones used for printf(3). +@param[in] fmt format +@param[in] ... extra parameters according to fmt */ +static +void +buf_resize_status( + const char* fmt, + ...) +{ + va_list ap; + + va_start(ap, fmt); + + vsnprintf( + export_vars.innodb_buffer_pool_resize_status, + sizeof(export_vars.innodb_buffer_pool_resize_status), + fmt, ap); + + va_end(ap); + + ib::info() << export_vars.innodb_buffer_pool_resize_status; +} + +/** Withdraw blocks from the buffer pool until meeting withdraw_target. +@return whether retry is needed */ +inline bool buf_pool_t::withdraw_blocks() +{ + buf_block_t* block; + ulint loop_count = 0; + + ib::info() << "Start to withdraw the last " + << withdraw_target << " blocks."; + + while (UT_LIST_GET_LEN(withdraw) < withdraw_target) { + + /* try to withdraw from free_list */ + ulint count1 = 0; + + mysql_mutex_lock(&mutex); + buf_buddy_condense_free(); + block = reinterpret_cast( + UT_LIST_GET_FIRST(free)); + while (block != NULL + && UT_LIST_GET_LEN(withdraw) < withdraw_target) { + ut_ad(block->page.in_free_list); + ut_ad(!block->page.oldest_modification()); + ut_ad(!block->page.in_LRU_list); + ut_a(!block->page.in_file()); + + buf_block_t* next_block; + next_block = reinterpret_cast( + UT_LIST_GET_NEXT( + list, &block->page)); + + if (will_be_withdrawn(block->page)) { + /* This should be withdrawn */ + UT_LIST_REMOVE(free, &block->page); + UT_LIST_ADD_LAST(withdraw, &block->page); + ut_d(block->in_withdraw_list = true); + count1++; + } + + block = next_block; + } + + /* reserve free_list length */ + if (UT_LIST_GET_LEN(withdraw) < withdraw_target) { + buf_flush_LRU( + std::max(withdraw_target + - UT_LIST_GET_LEN(withdraw), + srv_LRU_scan_depth), + true); + mysql_mutex_unlock(&buf_pool.mutex); + buf_dblwr.flush_buffered_writes(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_flush_wait_LRU_batch_end(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + mysql_mutex_lock(&buf_pool.mutex); + } + + /* relocate blocks/buddies in withdrawn area */ + ulint count2 = 0; + + buf_pool_mutex_exit_forbid(); + for (buf_page_t* bpage = UT_LIST_GET_FIRST(LRU), *next_bpage; + bpage; bpage = next_bpage) { + ut_ad(bpage->in_file()); + next_bpage = UT_LIST_GET_NEXT(LRU, bpage); + if (UNIV_LIKELY_NULL(bpage->zip.data) + && will_be_withdrawn(bpage->zip.data) + && bpage->can_relocate()) { + if (!buf_buddy_realloc( + bpage->zip.data, + page_zip_get_size(&bpage->zip))) { + /* failed to allocate block */ + break; + } + count2++; + if (bpage->frame) { + goto realloc_frame; + } + } + + if (bpage->frame && will_be_withdrawn(*bpage) + && bpage->can_relocate()) { +realloc_frame: + if (!realloc(reinterpret_cast( + bpage))) { + /* failed to allocate block */ + break; + } + count2++; + } + } + buf_pool_mutex_exit_allow(); + mysql_mutex_unlock(&mutex); + + buf_resize_status( + "Withdrawing blocks. (" ULINTPF "/" ULINTPF ").", + UT_LIST_GET_LEN(withdraw), + withdraw_target); + + ib::info() << "Withdrew " + << count1 << " blocks from free list." + << " Tried to relocate " << count2 << " blocks (" + << UT_LIST_GET_LEN(withdraw) << "/" + << withdraw_target << ")."; + + if (++loop_count >= 10) { + /* give up for now. + retried after user threads paused. */ + + ib::info() << "will retry to withdraw later"; + + /* need retry later */ + return(true); + } + } + + /* confirm withdrawn enough */ + for (const chunk_t* chunk = chunks + n_chunks_new, + * const echunk = chunks + n_chunks; chunk != echunk; chunk++) { + block = chunk->blocks; + for (ulint j = chunk->size; j--; block++) { + ut_a(block->page.state() == buf_page_t::NOT_USED); + ut_ad(block->in_withdraw_list); + } + } + + ib::info() << "Withdrawn target: " << UT_LIST_GET_LEN(withdraw) + << " blocks."; + + return(false); +} + + + +inline void buf_pool_t::page_hash_table::write_lock_all() +{ + for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1) + { + reinterpret_cast(array[n]).lock(); + if (!n) + break; + } +} + + +inline void buf_pool_t::page_hash_table::write_unlock_all() +{ + for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1) + { + reinterpret_cast(array[n]).unlock(); + if (!n) + break; + } +} + + +namespace +{ + +struct find_interesting_trx +{ + void operator()(const trx_t &trx) + { + if (trx.state == TRX_STATE_NOT_STARTED) + return; + if (trx.mysql_thd == nullptr) + return; + if (withdraw_started <= trx.start_time_micro) + return; + + if (!found) + { + ib::warn() << "The following trx might hold " + "the blocks in buffer pool to " + "be withdrawn. Buffer pool " + "resizing can complete only " + "after all the transactions " + "below release the blocks."; + found= true; + } + + lock_trx_print_wait_and_mvcc_state(stderr, &trx, current_time); + } + + bool &found; + /** microsecond_interval_timer() */ + const ulonglong withdraw_started; + const my_hrtime_t current_time; +}; + +} // namespace + +/** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */ +inline void buf_pool_t::resize() +{ + ut_ad(this == &buf_pool); + + bool warning = false; + + NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE; + + ut_ad(!resize_in_progress()); + ut_ad(srv_buf_pool_chunk_unit > 0); + + ulint new_instance_size = srv_buf_pool_size >> srv_page_size_shift; + std::ostringstream str_old_size, str_new_size, str_chunk_size; + str_old_size << ib::bytes_iec{srv_buf_pool_old_size}; + str_new_size << ib::bytes_iec{srv_buf_pool_size}; + str_chunk_size << ib::bytes_iec{srv_buf_pool_chunk_unit}; + + buf_resize_status("Resizing buffer pool from %s to %s (unit = %s).", + str_old_size.str().c_str(), + str_new_size.str().c_str(), + str_chunk_size.str().c_str()); + +#ifdef BTR_CUR_HASH_ADAPT + /* disable AHI if needed */ + buf_resize_status("Disabling adaptive hash index."); + + btr_search_s_lock_all(); + const bool btr_search_disabled = btr_search_enabled; + btr_search_s_unlock_all(); + + btr_search_disable(); + + if (btr_search_disabled) { + ib::info() << "disabled adaptive hash index."; + } +#endif /* BTR_CUR_HASH_ADAPT */ + + mysql_mutex_lock(&mutex); + ut_ad(n_chunks_new == n_chunks); + ut_ad(UT_LIST_GET_LEN(withdraw) == 0); + + n_chunks_new = (new_instance_size << srv_page_size_shift) + / srv_buf_pool_chunk_unit; + curr_size = n_chunks_new * chunks->size; + mysql_mutex_unlock(&mutex); + + if (is_shrinking()) { + /* set withdraw target */ + size_t w = 0; + + for (const chunk_t* chunk = chunks + n_chunks_new, + * const echunk = chunks + n_chunks; + chunk != echunk; chunk++) + w += chunk->size; + + ut_ad(withdraw_target == 0); + withdraw_target = w; + } + + buf_resize_status("Withdrawing blocks to be shrunken."); + + ulonglong withdraw_started = microsecond_interval_timer(); + ulonglong message_interval = 60ULL * 1000 * 1000; + ulint retry_interval = 1; + +withdraw_retry: + /* wait for the number of blocks fit to the new size (if needed)*/ + bool should_retry_withdraw = is_shrinking() + && withdraw_blocks(); + + if (srv_shutdown_state != SRV_SHUTDOWN_NONE) { + /* abort to resize for shutdown. */ + return; + } + + /* abort buffer pool load */ + buf_load_abort(); + + const ulonglong current_time = microsecond_interval_timer(); + + if (should_retry_withdraw + && current_time - withdraw_started >= message_interval) { + + if (message_interval > 900000000) { + message_interval = 1800000000; + } else { + message_interval *= 2; + } + + bool found= false; + find_interesting_trx f + {found, withdraw_started, my_hrtime_coarse()}; + withdraw_started = current_time; + + /* This is going to exceed the maximum size of a + memory transaction. */ + LockMutexGuard g{SRW_LOCK_CALL}; + trx_sys.trx_list.for_each(f); + } + + if (should_retry_withdraw) { + ib::info() << "Will retry to withdraw " << retry_interval + << " seconds later."; + std::this_thread::sleep_for( + std::chrono::seconds(retry_interval)); + + if (retry_interval > 5) { + retry_interval = 10; + } else { + retry_interval *= 2; + } + + goto withdraw_retry; + } + + buf_resize_status("Latching entire buffer pool."); + +#ifndef DBUG_OFF + { + bool should_wait = true; + + while (should_wait) { + should_wait = false; + DBUG_EXECUTE_IF( + "ib_buf_pool_resize_wait_before_resize", + should_wait = true; + std::this_thread::sleep_for( + std::chrono::milliseconds(10));); + } + } +#endif /* !DBUG_OFF */ + + if (srv_shutdown_state != SRV_SHUTDOWN_NONE) { + return; + } + + /* Indicate critical path */ + resizing.store(true, std::memory_order_relaxed); + + mysql_mutex_lock(&mutex); + page_hash.write_lock_all(); + + chunk_t::map_reg = UT_NEW_NOKEY(chunk_t::map()); + + /* add/delete chunks */ + + buf_resize_status("Resizing buffer pool from " + ULINTPF " chunks to " ULINTPF " chunks.", + n_chunks, n_chunks_new); + + if (is_shrinking()) { + /* delete chunks */ + chunk_t* chunk = chunks + n_chunks_new; + const chunk_t* const echunk = chunks + n_chunks; + + ulint sum_freed = 0; + + while (chunk < echunk) { + /* buf_LRU_block_free_non_file_page() invokes + MEM_NOACCESS() on any buf_pool.free blocks. + We must cancel the effect of that. In + MemorySanitizer, MEM_NOACCESS() is no-op, so + we must not do anything special for it here. */ +#ifdef HAVE_valgrind +# if !__has_feature(memory_sanitizer) + MEM_MAKE_DEFINED(chunk->mem, chunk->mem_size()); +# endif +#else + MEM_MAKE_ADDRESSABLE(chunk->mem, chunk->size); +#endif + + buf_block_t* block = chunk->blocks; + + for (ulint j = chunk->size; j--; block++) { + block->page.lock.free(); + } + + allocator.deallocate_large_dodump( + chunk->mem, &chunk->mem_pfx); + sum_freed += chunk->size; + ++chunk; + } + + /* discard withdraw list */ + UT_LIST_INIT(withdraw, &buf_page_t::list); + withdraw_target = 0; + + ib::info() << n_chunks - n_chunks_new + << " Chunks (" << sum_freed + << " blocks) were freed."; + + n_chunks = n_chunks_new; + } + + { + /* reallocate chunks */ + const size_t new_chunks_size + = n_chunks_new * sizeof(chunk_t); + + chunk_t* new_chunks = static_cast( + ut_zalloc_nokey_nofatal(new_chunks_size)); + + DBUG_EXECUTE_IF("buf_pool_resize_chunk_null", + ut_free(new_chunks); new_chunks= nullptr; ); + + if (!new_chunks) { + ib::error() << "failed to allocate" + " the chunk array."; + n_chunks_new = n_chunks; + warning = true; + chunks_old = NULL; + goto calc_buf_pool_size; + } + + ulint n_chunks_copy = ut_min(n_chunks_new, n_chunks); + + memcpy(new_chunks, chunks, + n_chunks_copy * sizeof *new_chunks); + + for (ulint j = 0; j < n_chunks_copy; j++) { + new_chunks[j].reg(); + } + + chunks_old = chunks; + chunks = new_chunks; + } + + if (n_chunks_new > n_chunks) { + /* add chunks */ + ulint sum_added = 0; + ulint n = n_chunks; + const size_t unit = srv_buf_pool_chunk_unit; + + for (chunk_t* chunk = chunks + n_chunks, + * const echunk = chunks + n_chunks_new; + chunk != echunk; chunk++) { + if (!chunk->create(unit)) { + ib::error() << "failed to allocate" + " memory for buffer pool chunk"; + + warning = true; + n_chunks_new = n_chunks; + break; + } + + sum_added += chunk->size; + ++n; + } + + ib::info() << n_chunks_new - n_chunks + << " chunks (" << sum_added + << " blocks) were added."; + + n_chunks = n; + } +calc_buf_pool_size: + /* recalc curr_size */ + ulint new_size = 0; + + { + chunk_t* chunk = chunks; + const chunk_t* const echunk = chunk + n_chunks; + do { + new_size += chunk->size; + } while (++chunk != echunk); + } + + curr_size = new_size; + n_chunks_new = n_chunks; + + if (chunks_old) { + ut_free(chunks_old); + chunks_old = NULL; + } + + chunk_t::map* chunk_map_old = chunk_t::map_ref; + chunk_t::map_ref = chunk_t::map_reg; + + /* set size */ + ut_ad(UT_LIST_GET_LEN(withdraw) == 0); + ulint s= curr_size; + s/= BUF_READ_AHEAD_PORTION; + read_ahead_area= s >= READ_AHEAD_PAGES + ? READ_AHEAD_PAGES + : my_round_up_to_next_power(static_cast(s)); + curr_pool_size= n_chunks * srv_buf_pool_chunk_unit; + srv_buf_pool_curr_size= curr_pool_size;/* FIXME: remove*/ + extern ulonglong innobase_buffer_pool_size; + innobase_buffer_pool_size= buf_pool_size_align(srv_buf_pool_curr_size); + + const bool new_size_too_diff + = srv_buf_pool_base_size > srv_buf_pool_size * 2 + || srv_buf_pool_base_size * 2 < srv_buf_pool_size; + + mysql_mutex_unlock(&mutex); + page_hash.write_unlock_all(); + + UT_DELETE(chunk_map_old); + + resizing.store(false, std::memory_order_relaxed); + + /* Normalize other components, if the new size is too different */ + if (!warning && new_size_too_diff) { + srv_buf_pool_base_size = srv_buf_pool_size; + + buf_resize_status("Resizing other hash tables."); + + srv_lock_table_size = 5 + * (srv_buf_pool_size >> srv_page_size_shift); + lock_sys.resize(srv_lock_table_size); + dict_sys.resize(); + + ib::info() << "Resized hash tables: lock_sys," +#ifdef BTR_CUR_HASH_ADAPT + " adaptive hash index," +#endif /* BTR_CUR_HASH_ADAPT */ + " and dictionary."; + } + + /* normalize ibuf.max_size */ + ibuf_max_size_update(srv_change_buffer_max_size); + + if (srv_buf_pool_old_size != srv_buf_pool_size) { + + buf_resize_status("Completed resizing buffer pool from %zu to %zu bytes." + ,srv_buf_pool_old_size, srv_buf_pool_size); + srv_buf_pool_old_size = srv_buf_pool_size; + } + +#ifdef BTR_CUR_HASH_ADAPT + /* enable AHI if needed */ + if (btr_search_disabled) { + btr_search_enable(true); + ib::info() << "Re-enabled adaptive hash index."; + } +#endif /* BTR_CUR_HASH_ADAPT */ + + if (warning) + buf_resize_status("Resizing buffer pool failed"); + + ut_d(validate()); + + return; +} + +/** Thread pool task invoked by innodb_buffer_pool_size changes. */ +static void buf_resize_callback(void *) +{ + DBUG_ENTER("buf_resize_callback"); + ut_ad(srv_shutdown_state < SRV_SHUTDOWN_CLEANUP); + mysql_mutex_lock(&buf_pool.mutex); + const auto size= srv_buf_pool_size; + const bool work= srv_buf_pool_old_size != size; + mysql_mutex_unlock(&buf_pool.mutex); + + if (work) + buf_pool.resize(); + else + { + std::ostringstream sout; + sout << "Size did not change: old size = new size = " << size; + buf_resize_status(sout.str().c_str()); + } + DBUG_VOID_RETURN; +} + +/* Ensure that task does not run in parallel, by setting max_concurrency to 1 for the thread group */ +static tpool::task_group single_threaded_group(1); +static tpool::waitable_task buf_resize_task(buf_resize_callback, + nullptr, &single_threaded_group); + +void buf_resize_start() +{ + srv_thread_pool->submit_task(&buf_resize_task); +} + +void buf_resize_shutdown() +{ + buf_resize_task.wait(); +} + + +/** Relocate a ROW_FORMAT=COMPRESSED block in the LRU list and +buf_pool.page_hash. +The caller must relocate bpage->list. +@param bpage ROW_FORMAT=COMPRESSED only block +@param dpage destination control block */ +static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage) +{ + const page_id_t id{bpage->id()}; + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold()); + ut_ad(!bpage->frame); + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(buf_pool.page_hash.lock_get(chain).is_write_locked()); + ut_ad(bpage == buf_pool.page_hash.get(id, chain)); + ut_ad(!buf_pool.watch_is_sentinel(*bpage)); + ut_d(const auto state= bpage->state()); + ut_ad(state >= buf_page_t::FREED); + ut_ad(state <= buf_page_t::READ_FIX); + ut_ad(bpage->lock.is_write_locked()); + const auto frame= dpage->frame; + + dpage->lock.free(); + new (dpage) buf_page_t(*bpage); + + dpage->frame= frame; + + /* Important that we adjust the hazard pointer before + removing bpage from LRU list. */ + if (buf_page_t *b= buf_pool.LRU_remove(bpage)) + UT_LIST_INSERT_AFTER(buf_pool.LRU, b, dpage); + else + UT_LIST_ADD_FIRST(buf_pool.LRU, dpage); + + if (UNIV_UNLIKELY(buf_pool.LRU_old == bpage)) + { + buf_pool.LRU_old= dpage; +#ifdef UNIV_LRU_DEBUG + /* buf_pool.LRU_old must be the first item in the LRU list + whose "old" flag is set. */ + ut_a(buf_pool.LRU_old->old); + ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old) || + !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old); + ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old) || + UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old); + } + else + { + /* Check that the "old" flag is consistent in + the block and its neighbours. */ + dpage->set_old(dpage->is_old()); +#endif /* UNIV_LRU_DEBUG */ + } + + ut_d(CheckInLRUList::validate()); + + buf_pool.page_hash.replace(chain, bpage, dpage); +} + +buf_page_t *buf_pool_t::watch_set(const page_id_t id, + buf_pool_t::hash_chain &chain) +{ + ut_ad(&chain == &page_hash.cell_get(id.fold())); + page_hash.lock_get(chain).lock(); + + buf_page_t *bpage= page_hash.get(id, chain); + + if (bpage) + { +got_block: + bpage->fix(); + if (watch_is_sentinel(*bpage)) + bpage= nullptr; + page_hash.lock_get(chain).unlock(); + return bpage; + } + + page_hash.lock_get(chain).unlock(); + /* Allocate a watch[] and then try to insert it into the page_hash. */ + mysql_mutex_lock(&mutex); + + /* The maximum number of purge tasks should never exceed + the UT_ARR_SIZE(watch) - 1, and there is no way for a purge task to hold a + watch when setting another watch. */ + for (buf_page_t *w= &watch[UT_ARR_SIZE(watch)]; w-- >= watch; ) + { + ut_ad(w->access_time == 0); + ut_ad(!w->oldest_modification()); + ut_ad(!w->zip.data); + ut_ad(!w->in_zip_hash); + static_assert(buf_page_t::NOT_USED == 0, "efficiency"); + if (ut_d(auto s=) w->state()) + { + /* This watch may be in use for some other page. */ + ut_ad(s >= buf_page_t::UNFIXED); + continue; + } + /* w is pointing to watch[], which is protected by mutex. + Normally, buf_page_t::id for objects that are reachable by + page_hash.get(id, chain) are protected by hash_lock. */ + w->set_state(buf_page_t::UNFIXED + 1); + w->id_= id; + + page_hash.lock_get(chain).lock(); + bpage= page_hash.get(id, chain); + if (UNIV_LIKELY_NULL(bpage)) + { + w->set_state(buf_page_t::NOT_USED); + mysql_mutex_unlock(&mutex); + goto got_block; + } + + ut_ad(w->state() == buf_page_t::UNFIXED + 1); + buf_pool.page_hash.append(chain, w); + mysql_mutex_unlock(&mutex); + page_hash.lock_get(chain).unlock(); + return nullptr; + } + + ut_error; +} + +/** Stop watching whether a page has been read in. +watch_set(id) must have returned nullptr before. +@param id page identifier +@param chain unlocked hash table chain */ +TRANSACTIONAL_TARGET +void buf_pool_t::watch_unset(const page_id_t id, buf_pool_t::hash_chain &chain) +{ + mysql_mutex_assert_not_owner(&mutex); + buf_page_t *w; + { + transactional_lock_guard g{page_hash.lock_get(chain)}; + /* The page must exist because watch_set() did fix(). */ + w= page_hash.get(id, chain); + ut_ad(w->in_page_hash); + if (!watch_is_sentinel(*w)) + { + no_watch: + w->unfix(); + w= nullptr; + } + else + { + const auto state= w->state(); + ut_ad(~buf_page_t::LRU_MASK & state); + ut_ad(state >= buf_page_t::UNFIXED + 1); + if (state != buf_page_t::UNFIXED + 1) + goto no_watch; + } + } + + if (!w) + return; + + const auto old= w; + /* The following is based on buf_pool_t::watch_remove(). */ + mysql_mutex_lock(&mutex); + w= page_hash.get(id, chain); + + { + transactional_lock_guard g + {buf_pool.page_hash.lock_get(chain)}; + auto f= w->unfix(); + ut_ad(f < buf_page_t::READ_FIX || w != old); + + if (f == buf_page_t::UNFIXED && w == old) + { + page_hash.remove(chain, w); + // Now that w is detached from page_hash, release it to watch[]. + ut_ad(w->id_ == id); + ut_ad(!w->frame); + ut_ad(!w->zip.data); + w->set_state(buf_page_t::NOT_USED); + } + } + + mysql_mutex_unlock(&mutex); +} + +/** Mark the page status as FREED for the given tablespace and page number. +@param[in,out] space tablespace +@param[in] page page number +@param[in,out] mtr mini-transaction */ +TRANSACTIONAL_TARGET +void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr) +{ + ut_ad(mtr); + ut_ad(mtr->is_active()); + + if (srv_immediate_scrub_data_uncompressed +#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32 + || space->is_compressed() +#endif + ) + mtr->add_freed_offset(space, page); + + ++buf_pool.stat.n_page_gets; + const page_id_t page_id(space->id, page); + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold()); + uint32_t fix; + buf_block_t *block; + { + transactional_shared_lock_guard g + {buf_pool.page_hash.lock_get(chain)}; + block= reinterpret_cast + (buf_pool.page_hash.get(page_id, chain)); + if (!block || !block->page.frame) + /* FIXME: convert ROW_FORMAT=COMPRESSED, without buf_zip_decompress() */ + return; + /* To avoid a deadlock with buf_LRU_free_page() of some other page + and buf_page_write_complete() of this page, we must not wait for a + page latch while holding a page_hash latch. */ + fix= block->page.fix(); + } + + if (UNIV_UNLIKELY(fix < buf_page_t::UNFIXED)) + { + block->page.unfix(); + return; + } + + block->page.lock.x_lock(); + if (block->page.is_ibuf_exist()) + ibuf_merge_or_delete_for_page(nullptr, page_id, block->page.zip_size()); +#ifdef BTR_CUR_HASH_ADAPT + if (block->index) + btr_search_drop_page_hash_index(block, false); +#endif /* BTR_CUR_HASH_ADAPT */ + block->page.set_freed(block->page.state()); + mtr->memo_push(block, MTR_MEMO_PAGE_X_MODIFY); +} + +/** Get read access to a compressed page (usually of type +FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2). +The page must be released with unfix(). +NOTE: the page is not protected by any latch. Mutual exclusion has to +be implemented at a higher level. In other words, all possible +accesses to a given page through this function must be protected by +the same set of mutexes or latches. +@param page_id page identifier +@param zip_size ROW_FORMAT=COMPRESSED page size in bytes +@return pointer to the block, s-latched */ +TRANSACTIONAL_TARGET +buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size) +{ + ut_ad(zip_size); + ut_ad(ut_is_2pow(zip_size)); + ++buf_pool.stat.n_page_gets; + mariadb_increment_pages_accessed(); + + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold()); + page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain); + buf_page_t *bpage; + +lookup: + for (bool discard_attempted= false;;) + { +#ifndef NO_ELISION + if (xbegin()) + { + if (hash_lock.is_locked()) + xabort(); + bpage= buf_pool.page_hash.get(page_id, chain); + if (!bpage || buf_pool.watch_is_sentinel(*bpage)) + { + xend(); + goto must_read_page; + } + if (!bpage->zip.data) + { + /* There is no ROW_FORMAT=COMPRESSED page. */ + xend(); + return nullptr; + } + if (discard_attempted || !bpage->frame) + { + if (!bpage->lock.s_lock_try()) + xabort(); + xend(); + break; + } + xend(); + } + else +#endif + { + hash_lock.lock_shared(); + bpage= buf_pool.page_hash.get(page_id, chain); + if (!bpage || buf_pool.watch_is_sentinel(*bpage)) + { + hash_lock.unlock_shared(); + goto must_read_page; + } + + ut_ad(bpage->in_file()); + ut_ad(page_id == bpage->id()); + + if (!bpage->zip.data) + { + /* There is no ROW_FORMAT=COMPRESSED page. */ + hash_lock.unlock_shared(); + return nullptr; + } + + if (discard_attempted || !bpage->frame) + { + /* Even when we are holding a hash_lock, it should be + acceptable to wait for a page S-latch here, because + buf_page_t::read_complete() will not wait for buf_pool.mutex, + and because S-latch would not conflict with a U-latch + that would be protecting buf_page_t::write_complete(). */ + bpage->lock.s_lock(); + hash_lock.unlock_shared(); + break; + } + + hash_lock.unlock_shared(); + } + + discard_attempted= true; + mysql_mutex_lock(&buf_pool.mutex); + if (buf_page_t *bpage= buf_pool.page_hash.get(page_id, chain)) + buf_LRU_free_page(bpage, false); + mysql_mutex_unlock(&buf_pool.mutex); + } + + { + ut_d(const auto s=) bpage->fix(); + ut_ad(s >= buf_page_t::UNFIXED); + ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX); + } + + bpage->set_accessed(); + buf_page_make_young_if_needed(bpage); + +#ifdef UNIV_DEBUG + if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); +#endif /* UNIV_DEBUG */ + return bpage; + +must_read_page: + switch (dberr_t err= buf_read_page(page_id, zip_size)) { + case DB_SUCCESS: + case DB_SUCCESS_LOCKED_REC: + mariadb_increment_pages_read(); + goto lookup; + default: + ib::error() << "Reading compressed page " << page_id + << " failed with error: " << err; + return nullptr; + } +} + +/********************************************************************//** +Initialize some fields of a control block. */ +UNIV_INLINE +void +buf_block_init_low( +/*===============*/ + buf_block_t* block) /*!< in: block to init */ +{ +#ifdef BTR_CUR_HASH_ADAPT + /* No adaptive hash index entries may point to a previously + unused (and now freshly allocated) block. */ + assert_block_ahi_empty_on_init(block); + block->index = NULL; + + block->n_hash_helps = 0; + block->n_fields = 1; + block->n_bytes = 0; + block->left_side = TRUE; +#endif /* BTR_CUR_HASH_ADAPT */ +} + +/********************************************************************//** +Decompress a block. +@return TRUE if successful */ +ibool +buf_zip_decompress( +/*===============*/ + buf_block_t* block, /*!< in/out: block */ + ibool check) /*!< in: TRUE=verify the page checksum */ +{ + const byte* frame = block->page.zip.data; + ulint size = page_zip_get_size(&block->page.zip); + /* The tablespace will not be found if this function is called + during IMPORT. */ + fil_space_t* space= fil_space_t::get(block->page.id().space()); + const unsigned key_version = mach_read_from_4( + frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); + fil_space_crypt_t* crypt_data = space ? space->crypt_data : NULL; + const bool encrypted = crypt_data + && crypt_data->type != CRYPT_SCHEME_UNENCRYPTED + && (!crypt_data->is_default_encryption() + || srv_encrypt_tables); + + ut_ad(block->zip_size()); + ut_a(block->page.id().space() != 0); + + if (UNIV_UNLIKELY(check && !page_zip_verify_checksum(frame, size))) { + + ib::error() << "Compressed page checksum mismatch for " + << (space ? space->chain.start->name : "") + << block->page.id() << ": stored: " + << mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM) + << ", crc32: " + << page_zip_calc_checksum(frame, size, false) + << " adler32: " + << page_zip_calc_checksum(frame, size, true); + goto err_exit; + } + + switch (fil_page_get_type(frame)) { + case FIL_PAGE_INDEX: + case FIL_PAGE_RTREE: + if (page_zip_decompress(&block->page.zip, + block->page.frame, TRUE)) { +func_exit: + if (space) { + space->release(); + } + return(TRUE); + } + + ib::error() << "Unable to decompress " + << (space ? space->chain.start->name : "") + << block->page.id(); + goto err_exit; + case FIL_PAGE_TYPE_ALLOCATED: + case FIL_PAGE_INODE: + case FIL_PAGE_IBUF_BITMAP: + case FIL_PAGE_TYPE_FSP_HDR: + case FIL_PAGE_TYPE_XDES: + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + /* Copy to uncompressed storage. */ + memcpy(block->page.frame, frame, block->zip_size()); + goto func_exit; + } + + ib::error() << "Unknown compressed page type " + << fil_page_get_type(frame) + << " in " << (space ? space->chain.start->name : "") + << block->page.id(); + +err_exit: + if (encrypted) { + ib::info() << "Row compressed page could be encrypted" + " with key_version " << key_version; + } + + if (space) { + space->release(); + } + + return(FALSE); +} + +/** Low level function used to get access to a database page. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH +@param[in] guess guessed block or NULL +@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL, +BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH +@param[in] mtr mini-transaction +@param[out] err DB_SUCCESS or error code +@param[in] allow_ibuf_merge Allow change buffer merge to happen +while reading the page from file +then it makes sure that it does merging of change buffer changes while +reading the page from file. +@return pointer to the block or NULL */ +TRANSACTIONAL_TARGET +buf_block_t* +buf_page_get_low( + const page_id_t page_id, + ulint zip_size, + ulint rw_latch, + buf_block_t* guess, + ulint mode, + mtr_t* mtr, + dberr_t* err, + bool allow_ibuf_merge) +{ + unsigned access_time; + ulint retries = 0; + + ut_ad(!mtr || mtr->is_active()); + ut_ad(mtr || mode == BUF_PEEK_IF_IN_POOL); + ut_ad((rw_latch == RW_S_LATCH) + || (rw_latch == RW_X_LATCH) + || (rw_latch == RW_SX_LATCH) + || (rw_latch == RW_NO_LATCH)); + + if (err) { + *err = DB_SUCCESS; + } + +#ifdef UNIV_DEBUG + switch (mode) { + default: + ut_ad(!allow_ibuf_merge); + ut_ad(mode == BUF_PEEK_IF_IN_POOL); + break; + case BUF_GET_POSSIBLY_FREED: + case BUF_GET_IF_IN_POOL: + /* The caller may pass a dummy page size, + because it does not really matter. */ + break; + case BUF_GET: + case BUF_GET_IF_IN_POOL_OR_WATCH: + ut_ad(!mtr->is_freeing_tree()); + fil_space_t* s = fil_space_get(page_id.space()); + ut_ad(s); + ut_ad(s->zip_size() == zip_size); + } +#endif /* UNIV_DEBUG */ + + ut_ad(!mtr || !ibuf_inside(mtr) + || ibuf_page_low(page_id, zip_size, FALSE, NULL)); + + ++buf_pool.stat.n_page_gets; + mariadb_increment_pages_accessed(); + + auto& chain= buf_pool.page_hash.cell_get(page_id.fold()); + page_hash_latch& hash_lock = buf_pool.page_hash.lock_get(chain); +loop: + buf_block_t* block = guess; + uint32_t state; + + if (block) { + transactional_shared_lock_guard g{hash_lock}; + if (buf_pool.is_uncompressed(block) + && page_id == block->page.id()) { + ut_ad(!block->page.in_zip_hash); + state = block->page.state(); + /* Ignore guesses that point to read-fixed blocks. + We can only avoid a race condition by + looking up the block via buf_pool.page_hash. */ + if ((state >= buf_page_t::FREED + && state < buf_page_t::READ_FIX) + || state >= buf_page_t::WRITE_FIX) { + state = block->page.fix(); + goto got_block; + } + } + } + + guess = nullptr; + + /* A memory transaction would frequently be aborted here. */ + hash_lock.lock_shared(); + block = reinterpret_cast( + buf_pool.page_hash.get(page_id, chain)); + if (UNIV_LIKELY(block + && !buf_pool.watch_is_sentinel(block->page))) { + state = block->page.fix(); + hash_lock.unlock_shared(); + goto got_block; + } + hash_lock.unlock_shared(); + + /* Page not in buf_pool: needs to be read from file */ + switch (mode) { + case BUF_GET_IF_IN_POOL: + case BUF_PEEK_IF_IN_POOL: + return nullptr; + case BUF_GET_IF_IN_POOL_OR_WATCH: + /* Buffer-fixing inside watch_set() will prevent eviction */ + block = reinterpret_cast + (buf_pool.watch_set(page_id, chain)); + + if (block) { + state = block->page.state(); + goto got_block_fixed; + } + + return nullptr; + } + + /* The call path is buf_read_page() -> + buf_read_page_low() (fil_space_t::io()) -> + buf_page_t::read_complete() -> + buf_decrypt_after_read(). Here fil_space_t* is used + and we decrypt -> buf_page_check_corrupt() where page + checksums are compared. Decryption, decompression as + well as error handling takes place at a lower level. + Here we only need to know whether the page really is + corrupted, or if an encrypted page with a valid + checksum cannot be decypted. */ + + switch (dberr_t local_err = buf_read_page(page_id, zip_size)) { + case DB_SUCCESS: + case DB_SUCCESS_LOCKED_REC: + mariadb_increment_pages_read(); + buf_read_ahead_random(page_id, zip_size, ibuf_inside(mtr)); + break; + default: + if (mode != BUF_GET_POSSIBLY_FREED + && retries++ < BUF_PAGE_READ_MAX_RETRIES) { + DBUG_EXECUTE_IF("intermittent_read_failure", + retries = BUF_PAGE_READ_MAX_RETRIES;); + } + /* fall through */ + case DB_PAGE_CORRUPTED: + if (err) { + *err = local_err; + } + return nullptr; + } + + ut_d(if (!(++buf_dbg_counter % 5771)) buf_pool.validate()); + goto loop; + +got_block: + ut_ad(!block->page.in_zip_hash); + state++; +got_block_fixed: + ut_ad(state > buf_page_t::FREED); + + if (state > buf_page_t::READ_FIX && state < buf_page_t::WRITE_FIX) { + if (mode == BUF_PEEK_IF_IN_POOL) { +ignore_block: + ut_ad(mode == BUF_GET_POSSIBLY_FREED + || mode == BUF_PEEK_IF_IN_POOL); + block->unfix(); + if (err) { + *err = DB_CORRUPTION; + } + return nullptr; + } + + if (UNIV_UNLIKELY(!block->page.frame)) { + goto wait_for_unzip; + } + /* A read-fix is released after block->page.lock + in buf_page_t::read_complete() or + buf_pool_t::corrupted_evict(), or + after buf_zip_decompress() in this function. */ + block->page.lock.s_lock(); + state = block->page.state(); + ut_ad(state < buf_page_t::READ_FIX + || state >= buf_page_t::WRITE_FIX); + const page_id_t id{block->page.id()}; + block->page.lock.s_unlock(); + + if (UNIV_UNLIKELY(id != page_id)) { + ut_ad(id == page_id_t{~0ULL}); + block->page.unfix(); + if (++retries < BUF_PAGE_READ_MAX_RETRIES) { + goto loop; + } + + if (err) { + *err = DB_PAGE_CORRUPTED; + } + + return nullptr; + } + } else if (mode != BUF_PEEK_IF_IN_POOL) { + } else if (!mtr) { + ut_ad(!block->page.oldest_modification()); + mysql_mutex_lock(&buf_pool.mutex); + block->unfix(); + +free_unfixed_block: + if (!buf_LRU_free_page(&block->page, true)) { + ut_ad(0); + } + + mysql_mutex_unlock(&buf_pool.mutex); + return nullptr; + } else if (UNIV_UNLIKELY(!block->page.frame)) { + /* The BUF_PEEK_IF_IN_POOL mode is mainly used for dropping an + adaptive hash index. There cannot be an + adaptive hash index for a compressed-only page. */ + goto ignore_block; + } + + ut_ad(mode == BUF_GET_IF_IN_POOL || mode == BUF_PEEK_IF_IN_POOL + || block->zip_size() == zip_size); + + if (UNIV_UNLIKELY(!block->page.frame)) { + if (!block->page.lock.x_lock_try()) { +wait_for_unzip: + /* The page is being read or written, or + another thread is executing buf_zip_decompress() + in buf_page_get_low() on it. */ + block->page.unfix(); + std::this_thread::sleep_for( + std::chrono::microseconds(100)); + goto loop; + } + + buf_block_t *new_block = buf_LRU_get_free_block(false); + buf_block_init_low(new_block); + +wait_for_unfix: + mysql_mutex_lock(&buf_pool.mutex); + page_hash_latch& hash_lock=buf_pool.page_hash.lock_get(chain); + + /* It does not make sense to use + transactional_lock_guard here, because buf_relocate() + would likely make a memory transaction too large. */ + hash_lock.lock(); + + /* block->page.lock implies !block->page.can_relocate() */ + ut_ad(&block->page == buf_pool.page_hash.get(page_id, chain)); + + /* Wait for any other threads to release their buffer-fix + on the compressed-only block descriptor. + FIXME: Never fix() before acquiring the lock. + Only in buf_page_get_gen(), buf_page_get_low(), buf_page_free() + we are violating that principle. */ + state = block->page.state(); + + switch (state) { + case buf_page_t::UNFIXED + 1: + case buf_page_t::IBUF_EXIST + 1: + case buf_page_t::REINIT + 1: + break; + default: + ut_ad(state < buf_page_t::READ_FIX); + + if (state < buf_page_t::UNFIXED + 1) { + ut_ad(state > buf_page_t::FREED); + block->page.lock.x_unlock(); + hash_lock.unlock(); + buf_LRU_block_free_non_file_page(new_block); + mysql_mutex_unlock(&buf_pool.mutex); + goto ignore_block; + } + + mysql_mutex_unlock(&buf_pool.mutex); + hash_lock.unlock(); + std::this_thread::sleep_for( + std::chrono::microseconds(100)); + goto wait_for_unfix; + } + + /* Ensure that another buf_page_get_low() will wait for + new_block->page.lock.x_unlock(). */ + block->page.set_state(buf_page_t::READ_FIX); + + /* Move the compressed page from block->page to new_block, + and uncompress it. */ + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_relocate(&block->page, &new_block->page); + + /* X-latch the block for the duration of the decompression. */ + new_block->page.lock.x_lock(); + ut_d(block->page.lock.x_unlock()); + + buf_flush_relocate_on_flush_list(&block->page, + &new_block->page); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + /* Insert at the front of unzip_LRU list */ + buf_unzip_LRU_add_block(new_block, FALSE); + + mysql_mutex_unlock(&buf_pool.mutex); + hash_lock.unlock(); + +#if defined SUX_LOCK_GENERIC || defined UNIV_DEBUG + block->page.lock.free(); +#endif + ut_free(reinterpret_cast(block)); + block = new_block; + + buf_pool.n_pend_unzip++; + + access_time = block->page.is_accessed(); + + if (!access_time && !recv_no_ibuf_operations + && ibuf_page_exists(block->page.id(), block->zip_size())) { + state = buf_page_t::IBUF_EXIST + 1; + } + + /* Decompress the page while not holding + buf_pool.mutex. */ + const auto ok = buf_zip_decompress(block, false); + --buf_pool.n_pend_unzip; + if (!ok) { + if (err) { + *err = DB_PAGE_CORRUPTED; + } + mysql_mutex_lock(&buf_pool.mutex); + } + state = block->page.read_unfix(state); + block->page.lock.x_unlock(); + + if (!ok) { + goto free_unfixed_block; + } + } + +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG +re_evict: + if (mode != BUF_GET_IF_IN_POOL + && mode != BUF_GET_IF_IN_POOL_OR_WATCH) { + } else if (!ibuf_debug || recv_recovery_is_on()) { + } else if (fil_space_t* space = fil_space_t::get(page_id.space())) { + for (ulint i = 0; i < mtr->get_savepoint(); i++) { + if (buf_block_t* b = mtr->block_at_savepoint(i)) { + if (b->page.oldest_modification() > 2 + && b->page.lock.have_any()) { + /* We are holding a dirty page latch + that would hang buf_flush_sync(). */ + space->release(); + goto re_evict_fail; + } + } + } + + /* Try to evict the block from the buffer pool, to use the + insert buffer (change buffer) as much as possible. */ + + mysql_mutex_lock(&buf_pool.mutex); + + block->unfix(); + + /* Blocks cannot be relocated or enter or exit the + buf_pool while we are holding the buf_pool.mutex. */ + const bool evicted = buf_LRU_free_page(&block->page, true); + space->release(); + + if (!evicted) { + block->fix(); + } + + mysql_mutex_unlock(&buf_pool.mutex); + + if (evicted) { + if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) { + buf_pool.watch_set(page_id, chain); + } + return(NULL); + } + + buf_flush_sync(); + + state = block->page.state(); + + if (state == buf_page_t::UNFIXED + 1 + && !block->page.oldest_modification()) { + goto re_evict; + } + + /* Failed to evict the page; change it directly */ + } +re_evict_fail: +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + + if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) { + goto ignore_block; + } + ut_ad((~buf_page_t::LRU_MASK) & state); + ut_ad(state > buf_page_t::WRITE_FIX || state < buf_page_t::READ_FIX); + +#ifdef UNIV_DEBUG + if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); +#endif /* UNIV_DEBUG */ + ut_ad(block->page.frame); + + if (state >= buf_page_t::UNFIXED + && allow_ibuf_merge + && fil_page_get_type(block->page.frame) == FIL_PAGE_INDEX + && page_is_leaf(block->page.frame)) { + block->page.lock.x_lock(); + ut_ad(block->page.id() == page_id + || (state >= buf_page_t::READ_FIX + && state < buf_page_t::WRITE_FIX)); + +#ifdef BTR_CUR_HASH_ADAPT + btr_search_drop_page_hash_index(block, true); +#endif /* BTR_CUR_HASH_ADAPT */ + + dberr_t e; + + if (UNIV_UNLIKELY(block->page.id() != page_id)) { +page_id_mismatch: + state = block->page.state(); + e = DB_CORRUPTION; +ibuf_merge_corrupted: + if (err) { + *err = e; + } + + if (block->page.id().is_corrupted()) { + buf_pool.corrupted_evict(&block->page, state); + } + return nullptr; + } + + state = block->page.state(); + ut_ad(state < buf_page_t::READ_FIX); + + if (state >= buf_page_t::IBUF_EXIST + && state < buf_page_t::REINIT) { + block->page.clear_ibuf_exist(); + e = ibuf_merge_or_delete_for_page(block, page_id, + block->zip_size()); + if (UNIV_UNLIKELY(e != DB_SUCCESS)) { + goto ibuf_merge_corrupted; + } + } + + if (rw_latch == RW_X_LATCH) { + goto get_latch_valid; + } else { + block->page.lock.x_unlock(); + goto get_latch; + } + } else { +get_latch: + switch (rw_latch) { + case RW_NO_LATCH: + mtr->memo_push(block, MTR_MEMO_BUF_FIX); + return block; + case RW_S_LATCH: + block->page.lock.s_lock(); + ut_ad(!block->page.is_read_fixed()); + if (UNIV_UNLIKELY(block->page.id() != page_id)) { + block->page.lock.s_unlock(); + block->page.lock.x_lock(); + goto page_id_mismatch; + } +get_latch_valid: + mtr->memo_push(block, mtr_memo_type_t(rw_latch)); +#ifdef BTR_CUR_HASH_ADAPT + btr_search_drop_page_hash_index(block, true); +#endif /* BTR_CUR_HASH_ADAPT */ + break; + case RW_SX_LATCH: + block->page.lock.u_lock(); + ut_ad(!block->page.is_io_fixed()); + if (UNIV_UNLIKELY(block->page.id() != page_id)) { + block->page.lock.u_x_upgrade(); + goto page_id_mismatch; + } + goto get_latch_valid; + default: + ut_ad(rw_latch == RW_X_LATCH); + if (block->page.lock.x_lock_upgraded()) { + ut_ad(block->page.id() == page_id); + block->unfix(); + mtr->page_lock_upgrade(*block); + return block; + } + if (UNIV_UNLIKELY(block->page.id() != page_id)) { + goto page_id_mismatch; + } + goto get_latch_valid; + } + + ut_ad(page_id_t(page_get_space_id(block->page.frame), + page_get_page_no(block->page.frame)) + == page_id); + + if (mode == BUF_GET_POSSIBLY_FREED + || mode == BUF_PEEK_IF_IN_POOL) { + return block; + } + + const bool not_first_access{block->page.set_accessed()}; + buf_page_make_young_if_needed(&block->page); + if (!not_first_access) { + buf_read_ahead_linear(page_id, block->zip_size(), + ibuf_inside(mtr)); + } + } + + return block; +} + +/** Get access to a database page. Buffered redo log may be applied. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH +@param[in] guess guessed block or NULL +@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL, +BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH +@param[in,out] mtr mini-transaction, or NULL +@param[out] err DB_SUCCESS or error code +@param[in] allow_ibuf_merge Allow change buffer merge while +reading the pages from file. +@return pointer to the block or NULL */ +buf_block_t* +buf_page_get_gen( + const page_id_t page_id, + ulint zip_size, + ulint rw_latch, + buf_block_t* guess, + ulint mode, + mtr_t* mtr, + dberr_t* err, + bool allow_ibuf_merge) +{ + buf_block_t *block= recv_sys.recover(page_id); + if (UNIV_LIKELY(!block)) + return buf_page_get_low(page_id, zip_size, rw_latch, + guess, mode, mtr, err, allow_ibuf_merge); + else if (UNIV_UNLIKELY(block == reinterpret_cast(-1))) + { + corrupted: + if (err) + *err= DB_CORRUPTION; + return nullptr; + } + /* Recovery is a special case; we fix() before acquiring lock. */ + auto s= block->page.fix(); + ut_ad(s >= buf_page_t::FREED); + /* The block may be write-fixed at this point because we are not + holding a lock, but it must not be read-fixed. */ + ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX); + if (err) + *err= DB_SUCCESS; + const bool must_merge= allow_ibuf_merge && + ibuf_page_exists(page_id, block->zip_size()); + if (s < buf_page_t::UNFIXED) + { + got_freed_page: + ut_ad(mode == BUF_GET_POSSIBLY_FREED || mode == BUF_PEEK_IF_IN_POOL); + mysql_mutex_lock(&buf_pool.mutex); + block->page.unfix(); + buf_LRU_free_page(&block->page, true); + mysql_mutex_unlock(&buf_pool.mutex); + goto corrupted; + } + else if (must_merge && + fil_page_get_type(block->page.frame) == FIL_PAGE_INDEX && + page_is_leaf(block->page.frame)) + { + block->page.lock.x_lock(); + s= block->page.state(); + ut_ad(s > buf_page_t::FREED); + ut_ad(s < buf_page_t::READ_FIX); + if (s < buf_page_t::UNFIXED) + { + block->page.lock.x_unlock(); + goto got_freed_page; + } + else + { + if (block->page.is_ibuf_exist()) + block->page.clear_ibuf_exist(); + if (dberr_t e= + ibuf_merge_or_delete_for_page(block, page_id, block->zip_size())) + { + if (err) + *err= e; + buf_pool.corrupted_evict(&block->page, s); + return nullptr; + } + } + + if (rw_latch == RW_X_LATCH) + { + mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX); + return block; + } + block->page.lock.x_unlock(); + } + mtr->page_lock(block, rw_latch); + return block; +} + +/********************************************************************//** +This is the general function used to get optimistic access to a database +page. +@return TRUE if success */ +TRANSACTIONAL_TARGET +bool buf_page_optimistic_get(ulint rw_latch, buf_block_t *block, + uint64_t modify_clock, mtr_t *mtr) +{ + ut_ad(block); + ut_ad(mtr); + ut_ad(mtr->is_active()); + ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH); + + if (have_transactional_memory); + else if (UNIV_UNLIKELY(!block->page.frame)) + return false; + else + { + const auto state= block->page.state(); + if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED || + state >= buf_page_t::READ_FIX)) + return false; + } + + bool success; + const page_id_t id{block->page.id()}; + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold()); + bool have_u_not_x= false; + + { + transactional_shared_lock_guard g + {buf_pool.page_hash.lock_get(chain)}; + if (UNIV_UNLIKELY(id != block->page.id() || !block->page.frame)) + return false; + const auto state= block->page.state(); + if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED || + state >= buf_page_t::READ_FIX)) + return false; + + if (rw_latch == RW_S_LATCH) + success= block->page.lock.s_lock_try(); + else + { + have_u_not_x= block->page.lock.have_u_not_x(); + success= have_u_not_x || block->page.lock.x_lock_try(); + } + } + + if (!success) + return false; + + if (have_u_not_x) + { + block->page.lock.u_x_upgrade(); + mtr->page_lock_upgrade(*block); + ut_ad(id == block->page.id()); + ut_ad(modify_clock == block->modify_clock); + } + else + { + ut_ad(rw_latch == RW_S_LATCH || !block->page.is_io_fixed()); + ut_ad(id == block->page.id()); + ut_ad(!ibuf_inside(mtr) || ibuf_page(id, block->zip_size(), nullptr)); + + if (modify_clock != block->modify_clock || block->page.is_freed()) + { + if (rw_latch == RW_S_LATCH) + block->page.lock.s_unlock(); + else + block->page.lock.x_unlock(); + return false; + } + + block->page.fix(); + ut_ad(!block->page.is_read_fixed()); + block->page.set_accessed(); + buf_page_make_young_if_needed(&block->page); + mtr->memo_push(block, mtr_memo_type_t(rw_latch)); + } + + ut_d(if (!(++buf_dbg_counter % 5771)) buf_pool.validate()); + ut_d(const auto state = block->page.state()); + ut_ad(state > buf_page_t::UNFIXED); + ut_ad(state < buf_page_t::READ_FIX || state > buf_page_t::WRITE_FIX); + ut_ad(~buf_page_t::LRU_MASK & state); + ut_ad(block->page.frame); + + return true; +} + +/** Try to S-latch a page. +Suitable for using when holding the lock_sys latches (as it avoids deadlock). +@param[in] page_id page identifier +@param[in,out] mtr mini-transaction +@return the block +@retval nullptr if an S-latch cannot be granted immediately */ +TRANSACTIONAL_TARGET +buf_block_t *buf_page_try_get(const page_id_t page_id, mtr_t *mtr) +{ + ut_ad(mtr); + ut_ad(mtr->is_active()); + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold()); + buf_block_t *block; + + { + transactional_shared_lock_guard g + {buf_pool.page_hash.lock_get(chain)}; + block= reinterpret_cast + (buf_pool.page_hash.get(page_id, chain)); + if (!block || !block->page.frame || !block->page.lock.s_lock_try()) + return nullptr; + } + + block->page.fix(); + ut_ad(!block->page.is_read_fixed()); + mtr->memo_push(block, MTR_MEMO_PAGE_S_FIX); + +#ifdef UNIV_DEBUG + if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); +#endif /* UNIV_DEBUG */ + ut_ad(block->page.buf_fix_count()); + ut_ad(block->page.id() == page_id); + + ++buf_pool.stat.n_page_gets; + mariadb_increment_pages_accessed(); + return block; +} + +/** Initialize the block. +@param page_id page identifier +@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param fix initial buf_fix_count() */ +void buf_block_t::initialise(const page_id_t page_id, ulint zip_size, + uint32_t fix) +{ + ut_ad(!page.in_file()); + buf_block_init_low(this); + page.init(fix, page_id); + page.set_os_used(); + page_zip_set_size(&page.zip, zip_size); +} + +TRANSACTIONAL_TARGET +static buf_block_t *buf_page_create_low(page_id_t page_id, ulint zip_size, + mtr_t *mtr, buf_block_t *free_block) +{ + ut_ad(mtr->is_active()); + ut_ad(page_id.space() != 0 || !zip_size); + + free_block->initialise(page_id, zip_size, buf_page_t::MEMORY); + + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold()); +retry: + mysql_mutex_lock(&buf_pool.mutex); + + buf_page_t *bpage= buf_pool.page_hash.get(page_id, chain); + + if (bpage && !buf_pool.watch_is_sentinel(*bpage)) + { +#ifdef BTR_CUR_HASH_ADAPT + const dict_index_t *drop_hash_entry= nullptr; +#endif + bool ibuf_exist= false; + + if (!mtr->have_x_latch(reinterpret_cast(*bpage))) + { + const bool got= bpage->lock.x_lock_try(); + if (!got) + { + mysql_mutex_unlock(&buf_pool.mutex); + bpage->lock.x_lock(); + const page_id_t id{bpage->id()}; + if (UNIV_UNLIKELY(id != page_id)) + { + ut_ad(id.is_corrupted()); + bpage->lock.x_unlock(); + goto retry; + } + mysql_mutex_lock(&buf_pool.mutex); + } + + auto state= bpage->fix(); + ut_ad(state >= buf_page_t::FREED); + ut_ad(state < buf_page_t::READ_FIX); + + if (state < buf_page_t::UNFIXED) + bpage->set_reinit(buf_page_t::FREED); + else + { + bpage->set_reinit(state & buf_page_t::LRU_MASK); + ibuf_exist= (state & buf_page_t::LRU_MASK) == buf_page_t::IBUF_EXIST; + } + + if (UNIV_LIKELY(bpage->frame != nullptr)) + { + mysql_mutex_unlock(&buf_pool.mutex); + buf_block_t *block= reinterpret_cast(bpage); + mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX); +#ifdef BTR_CUR_HASH_ADAPT + drop_hash_entry= block->index; +#endif + } + else + { + auto state= bpage->state(); + ut_ad(state >= buf_page_t::FREED); + ut_ad(state < buf_page_t::READ_FIX); + + page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain); + /* It does not make sense to use transactional_lock_guard here, + because buf_relocate() would likely make the memory transaction + too large. */ + hash_lock.lock(); + + if (state < buf_page_t::UNFIXED) + bpage->set_reinit(buf_page_t::FREED); + else + { + bpage->set_reinit(state & buf_page_t::LRU_MASK); + ibuf_exist= (state & buf_page_t::LRU_MASK) == buf_page_t::IBUF_EXIST; + } + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_relocate(bpage, &free_block->page); + free_block->page.lock.x_lock(); + buf_flush_relocate_on_flush_list(bpage, &free_block->page); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + buf_unzip_LRU_add_block(free_block, FALSE); + + mysql_mutex_unlock(&buf_pool.mutex); + hash_lock.unlock(); +#if defined SUX_LOCK_GENERIC || defined UNIV_DEBUG + bpage->lock.x_unlock(); + bpage->lock.free(); +#endif + ut_free(bpage); + mtr->memo_push(free_block, MTR_MEMO_PAGE_X_FIX); + bpage= &free_block->page; + } + } + else + { + mysql_mutex_unlock(&buf_pool.mutex); + ut_ad(bpage->frame); +#ifdef BTR_CUR_HASH_ADAPT + ut_ad(!reinterpret_cast(bpage)->index); +#endif + const auto state= bpage->state(); + ut_ad(state >= buf_page_t::FREED); + bpage->set_reinit(state < buf_page_t::UNFIXED ? buf_page_t::FREED + : state & buf_page_t::LRU_MASK); + } + +#ifdef BTR_CUR_HASH_ADAPT + if (drop_hash_entry) + btr_search_drop_page_hash_index(reinterpret_cast(bpage), + false); +#endif /* BTR_CUR_HASH_ADAPT */ + + if (ibuf_exist && !recv_recovery_is_on()) + ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size); + + return reinterpret_cast(bpage); + } + + /* If we get here, the page was not in buf_pool: init it there */ + + DBUG_PRINT("ib_buf", ("create page %u:%u", + page_id.space(), page_id.page_no())); + + bpage= &free_block->page; + + ut_ad(bpage->state() == buf_page_t::MEMORY); + bpage->lock.x_lock(); + + /* The block must be put to the LRU list */ + buf_LRU_add_block(bpage, false); + { + transactional_lock_guard g + {buf_pool.page_hash.lock_get(chain)}; + bpage->set_state(buf_page_t::REINIT + 1); + buf_pool.page_hash.append(chain, bpage); + } + + if (UNIV_UNLIKELY(zip_size)) + { + bpage->zip.data= buf_buddy_alloc(zip_size); + + /* To maintain the invariant block->in_unzip_LRU_list == + block->page.belongs_to_unzip_LRU() we have to add this + block to unzip_LRU after block->page.zip.data is set. */ + ut_ad(bpage->belongs_to_unzip_LRU()); + buf_unzip_LRU_add_block(reinterpret_cast(bpage), FALSE); + } + + buf_pool.stat.n_pages_created++; + mysql_mutex_unlock(&buf_pool.mutex); + + mtr->memo_push(reinterpret_cast(bpage), MTR_MEMO_PAGE_X_FIX); + + bpage->set_accessed(); + + /* Delete possible entries for the page from the insert buffer: + such can exist if the page belonged to an index which was dropped */ + if (page_id < page_id_t{SRV_SPACE_ID_UPPER_BOUND, 0} && + !srv_is_undo_tablespace(page_id.space()) && + !recv_recovery_is_on()) + ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size); + + static_assert(FIL_PAGE_PREV + 4 == FIL_PAGE_NEXT, "adjacent"); + memset_aligned<8>(bpage->frame + FIL_PAGE_PREV, 0xff, 8); + mach_write_to_2(bpage->frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED); + + /* FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION is only used on the + following pages: + (1) The first page of the InnoDB system tablespace (page 0:0) + (2) FIL_RTREE_SPLIT_SEQ_NUM on R-tree pages + (3) key_version on encrypted pages (not page 0:0) */ + + memset(bpage->frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8); + memset_aligned<8>(bpage->frame + FIL_PAGE_LSN, 0, 8); + +#ifdef UNIV_DEBUG + if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); +#endif /* UNIV_DEBUG */ + return reinterpret_cast(bpage); +} + +/** Initialize a page in the buffer pool. The page is usually not read +from a file even if it cannot be found in the buffer buf_pool. This is one +of the functions which perform to a block a state transition NOT_USED => +FILE_PAGE (the other is buf_page_get_gen). +@param[in,out] space space object +@param[in] offset offset of the tablespace + or deferred space id if space + object is null +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] mtr mini-transaction +@param[in,out] free_block pre-allocated buffer block +@return pointer to the block, page bufferfixed */ +buf_block_t* +buf_page_create(fil_space_t *space, uint32_t offset, + ulint zip_size, mtr_t *mtr, buf_block_t *free_block) +{ + space->free_page(offset, false); + return buf_page_create_low({space->id, offset}, zip_size, mtr, free_block); +} + +/** Initialize a page in buffer pool while initializing the +deferred tablespace +@param space_id space identfier +@param zip_size ROW_FORMAT=COMPRESSED page size or 0 +@param mtr mini-transaction +@param free_block pre-allocated buffer block +@return pointer to the block, page bufferfixed */ +buf_block_t* buf_page_create_deferred(uint32_t space_id, ulint zip_size, + mtr_t *mtr, buf_block_t *free_block) +{ + return buf_page_create_low({space_id, 0}, zip_size, mtr, free_block); +} + +/** Monitor the buffer page read/write activity, and increment corresponding +counter value in MONITOR_MODULE_BUF_PAGE. +@param bpage buffer page whose read or write was completed +@param read true=read, false=write */ +ATTRIBUTE_COLD void buf_page_monitor(const buf_page_t &bpage, bool read) +{ + monitor_id_t counter; + + const byte* frame = bpage.zip.data ? bpage.zip.data : bpage.frame; + + switch (fil_page_get_type(frame)) { + ulint level; + case FIL_PAGE_TYPE_INSTANT: + case FIL_PAGE_INDEX: + case FIL_PAGE_RTREE: + level = btr_page_get_level(frame); + + /* Check if it is an index page for insert buffer */ + if (fil_page_get_type(frame) == FIL_PAGE_INDEX + && btr_page_get_index_id(frame) + == (index_id_t)(DICT_IBUF_ID_MIN + IBUF_SPACE_ID)) { + if (level == 0) { + counter = MONITOR_RW_COUNTER( + read, MONITOR_INDEX_IBUF_LEAF_PAGE); + } else { + counter = MONITOR_RW_COUNTER( + read, + MONITOR_INDEX_IBUF_NON_LEAF_PAGE); + } + } else { + if (level == 0) { + counter = MONITOR_RW_COUNTER( + read, MONITOR_INDEX_LEAF_PAGE); + } else { + counter = MONITOR_RW_COUNTER( + read, MONITOR_INDEX_NON_LEAF_PAGE); + } + } + break; + + case FIL_PAGE_UNDO_LOG: + counter = MONITOR_RW_COUNTER(read, MONITOR_UNDO_LOG_PAGE); + break; + + case FIL_PAGE_INODE: + counter = MONITOR_RW_COUNTER(read, MONITOR_INODE_PAGE); + break; + + case FIL_PAGE_IBUF_FREE_LIST: + counter = MONITOR_RW_COUNTER(read, MONITOR_IBUF_FREELIST_PAGE); + break; + + case FIL_PAGE_IBUF_BITMAP: + counter = MONITOR_RW_COUNTER(read, MONITOR_IBUF_BITMAP_PAGE); + break; + + case FIL_PAGE_TYPE_SYS: + counter = MONITOR_RW_COUNTER(read, MONITOR_SYSTEM_PAGE); + break; + + case FIL_PAGE_TYPE_TRX_SYS: + counter = MONITOR_RW_COUNTER(read, MONITOR_TRX_SYSTEM_PAGE); + break; + + case FIL_PAGE_TYPE_FSP_HDR: + counter = MONITOR_RW_COUNTER(read, MONITOR_FSP_HDR_PAGE); + break; + + case FIL_PAGE_TYPE_XDES: + counter = MONITOR_RW_COUNTER(read, MONITOR_XDES_PAGE); + break; + + case FIL_PAGE_TYPE_BLOB: + counter = MONITOR_RW_COUNTER(read, MONITOR_BLOB_PAGE); + break; + + case FIL_PAGE_TYPE_ZBLOB: + counter = MONITOR_RW_COUNTER(read, MONITOR_ZBLOB_PAGE); + break; + + case FIL_PAGE_TYPE_ZBLOB2: + counter = MONITOR_RW_COUNTER(read, MONITOR_ZBLOB2_PAGE); + break; + + default: + counter = MONITOR_RW_COUNTER(read, MONITOR_OTHER_PAGE); + } + + MONITOR_INC_NOCHECK(counter); +} + +/** Check if the encrypted page is corrupted for the full crc32 format. +@param[in] space_id page belongs to space id +@param[in] d page +@param[in] is_compressed compressed page +@return true if page is corrupted or false if it isn't */ +static bool buf_page_full_crc32_is_corrupted(ulint space_id, const byte* d, + bool is_compressed) +{ + if (space_id != mach_read_from_4(d + FIL_PAGE_SPACE_ID)) + return true; + + static_assert(FIL_PAGE_LSN % 4 == 0, "alignment"); + static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment"); + + return !is_compressed && + memcmp_aligned<4>(FIL_PAGE_LSN + 4 + d, + d + srv_page_size - FIL_PAGE_FCRC32_END_LSN, 4); +} + +/** Check if page is maybe compressed, encrypted or both when we encounter +corrupted page. Note that we can't be 100% sure if page is corrupted +or decrypt/decompress just failed. +@param[in,out] bpage page +@param[in] node data file +@return whether the operation succeeded +@retval DB_SUCCESS if page has been read and is not corrupted +@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted +@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but +after decryption normal page checksum does not match. */ +static dberr_t buf_page_check_corrupt(buf_page_t *bpage, + const fil_node_t &node) +{ + ut_ad(node.space->referenced()); + + byte* dst_frame = bpage->zip.data ? bpage->zip.data : bpage->frame; + dberr_t err = DB_SUCCESS; + uint key_version = buf_page_get_key_version(dst_frame, + node.space->flags); + + /* In buf_decrypt_after_read we have either decrypted the page if + page post encryption checksum matches and used key_id is found + from the encryption plugin. If checksum did not match page was + not decrypted and it could be either encrypted and corrupted + or corrupted or good page. If we decrypted, there page could + still be corrupted if used key does not match. */ + const bool seems_encrypted = !node.space->full_crc32() && key_version + && node.space->crypt_data + && node.space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED; + ut_ad(node.space->purpose != FIL_TYPE_TEMPORARY || + node.space->full_crc32()); + + /* If traditional checksums match, we assume that page is + not anymore encrypted. */ + if (node.space->full_crc32() + && !buf_is_zeroes(span(dst_frame, + node.space->physical_size())) + && (key_version || node.space->is_compressed() + || node.space->purpose == FIL_TYPE_TEMPORARY)) { + if (buf_page_full_crc32_is_corrupted( + bpage->id().space(), dst_frame, + node.space->is_compressed())) { + err = DB_PAGE_CORRUPTED; + } + } else if (buf_page_is_corrupted(true, dst_frame, node.space->flags)) { + err = DB_PAGE_CORRUPTED; + } + + if (seems_encrypted && err == DB_PAGE_CORRUPTED + && bpage->id().page_no() != 0) { + err = DB_DECRYPTION_FAILED; + + ib::error() + << "The page " << bpage->id() + << " in file '" << node.name + << "' cannot be decrypted; key_version=" + << key_version; + } + + return (err); +} + +/** Complete a read of a page. +@param node data file +@return whether the operation succeeded +@retval DB_PAGE_CORRUPTED if the checksum fails +@retval DB_DECRYPTION_FAILED if the page cannot be decrypted +@retval DB_FAIL if the page contains the wrong ID */ +dberr_t buf_page_t::read_complete(const fil_node_t &node) +{ + const page_id_t expected_id{id()}; + ut_ad(is_read_fixed()); + ut_ad(!buf_dblwr.is_inside(id())); + ut_ad(id().space() == node.space->id); + ut_ad(zip_size() == node.space->zip_size()); + ut_ad(!!zip.ssize == !!zip.data); + + const byte *read_frame= zip.data ? zip.data : frame; + ut_ad(read_frame); + + dberr_t err; + if (!buf_page_decrypt_after_read(this, node)) + { + err= DB_DECRYPTION_FAILED; + goto database_corrupted; + } + + if (belongs_to_unzip_LRU()) + { + buf_pool.n_pend_unzip++; + auto ok= buf_zip_decompress(reinterpret_cast(this), false); + buf_pool.n_pend_unzip--; + + if (!ok) + { + ib::info() << "Page " << expected_id << " zip_decompress failure."; + err= DB_PAGE_CORRUPTED; + goto database_corrupted; + } + } + + { + const page_id_t read_id(mach_read_from_4(read_frame + FIL_PAGE_SPACE_ID), + mach_read_from_4(read_frame + FIL_PAGE_OFFSET)); + + if (read_id == expected_id); + else if (read_id == page_id_t(0, 0)) + { + /* This is likely an uninitialized (all-zero) page. */ + err= DB_FAIL; + goto release_page; + } + else if (!node.space->full_crc32() && + page_id_t(0, read_id.page_no()) == expected_id) + /* FIL_PAGE_SPACE_ID was written as garbage in the system tablespace + before MySQL 4.1.1, which introduced innodb_file_per_table. */; + else if (node.space->full_crc32() && + *reinterpret_cast + (&read_frame[FIL_PAGE_FCRC32_KEY_VERSION]) && + node.space->crypt_data && + node.space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED) + { + ib::error() << "Cannot decrypt " << expected_id; + err= DB_DECRYPTION_FAILED; + goto release_page; + } + else + { + ib::error() << "Space id and page no stored in the page, read in are " + << read_id << ", should be " << expected_id; + err= DB_PAGE_CORRUPTED; + goto release_page; + } + } + + err= buf_page_check_corrupt(this, node); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) + { +database_corrupted: + if (belongs_to_unzip_LRU()) + memset_aligned(frame, 0, srv_page_size); + + if (err == DB_PAGE_CORRUPTED) + { + ib::error() << "Database page corruption on disk" + " or a failed read of file '" + << node.name << "' page " << expected_id + << ". You may have to recover from a backup."; + + buf_page_print(read_frame, zip_size()); + + node.space->set_corrupted(); + + ib::info() << " You can use CHECK TABLE to scan" + " your table for corruption. " + << FORCE_RECOVERY_MSG; + } + + if (!srv_force_recovery) + goto release_page; + } + + if (err == DB_PAGE_CORRUPTED || err == DB_DECRYPTION_FAILED) + { +release_page: + buf_pool.corrupted_evict(this, buf_page_t::READ_FIX); + return err; + } + + const bool recovery= recv_recovery_is_on(); + + if (recovery && !recv_recover_page(node.space, this)) + return DB_PAGE_CORRUPTED; + + const bool ibuf_may_exist= frame && !recv_no_ibuf_operations && + (!expected_id.space() || !is_predefined_tablespace(expected_id.space())) && + fil_page_get_type(read_frame) == FIL_PAGE_INDEX && + page_is_leaf(read_frame); + + if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE))) + buf_page_monitor(*this, true); + DBUG_PRINT("ib_buf", ("read page %u:%u", id().space(), id().page_no())); + + if (!recovery) + { + ut_d(auto f=) zip.fix.fetch_sub(ibuf_may_exist + ? READ_FIX - IBUF_EXIST + : READ_FIX - UNFIXED); + ut_ad(f >= READ_FIX); + ut_ad(f < WRITE_FIX); + } + else if (ibuf_may_exist) + set_ibuf_exist(); + + lock.x_unlock(true); + + return DB_SUCCESS; +} + +#ifdef UNIV_DEBUG +/** Check that all blocks are in a replaceable state. +@return address of a non-free block +@retval nullptr if all freed */ +void buf_pool_t::assert_all_freed() +{ + mysql_mutex_lock(&mutex); + const chunk_t *chunk= chunks; + for (auto i= n_chunks; i--; chunk++) + if (const buf_block_t* block= chunk->not_freed()) + ib::fatal() << "Page " << block->page.id() << " still fixed or dirty"; + mysql_mutex_unlock(&mutex); +} +#endif /* UNIV_DEBUG */ + +/** Refresh the statistics used to print per-second averages. */ +void buf_refresh_io_stats() +{ + buf_pool.last_printout_time = time(NULL); + buf_pool.old_stat = buf_pool.stat; +} + +/** Invalidate all pages in the buffer pool. +All pages must be in a replaceable state (not modified or latched). */ +void buf_pool_invalidate() +{ + mysql_mutex_lock(&buf_pool.mutex); + + /* It is possible that a write batch that has been posted + earlier is still not complete. For buffer pool invalidation to + proceed we must ensure there is NO write activity happening. */ + + ut_d(mysql_mutex_unlock(&buf_pool.mutex)); + ut_d(buf_pool.assert_all_freed()); + ut_d(mysql_mutex_lock(&buf_pool.mutex)); + + while (UT_LIST_GET_LEN(buf_pool.LRU)) { + buf_LRU_scan_and_free_block(); + } + + ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0); + + buf_pool.freed_page_clock = 0; + buf_pool.LRU_old = NULL; + buf_pool.LRU_old_len = 0; + buf_pool.stat.init(); + + buf_refresh_io_stats(); + mysql_mutex_unlock(&buf_pool.mutex); +} + +#ifdef UNIV_DEBUG +/** Validate the buffer pool. */ +void buf_pool_t::validate() +{ + ulint n_lru = 0; + ulint n_flushing = 0; + ulint n_free = 0; + ulint n_zip = 0; + + mysql_mutex_lock(&mutex); + + chunk_t* chunk = chunks; + + /* Check the uncompressed blocks. */ + + for (auto i = n_chunks; i--; chunk++) { + buf_block_t* block = chunk->blocks; + + for (auto j = chunk->size; j--; block++) { + ut_ad(block->page.frame); + switch (const auto f = block->page.state()) { + case buf_page_t::NOT_USED: + n_free++; + break; + + case buf_page_t::MEMORY: + case buf_page_t::REMOVE_HASH: + /* do nothing */ + break; + + default: + if (f >= buf_page_t::READ_FIX + && f < buf_page_t::WRITE_FIX) { + /* A read-fixed block is not + necessarily in the page_hash yet. */ + break; + } + ut_ad(f >= buf_page_t::FREED); + const page_id_t id{block->page.id()}; + ut_ad(page_hash.get( + id, + page_hash.cell_get(id.fold())) + == &block->page); + n_lru++; + } + } + } + + /* Check dirty blocks. */ + + mysql_mutex_lock(&flush_list_mutex); + for (buf_page_t* b = UT_LIST_GET_FIRST(flush_list); b; + b = UT_LIST_GET_NEXT(list, b)) { + ut_ad(b->in_file()); + ut_ad(b->oldest_modification()); + ut_ad(!fsp_is_system_temporary(b->id().space())); + n_flushing++; + + if (UNIV_UNLIKELY(!b->frame)) { + n_lru++; + n_zip++; + } + const page_id_t id{b->id()}; + ut_ad(page_hash.get(id, page_hash.cell_get(id.fold())) == b); + } + + ut_ad(UT_LIST_GET_LEN(flush_list) == n_flushing); + + mysql_mutex_unlock(&flush_list_mutex); + + if (n_chunks_new == n_chunks + && n_lru + n_free > curr_size + n_zip) { + + ib::fatal() << "n_LRU " << n_lru << ", n_free " << n_free + << ", pool " << curr_size + << " zip " << n_zip << ". Aborting..."; + } + + ut_ad(UT_LIST_GET_LEN(LRU) >= n_lru); + + if (n_chunks_new == n_chunks + && UT_LIST_GET_LEN(free) != n_free) { + + ib::fatal() << "Free list len " + << UT_LIST_GET_LEN(free) + << ", free blocks " << n_free << ". Aborting..."; + } + + mysql_mutex_unlock(&mutex); + + ut_d(buf_LRU_validate()); + ut_d(buf_flush_validate()); +} +#endif /* UNIV_DEBUG */ + +#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG +/** Write information of the buf_pool to the error log. */ +void buf_pool_t::print() +{ + index_id_t* index_ids; + ulint* counts; + ulint size; + ulint i; + ulint j; + index_id_t id; + ulint n_found; + chunk_t* chunk; + dict_index_t* index; + + size = curr_size; + + index_ids = static_cast( + ut_malloc_nokey(size * sizeof *index_ids)); + + counts = static_cast(ut_malloc_nokey(sizeof(ulint) * size)); + + mysql_mutex_lock(&mutex); + mysql_mutex_lock(&flush_list_mutex); + + ib::info() + << "[buffer pool: size=" << curr_size + << ", database pages=" << UT_LIST_GET_LEN(LRU) + << ", free pages=" << UT_LIST_GET_LEN(free) + << ", modified database pages=" + << UT_LIST_GET_LEN(flush_list) + << ", n pending decompressions=" << n_pend_unzip + << ", n pending flush LRU=" << n_flush() + << " list=" << os_aio_pending_writes() + << ", pages made young=" << stat.n_pages_made_young + << ", not young=" << stat.n_pages_not_made_young + << ", pages read=" << stat.n_pages_read + << ", created=" << stat.n_pages_created + << ", written=" << stat.n_pages_written << "]"; + + mysql_mutex_unlock(&flush_list_mutex); + + /* Count the number of blocks belonging to each index in the buffer */ + + n_found = 0; + + chunk = chunks; + + for (i = n_chunks; i--; chunk++) { + buf_block_t* block = chunk->blocks; + ulint n_blocks = chunk->size; + + for (; n_blocks--; block++) { + const buf_frame_t* frame = block->page.frame; + + if (fil_page_index_page_check(frame)) { + + id = btr_page_get_index_id(frame); + + /* Look for the id in the index_ids array */ + j = 0; + + while (j < n_found) { + + if (index_ids[j] == id) { + counts[j]++; + + break; + } + j++; + } + + if (j == n_found) { + n_found++; + index_ids[j] = id; + counts[j] = 1; + } + } + } + } + + mysql_mutex_unlock(&mutex); + + for (i = 0; i < n_found; i++) { + index = dict_index_get_if_in_cache(index_ids[i]); + + if (!index) { + ib::info() << "Block count for index " + << index_ids[i] << " in buffer is about " + << counts[i]; + } else { + ib::info() << "Block count for index " << index_ids[i] + << " in buffer is about " << counts[i] + << ", index " << index->name + << " of table " << index->table->name; + } + } + + ut_free(index_ids); + ut_free(counts); + + validate(); +} +#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */ + +#ifdef UNIV_DEBUG +/** @return the number of latched pages in the buffer pool */ +ulint buf_get_latched_pages_number() +{ + ulint fixed_pages_number= 0; + + mysql_mutex_lock(&buf_pool.mutex); + + for (buf_page_t *b= UT_LIST_GET_FIRST(buf_pool.LRU); b; + b= UT_LIST_GET_NEXT(LRU, b)) + if (b->state() > buf_page_t::UNFIXED) + fixed_pages_number++; + + mysql_mutex_unlock(&buf_pool.mutex); + + return fixed_pages_number; +} +#endif /* UNIV_DEBUG */ + +/** Collect buffer pool metadata. +@param[out] pool_info buffer pool metadata */ +void buf_stats_get_pool_info(buf_pool_info_t *pool_info) +{ + time_t current_time; + double time_elapsed; + + mysql_mutex_lock(&buf_pool.mutex); + + pool_info->pool_size = buf_pool.curr_size; + + pool_info->lru_len = UT_LIST_GET_LEN(buf_pool.LRU); + + pool_info->old_lru_len = buf_pool.LRU_old_len; + + pool_info->free_list_len = UT_LIST_GET_LEN(buf_pool.free); + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + pool_info->flush_list_len = UT_LIST_GET_LEN(buf_pool.flush_list); + + pool_info->n_pend_unzip = UT_LIST_GET_LEN(buf_pool.unzip_LRU); + + pool_info->n_pend_reads = os_aio_pending_reads_approx(); + + pool_info->n_pending_flush_lru = buf_pool.n_flush(); + + pool_info->n_pending_flush_list = os_aio_pending_writes(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + current_time = time(NULL); + time_elapsed = 0.001 + difftime(current_time, + buf_pool.last_printout_time); + + pool_info->n_pages_made_young = buf_pool.stat.n_pages_made_young; + + pool_info->n_pages_not_made_young = + buf_pool.stat.n_pages_not_made_young; + + pool_info->n_pages_read = buf_pool.stat.n_pages_read; + + pool_info->n_pages_created = buf_pool.stat.n_pages_created; + + pool_info->n_pages_written = buf_pool.stat.n_pages_written; + + pool_info->n_page_gets = buf_pool.stat.n_page_gets; + + pool_info->n_ra_pages_read_rnd = buf_pool.stat.n_ra_pages_read_rnd; + pool_info->n_ra_pages_read = buf_pool.stat.n_ra_pages_read; + + pool_info->n_ra_pages_evicted = buf_pool.stat.n_ra_pages_evicted; + + pool_info->page_made_young_rate = + static_cast(buf_pool.stat.n_pages_made_young + - buf_pool.old_stat.n_pages_made_young) + / time_elapsed; + + pool_info->page_not_made_young_rate = + static_cast(buf_pool.stat.n_pages_not_made_young + - buf_pool.old_stat.n_pages_not_made_young) + / time_elapsed; + + pool_info->pages_read_rate = + static_cast(buf_pool.stat.n_pages_read + - buf_pool.old_stat.n_pages_read) + / time_elapsed; + + pool_info->pages_created_rate = + static_cast(buf_pool.stat.n_pages_created + - buf_pool.old_stat.n_pages_created) + / time_elapsed; + + pool_info->pages_written_rate = + static_cast(buf_pool.stat.n_pages_written + - buf_pool.old_stat.n_pages_written) + / time_elapsed; + + pool_info->n_page_get_delta = buf_pool.stat.n_page_gets + - buf_pool.old_stat.n_page_gets; + + if (pool_info->n_page_get_delta) { + pool_info->page_read_delta = buf_pool.stat.n_pages_read + - buf_pool.old_stat.n_pages_read; + + pool_info->young_making_delta = + buf_pool.stat.n_pages_made_young + - buf_pool.old_stat.n_pages_made_young; + + pool_info->not_young_making_delta = + buf_pool.stat.n_pages_not_made_young + - buf_pool.old_stat.n_pages_not_made_young; + } + pool_info->pages_readahead_rnd_rate = + static_cast(buf_pool.stat.n_ra_pages_read_rnd + - buf_pool.old_stat.n_ra_pages_read_rnd) + / time_elapsed; + + + pool_info->pages_readahead_rate = + static_cast(buf_pool.stat.n_ra_pages_read + - buf_pool.old_stat.n_ra_pages_read) + / time_elapsed; + + pool_info->pages_evicted_rate = + static_cast(buf_pool.stat.n_ra_pages_evicted + - buf_pool.old_stat.n_ra_pages_evicted) + / time_elapsed; + + pool_info->unzip_lru_len = UT_LIST_GET_LEN(buf_pool.unzip_LRU); + + pool_info->io_sum = buf_LRU_stat_sum.io; + + pool_info->io_cur = buf_LRU_stat_cur.io; + + pool_info->unzip_sum = buf_LRU_stat_sum.unzip; + + pool_info->unzip_cur = buf_LRU_stat_cur.unzip; + + buf_refresh_io_stats(); + mysql_mutex_unlock(&buf_pool.mutex); +} + +/*********************************************************************//** +Prints info of the buffer i/o. */ +static +void +buf_print_io_instance( +/*==================*/ + buf_pool_info_t*pool_info, /*!< in: buffer pool info */ + FILE* file) /*!< in/out: buffer where to print */ +{ + ut_ad(pool_info); + + fprintf(file, + "Buffer pool size " ULINTPF "\n" + "Free buffers " ULINTPF "\n" + "Database pages " ULINTPF "\n" + "Old database pages " ULINTPF "\n" + "Modified db pages " ULINTPF "\n" + "Percent of dirty pages(LRU & free pages): %.3f\n" + "Max dirty pages percent: %.3f\n" + "Pending reads " ULINTPF "\n" + "Pending writes: LRU " ULINTPF ", flush list " ULINTPF "\n", + pool_info->pool_size, + pool_info->free_list_len, + pool_info->lru_len, + pool_info->old_lru_len, + pool_info->flush_list_len, + static_cast(pool_info->flush_list_len) + / (static_cast(pool_info->lru_len + + pool_info->free_list_len) + 1.0) + * 100.0, + srv_max_buf_pool_modified_pct, + pool_info->n_pend_reads, + pool_info->n_pending_flush_lru, + pool_info->n_pending_flush_list); + + fprintf(file, + "Pages made young " ULINTPF ", not young " ULINTPF "\n" + "%.2f youngs/s, %.2f non-youngs/s\n" + "Pages read " ULINTPF ", created " ULINTPF + ", written " ULINTPF "\n" + "%.2f reads/s, %.2f creates/s, %.2f writes/s\n", + pool_info->n_pages_made_young, + pool_info->n_pages_not_made_young, + pool_info->page_made_young_rate, + pool_info->page_not_made_young_rate, + pool_info->n_pages_read, + pool_info->n_pages_created, + pool_info->n_pages_written, + pool_info->pages_read_rate, + pool_info->pages_created_rate, + pool_info->pages_written_rate); + + if (pool_info->n_page_get_delta) { + double hit_rate = static_cast( + pool_info->page_read_delta) + / static_cast(pool_info->n_page_get_delta); + + if (hit_rate > 1) { + hit_rate = 1; + } + + fprintf(file, + "Buffer pool hit rate " ULINTPF " / 1000," + " young-making rate " ULINTPF " / 1000 not " + ULINTPF " / 1000\n", + ulint(1000 * (1 - hit_rate)), + ulint(1000 + * double(pool_info->young_making_delta) + / double(pool_info->n_page_get_delta)), + ulint(1000 * double(pool_info->not_young_making_delta) + / double(pool_info->n_page_get_delta))); + } else { + fputs("No buffer pool page gets since the last printout\n", + file); + } + + /* Statistics about read ahead algorithm */ + fprintf(file, "Pages read ahead %.2f/s," + " evicted without access %.2f/s," + " Random read ahead %.2f/s\n", + + pool_info->pages_readahead_rate, + pool_info->pages_evicted_rate, + pool_info->pages_readahead_rnd_rate); + + /* Print some values to help us with visualizing what is + happening with LRU eviction. */ + fprintf(file, + "LRU len: " ULINTPF ", unzip_LRU len: " ULINTPF "\n" + "I/O sum[" ULINTPF "]:cur[" ULINTPF "], " + "unzip sum[" ULINTPF "]:cur[" ULINTPF "]\n", + pool_info->lru_len, pool_info->unzip_lru_len, + pool_info->io_sum, pool_info->io_cur, + pool_info->unzip_sum, pool_info->unzip_cur); +} + +/*********************************************************************//** +Prints info of the buffer i/o. */ +void +buf_print_io( +/*=========*/ + FILE* file) /*!< in/out: buffer where to print */ +{ + buf_pool_info_t pool_info; + + buf_stats_get_pool_info(&pool_info); + buf_print_io_instance(&pool_info, file); +} + +/** Verify that post encryption checksum match with the calculated checksum. +This function should be called only if tablespace contains crypt data metadata. +@param page page frame +@param fsp_flags contents of FSP_SPACE_FLAGS +@return whether the page is encrypted and valid */ +bool buf_page_verify_crypt_checksum(const byte *page, uint32_t fsp_flags) +{ + if (!fil_space_t::full_crc32(fsp_flags)) { + return fil_space_verify_crypt_checksum( + page, fil_space_t::zip_size(fsp_flags)); + } + + return !buf_page_is_corrupted(true, page, fsp_flags); +} + +/** Print the given page_id_t object. +@param[in,out] out the output stream +@param[in] page_id the page_id_t object to be printed +@return the output stream */ +std::ostream& operator<<(std::ostream &out, const page_id_t page_id) +{ + out << "[page id: space=" << page_id.space() + << ", page number=" << page_id.page_no() << "]"; + return out; +} +#endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/buf/buf0checksum.cc b/storage/innobase/buf/buf0checksum.cc new file mode 100644 index 00000000..662343ae --- /dev/null +++ b/storage/innobase/buf/buf0checksum.cc @@ -0,0 +1,98 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0checksum.cc +Buffer pool checksum functions, also linked from /extra/innochecksum.cc + +Created Aug 11, 2011 Vasil Dimov +*******************************************************/ + +#include "buf0checksum.h" +#include "fil0fil.h" +#include "ut0rnd.h" + +#ifndef UNIV_INNOCHECKSUM +#include "srv0srv.h" +#endif /* !UNIV_INNOCHECKSUM */ + +/** Calculate the CRC32 checksum of a page. The value is stored to the page +when it is written to a file and also checked for a match when reading from +the file. Note that we must be careful to calculate the same value on all +architectures. +@param[in] page buffer page (srv_page_size bytes) +@return CRC-32C */ +uint32_t buf_calc_page_crc32(const byte* page) +{ + /* Note: innodb_checksum_algorithm=crc32 could and should have + included the entire page in the checksum, and CRC-32 values + should be combined with the CRC-32 function, not with + exclusive OR. We stick to the current algorithm in order to + remain compatible with old data files. */ + return my_crc32c(0, page + FIL_PAGE_OFFSET, + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + - FIL_PAGE_OFFSET) + ^ my_crc32c(0, page + FIL_PAGE_DATA, + srv_page_size + - (FIL_PAGE_DATA + FIL_PAGE_END_LSN_OLD_CHKSUM)); +} + +#ifndef UNIV_INNOCHECKSUM +/** Calculate a checksum which is stored to the page when it is written +to a file. Note that we must be careful to calculate the same value on +32-bit and 64-bit architectures. +@param[in] page file page (srv_page_size bytes) +@return checksum */ +uint32_t +buf_calc_page_new_checksum(const byte* page) +{ + ulint checksum; + + /* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, are written outside the buffer pool + to the first pages of data files, we have to skip them in the page + checksum calculation. + We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the + checksum is stored, and also the last 8 bytes of page because + there we store the old formula checksum. */ + + checksum = ut_fold_binary(page + FIL_PAGE_OFFSET, + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + - FIL_PAGE_OFFSET) + + ut_fold_binary(page + FIL_PAGE_DATA, + srv_page_size - FIL_PAGE_DATA + - FIL_PAGE_END_LSN_OLD_CHKSUM); + return(static_cast(checksum)); +} + +/** In MySQL before 4.0.14 or 4.1.1 there was an InnoDB bug that +the checksum only looked at the first few bytes of the page. +This calculates that old checksum. +NOTE: we must first store the new formula checksum to +FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum +because this takes that field as an input! +@param[in] page file page (srv_page_size bytes) +@return checksum */ +uint32_t +buf_calc_page_old_checksum(const byte* page) +{ + return(static_cast + (ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION))); +} +#endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc new file mode 100644 index 00000000..e9aea355 --- /dev/null +++ b/storage/innobase/buf/buf0dblwr.cc @@ -0,0 +1,779 @@ +/***************************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0dblwr.cc +Doublwrite buffer module + +Created 2011/12/19 +*******************************************************/ + +#include "buf0dblwr.h" +#include "buf0flu.h" +#include "buf0checksum.h" +#include "srv0start.h" +#include "srv0srv.h" +#include "page0zip.h" +#include "trx0sys.h" +#include "fil0crypt.h" +#include "fil0pagecompress.h" + +using st_::span; + +/** The doublewrite buffer */ +buf_dblwr_t buf_dblwr; + +/** @return the TRX_SYS page */ +inline buf_block_t *buf_dblwr_trx_sys_get(mtr_t *mtr) +{ + return buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO), + 0, RW_X_LATCH, mtr); +} + +void buf_dblwr_t::init() +{ + if (!active_slot) + { + active_slot= &slots[0]; + mysql_mutex_init(buf_dblwr_mutex_key, &mutex, nullptr); + pthread_cond_init(&cond, nullptr); + } +} + +/** Initialise the persistent storage of the doublewrite buffer. +@param header doublewrite page header in the TRX_SYS page */ +inline void buf_dblwr_t::init(const byte *header) +{ + ut_ad(!active_slot->first_free); + ut_ad(!active_slot->reserved); + ut_ad(!batch_running); + + block1= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK1)); + block2= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK2)); + + const uint32_t buf_size= 2 * block_size(); + for (int i= 0; i < 2; i++) + { + slots[i].write_buf= static_cast + (aligned_malloc(buf_size << srv_page_size_shift, srv_page_size)); + slots[i].buf_block_arr= static_cast + (ut_zalloc_nokey(buf_size * sizeof(element))); + } + active_slot= &slots[0]; +} + +/** Create or restore the doublewrite buffer in the TRX_SYS page. +@return whether the operation succeeded */ +bool buf_dblwr_t::create() +{ + if (is_created()) + return true; + + mtr_t mtr; + const ulint size= block_size(); + +start_again: + mtr.start(); + + dberr_t err; + buf_block_t *trx_sys_block= buf_dblwr_trx_sys_get(&mtr); + if (!trx_sys_block) + { + mtr.commit(); + return false; + } + + if (mach_read_from_4(TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC + + trx_sys_block->page.frame) == + TRX_SYS_DOUBLEWRITE_MAGIC_N) + { + /* The doublewrite buffer has already been created: just read in + some numbers */ + init(TRX_SYS_DOUBLEWRITE + trx_sys_block->page.frame); + mtr.commit(); + return true; + } + + if (UT_LIST_GET_FIRST(fil_system.sys_space->chain)->size < 3 * size) + { + ib::error() << "Cannot create doublewrite buffer: " + "the first file in innodb_data_file_path must be at least " + << (3 * (size >> (20U - srv_page_size_shift))) << "M."; +fail: + mtr.commit(); + return false; + } + else + { + buf_block_t *b= fseg_create(fil_system.sys_space, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG, + &mtr, &err, false, trx_sys_block); + if (!b) + { + ib::error() << "Cannot create doublewrite buffer: " << err; + goto fail; + } + + ib::info() << "Doublewrite buffer not found: creating new"; + + /* FIXME: After this point, the doublewrite buffer creation + is not atomic. The doublewrite buffer should not exist in + the InnoDB system tablespace file in the first place. + It could be located in separate optional file(s) in a + user-specified location. */ + } + + byte *fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG + + trx_sys_block->page.frame; + for (uint32_t prev_page_no= 0, i= 0, extent_size= FSP_EXTENT_SIZE; + i < 2 * size + extent_size / 2; i++) + { + buf_block_t *new_block= + fseg_alloc_free_page_general(fseg_header, prev_page_no + 1, FSP_UP, + false, &mtr, &mtr, &err); + if (!new_block) + { + ib::error() << "Cannot create doublewrite buffer: " + " you must increase your tablespace size." + " Cannot continue operation."; + /* This may essentially corrupt the doublewrite + buffer. However, usually the doublewrite buffer + is created at database initialization, and it + should not matter (just remove all newly created + InnoDB files and restart). */ + mtr.commit(); + return false; + } + + /* We read the allocated pages to the buffer pool; when they are + written to disk in a flush, the space id and page number fields + are also written to the pages. When we at database startup read + pages from the doublewrite buffer, we know that if the space id + and page number in them are the same as the page position in the + tablespace, then the page has not been written to in + doublewrite. */ + + ut_ad(new_block->page.lock.not_recursive()); + const page_id_t id= new_block->page.id(); + /* We only do this in the debug build, to ensure that the check in + buf_flush_init_for_writing() will see a valid page type. The + flushes of new_block are actually unnecessary here. */ + ut_d(mtr.write<2>(*new_block, FIL_PAGE_TYPE + new_block->page.frame, + FIL_PAGE_TYPE_SYS)); + + if (i == size / 2) + { + ut_a(id.page_no() == size); + mtr.write<4>(*trx_sys_block, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK1 + + trx_sys_block->page.frame, id.page_no()); + mtr.write<4>(*trx_sys_block, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT + + TRX_SYS_DOUBLEWRITE_BLOCK1 + trx_sys_block->page.frame, + id.page_no()); + } + else if (i == size / 2 + size) + { + ut_a(id.page_no() == 2 * size); + mtr.write<4>(*trx_sys_block, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK2 + + trx_sys_block->page.frame, id.page_no()); + mtr.write<4>(*trx_sys_block, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT + + TRX_SYS_DOUBLEWRITE_BLOCK2 + trx_sys_block->page.frame, + id.page_no()); + } + else if (i > size / 2) + ut_a(id.page_no() == prev_page_no + 1); + + if (((i + 1) & 15) == 0) { + /* rw_locks can only be recursively x-locked 2048 times. (on 32 + bit platforms, (lint) 0 - (X_LOCK_DECR * 2049) is no longer a + negative number, and thus lock_word becomes like a shared lock). + For 4k page size this loop will lock the fseg header too many + times. Since this code is not done while any other threads are + active, restart the MTR occasionally. */ + mtr.commit(); + mtr.start(); + trx_sys_block= buf_dblwr_trx_sys_get(&mtr); + fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG + + trx_sys_block->page.frame; + } + + prev_page_no= id.page_no(); + } + + mtr.write<4>(*trx_sys_block, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC + + trx_sys_block->page.frame, TRX_SYS_DOUBLEWRITE_MAGIC_N); + mtr.write<4>(*trx_sys_block, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC + + TRX_SYS_DOUBLEWRITE_REPEAT + trx_sys_block->page.frame, + TRX_SYS_DOUBLEWRITE_MAGIC_N); + + mtr.write<4>(*trx_sys_block, + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED + + trx_sys_block->page.frame, + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N); + mtr.commit(); + + buf_flush_wait_flushed(mtr.commit_lsn()); + + /* Remove doublewrite pages from LRU */ + buf_pool_invalidate(); + goto start_again; +} + +/** Initialize the doublewrite buffer memory structure on recovery. +If we are upgrading from a version before MySQL 4.1, then this +function performs the necessary update operations to support +innodb_file_per_table. If we are in a crash recovery, this function +loads the pages from double write buffer into memory. +@param file File handle +@param path Path name of file +@return DB_SUCCESS or error code */ +dberr_t buf_dblwr_t::init_or_load_pages(pfs_os_file_t file, const char *path) +{ + ut_ad(this == &buf_dblwr); + const uint32_t size= block_size(); + + /* We do the file i/o past the buffer pool */ + byte *read_buf= static_cast(aligned_malloc(srv_page_size, + srv_page_size)); + /* Read the TRX_SYS header to check if we are using the doublewrite buffer */ + dberr_t err= os_file_read(IORequestRead, file, read_buf, + TRX_SYS_PAGE_NO << srv_page_size_shift, + srv_page_size, nullptr); + + if (err != DB_SUCCESS) + { + ib::error() << "Failed to read the system tablespace header page"; +func_exit: + aligned_free(read_buf); + return err; + } + + /* TRX_SYS_PAGE_NO is not encrypted see fil_crypt_rotate_page() */ + if (mach_read_from_4(TRX_SYS_DOUBLEWRITE_MAGIC + TRX_SYS_DOUBLEWRITE + + read_buf) != TRX_SYS_DOUBLEWRITE_MAGIC_N) + { + /* There is no doublewrite buffer initialized in the TRX_SYS page. + This should normally not be possible; the doublewrite buffer should + be initialized when creating the database. */ + err= DB_SUCCESS; + goto func_exit; + } + + init(TRX_SYS_DOUBLEWRITE + read_buf); + + const bool upgrade_to_innodb_file_per_table= + mach_read_from_4(TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED + + TRX_SYS_DOUBLEWRITE + read_buf) != + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N; + + auto write_buf= active_slot->write_buf; + /* Read the pages from the doublewrite buffer to memory */ + err= os_file_read(IORequestRead, file, write_buf, + block1.page_no() << srv_page_size_shift, + size << srv_page_size_shift, nullptr); + + if (err != DB_SUCCESS) + { + ib::error() << "Failed to read the first double write buffer extent"; + goto func_exit; + } + + err= os_file_read(IORequestRead, file, + write_buf + (size << srv_page_size_shift), + block2.page_no() << srv_page_size_shift, + size << srv_page_size_shift, nullptr); + if (err != DB_SUCCESS) + { + ib::error() << "Failed to read the second double write buffer extent"; + goto func_exit; + } + + byte *page= write_buf; + + if (UNIV_UNLIKELY(upgrade_to_innodb_file_per_table)) + { + ib::info() << "Resetting space id's in the doublewrite buffer"; + + for (ulint i= 0; i < size * 2; i++, page += srv_page_size) + { + memset(page + FIL_PAGE_SPACE_ID, 0, 4); + /* For pre-MySQL-4.1 innodb_checksum_algorithm=innodb, we do not need to + calculate new checksums for the pages because the field + .._SPACE_ID does not affect them. Write the page back to where + we read it from. */ + const ulint source_page_no= i < size + ? block1.page_no() + i + : block2.page_no() + i - size; + err= os_file_write(IORequestWrite, path, file, page, + source_page_no << srv_page_size_shift, srv_page_size); + if (err != DB_SUCCESS) + { + ib::error() << "Failed to upgrade the double write buffer"; + goto func_exit; + } + } + os_file_flush(file); + } + else + for (ulint i= 0; i < size * 2; i++, page += srv_page_size) + if (mach_read_from_8(my_assume_aligned<8>(page + FIL_PAGE_LSN))) + /* Each valid page header must contain a nonzero FIL_PAGE_LSN field. */ + recv_sys.dblwr.add(page); + + err= DB_SUCCESS; + goto func_exit; +} + +/** Process and remove the double write buffer pages for all tablespaces. */ +void buf_dblwr_t::recover() +{ + ut_ad(log_sys.last_checkpoint_lsn); + if (!is_created()) + return; + + uint32_t page_no_dblwr= 0; + byte *read_buf= static_cast(aligned_malloc(3 * srv_page_size, + srv_page_size)); + byte *const buf= read_buf + srv_page_size; + + for (recv_dblwr_t::list::iterator i= recv_sys.dblwr.pages.begin(); + i != recv_sys.dblwr.pages.end(); ++i, ++page_no_dblwr) + { + byte *page= *i; + const uint32_t page_no= page_get_page_no(page); + if (!page_no) /* recovered via recv_dblwr_t::restore_first_page() */ + continue; + + const lsn_t lsn= mach_read_from_8(page + FIL_PAGE_LSN); + if (log_sys.last_checkpoint_lsn > lsn) + /* Pages written before the checkpoint are not useful for recovery. */ + continue; + const uint32_t space_id= page_get_space_id(page); + const page_id_t page_id(space_id, page_no); + + if (recv_sys.scanned_lsn < lsn) + { + ib::info() << "Ignoring a doublewrite copy of page " << page_id + << " with future log sequence number " << lsn; + continue; + } + + fil_space_t *space= fil_space_t::get(space_id); + + if (!space) + /* The tablespace that this page once belonged to does not exist */ + continue; + + if (UNIV_UNLIKELY(page_no >= space->get_size())) + { + /* Do not report the warning for undo tablespaces, because they + can be truncated in place. */ + if (!srv_is_undo_tablespace(space_id)) + ib::warn() << "A copy of page " << page_no + << " in the doublewrite buffer slot " << page_no_dblwr + << " is beyond the end of " << space->chain.start->name + << " (" << space->size << " pages)"; +next_page: + space->release(); + continue; + } + + const ulint physical_size= space->physical_size(); + ut_ad(!buf_is_zeroes(span(page, physical_size))); + + /* We want to ensure that for partial reads the unread portion of + the page is NUL. */ + memset(read_buf, 0x0, physical_size); + + /* Read in the actual page from the file */ + fil_io_t fio= space->io(IORequest(IORequest::DBLWR_RECOVER), + os_offset_t{page_no} * physical_size, + physical_size, read_buf); + + if (UNIV_UNLIKELY(fio.err != DB_SUCCESS)) + { + ib::warn() << "Double write buffer recovery: " << page_id + << " ('" << space->chain.start->name + << "') read failed with error: " << fio.err; + continue; + } + + if (buf_is_zeroes(span(read_buf, physical_size))) + { + /* We will check if the copy in the doublewrite buffer is + valid. If not, we will ignore this page (there should be redo + log records to initialize it). */ + } + else if (recv_sys.dblwr.validate_page(page_id, read_buf, space, buf)) + goto next_page; + else + /* We intentionally skip this message for all-zero pages. */ + ib::info() << "Trying to recover page " << page_id + << " from the doublewrite buffer."; + + page= recv_sys.dblwr.find_page(page_id, space, buf); + + if (!page) + goto next_page; + + /* Write the good page from the doublewrite buffer to the intended + position. */ + space->reacquire(); + fio= space->io(IORequestWrite, + os_offset_t{page_id.page_no()} * physical_size, + physical_size, page); + + if (fio.err == DB_SUCCESS) + ib::info() << "Recovered page " << page_id << " to '" << fio.node->name + << "' from the doublewrite buffer."; + goto next_page; + } + + recv_sys.dblwr.pages.clear(); + fil_flush_file_spaces(); + aligned_free(read_buf); +} + +/** Free the doublewrite buffer. */ +void buf_dblwr_t::close() +{ + if (!active_slot) + return; + + ut_ad(!active_slot->reserved); + ut_ad(!active_slot->first_free); + ut_ad(!batch_running); + + pthread_cond_destroy(&cond); + for (int i= 0; i < 2; i++) + { + aligned_free(slots[i].write_buf); + ut_free(slots[i].buf_block_arr); + } + mysql_mutex_destroy(&mutex); + + memset((void*) this, 0, sizeof *this); +} + +/** Update the doublewrite buffer on write completion. */ +void buf_dblwr_t::write_completed() +{ + ut_ad(this == &buf_dblwr); + ut_ad(!srv_read_only_mode); + + mysql_mutex_lock(&mutex); + + ut_ad(is_created()); + ut_ad(srv_use_doublewrite_buf); + ut_ad(batch_running); + slot *flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0]; + ut_ad(flush_slot->reserved); + ut_ad(flush_slot->reserved <= flush_slot->first_free); + + if (!--flush_slot->reserved) + { + mysql_mutex_unlock(&mutex); + /* This will finish the batch. Sync data files to the disk. */ + fil_flush_file_spaces(); + mysql_mutex_lock(&mutex); + + /* We can now reuse the doublewrite memory buffer: */ + flush_slot->first_free= 0; + batch_running= false; + pthread_cond_broadcast(&cond); + } + + mysql_mutex_unlock(&mutex); +} + +#ifdef UNIV_DEBUG +/** Check the LSN values on the page. +@param[in] page page to check +@param[in] s tablespace */ +static void buf_dblwr_check_page_lsn(const page_t* page, const fil_space_t& s) +{ + /* Ignore page_compressed or encrypted pages */ + if (s.is_compressed() || buf_page_get_key_version(page, s.flags)) + return; + const byte* lsn_start= FIL_PAGE_LSN + 4 + page; + const byte* lsn_end= page + srv_page_size - + (s.full_crc32() + ? FIL_PAGE_FCRC32_END_LSN + : FIL_PAGE_END_LSN_OLD_CHKSUM - 4); + static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment"); + static_assert(FIL_PAGE_LSN % 4 == 0, "alignment"); + ut_ad(!memcmp_aligned<4>(lsn_start, lsn_end, 4)); +} + +static void buf_dblwr_check_page_lsn(const buf_page_t &b, const byte *page) +{ + if (fil_space_t *space= fil_space_t::get_for_write(b.id().space())) + { + buf_dblwr_check_page_lsn(page, *space); + space->release(); + } +} + +/** Check the LSN values on the page with which this block is associated. */ +static void buf_dblwr_check_block(const buf_page_t *bpage) +{ + ut_ad(bpage->in_file()); + const page_t *page= bpage->frame; + ut_ad(page); + + switch (fil_page_get_type(page)) { + case FIL_PAGE_INDEX: + case FIL_PAGE_TYPE_INSTANT: + case FIL_PAGE_RTREE: + if (page_is_comp(page)) + { + if (page_simple_validate_new(page)) + return; + } + else if (page_simple_validate_old(page)) + return; + /* While it is possible that this is not an index page but just + happens to have wrongly set FIL_PAGE_TYPE, such pages should never + be modified to without also adjusting the page type during page + allocation or buf_flush_init_for_writing() or + fil_block_reset_type(). */ + buf_page_print(page); + + ib::fatal() << "Apparent corruption of an index page " << bpage->id() + << " to be written to data file. We intentionally crash" + " the server to prevent corrupt data from ending up in" + " data files."; + } +} +#endif /* UNIV_DEBUG */ + +bool buf_dblwr_t::flush_buffered_writes(const ulint size) +{ + mysql_mutex_assert_owner(&mutex); + ut_ad(size == block_size()); + + for (;;) + { + if (!active_slot->first_free) + return false; + if (!batch_running) + break; + my_cond_wait(&cond, &mutex.m_mutex); + } + + ut_ad(active_slot->reserved == active_slot->first_free); + ut_ad(!flushing_buffered_writes); + + /* Disallow anyone else to start another batch of flushing. */ + slot *flush_slot= active_slot; + /* Switch the active slot */ + active_slot= active_slot == &slots[0] ? &slots[1] : &slots[0]; + ut_a(active_slot->first_free == 0); + batch_running= true; + const ulint old_first_free= flush_slot->first_free; + auto write_buf= flush_slot->write_buf; + const bool multi_batch= block1 + static_cast(size) != block2 && + old_first_free > size; + flushing_buffered_writes= 1 + multi_batch; + /* Now safe to release the mutex. */ + mysql_mutex_unlock(&mutex); +#ifdef UNIV_DEBUG + for (ulint len2= 0, i= 0; i < old_first_free; len2 += srv_page_size, i++) + { + buf_page_t *bpage= flush_slot->buf_block_arr[i].request.bpage; + + if (bpage->zip.data) + /* No simple validate for ROW_FORMAT=COMPRESSED pages exists. */ + continue; + + /* Check that the actual page in the buffer pool is not corrupt + and the LSN values are sane. */ + buf_dblwr_check_block(bpage); + ut_d(buf_dblwr_check_page_lsn(*bpage, write_buf + len2)); + } +#endif /* UNIV_DEBUG */ + const IORequest request{nullptr, nullptr, fil_system.sys_space->chain.start, + IORequest::DBLWR_BATCH}; + ut_a(fil_system.sys_space->acquire()); + if (multi_batch) + { + fil_system.sys_space->reacquire(); + os_aio(request, write_buf, + os_offset_t{block1.page_no()} << srv_page_size_shift, + size << srv_page_size_shift); + os_aio(request, write_buf + (size << srv_page_size_shift), + os_offset_t{block2.page_no()} << srv_page_size_shift, + (old_first_free - size) << srv_page_size_shift); + } + else + os_aio(request, write_buf, + os_offset_t{block1.page_no()} << srv_page_size_shift, + old_first_free << srv_page_size_shift); + return true; +} + +static void *get_frame(const IORequest &request) +{ + if (request.slot) + return request.slot->out_buf; + const buf_page_t *bpage= request.bpage; + return bpage->zip.data ? bpage->zip.data : bpage->frame; +} + +void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request) +{ + ut_ad(this == &buf_dblwr); + ut_ad(srv_use_doublewrite_buf); + ut_ad(is_created()); + ut_ad(!srv_read_only_mode); + ut_ad(!request.bpage); + ut_ad(request.node == fil_system.sys_space->chain.start); + ut_ad(request.type == IORequest::DBLWR_BATCH); + mysql_mutex_lock(&mutex); + ut_ad(batch_running); + ut_ad(flushing_buffered_writes); + ut_ad(flushing_buffered_writes <= 2); + writes_completed++; + if (UNIV_UNLIKELY(--flushing_buffered_writes)) + { + mysql_mutex_unlock(&mutex); + return; + } + + slot *const flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0]; + ut_ad(flush_slot->reserved == flush_slot->first_free); + /* increment the doublewrite flushed pages counter */ + pages_written+= flush_slot->first_free; + mysql_mutex_unlock(&mutex); + + /* Now flush the doublewrite buffer data to disk */ + fil_system.sys_space->flush(); + + /* The writes have been flushed to disk now and in recovery we will + find them in the doublewrite buffer blocks. Next, write the data pages. */ + for (ulint i= 0, first_free= flush_slot->first_free; i < first_free; i++) + { + auto e= flush_slot->buf_block_arr[i]; + buf_page_t* bpage= e.request.bpage; + ut_ad(bpage->in_file()); + + void *frame= get_frame(e.request); + ut_ad(frame); + + auto e_size= e.size; + + if (UNIV_LIKELY_NULL(bpage->zip.data)) + { + e_size= bpage->zip_size(); + ut_ad(e_size); + } + else + { + ut_ad(!bpage->zip_size()); + ut_d(buf_dblwr_check_page_lsn(*bpage, static_cast(frame))); + } + + const lsn_t lsn= mach_read_from_8(my_assume_aligned<8> + (FIL_PAGE_LSN + + static_cast(frame))); + ut_ad(lsn); + ut_ad(lsn >= bpage->oldest_modification()); + log_write_up_to(lsn, true); + e.request.node->space->io(e.request, bpage->physical_offset(), e_size, + frame, bpage); + } +} + +/** Flush possible buffered writes to persistent storage. +It is very important to call this function after a batch of writes has been +posted, and also when we may have to wait for a page latch! +Otherwise a deadlock of threads can occur. */ +void buf_dblwr_t::flush_buffered_writes() +{ + if (!is_created() || !srv_use_doublewrite_buf) + { + fil_flush_file_spaces(); + return; + } + + ut_ad(!srv_read_only_mode); + const ulint size= block_size(); + + mysql_mutex_lock(&mutex); + if (!flush_buffered_writes(size)) + mysql_mutex_unlock(&mutex); +} + +/** Schedule a page write. If the doublewrite memory buffer is full, +flush_buffered_writes() will be invoked to make space. +@param request asynchronous write request +@param size payload size in bytes */ +void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size) +{ + ut_ad(request.is_async()); + ut_ad(request.is_write()); + ut_ad(request.bpage); + ut_ad(request.bpage->in_file()); + ut_ad(request.node); + ut_ad(request.node->space->purpose == FIL_TYPE_TABLESPACE); + ut_ad(request.node->space->id == request.bpage->id().space()); + ut_ad(request.node->space->referenced()); + ut_ad(!srv_read_only_mode); + + const ulint buf_size= 2 * block_size(); + + mysql_mutex_lock(&mutex); + + for (;;) + { + ut_ad(active_slot->first_free <= buf_size); + if (active_slot->first_free != buf_size) + break; + + if (flush_buffered_writes(buf_size / 2)) + mysql_mutex_lock(&mutex); + } + + byte *p= active_slot->write_buf + srv_page_size * active_slot->first_free; + + /* "frame" is at least 1024-byte aligned for ROW_FORMAT=COMPRESSED pages, + and at least srv_page_size (4096-byte) for everything else. */ + memcpy_aligned(p, get_frame(request), size); + /* fil_page_compress() for page_compressed guarantees 256-byte alignment */ + memset_aligned<256>(p + size, 0, srv_page_size - size); + /* FIXME: Inform the compiler that "size" and "srv_page_size - size" + are integer multiples of 256, so the above can translate into simple + SIMD instructions. Currently, we make no such assumptions about the + non-pointer parameters that are passed to the _aligned templates. */ + ut_ad(!request.bpage->zip_size() || request.bpage->zip_size() == size); + ut_ad(active_slot->reserved == active_slot->first_free); + ut_ad(active_slot->reserved < buf_size); + new (active_slot->buf_block_arr + active_slot->first_free++) + element{request, size}; + active_slot->reserved= active_slot->first_free; + + if (active_slot->first_free != buf_size || + !flush_buffered_writes(buf_size / 2)) + mysql_mutex_unlock(&mutex); +} diff --git a/storage/innobase/buf/buf0dump.cc b/storage/innobase/buf/buf0dump.cc new file mode 100644 index 00000000..957632db --- /dev/null +++ b/storage/innobase/buf/buf0dump.cc @@ -0,0 +1,765 @@ +/***************************************************************************** + +Copyright (c) 2011, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0dump.cc +Implements a buffer pool dump/load. + +Created April 08, 2011 Vasil Dimov +*******************************************************/ + +#include "my_global.h" +#include "mysqld.h" +#include "my_sys.h" + +#include "mysql/psi/mysql_stage.h" +#include "mysql/psi/psi.h" + +#include "buf0rea.h" +#include "buf0dump.h" +#include "dict0dict.h" +#include "os0file.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "ut0byte.h" + +#include + +#include "mysql/service_wsrep.h" /* wsrep_recovery */ +#include + +static void buf_do_load_dump(); + +enum status_severity { + STATUS_INFO, + STATUS_ERR +}; + +#define SHUTTING_DOWN() (srv_shutdown_state != SRV_SHUTDOWN_NONE) + +/* Flags that tell the buffer pool dump/load thread which action should it +take after being waked up. */ +static volatile bool buf_dump_should_start; +static volatile bool buf_load_should_start; + +static bool buf_load_abort_flag; + +/** Start the buffer pool dump/load task and instructs it to start a dump. */ +void buf_dump_start() +{ + buf_dump_should_start= true; + buf_do_load_dump(); +} + +/** Start the buffer pool dump/load task and instructs it to start a load. */ +void buf_load_start() +{ + buf_load_should_start= true; + buf_do_load_dump(); +} + +/*****************************************************************//** +Sets the global variable that feeds MySQL's innodb_buffer_pool_dump_status +to the specified string. The format and the following parameters are the +same as the ones used for printf(3). The value of this variable can be +retrieved by: +SELECT variable_value FROM information_schema.global_status WHERE +variable_name = 'INNODB_BUFFER_POOL_DUMP_STATUS'; +or by: +SHOW STATUS LIKE 'innodb_buffer_pool_dump_status'; */ +static MY_ATTRIBUTE((nonnull, format(printf, 2, 3))) +void +buf_dump_status( +/*============*/ + enum status_severity severity,/*!< in: status severity */ + const char* fmt, /*!< in: format */ + ...) /*!< in: extra parameters according + to fmt */ +{ + va_list ap; + + va_start(ap, fmt); + + vsnprintf( + export_vars.innodb_buffer_pool_dump_status, + sizeof(export_vars.innodb_buffer_pool_dump_status), + fmt, ap); + + switch (severity) { + case STATUS_INFO: + ib::info() << export_vars.innodb_buffer_pool_dump_status; + break; + + case STATUS_ERR: + ib::error() << export_vars.innodb_buffer_pool_dump_status; + break; + } + + va_end(ap); +} + +/*****************************************************************//** +Sets the global variable that feeds MySQL's innodb_buffer_pool_load_status +to the specified string. The format and the following parameters are the +same as the ones used for printf(3). The value of this variable can be +retrieved by: +SELECT variable_value FROM information_schema.global_status WHERE +variable_name = 'INNODB_BUFFER_POOL_LOAD_STATUS'; +or by: +SHOW STATUS LIKE 'innodb_buffer_pool_load_status'; */ +static MY_ATTRIBUTE((nonnull, format(printf, 2, 3))) +void +buf_load_status( +/*============*/ + enum status_severity severity,/*!< in: status severity */ + const char* fmt, /*!< in: format */ + ...) /*!< in: extra parameters according to fmt */ +{ + va_list ap; + + va_start(ap, fmt); + + vsnprintf( + export_vars.innodb_buffer_pool_load_status, + sizeof(export_vars.innodb_buffer_pool_load_status), + fmt, ap); + + switch (severity) { + case STATUS_INFO: + ib::info() << export_vars.innodb_buffer_pool_load_status; + break; + + case STATUS_ERR: + ib::error() << export_vars.innodb_buffer_pool_load_status; + break; + } + + va_end(ap); +} + +/** Returns the directory path where the buffer pool dump file will be created. +@return directory path */ +static +const char* +get_buf_dump_dir() +{ + const char* dump_dir; + + /* The dump file should be created in the default data directory if + innodb_data_home_dir is set as an empty string. */ + if (!*srv_data_home) { + dump_dir = fil_path_to_mysql_datadir; + } else { + dump_dir = srv_data_home; + } + + return(dump_dir); +} + +/** Generate the path to the buffer pool dump/load file. +@param[out] path generated path +@param[in] path_size size of 'path', used as in snprintf(3). */ +static void buf_dump_generate_path(char *path, size_t path_size) +{ + char buf[FN_REFLEN]; + + mysql_mutex_lock(&LOCK_global_system_variables); + snprintf(buf, sizeof buf, "%s/%s", get_buf_dump_dir(), + srv_buf_dump_filename); + mysql_mutex_unlock(&LOCK_global_system_variables); + + os_file_type_t type; + bool exists = false; + bool ret; + + ret = os_file_status(buf, &exists, &type); + + /* For realpath() to succeed the file must exist. */ + + if (ret && exists) { + /* my_realpath() assumes the destination buffer is big enough + to hold FN_REFLEN bytes. */ + ut_a(path_size >= FN_REFLEN); + + my_realpath(path, buf, 0); + } else { + /* If it does not exist, then resolve only srv_data_home + and append srv_buf_dump_filename to it. */ + char srv_data_home_full[FN_REFLEN]; + + my_realpath(srv_data_home_full, get_buf_dump_dir(), 0); + const char *format; + + switch (srv_data_home_full[strlen(srv_data_home_full) - 1]) { +#ifdef _WIN32 + case '\\': +#endif + case '/': + format = "%s%s"; + break; + default: + format = "%s/%s"; + } + + snprintf(path, path_size, format, + srv_data_home_full, srv_buf_dump_filename); + } +} + + +/*****************************************************************//** +Perform a buffer pool dump into the file specified by +innodb_buffer_pool_filename. If any errors occur then the value of +innodb_buffer_pool_dump_status will be set accordingly, see buf_dump_status(). +The dump filename can be specified by (relative to srv_data_home): +SET GLOBAL innodb_buffer_pool_filename='filename'; */ +static +void +buf_dump( +/*=====*/ + ibool obey_shutdown) /*!< in: quit if we are in a shutting down + state */ +{ +#define SHOULD_QUIT() (SHUTTING_DOWN() && obey_shutdown) + + char full_filename[OS_FILE_MAX_PATH]; + char tmp_filename[OS_FILE_MAX_PATH + sizeof "incomplete"]; + char now[32]; + FILE* f; + int ret; + + buf_dump_generate_path(full_filename, sizeof(full_filename)); + + snprintf(tmp_filename, sizeof(tmp_filename), + "%s.incomplete", full_filename); + + buf_dump_status(STATUS_INFO, "Dumping buffer pool(s) to %s", + full_filename); + +#ifdef _WIN32 + /* use my_fopen() for correct permissions during bootstrap*/ + f = my_fopen(tmp_filename, O_RDWR|O_TRUNC|O_CREAT, 0); +#elif defined(__GLIBC__) || O_CLOEXEC == 0 + f = fopen(tmp_filename, "w" STR_O_CLOEXEC); +#else + { + int fd; + fd = open(tmp_filename, O_CREAT | O_TRUNC | O_CLOEXEC | O_WRONLY, 0640); + if (fd >= 0) { + f = fdopen(fd, "w"); + } + else { + f = NULL; + } + } +#endif + if (f == NULL) { + buf_dump_status(STATUS_ERR, + "Cannot open '%s' for writing: %s", + tmp_filename, strerror(errno)); + return; + } + const buf_page_t* bpage; + page_id_t* dump; + ulint n_pages; + ulint j; + + mysql_mutex_lock(&buf_pool.mutex); + + n_pages = UT_LIST_GET_LEN(buf_pool.LRU); + + /* skip empty buffer pools */ + if (n_pages == 0) { + mysql_mutex_unlock(&buf_pool.mutex); + goto done; + } + + if (srv_buf_pool_dump_pct != 100) { + ulint t_pages; + + /* limit the number of total pages dumped to X% of the + total number of pages */ + t_pages = buf_pool.curr_size * srv_buf_pool_dump_pct / 100; + if (n_pages > t_pages) { + buf_dump_status(STATUS_INFO, + "Restricted to " ULINTPF + " pages due to " + "innodb_buf_pool_dump_pct=%lu", + t_pages, srv_buf_pool_dump_pct); + n_pages = t_pages; + } + + if (n_pages == 0) { + n_pages = 1; + } + } + + dump = static_cast(ut_malloc_nokey( + n_pages * sizeof(*dump))); + + if (dump == NULL) { + std::ostringstream str_bytes; + mysql_mutex_unlock(&buf_pool.mutex); + fclose(f); + str_bytes << ib::bytes_iec{n_pages * sizeof(*dump)}; + buf_dump_status(STATUS_ERR, + "Cannot allocate %s: %s", + str_bytes.str().c_str(), + strerror(errno)); + /* leave tmp_filename to exist */ + return; + } + + for (bpage = UT_LIST_GET_FIRST(buf_pool.LRU), j = 0; + bpage != NULL && j < n_pages; + bpage = UT_LIST_GET_NEXT(LRU, bpage)) { + const auto status = bpage->state(); + if (status < buf_page_t::UNFIXED) { + ut_a(status >= buf_page_t::FREED); + continue; + } + const page_id_t id{bpage->id()}; + + if (id.space() == SRV_TMP_SPACE_ID) { + /* Ignore the innodb_temporary tablespace. */ + continue; + } + + dump[j++] = id; + } + + mysql_mutex_unlock(&buf_pool.mutex); + + ut_a(j <= n_pages); + n_pages = j; + + for (j = 0; j < n_pages && !SHOULD_QUIT(); j++) { + ret = fprintf(f, "%u,%u\n", + dump[j].space(), dump[j].page_no()); + if (ret < 0) { + ut_free(dump); + fclose(f); + buf_dump_status(STATUS_ERR, + "Cannot write to '%s': %s", + tmp_filename, strerror(errno)); + /* leave tmp_filename to exist */ + return; + } + if (SHUTTING_DOWN() && !(j & 1023)) { + service_manager_extend_timeout( + INNODB_EXTEND_TIMEOUT_INTERVAL, + "Dumping buffer pool page " + ULINTPF "/" ULINTPF, j + 1, n_pages); + } + } + + ut_free(dump); + +done: + ret = IF_WIN(my_fclose(f,0),fclose(f)); + if (ret != 0) { + buf_dump_status(STATUS_ERR, + "Cannot close '%s': %s", + tmp_filename, strerror(errno)); + return; + } + /* else */ + + ret = unlink(full_filename); + if (ret != 0 && errno != ENOENT) { + buf_dump_status(STATUS_ERR, + "Cannot delete '%s': %s", + full_filename, strerror(errno)); + /* leave tmp_filename to exist */ + return; + } + /* else */ + + ret = rename(tmp_filename, full_filename); + if (ret != 0) { + buf_dump_status(STATUS_ERR, + "Cannot rename '%s' to '%s': %s", + tmp_filename, full_filename, + strerror(errno)); + /* leave tmp_filename to exist */ + return; + } + /* else */ + + /* success */ + + ut_sprintf_timestamp(now); + + buf_dump_status(STATUS_INFO, + "Buffer pool(s) dump completed at %s", now); + + /* Though dumping doesn't related to an incomplete load, + we reset this to 0 here to indicate that a shutdown can also perform + a dump */ + export_vars.innodb_buffer_pool_load_incomplete = 0; +} + +/*****************************************************************//** +Perform a buffer pool load from the file specified by +innodb_buffer_pool_filename. If any errors occur then the value of +innodb_buffer_pool_load_status will be set accordingly, see buf_load_status(). +The dump filename can be specified by (relative to srv_data_home): +SET GLOBAL innodb_buffer_pool_filename='filename'; */ +static +void +buf_load() +/*======*/ +{ + char full_filename[OS_FILE_MAX_PATH]; + char now[32]; + FILE* f; + page_id_t* dump; + ulint dump_n; + ulint i; + uint32_t space_id; + uint32_t page_no; + int fscanf_ret; + + /* Ignore any leftovers from before */ + buf_load_abort_flag = false; + + buf_dump_generate_path(full_filename, sizeof(full_filename)); + + buf_load_status(STATUS_INFO, + "Loading buffer pool(s) from %s", full_filename); + + f = fopen(full_filename, "r" STR_O_CLOEXEC); + if (f == NULL) { + buf_load_status(STATUS_INFO, + "Cannot open '%s' for reading: %s", + full_filename, strerror(errno)); + return; + } + /* else */ + + /* First scan the file to estimate how many entries are in it. + This file is tiny (approx 500KB per 1GB buffer pool), reading it + two times is fine. */ + dump_n = 0; + while (fscanf(f, "%u,%u", &space_id, &page_no) == 2 + && !SHUTTING_DOWN()) { + dump_n++; + } + + if (!SHUTTING_DOWN() && !feof(f)) { + /* fscanf() returned != 2 */ + const char* what; + if (ferror(f)) { + what = "reading"; + } else { + what = "parsing"; + } + fclose(f); + buf_load_status(STATUS_ERR, "Error %s '%s'," + " unable to load buffer pool (stage 1)", + what, full_filename); + return; + } + + /* If dump is larger than the buffer pool(s), then we ignore the + extra trailing. This could happen if a dump is made, then buffer + pool is shrunk and then load is attempted. */ + dump_n = std::min(dump_n, buf_pool.get_n_pages()); + + if (dump_n != 0) { + dump = static_cast(ut_malloc_nokey( + dump_n * sizeof(*dump))); + } else { + fclose(f); + ut_sprintf_timestamp(now); + buf_load_status(STATUS_INFO, + "Buffer pool(s) load completed at %s" + " (%s was empty)", now, full_filename); + return; + } + + if (dump == NULL) { + std::ostringstream str_bytes; + fclose(f); + str_bytes << ib::bytes_iec{dump_n * sizeof(*dump)}; + buf_dump_status(STATUS_ERR, + "Cannot allocate %s: %s", + str_bytes.str().c_str(), + strerror(errno)); + /* leave tmp_filename to exist */ + return; + } + + rewind(f); + + export_vars.innodb_buffer_pool_load_incomplete = 1; + + for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) { + fscanf_ret = fscanf(f, "%u,%u", &space_id, &page_no); + + if (fscanf_ret != 2) { + if (feof(f)) { + break; + } + /* else */ + + ut_free(dump); + fclose(f); + buf_load_status(STATUS_ERR, + "Error parsing '%s', unable" + " to load buffer pool (stage 2)", + full_filename); + return; + } + + if (space_id > ULINT32_MASK || page_no > ULINT32_MASK) { + ut_free(dump); + fclose(f); + buf_load_status(STATUS_ERR, + "Error parsing '%s': bogus" + " space,page %u,%u at line " ULINTPF + ", unable to load buffer pool", + full_filename, + space_id, page_no, + i); + return; + } + + dump[i] = page_id_t(space_id, page_no); + } + + /* Set dump_n to the actual number of initialized elements, + i could be smaller than dump_n here if the file got truncated after + we read it the first time. */ + dump_n = i; + + fclose(f); + + if (dump_n == 0) { + ut_free(dump); + ut_sprintf_timestamp(now); + buf_load_status(STATUS_INFO, + "Buffer pool(s) load completed at %s" + " (%s was empty or had errors)", now, full_filename); + return; + } + + if (!SHUTTING_DOWN()) { + std::sort(dump, dump + dump_n); + } + + /* Avoid calling the expensive fil_space_t::get() for each + page within the same tablespace. dump[] is sorted by (space, page), + so all pages from a given tablespace are consecutive. */ + uint32_t cur_space_id = dump[0].space(); + fil_space_t* space = fil_space_t::get(cur_space_id); + ulint zip_size = space ? space->zip_size() : 0; + + PSI_stage_progress* pfs_stage_progress __attribute__((unused)) + = mysql_set_stage(srv_stage_buffer_pool_load.m_key); + mysql_stage_set_work_estimated(pfs_stage_progress, dump_n); + mysql_stage_set_work_completed(pfs_stage_progress, 0); + + for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) { + + /* space_id for this iteration of the loop */ + const uint32_t this_space_id = dump[i].space(); + + if (this_space_id >= SRV_SPACE_ID_UPPER_BOUND) { + continue; + } + + if (this_space_id != cur_space_id) { + if (space) { + space->release(); + } + + cur_space_id = this_space_id; + space = fil_space_t::get(cur_space_id); + + if (!space) { + continue; + } + + zip_size = space->zip_size(); + } + + /* JAN: TODO: As we use background page read below, + if tablespace is encrypted we cant use it. */ + if (!space || dump[i].page_no() >= space->get_size() || + (space->crypt_data && + space->crypt_data->encryption != FIL_ENCRYPTION_OFF && + space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED)) { + continue; + } + + if (space->is_stopping()) { + space->release(); + space = nullptr; + continue; + } + + space->reacquire(); + buf_read_page_background(space, dump[i], zip_size); + + if (buf_load_abort_flag) { + if (space) { + space->release(); + } + buf_load_abort_flag = false; + ut_free(dump); + buf_load_status( + STATUS_INFO, + "Buffer pool(s) load aborted on request"); + /* Premature end, set estimated = completed = i and + end the current stage event. */ + + mysql_stage_set_work_estimated(pfs_stage_progress, i); + mysql_stage_set_work_completed(pfs_stage_progress, i); + + mysql_end_stage(); + return; + } + +#ifdef UNIV_DEBUG + if ((i+1) >= srv_buf_pool_load_pages_abort) { + buf_load_abort_flag = true; + } +#endif + } + + if (space) { + space->release(); + } + + ut_free(dump); + + if (i == dump_n) { + os_aio_wait_until_no_pending_reads(true); + } + + ut_sprintf_timestamp(now); + + if (i == dump_n) { + buf_load_status(STATUS_INFO, + "Buffer pool(s) load completed at %s", now); + export_vars.innodb_buffer_pool_load_incomplete = 0; + } else if (!buf_load_abort_flag) { + buf_load_status(STATUS_INFO, + "Buffer pool(s) load aborted due to user instigated abort at %s", + now); + /* intentionally don't reset innodb_buffer_pool_load_incomplete + as we don't want a shutdown to save the buffer pool */ + } else { + buf_load_status(STATUS_INFO, + "Buffer pool(s) load aborted due to shutdown at %s", + now); + /* intentionally don't reset innodb_buffer_pool_load_incomplete + as we want to abort without saving the buffer pool */ + } + + /* Make sure that estimated = completed when we end. */ + mysql_stage_set_work_completed(pfs_stage_progress, dump_n); + /* End the stage progress event. */ + mysql_end_stage(); +} + +/** Abort a currently running buffer pool load. */ +void buf_load_abort() +{ + buf_load_abort_flag= true; +} + +/*****************************************************************//** +This is the main task for buffer pool dump/load. when scheduled +either performs a dump or load, depending on server state, state of the variables etc- */ +static void buf_dump_load_func(void *) +{ + ut_ad(!srv_read_only_mode); + static bool first_time = true; + if (first_time && srv_buffer_pool_load_at_startup) { + +#ifdef WITH_WSREP + if (!get_wsrep_recovery()) { +#endif /* WITH_WSREP */ + srv_thread_pool->set_concurrency(srv_n_read_io_threads); + buf_load(); + srv_thread_pool->set_concurrency(); +#ifdef WITH_WSREP + } +#endif /* WITH_WSREP */ + } + first_time = false; + + while (!SHUTTING_DOWN()) { + if (buf_dump_should_start) { + buf_dump_should_start = false; + buf_dump(true); + } + if (buf_load_should_start) { + buf_load_should_start = false; + buf_load(); + } + + if (!buf_dump_should_start && !buf_load_should_start) { + return; + } + } + + /* In shutdown */ + if (srv_buffer_pool_dump_at_shutdown && srv_fast_shutdown != 2) { + if (export_vars.innodb_buffer_pool_load_incomplete) { + buf_dump_status(STATUS_INFO, + "Dumping of buffer pool not started" + " as load was incomplete"); +#ifdef WITH_WSREP + } else if (get_wsrep_recovery()) { +#endif /* WITH_WSREP */ + } else { + buf_dump(false/* do complete dump at shutdown */); + } + } +} + + +/* Execute task with max.concurrency */ +static tpool::task_group tpool_group(1); +static tpool::waitable_task buf_dump_load_task(buf_dump_load_func, &tpool_group); +static bool load_dump_enabled; + +/** Start async buffer pool load, if srv_buffer_pool_load_at_startup was set.*/ +void buf_load_at_startup() +{ + load_dump_enabled= true; + if (srv_buffer_pool_load_at_startup) + buf_do_load_dump(); +} + +static void buf_do_load_dump() +{ + if (load_dump_enabled && !buf_dump_load_task.is_running()) + srv_thread_pool->submit_task(&buf_dump_load_task); +} + +/** Wait for currently running load/dumps to finish*/ +void buf_load_dump_end() +{ + ut_ad(SHUTTING_DOWN()); + buf_dump_load_task.wait(); +} diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc new file mode 100644 index 00000000..b6357989 --- /dev/null +++ b/storage/innobase/buf/buf0flu.cc @@ -0,0 +1,2765 @@ +/***************************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2022, MariaDB Corporation. +Copyright (c) 2013, 2014, Fusion-io + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0flu.cc +The database buffer buf_pool flush algorithm + +Created 11/11/1995 Heikki Tuuri +*******************************************************/ + +#include "univ.i" +#include +#include +#include + +#include "buf0flu.h" +#include "buf0buf.h" +#include "buf0checksum.h" +#include "buf0dblwr.h" +#include "srv0start.h" +#include "page0zip.h" +#include "fil0fil.h" +#include "log0crypt.h" +#include "srv0mon.h" +#include "fil0pagecompress.h" +#include "lzo/lzo1x.h" +#include "snappy-c.h" + +/** Number of pages flushed via LRU. Protected by buf_pool.mutex. +Also included in buf_pool.stat.n_pages_written. */ +ulint buf_lru_flush_page_count; + +/** Number of pages freed without flushing. Protected by buf_pool.mutex. */ +ulint buf_lru_freed_page_count; + +/** Flag indicating if the page_cleaner is in active state. */ +Atomic_relaxed buf_page_cleaner_is_active; + +/** Factor for scan length to determine n_pages for intended oldest LSN +progress */ +static constexpr ulint buf_flush_lsn_scan_factor = 3; + +/** Average redo generation rate */ +static lsn_t lsn_avg_rate = 0; + +/** Target oldest_modification for the page cleaner background flushing; +writes are protected by buf_pool.flush_list_mutex */ +static Atomic_relaxed buf_flush_async_lsn; +/** Target oldest_modification for the page cleaner furious flushing; +writes are protected by buf_pool.flush_list_mutex */ +static Atomic_relaxed buf_flush_sync_lsn; + +#ifdef UNIV_PFS_THREAD +mysql_pfs_key_t page_cleaner_thread_key; +#endif /* UNIV_PFS_THREAD */ + +/** Page cleaner structure */ +static struct +{ + /** total elapsed time in adaptive flushing, in seconds */ + ulint flush_time; + /** number of adaptive flushing passes */ + ulint flush_pass; +} page_cleaner; + +/* @} */ + +#ifdef UNIV_DEBUG +/** Validate the flush list. */ +static void buf_flush_validate_low(); + +/** Validates the flush list some of the time. */ +static void buf_flush_validate_skip() +{ +/** Try buf_flush_validate_low() every this many times */ +# define BUF_FLUSH_VALIDATE_SKIP 23 + + /** The buf_flush_validate_low() call skip counter. + Use a signed type because of the race condition below. */ + static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP; + + /* There is a race condition below, but it does not matter, + because this call is only for heuristic purposes. We want to + reduce the call frequency of the costly buf_flush_validate_low() + check in debug builds. */ + if (--buf_flush_validate_count > 0) { + return; + } + + buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP; + buf_flush_validate_low(); +} +#endif /* UNIV_DEBUG */ + +void buf_pool_t::page_cleaner_wakeup(bool for_LRU) +{ + ut_d(buf_flush_validate_skip()); + if (!page_cleaner_idle()) + { + if (for_LRU) + /* Ensure that the page cleaner is not in a timed wait. */ + pthread_cond_signal(&do_flush_list); + return; + } + double dirty_pct= double(UT_LIST_GET_LEN(buf_pool.flush_list)) * 100.0 / + double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free)); + double pct_lwm= srv_max_dirty_pages_pct_lwm; + + /* if pct_lwm != 0.0, adaptive flushing is enabled. + signal buf page cleaner thread + - if pct_lwm <= dirty_pct then it will invoke apdative flushing flow + - if pct_lwm > dirty_pct then it will invoke idle flushing flow. + + idle_flushing: + dirty_pct < innodb_max_dirty_pages_pct_lwm so it could be an + idle flushing use-case. + + Why is last_activity_count not updated always? + - let's first understand when is server activity count updated. + - it is updated on commit of a transaction trx_t::commit() and not + on adding a page to the flush list. + - page_cleaner_wakeup is called when a page is added to the flush list. + + - now let's say the first user thread, updates the count from X -> Y but + is yet to commit the transaction (so activity count is still Y). + followup user threads will see the updated count as (Y) that is matching + the universal server activity count (Y), giving a false impression that + the server is idle. + + How to avoid this? + - by allowing last_activity_count to updated when page-cleaner is made + active and has work to do. This ensures that the last_activity signal + is consumed by the page-cleaner before the next one is generated. */ + if (for_LRU || + (pct_lwm != 0.0 && (pct_lwm <= dirty_pct || + last_activity_count == srv_get_activity_count())) || + srv_max_buf_pool_modified_pct <= dirty_pct) + { + page_cleaner_status-= PAGE_CLEANER_IDLE; + pthread_cond_signal(&do_flush_list); + } +} + +/** Remove a block from flush_list. +@param bpage buffer pool page */ +void buf_pool_t::delete_from_flush_list(buf_page_t *bpage) noexcept +{ + ut_ad(!fsp_is_system_temporary(bpage->id().space())); + mysql_mutex_assert_owner(&flush_list_mutex); + flush_hp.adjust(bpage); + UT_LIST_REMOVE(flush_list, bpage); + flush_list_bytes-= bpage->physical_size(); + bpage->clear_oldest_modification(); +#ifdef UNIV_DEBUG + buf_flush_validate_skip(); +#endif /* UNIV_DEBUG */ +} + +/** Remove all dirty pages belonging to a given tablespace when we are +deleting the data file of that tablespace. +The pages still remain a part of LRU and are evicted from +the list as they age towards the tail of the LRU. +@param id tablespace identifier */ +void buf_flush_remove_pages(uint32_t id) +{ + const page_id_t first(id, 0), end(id + 1, 0); + ut_ad(id); + + for (;;) + { + mysql_mutex_lock(&buf_pool.mutex); + bool deferred= false; + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + + for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; ) + { + const auto s= bpage->state(); + ut_ad(s >= buf_page_t::REMOVE_HASH); + ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX); + buf_page_t *prev= UT_LIST_GET_PREV(list, bpage); + + const page_id_t bpage_id(bpage->id()); + + if (bpage_id < first || bpage_id >= end); + else if (s >= buf_page_t::WRITE_FIX) + deferred= true; + else + buf_pool.delete_from_flush_list(bpage); + + bpage= prev; + } + + mysql_mutex_unlock(&buf_pool.mutex); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + if (!deferred) + break; + + os_aio_wait_until_no_pending_writes(true); + } +} + +/*******************************************************************//** +Relocates a buffer control block on the flush_list. +Note that it is assumed that the contents of bpage have already been +copied to dpage. +IMPORTANT: When this function is called bpage and dpage are not +exact copies of each other. For example, they both will have different +::state. Also the ::list pointers in dpage may be stale. We need to +use the current list node (bpage) to do the list manipulation because +the list pointers could have changed between the time that we copied +the contents of bpage to the dpage and the flush list manipulation +below. */ +ATTRIBUTE_COLD +void +buf_flush_relocate_on_flush_list( +/*=============================*/ + buf_page_t* bpage, /*!< in/out: control block being moved */ + buf_page_t* dpage) /*!< in/out: destination block */ +{ + buf_page_t* prev; + + mysql_mutex_assert_owner(&buf_pool.flush_list_mutex); + ut_ad(!fsp_is_system_temporary(bpage->id().space())); + + const lsn_t lsn = bpage->oldest_modification(); + + if (!lsn) { + return; + } + + ut_ad(lsn == 1 || lsn > 2); + ut_ad(dpage->oldest_modification() == lsn); + + /* Important that we adjust the hazard pointer before removing + the bpage from the flush list. */ + buf_pool.flush_hp.adjust(bpage); + + prev = UT_LIST_GET_PREV(list, bpage); + UT_LIST_REMOVE(buf_pool.flush_list, bpage); + + bpage->clear_oldest_modification(); + + if (lsn == 1) { + buf_pool.flush_list_bytes -= dpage->physical_size(); + dpage->list.prev = nullptr; + dpage->list.next = nullptr; + dpage->clear_oldest_modification(); + } else if (prev) { + ut_ad(prev->oldest_modification()); + UT_LIST_INSERT_AFTER(buf_pool.flush_list, prev, dpage); + } else { + UT_LIST_ADD_FIRST(buf_pool.flush_list, dpage); + } + + ut_d(buf_flush_validate_low()); +} + +/** Note that a block is no longer dirty, while not removing +it from buf_pool.flush_list +@param temporary whether the page belongs to the temporary tablespace +@param error whether an error may have occurred while writing */ +inline void buf_page_t::write_complete(bool temporary, bool error) +{ + ut_ad(temporary == fsp_is_system_temporary(id().space())); + if (UNIV_UNLIKELY(error)); + else if (temporary) + { + ut_ad(oldest_modification() == 2); + oldest_modification_= 0; + } + else + { + /* We use release memory order to guarantee that callers of + oldest_modification_acquire() will observe the block as + being detached from buf_pool.flush_list, after reading the value 0. */ + ut_ad(oldest_modification() > 2); + oldest_modification_.store(1, std::memory_order_release); + } + const auto s= state(); + ut_ad(s >= WRITE_FIX); + zip.fix.fetch_sub((s >= WRITE_FIX_REINIT) + ? (WRITE_FIX_REINIT - UNFIXED) + : (WRITE_FIX - UNFIXED)); + lock.u_unlock(true); +} + +inline void buf_pool_t::n_flush_inc() +{ + mysql_mutex_assert_owner(&flush_list_mutex); + page_cleaner_status+= LRU_FLUSH; +} + +inline void buf_pool_t::n_flush_dec() +{ + mysql_mutex_lock(&flush_list_mutex); + ut_ad(page_cleaner_status >= LRU_FLUSH); + if ((page_cleaner_status-= LRU_FLUSH) < LRU_FLUSH) + pthread_cond_broadcast(&done_flush_LRU); + mysql_mutex_unlock(&flush_list_mutex); +} + +inline void buf_pool_t::n_flush_dec_holding_mutex() +{ + mysql_mutex_assert_owner(&flush_list_mutex); + ut_ad(page_cleaner_status >= LRU_FLUSH); + page_cleaner_status-= LRU_FLUSH; +} + +/** Complete write of a file page from buf_pool. +@param request write request +@param error whether the write may have failed */ +void buf_page_write_complete(const IORequest &request, bool error) +{ + ut_ad(request.is_write()); + ut_ad(!srv_read_only_mode); + buf_page_t *bpage= request.bpage; + ut_ad(bpage); + const auto state= bpage->state(); + /* io-fix can only be cleared by buf_page_t::write_complete() + and buf_page_t::read_complete() */ + ut_ad(state >= buf_page_t::WRITE_FIX); + ut_ad(!buf_dblwr.is_inside(bpage->id())); + ut_ad(request.node->space->id == bpage->id().space()); + + if (request.slot) + request.slot->release(); + + if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE))) + buf_page_monitor(*bpage, false); + DBUG_PRINT("ib_buf", ("write page %u:%u", + bpage->id().space(), bpage->id().page_no())); + + mysql_mutex_assert_not_owner(&buf_pool.mutex); + mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); + + if (request.is_LRU()) + { + const bool temp= bpage->oldest_modification() == 2; + if (!temp && state < buf_page_t::WRITE_FIX_REINIT && + request.node->space->use_doublewrite()) + buf_dblwr.write_completed(); + /* We must hold buf_pool.mutex while releasing the block, so that + no other thread can access it before we have freed it. */ + mysql_mutex_lock(&buf_pool.mutex); + bpage->write_complete(temp, error); + if (!error) + buf_LRU_free_page(bpage, true); + mysql_mutex_unlock(&buf_pool.mutex); + + buf_pool.n_flush_dec(); + } + else + { + if (state < buf_page_t::WRITE_FIX_REINIT && + request.node->space->use_doublewrite()) + buf_dblwr.write_completed(); + bpage->write_complete(false, error); + } +} + +/** Calculate a ROW_FORMAT=COMPRESSED page checksum and update the page. +@param[in,out] page page to update +@param[in] size compressed page size */ +void buf_flush_update_zip_checksum(buf_frame_t *page, ulint size) +{ + ut_ad(size > 0); + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, + page_zip_calc_checksum(page, size, false)); +} + +/** Assign the full crc32 checksum for non-compressed page. +@param[in,out] page page to be updated */ +void buf_flush_assign_full_crc32_checksum(byte* page) +{ + ut_d(bool compressed = false); + ut_d(bool corrupted = false); + ut_d(const uint size = buf_page_full_crc32_size(page, &compressed, + &corrupted)); + ut_ad(!compressed); + ut_ad(!corrupted); + ut_ad(size == uint(srv_page_size)); + const ulint payload = srv_page_size - FIL_PAGE_FCRC32_CHECKSUM; + mach_write_to_4(page + payload, my_crc32c(0, page, payload)); +} + +/** Initialize a page for writing to the tablespace. +@param[in] block buffer block; NULL if bypassing + the buffer pool +@param[in,out] page page frame +@param[in,out] page_zip_ compressed page, or NULL if + uncompressed +@param[in] use_full_checksum whether tablespace uses full checksum */ +void +buf_flush_init_for_writing( + const buf_block_t* block, + byte* page, + void* page_zip_, + bool use_full_checksum) +{ + if (block && block->page.frame != page) { + /* If page is encrypted in full crc32 format then + checksum stored already as a part of fil_encrypt_buf() */ + ut_ad(use_full_checksum); + return; + } + + ut_ad(!block || block->page.frame == page); + ut_ad(page); + + if (page_zip_) { + page_zip_des_t* page_zip; + ulint size; + + page_zip = static_cast(page_zip_); + ut_ad(!block || &block->page.zip == page_zip); + size = page_zip_get_size(page_zip); + + ut_ad(size); + ut_ad(ut_is_2pow(size)); + ut_ad(size <= UNIV_ZIP_SIZE_MAX); + + switch (fil_page_get_type(page)) { + case FIL_PAGE_TYPE_ALLOCATED: + case FIL_PAGE_INODE: + case FIL_PAGE_IBUF_BITMAP: + case FIL_PAGE_TYPE_FSP_HDR: + case FIL_PAGE_TYPE_XDES: + /* These are essentially uncompressed pages. */ + memcpy(page_zip->data, page, size); + /* fall through */ + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + case FIL_PAGE_INDEX: + case FIL_PAGE_RTREE: + buf_flush_update_zip_checksum(page_zip->data, size); + return; + } + + ib::error() << "The compressed page to be written" + " seems corrupt:"; + ut_print_buf(stderr, page, size); + fputs("\nInnoDB: Possibly older version of the page:", stderr); + ut_print_buf(stderr, page_zip->data, size); + putc('\n', stderr); + ut_error; + } + + if (use_full_checksum) { + static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "aligned"); + static_assert(FIL_PAGE_LSN % 4 == 0, "aligned"); + memcpy_aligned<4>(page + srv_page_size + - FIL_PAGE_FCRC32_END_LSN, + FIL_PAGE_LSN + 4 + page, 4); + return buf_flush_assign_full_crc32_checksum(page); + } + + static_assert(FIL_PAGE_END_LSN_OLD_CHKSUM % 8 == 0, "aligned"); + static_assert(FIL_PAGE_LSN % 8 == 0, "aligned"); + memcpy_aligned<8>(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM, + FIL_PAGE_LSN + page, 8); + + if (block && srv_page_size == 16384) { + /* The page type could be garbage in old files + created before MySQL 5.5. Such files always + had a page size of 16 kilobytes. */ + ulint page_type = fil_page_get_type(page); + ulint reset_type = page_type; + + switch (block->page.id().page_no() % 16384) { + case 0: + reset_type = block->page.id().page_no() == 0 + ? FIL_PAGE_TYPE_FSP_HDR + : FIL_PAGE_TYPE_XDES; + break; + case 1: + reset_type = FIL_PAGE_IBUF_BITMAP; + break; + case FSP_TRX_SYS_PAGE_NO: + if (block->page.id() + == page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO)) { + reset_type = FIL_PAGE_TYPE_TRX_SYS; + break; + } + /* fall through */ + default: + switch (page_type) { + case FIL_PAGE_INDEX: + case FIL_PAGE_TYPE_INSTANT: + case FIL_PAGE_RTREE: + case FIL_PAGE_UNDO_LOG: + case FIL_PAGE_INODE: + case FIL_PAGE_IBUF_FREE_LIST: + case FIL_PAGE_TYPE_ALLOCATED: + case FIL_PAGE_TYPE_SYS: + case FIL_PAGE_TYPE_TRX_SYS: + case FIL_PAGE_TYPE_BLOB: + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + break; + case FIL_PAGE_TYPE_FSP_HDR: + case FIL_PAGE_TYPE_XDES: + case FIL_PAGE_IBUF_BITMAP: + /* These pages should have + predetermined page numbers + (see above). */ + default: + reset_type = FIL_PAGE_TYPE_UNKNOWN; + break; + } + } + + if (UNIV_UNLIKELY(page_type != reset_type)) { + ib::info() + << "Resetting invalid page " + << block->page.id() << " type " + << page_type << " to " + << reset_type << " when flushing."; + fil_page_set_type(page, reset_type); + } + } + + const uint32_t checksum = buf_calc_page_crc32(page); + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum); + mach_write_to_4(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM, + checksum); +} + +/** Reserve a buffer for compression. +@param[in,out] slot reserved slot */ +static void buf_tmp_reserve_compression_buf(buf_tmp_buffer_t* slot) +{ + if (slot->comp_buf) + return; + /* Both Snappy and LZO compression methods require that the output + buffer be bigger than input buffer. Adjust the allocated size. */ + ulint size= srv_page_size; + if (provider_service_lzo->is_loaded) + size= LZO1X_1_15_MEM_COMPRESS; + else if (provider_service_snappy->is_loaded) + size= snappy_max_compressed_length(size); + slot->comp_buf= static_cast(aligned_malloc(size, srv_page_size)); +} + +/** Encrypt a buffer of temporary tablespace +@param[in] offset Page offset +@param[in] s Page to encrypt +@param[in,out] d Output buffer +@return encrypted buffer or NULL */ +static byte* buf_tmp_page_encrypt(ulint offset, const byte* s, byte* d) +{ + /* Calculate the start offset in a page */ + uint srclen= static_cast(srv_page_size) - + (FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + + FIL_PAGE_FCRC32_CHECKSUM); + const byte* src= s + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION; + byte* dst= d + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION; + + memcpy(d, s, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); + + if (!log_tmp_block_encrypt(src, srclen, dst, (offset * srv_page_size), true)) + return NULL; + + const ulint payload= srv_page_size - FIL_PAGE_FCRC32_CHECKSUM; + mach_write_to_4(d + payload, my_crc32c(0, d, payload)); + + srv_stats.pages_encrypted.inc(); + srv_stats.n_temp_blocks_encrypted.inc(); + return d; +} + +/** Encryption and page_compression hook that is called just before +a page is written to disk. +@param[in,out] space tablespace +@param[in,out] bpage buffer page +@param[in] s physical page frame that is being encrypted +@param[in,out] size payload size in bytes +@return page frame to be written to file +(may be src_frame or an encrypted/compressed copy of it) */ +static byte *buf_page_encrypt(fil_space_t* space, buf_page_t* bpage, byte* s, + buf_tmp_buffer_t **slot, size_t *size) +{ + ut_ad(!bpage->is_freed()); + ut_ad(space->id == bpage->id().space()); + ut_ad(!*slot); + + const uint32_t page_no= bpage->id().page_no(); + + switch (page_no) { + case TRX_SYS_PAGE_NO: + if (bpage->id().space() != TRX_SYS_SPACE) + break; + /* The TRX_SYS page is neither encrypted nor compressed, because + it contains the address of the doublewrite buffer. */ + /* fall through */ + case 0: + /* Page 0 of a tablespace is not encrypted/compressed */ + return s; + } + + fil_space_crypt_t *crypt_data= space->crypt_data; + bool encrypted, page_compressed; + if (space->purpose == FIL_TYPE_TEMPORARY) + { + ut_ad(!crypt_data); + encrypted= innodb_encrypt_temporary_tables; + page_compressed= false; + } + else + { + encrypted= crypt_data && !crypt_data->not_encrypted() && + crypt_data->type != CRYPT_SCHEME_UNENCRYPTED && + (!crypt_data->is_default_encryption() || srv_encrypt_tables); + page_compressed= space->is_compressed(); + } + + const bool full_crc32= space->full_crc32(); + + if (!encrypted && !page_compressed) + { + /* No need to encrypt or compress. Clear key-version & crypt-checksum. */ + static_assert(FIL_PAGE_FCRC32_KEY_VERSION % 4 == 0, "alignment"); + static_assert(FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION % 4 == 2, + "not perfect alignment"); + if (full_crc32) + memset_aligned<4>(s + FIL_PAGE_FCRC32_KEY_VERSION, 0, 4); + else + memset_aligned<2>(s + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8); + return s; + } + + static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment"); + static_assert(FIL_PAGE_LSN % 8 == 0, "alignment"); + if (full_crc32) + memcpy_aligned<4>(s + srv_page_size - FIL_PAGE_FCRC32_END_LSN, + FIL_PAGE_LSN + 4 + s, 4); + + ut_ad(!bpage->zip_size() || !page_compressed); + /* Find free slot from temporary memory array */ + *slot= buf_pool.io_buf_reserve(); + ut_a(*slot); + (*slot)->allocate(); + + byte *d= (*slot)->crypt_buf; + + if (!page_compressed) + { +not_compressed: + d= space->purpose == FIL_TYPE_TEMPORARY + ? buf_tmp_page_encrypt(page_no, s, d) + : fil_space_encrypt(space, page_no, s, d); + } + else + { + ut_ad(space->purpose != FIL_TYPE_TEMPORARY); + /* First we compress the page content */ + buf_tmp_reserve_compression_buf(*slot); + byte *tmp= (*slot)->comp_buf; + ulint len= fil_page_compress(s, tmp, space->flags, + fil_space_get_block_size(space, page_no), + encrypted); + + if (!len) + goto not_compressed; + + *size= len; + + if (full_crc32) + { + ut_d(bool compressed = false); + len= buf_page_full_crc32_size(tmp, +#ifdef UNIV_DEBUG + &compressed, +#else + NULL, +#endif + NULL); + ut_ad(compressed); + } + + /* Workaround for MDEV-15527. */ + memset(tmp + len, 0 , srv_page_size - len); + + if (encrypted) + tmp= fil_space_encrypt(space, page_no, tmp, d); + + if (full_crc32) + { + static_assert(FIL_PAGE_FCRC32_CHECKSUM == 4, "alignment"); + mach_write_to_4(tmp + len - 4, my_crc32c(0, tmp, len - 4)); + ut_ad(!buf_page_is_corrupted(true, tmp, space->flags)); + } + + d= tmp; + } + + (*slot)->out_buf= d; + return d; +} + +/** Free a page whose underlying file page has been freed. */ +ATTRIBUTE_COLD void buf_pool_t::release_freed_page(buf_page_t *bpage) noexcept +{ + mysql_mutex_assert_owner(&mutex); + ut_d(const lsn_t oldest_modification= bpage->oldest_modification();) + if (fsp_is_system_temporary(bpage->id().space())) + { + ut_ad(bpage->frame); + ut_ad(oldest_modification == 2); + bpage->clear_oldest_modification(); + } + else + { + mysql_mutex_lock(&flush_list_mutex); + ut_ad(oldest_modification > 2); + delete_from_flush_list(bpage); + mysql_mutex_unlock(&flush_list_mutex); + } + + bpage->lock.u_unlock(true); + buf_LRU_free_page(bpage, true); +} + +/** Write a flushable page to a file or free a freeable block. +@param evict whether to evict the page on write completion +@param space tablespace +@return whether a page write was initiated and buf_pool.mutex released */ +bool buf_page_t::flush(bool evict, fil_space_t *space) +{ + mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); + ut_ad(in_file()); + ut_ad(in_LRU_list); + ut_ad((space->purpose == FIL_TYPE_TEMPORARY) == + (space == fil_system.temp_space)); + ut_ad(evict || space != fil_system.temp_space); + ut_ad(space->referenced()); + + const auto s= state(); + ut_a(s >= FREED); + + if (s < UNFIXED) + { + if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE)) + { + const lsn_t lsn= + mach_read_from_8(my_assume_aligned<8> + (FIL_PAGE_LSN + (zip.data ? zip.data : frame))); + ut_ad(lsn >= oldest_modification()); + if (lsn > log_sys.get_flushed_lsn()) + { + mysql_mutex_unlock(&buf_pool.mutex); + log_write_up_to(lsn, true); + mysql_mutex_lock(&buf_pool.mutex); + } + } + buf_pool.release_freed_page(this); + return false; + } + + ut_d(const auto f=) zip.fix.fetch_add(WRITE_FIX - UNFIXED); + ut_ad(f >= UNFIXED); + ut_ad(f < READ_FIX); + ut_ad((space == fil_system.temp_space) + ? oldest_modification() == 2 + : oldest_modification() > 2); + + /* Increment the I/O operation count used for selecting LRU policy. */ + buf_LRU_stat_inc_io(); + mysql_mutex_unlock(&buf_pool.mutex); + + IORequest::Type type= IORequest::WRITE_ASYNC; + if (UNIV_UNLIKELY(evict)) + { + type= IORequest::WRITE_LRU; + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_pool.n_flush_inc(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + } + + /* Apart from the U-lock, this block will also be protected by + is_write_fixed() and oldest_modification()>1. + Thus, it cannot be relocated or removed. */ + + DBUG_PRINT("ib_buf", ("%s %u page %u:%u", + evict ? "LRU" : "flush_list", + id().space(), id().page_no())); + + buf_block_t *block= reinterpret_cast(this); + page_t *write_frame= zip.data; + + space->reacquire(); + size_t size; +#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32 + size_t orig_size; +#endif + buf_tmp_buffer_t *slot= nullptr; + + if (UNIV_UNLIKELY(!frame)) /* ROW_FORMAT=COMPRESSED */ + { + ut_ad(!space->full_crc32()); + ut_ad(!space->is_compressed()); /* not page_compressed */ + size= zip_size(); +#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32 + orig_size= size; +#endif + buf_flush_update_zip_checksum(write_frame, size); + write_frame= buf_page_encrypt(space, this, write_frame, &slot, &size); + ut_ad(size == zip_size()); + } + else + { + byte *page= frame; + size= block->physical_size(); +#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32 + orig_size= size; +#endif + + if (space->full_crc32()) + { + /* innodb_checksum_algorithm=full_crc32 is not implemented for + ROW_FORMAT=COMPRESSED pages. */ + ut_ad(!write_frame); + page= buf_page_encrypt(space, this, page, &slot, &size); + buf_flush_init_for_writing(block, page, nullptr, true); + } + else + { + buf_flush_init_for_writing(block, page, write_frame ? &zip : nullptr, + false); + page= buf_page_encrypt(space, this, write_frame ? write_frame : page, + &slot, &size); + } + +#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32 + if (size != orig_size) + { + switch (space->chain.start->punch_hole) { + case 1: + static_assert(IORequest::PUNCH_LRU - IORequest::PUNCH == + IORequest::WRITE_LRU - IORequest::WRITE_ASYNC, ""); + type= + IORequest::Type(type + (IORequest::PUNCH - IORequest::WRITE_ASYNC)); + break; + case 2: + size= orig_size; + } + } +#endif + write_frame= page; + } + + if ((s & LRU_MASK) == REINIT || !space->use_doublewrite()) + { + if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE)) + { + const lsn_t lsn= + mach_read_from_8(my_assume_aligned<8>(FIL_PAGE_LSN + + (write_frame ? write_frame + : frame))); + ut_ad(lsn >= oldest_modification()); + log_write_up_to(lsn, true); + } + space->io(IORequest{type, this, slot}, physical_offset(), size, + write_frame, this); + } + else + buf_dblwr.add_to_batch(IORequest{this, slot, space->chain.start, type}, + size); + return true; +} + +/** Check whether a page can be flushed from the buf_pool. +@param id page identifier +@param fold id.fold() +@param evict true=buf_pool.LRU; false=buf_pool.flush_list +@return whether the page can be flushed */ +static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, + bool evict) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(fold == id.fold()); + + /* FIXME: cell_get() is being invoked while holding buf_pool.mutex */ + const buf_page_t *bpage= + buf_pool.page_hash.get(id, buf_pool.page_hash.cell_get(fold)); + + if (!bpage || buf_pool.watch_is_sentinel(*bpage)) + return false; + + /* We avoid flushing 'non-old' blocks in an eviction flush, because the + flushed blocks are soon freed */ + if (evict && !bpage->is_old()) + return false; + + return bpage->oldest_modification() > 1 && !bpage->is_io_fixed(); +} + +/** Check which neighbors of a page can be flushed from the buf_pool. +@param space tablespace +@param id page identifier of a dirty page +@param contiguous whether to consider contiguous areas of pages +@param evict true=buf_pool.LRU; false=buf_pool.flush_list +@return last page number that can be flushed */ +static page_id_t buf_flush_check_neighbors(const fil_space_t &space, + page_id_t &id, bool contiguous, + bool evict) +{ + ut_ad(id.page_no() < space.size + + (space.physical_size() == 2048 ? 1 + : space.physical_size() == 1024 ? 3 : 0)); + /* When flushed, dirty blocks are searched in neighborhoods of this + size, and flushed along with the original page. */ + const ulint s= buf_pool.curr_size / 16; + const uint32_t read_ahead= buf_pool.read_ahead_area; + const uint32_t buf_flush_area= read_ahead > s + ? static_cast(s) : read_ahead; + page_id_t low= id - (id.page_no() % buf_flush_area); + page_id_t high= low + buf_flush_area; + high.set_page_no(std::min(high.page_no(), space.last_page_number())); + + if (!contiguous) + { + high= std::max(id + 1, high); + id= low; + return high; + } + + /* Determine the contiguous dirty area around id. */ + const ulint id_fold= id.fold(); + + mysql_mutex_lock(&buf_pool.mutex); + + if (id > low) + { + ulint fold= id_fold; + for (page_id_t i= id - 1;; --i) + { + fold--; + if (!buf_flush_check_neighbor(i, fold, evict)) + { + low= i + 1; + break; + } + if (i == low) + break; + } + } + + page_id_t i= id; + id= low; + ulint fold= id_fold; + while (++i < high) + { + ++fold; + if (!buf_flush_check_neighbor(i, fold, evict)) + break; + } + + mysql_mutex_unlock(&buf_pool.mutex); + return i; +} + +MY_ATTRIBUTE((warn_unused_result)) +/** Apply freed_ranges to the file. +@param writable whether the file is writable +@return number of pages written or hole-punched */ +uint32_t fil_space_t::flush_freed(bool writable) +{ + const bool punch_hole= chain.start->punch_hole == 1; + if (!punch_hole && !srv_immediate_scrub_data_uncompressed) + return 0; + + mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); + mysql_mutex_assert_not_owner(&buf_pool.mutex); + + for (;;) + { + freed_range_mutex.lock(); + if (freed_ranges.empty()) + { + freed_range_mutex.unlock(); + return 0; + } + const lsn_t flush_lsn= last_freed_lsn; + if (log_sys.get_flushed_lsn() >= flush_lsn) + break; + freed_range_mutex.unlock(); + log_write_up_to(flush_lsn, true); + } + + const unsigned physical{physical_size()}; + + range_set freed= std::move(freed_ranges); + uint32_t written= 0; + + if (!writable); + else if (punch_hole) + { + for (const auto &range : freed) + { + written+= range.last - range.first + 1; + reacquire(); + io(IORequest(IORequest::PUNCH_RANGE), + os_offset_t{range.first} * physical, + (range.last - range.first + 1) * physical, nullptr); + } + } + else + { + for (const auto &range : freed) + { + written+= range.last - range.first + 1; + for (os_offset_t i= range.first; i <= range.last; i++) + { + reacquire(); + io(IORequest(IORequest::WRITE_ASYNC), i * physical, physical, + const_cast(field_ref_zero)); + } + } + } + + freed_range_mutex.unlock(); + return written; +} + +/** Flushes to disk all flushable pages within the flush area +and also write zeroes or punch the hole for the freed ranges of pages. +@param space tablespace +@param page_id page identifier +@param bpage buffer page +@param contiguous whether to consider contiguous areas of pages +@param evict true=buf_pool.LRU; false=buf_pool.flush_list +@param n_flushed number of pages flushed so far in this batch +@param n_to_flush maximum number of pages we are allowed to flush +@return number of pages flushed */ +static ulint buf_flush_try_neighbors(fil_space_t *space, + const page_id_t page_id, + buf_page_t *bpage, + bool contiguous, bool evict, + ulint n_flushed, ulint n_to_flush) +{ + mysql_mutex_unlock(&buf_pool.mutex); + + ut_ad(space->id == page_id.space()); + ut_ad(bpage->id() == page_id); + + ulint count= 0; + page_id_t id= page_id; + page_id_t high= buf_flush_check_neighbors(*space, id, contiguous, evict); + + ut_ad(page_id >= id); + ut_ad(page_id < high); + + for (ulint id_fold= id.fold(); id < high; ++id, ++id_fold) + { + if (UNIV_UNLIKELY(space->is_stopping_writes())) + { + if (bpage) + bpage->lock.u_unlock(true); + break; + } + + if (count + n_flushed >= n_to_flush) + { + if (id > page_id) + break; + /* If the page whose neighbors we are flushing has not been + flushed yet, we must flush the page that we selected originally. */ + id= page_id; + id_fold= id.fold(); + } + + const buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id_fold); + mysql_mutex_lock(&buf_pool.mutex); + + if (buf_page_t *b= buf_pool.page_hash.get(id, chain)) + { + ut_ad(b->in_file()); + if (id == page_id) + { + ut_ad(bpage == b); + bpage= nullptr; + ut_ad(!buf_pool.watch_is_sentinel(*b)); + ut_ad(b->oldest_modification() > 1); + flush: + if (b->flush(evict, space)) + { + ++count; + continue; + } + } + /* We avoid flushing 'non-old' blocks in an eviction flush, + because the flushed blocks are soon freed */ + else if ((!evict || b->is_old()) && !buf_pool.watch_is_sentinel(*b) && + b->oldest_modification() > 1 && b->lock.u_lock_try(true)) + { + if (b->oldest_modification() < 2) + b->lock.u_unlock(true); + else + goto flush; + } + } + + mysql_mutex_unlock(&buf_pool.mutex); + } + + if (count > 1) + { + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE, + MONITOR_FLUSH_NEIGHBOR_COUNT, + MONITOR_FLUSH_NEIGHBOR_PAGES, count - 1); + } + + return count; +} + +/*******************************************************************//** +This utility moves the uncompressed frames of pages to the free list. +Note that this function does not actually flush any data to disk. It +just detaches the uncompressed frames from the compressed pages at the +tail of the unzip_LRU and puts those freed frames in the free list. +@return number of blocks moved to the free list. */ +static ulint buf_free_from_unzip_LRU_list_batch() +{ + ulint scanned = 0; + ulint count = 0; + + mysql_mutex_assert_owner(&buf_pool.mutex); + + buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU); + + while (block + && UT_LIST_GET_LEN(buf_pool.free) < srv_LRU_scan_depth + && UT_LIST_GET_LEN(buf_pool.unzip_LRU) + > UT_LIST_GET_LEN(buf_pool.LRU) / 10) { + + ++scanned; + if (buf_LRU_free_page(&block->page, false)) { + /* Block was freed. buf_pool.mutex potentially + released and reacquired */ + ++count; + block = UT_LIST_GET_LAST(buf_pool.unzip_LRU); + } else { + block = UT_LIST_GET_PREV(unzip_LRU, block); + } + } + + mysql_mutex_assert_owner(&buf_pool.mutex); + + if (scanned) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_BATCH_SCANNED, + MONITOR_LRU_BATCH_SCANNED_NUM_CALL, + MONITOR_LRU_BATCH_SCANNED_PER_CALL, + scanned); + } + + return(count); +} + +/** Acquire a tablespace reference for writing. +@param id tablespace identifier +@return tablespace +@retval nullptr if the tablespace is missing or inaccessible */ +fil_space_t *fil_space_t::get_for_write(uint32_t id) +{ + mysql_mutex_lock(&fil_system.mutex); + fil_space_t *space= fil_space_get_by_id(id); + const uint32_t n= space ? space->acquire_low(STOPPING_WRITES) : 0; + + if (n & STOPPING_WRITES) + space= nullptr; + else if ((n & CLOSING) && !space->prepare_acquired()) + space= nullptr; + + mysql_mutex_unlock(&fil_system.mutex); + return space; +} + +/** Start writing out pages for a tablespace. +@param id tablespace identifier +@return tablespace and number of pages written */ +static std::pair buf_flush_space(const uint32_t id) +{ + if (fil_space_t *space= fil_space_t::get_for_write(id)) + return {space, space->flush_freed(true)}; + return {nullptr, 0}; +} + +struct flush_counters_t +{ + /** number of dirty pages flushed */ + ulint flushed; + /** number of clean pages evicted */ + ulint evicted; +}; + +/** Discard a dirty page, and release buf_pool.flush_list_mutex. +@param bpage dirty page whose tablespace is not accessible */ +static void buf_flush_discard_page(buf_page_t *bpage) +{ + ut_ad(bpage->in_file()); + ut_ad(bpage->oldest_modification()); + + buf_pool.delete_from_flush_list(bpage); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + ut_d(const auto state= bpage->state()); + ut_ad(state == buf_page_t::FREED || state == buf_page_t::UNFIXED || + state == buf_page_t::IBUF_EXIST || state == buf_page_t::REINIT); + bpage->lock.u_unlock(true); + buf_LRU_free_page(bpage, true); +} + +/** Flush dirty blocks from the end buf_pool.LRU, +and move clean blocks to buf_pool.free. +@param max maximum number of blocks to flush +@param evict whether dirty pages are to be evicted after flushing them +@param n counts of flushed and evicted pages */ +static void buf_flush_LRU_list_batch(ulint max, bool evict, + flush_counters_t *n) +{ + ulint scanned= 0; + ulint free_limit= srv_LRU_scan_depth; + + mysql_mutex_assert_owner(&buf_pool.mutex); + if (buf_pool.withdraw_target && buf_pool.is_shrinking()) + free_limit+= buf_pool.withdraw_target - UT_LIST_GET_LEN(buf_pool.withdraw); + + const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN + ? 0 : srv_flush_neighbors; + fil_space_t *space= nullptr; + uint32_t last_space_id= FIL_NULL; + static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency"); + static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency"); + + for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.LRU); + bpage && + ((UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_MIN_LEN && + UT_LIST_GET_LEN(buf_pool.free) < free_limit) || + recv_recovery_is_on()); + ++scanned, bpage= buf_pool.lru_hp.get()) + { + buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage); + buf_pool.lru_hp.set(prev); + auto state= bpage->state(); + ut_ad(state >= buf_page_t::FREED); + ut_ad(bpage->in_LRU_list); + + if (!bpage->oldest_modification()) + { + evict: + if (state != buf_page_t::FREED && + (state >= buf_page_t::READ_FIX || (~buf_page_t::LRU_MASK & state))) + continue; + buf_LRU_free_page(bpage, true); + ++n->evicted; + if (UNIV_LIKELY(scanned & 31)) + continue; + mysql_mutex_unlock(&buf_pool.mutex); + reacquire_mutex: + mysql_mutex_lock(&buf_pool.mutex); + continue; + } + + if (state < buf_page_t::READ_FIX && bpage->lock.u_lock_try(true)) + { + ut_ad(!bpage->is_io_fixed()); + bool do_evict= evict; + switch (bpage->oldest_modification()) { + case 1: + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (ut_d(lsn_t lsn=) bpage->oldest_modification()) + { + ut_ad(lsn == 1); /* It must be clean while we hold bpage->lock */ + buf_pool.delete_from_flush_list(bpage); + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + /* fall through */ + case 0: + bpage->lock.u_unlock(true); + goto evict; + case 2: + /* LRU flushing will always evict pages of the temporary tablespace. */ + do_evict= true; + } + /* Block is ready for flush. Dispatch an IO request. + If do_evict, the page may be evicted by buf_page_write_complete(). */ + const page_id_t page_id(bpage->id()); + const uint32_t space_id= page_id.space(); + if (!space || space->id != space_id) + { + if (last_space_id != space_id) + { + buf_pool.lru_hp.set(bpage); + mysql_mutex_unlock(&buf_pool.mutex); + if (space) + space->release(); + auto p= buf_flush_space(space_id); + space= p.first; + last_space_id= space_id; + if (!space) + { + mysql_mutex_lock(&buf_pool.mutex); + goto no_space; + } + mysql_mutex_lock(&buf_pool.mutex); + buf_pool.stat.n_pages_written+= p.second; + } + else + { + ut_ad(!space); + goto no_space; + } + } + else if (space->is_stopping_writes()) + { + space->release(); + space= nullptr; + no_space: + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_flush_discard_page(bpage); + continue; + } + + if (n->flushed >= max && !recv_recovery_is_on()) + { + bpage->lock.u_unlock(true); + break; + } + + if (neighbors && space->is_rotational()) + n->flushed+= buf_flush_try_neighbors(space, page_id, bpage, + neighbors == 1, + do_evict, n->flushed, max); + else if (bpage->flush(do_evict, space)) + ++n->flushed; + else + continue; + + goto reacquire_mutex; + } + else + /* Can't evict or dispatch this block. Go to previous. */ + ut_ad(buf_pool.lru_hp.is_hp(prev)); + } + + buf_pool.lru_hp.set(nullptr); + + if (space) + space->release(); + + if (scanned) + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_LRU_BATCH_SCANNED, + MONITOR_LRU_BATCH_SCANNED_NUM_CALL, + MONITOR_LRU_BATCH_SCANNED_PER_CALL, + scanned); +} + +/** Flush and move pages from LRU or unzip_LRU list to the free list. +Whether LRU or unzip_LRU is used depends on the state of the system. +@param max maximum number of blocks to flush +@param evict whether dirty pages are to be evicted after flushing them +@param n counts of flushed and evicted pages */ +static void buf_do_LRU_batch(ulint max, bool evict, flush_counters_t *n) +{ + if (buf_LRU_evict_from_unzip_LRU()) + buf_free_from_unzip_LRU_list_batch(); + n->evicted= 0; + n->flushed= 0; + buf_flush_LRU_list_batch(max, evict, n); + + mysql_mutex_assert_owner(&buf_pool.mutex); + buf_lru_freed_page_count+= n->evicted; + buf_lru_flush_page_count+= n->flushed; + buf_pool.stat.n_pages_written+= n->flushed; +} + +/** This utility flushes dirty blocks from the end of the flush_list. +The calling thread is not allowed to own any latches on pages! +@param max_n maximum mumber of blocks to flush +@param lsn once an oldest_modification>=lsn is found, terminate the batch +@return number of blocks for which the write request was queued */ +static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn) +{ + ulint count= 0; + ulint scanned= 0; + + mysql_mutex_assert_owner(&buf_pool.mutex); + mysql_mutex_assert_owner(&buf_pool.flush_list_mutex); + + const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN + ? 0 : srv_flush_neighbors; + fil_space_t *space= nullptr; + uint32_t last_space_id= FIL_NULL; + static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency"); + static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency"); + + /* Start from the end of the list looking for a suitable block to be + flushed. */ + ulint len= UT_LIST_GET_LEN(buf_pool.flush_list); + + for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); + bpage && len && count < max_n; ++scanned, len--) + { + const lsn_t oldest_modification= bpage->oldest_modification(); + if (oldest_modification >= lsn) + break; + ut_ad(bpage->in_file()); + + { + buf_page_t *prev= UT_LIST_GET_PREV(list, bpage); + + if (oldest_modification == 1) + { + clear: + buf_pool.delete_from_flush_list(bpage); + skip: + bpage= prev; + continue; + } + + ut_ad(oldest_modification > 2); + + if (!bpage->lock.u_lock_try(true)) + goto skip; + + ut_ad(!bpage->is_io_fixed()); + + if (bpage->oldest_modification() == 1) + { + bpage->lock.u_unlock(true); + goto clear; + } + + /* In order not to degenerate this scan to O(n*n) we attempt to + preserve the pointer position. Any thread that would remove 'prev' + from buf_pool.flush_list must adjust the hazard pointer. + + Note: A concurrent execution of buf_flush_list_space() may + terminate this scan prematurely. The buf_pool.flush_list_active + should prevent multiple threads from executing + buf_do_flush_list_batch() concurrently, + but buf_flush_list_space() is ignoring that. */ + buf_pool.flush_hp.set(prev); + } + + const page_id_t page_id(bpage->id()); + const uint32_t space_id= page_id.space(); + if (!space || space->id != space_id) + { + if (last_space_id != space_id) + { + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + mysql_mutex_unlock(&buf_pool.mutex); + if (space) + space->release(); + auto p= buf_flush_space(space_id); + space= p.first; + last_space_id= space_id; + mysql_mutex_lock(&buf_pool.mutex); + buf_pool.stat.n_pages_written+= p.second; + mysql_mutex_lock(&buf_pool.flush_list_mutex); + } + else + ut_ad(!space); + } + else if (space->is_stopping_writes()) + { + space->release(); + space= nullptr; + } + + if (!space) + buf_flush_discard_page(bpage); + else + { + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + do + { + if (neighbors && space->is_rotational()) + count+= buf_flush_try_neighbors(space, page_id, bpage, + neighbors == 1, false, count, max_n); + else if (bpage->flush(false, space)) + ++count; + else + continue; + mysql_mutex_lock(&buf_pool.mutex); + } + while (0); + } + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + bpage= buf_pool.flush_hp.get(); + } + + buf_pool.flush_hp.set(nullptr); + + if (space) + space->release(); + + if (scanned) + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED, + MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL, + MONITOR_FLUSH_BATCH_SCANNED_PER_CALL, + scanned); + return count; +} + +/** Wait until a LRU flush batch ends. */ +void buf_flush_wait_LRU_batch_end() +{ + mysql_mutex_assert_owner(&buf_pool.flush_list_mutex); + mysql_mutex_assert_not_owner(&buf_pool.mutex); + + if (buf_pool.n_flush()) + { + tpool::tpool_wait_begin(); + thd_wait_begin(nullptr, THD_WAIT_DISKIO); + do + my_cond_wait(&buf_pool.done_flush_LRU, + &buf_pool.flush_list_mutex.m_mutex); + while (buf_pool.n_flush()); + tpool::tpool_wait_end(); + thd_wait_end(nullptr); + } +} + +/** Write out dirty blocks from buf_pool.flush_list. +The caller must invoke buf_dblwr.flush_buffered_writes() +after releasing buf_pool.mutex. +@param max_n wished maximum mumber of blocks flushed +@param lsn buf_pool.get_oldest_modification(LSN_MAX) target +@return the number of processed pages +@retval 0 if a buf_pool.flush_list batch is already running */ +static ulint buf_flush_list_holding_mutex(ulint max_n= ULINT_UNDEFINED, + lsn_t lsn= LSN_MAX) +{ + ut_ad(lsn); + mysql_mutex_assert_owner(&buf_pool.mutex); + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (buf_pool.flush_list_active()) + { +nothing_to_do: + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + return 0; + } + if (!buf_pool.get_oldest_modification(0)) + { + pthread_cond_broadcast(&buf_pool.done_flush_list); + goto nothing_to_do; + } + buf_pool.flush_list_set_active(); + const ulint n_flushed= buf_do_flush_list_batch(max_n, lsn); + if (n_flushed) + buf_pool.stat.n_pages_written+= n_flushed; + buf_pool.flush_list_set_inactive(); + pthread_cond_broadcast(&buf_pool.done_flush_list); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + if (n_flushed) + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_TOTAL_PAGE, + MONITOR_FLUSH_BATCH_COUNT, + MONITOR_FLUSH_BATCH_PAGES, + n_flushed); + + DBUG_PRINT("ib_buf", ("flush_list completed, " ULINTPF " pages", n_flushed)); + return n_flushed; +} + +/** Write out dirty blocks from buf_pool.flush_list. +@param max_n wished maximum mumber of blocks flushed +@param lsn buf_pool.get_oldest_modification(LSN_MAX) target +@return the number of processed pages +@retval 0 if a buf_pool.flush_list batch is already running */ +static ulint buf_flush_list(ulint max_n= ULINT_UNDEFINED, + lsn_t lsn= LSN_MAX) +{ + mysql_mutex_lock(&buf_pool.mutex); + ulint n= buf_flush_list_holding_mutex(max_n, lsn); + mysql_mutex_unlock(&buf_pool.mutex); + buf_dblwr.flush_buffered_writes(); + return n; +} + +/** Try to flush all the dirty pages that belong to a given tablespace. +@param space tablespace +@param n_flushed number of pages written +@return whether the flush for some pages might not have been initiated */ +bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed) +{ + const auto space_id= space->id; + ut_ad(space_id <= SRV_SPACE_ID_UPPER_BOUND); + + bool may_have_skipped= false; + ulint max_n_flush= srv_io_capacity; + ulint n_flush= 0; + + bool acquired= space->acquire_for_write(); + { + const uint32_t written{space->flush_freed(acquired)}; + mysql_mutex_lock(&buf_pool.mutex); + if (written) + buf_pool.stat.n_pages_written+= written; + } + mysql_mutex_lock(&buf_pool.flush_list_mutex); + + for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; ) + { + ut_ad(bpage->oldest_modification()); + ut_ad(bpage->in_file()); + + buf_page_t *prev= UT_LIST_GET_PREV(list, bpage); + if (bpage->oldest_modification() == 1) + clear: + buf_pool.delete_from_flush_list(bpage); + else if (bpage->id().space() != space_id); + else if (!bpage->lock.u_lock_try(true)) + may_have_skipped= true; + else if (bpage->oldest_modification() == 1) + { + bpage->lock.u_unlock(true); + goto clear; + } + else + { + /* In order not to degenerate this scan to O(n*n) we attempt to + preserve the pointer position. Any thread that would remove 'prev' + from buf_pool.flush_list must adjust the hazard pointer. + + Note: Multiple executions of buf_flush_list_space() may be + interleaved, and also buf_do_flush_list_batch() may be running + concurrently. This may terminate our iteration prematurely, + leading us to return may_have_skipped=true. */ + buf_pool.flush_hp.set(prev); + + if (!acquired) + was_freed: + buf_flush_discard_page(bpage); + else + { + if (space->is_stopping_writes()) + { + space->release(); + acquired= false; + goto was_freed; + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + if (bpage->flush(false, space)) + { + ++n_flush; + if (!--max_n_flush) + { + mysql_mutex_lock(&buf_pool.mutex); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + may_have_skipped= true; + goto done; + } + mysql_mutex_lock(&buf_pool.mutex); + } + } + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (!buf_pool.flush_hp.is_hp(prev)) + may_have_skipped= true; + bpage= buf_pool.flush_hp.get(); + continue; + } + + bpage= prev; + } + + /* Note: this loop may have been executed concurrently with + buf_do_flush_list_batch() as well as other threads executing + buf_flush_list_space(). We should always return true from + buf_flush_list_space() if that should be the case; in + buf_do_flush_list_batch() we will simply perform less work. */ +done: + buf_pool.flush_hp.set(nullptr); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + buf_pool.stat.n_pages_written+= n_flush; + + buf_pool.try_LRU_scan= true; + pthread_cond_broadcast(&buf_pool.done_free); + mysql_mutex_unlock(&buf_pool.mutex); + + if (n_flushed) + *n_flushed= n_flush; + + if (acquired) + space->release(); + + if (space->purpose == FIL_TYPE_IMPORT) + os_aio_wait_until_no_pending_writes(true); + else + buf_dblwr.flush_buffered_writes(); + + return may_have_skipped; +} + +/** Write out dirty blocks from buf_pool.LRU, +and move clean blocks to buf_pool.free. +The caller must invoke buf_dblwr.flush_buffered_writes() +after releasing buf_pool.mutex. +@param max_n wished maximum mumber of blocks flushed +@param evict whether to evict pages after flushing +@return evict ? number of processed pages : number of pages written */ +ulint buf_flush_LRU(ulint max_n, bool evict) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + + flush_counters_t n; + buf_do_LRU_batch(max_n, evict, &n); + + ulint pages= n.flushed; + + if (n.evicted) + { + if (evict) + pages+= n.evicted; + buf_pool.try_LRU_scan= true; + pthread_cond_broadcast(&buf_pool.done_free); + } + + return pages; +} + +#ifdef HAVE_PMEM +# include +#endif + +/** Write checkpoint information to the log header and release mutex. +@param end_lsn start LSN of the FILE_CHECKPOINT mini-transaction */ +inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept +{ + ut_ad(!srv_read_only_mode); + ut_ad(end_lsn >= next_checkpoint_lsn); + ut_ad(end_lsn <= get_lsn()); + ut_ad(end_lsn + SIZE_OF_FILE_CHECKPOINT <= get_lsn() || + srv_shutdown_state > SRV_SHUTDOWN_INITIATED); + + DBUG_PRINT("ib_log", + ("checkpoint at " LSN_PF " written", next_checkpoint_lsn)); + + auto n= next_checkpoint_no; + const size_t offset{(n & 1) ? CHECKPOINT_2 : CHECKPOINT_1}; + static_assert(CPU_LEVEL1_DCACHE_LINESIZE >= 64, "efficiency"); + static_assert(CPU_LEVEL1_DCACHE_LINESIZE <= 4096, "compatibility"); + byte* c= my_assume_aligned + (is_pmem() ? buf + offset : checkpoint_buf); + memset_aligned(c, 0, CPU_LEVEL1_DCACHE_LINESIZE); + mach_write_to_8(my_assume_aligned<8>(c), next_checkpoint_lsn); + mach_write_to_8(my_assume_aligned<8>(c + 8), end_lsn); + mach_write_to_4(my_assume_aligned<4>(c + 60), my_crc32c(0, c, 60)); + + lsn_t resizing; + +#ifdef HAVE_PMEM + if (is_pmem()) + { + resizing= resize_lsn.load(std::memory_order_relaxed); + + if (resizing > 1 && resizing <= next_checkpoint_lsn) + { + memcpy_aligned<64>(resize_buf + CHECKPOINT_1, c, 64); + header_write(resize_buf, resizing, is_encrypted()); + pmem_persist(resize_buf, resize_target); + } + pmem_persist(c, 64); + } + else +#endif + { + ut_ad(!checkpoint_pending); + checkpoint_pending= true; + latch.wr_unlock(); + log_write_and_flush_prepare(); + resizing= resize_lsn.load(std::memory_order_relaxed); + /* FIXME: issue an asynchronous write */ + log.write(offset, {c, get_block_size()}); + if (resizing > 1 && resizing <= next_checkpoint_lsn) + { + byte *buf= static_cast(aligned_malloc(4096, 4096)); + memset_aligned<4096>(buf, 0, 4096); + header_write(buf, resizing, is_encrypted()); + resize_log.write(0, {buf, 4096}); + aligned_free(buf); + resize_log.write(CHECKPOINT_1, {c, get_block_size()}); + } + + if (srv_file_flush_method != SRV_O_DSYNC) + ut_a(log.flush()); + latch.wr_lock(SRW_LOCK_CALL); + ut_ad(checkpoint_pending); + checkpoint_pending= false; + resizing= resize_lsn.load(std::memory_order_relaxed); + } + + ut_ad(!checkpoint_pending); + next_checkpoint_no++; + const lsn_t checkpoint_lsn{next_checkpoint_lsn}; + last_checkpoint_lsn= checkpoint_lsn; + + DBUG_PRINT("ib_log", ("checkpoint ended at " LSN_PF ", flushed to " LSN_PF, + checkpoint_lsn, get_flushed_lsn())); + if (overwrite_warned) + { + sql_print_information("InnoDB: Crash recovery was broken " + "between LSN=" LSN_PF + " and checkpoint LSN=" LSN_PF ".", + overwrite_warned, checkpoint_lsn); + overwrite_warned= 0; + } + + lsn_t resizing_completed= 0; + + if (resizing > 1 && resizing <= checkpoint_lsn) + { + ut_ad(is_pmem() == !resize_flush_buf); + + if (!is_pmem()) + { + if (srv_file_flush_method != SRV_O_DSYNC) + ut_a(resize_log.flush()); + IF_WIN(log.close(),); + } + + if (resize_rename()) + { + /* Resizing failed. Discard the log_sys.resize_log. */ +#ifdef HAVE_PMEM + if (is_pmem()) + my_munmap(resize_buf, resize_target); + else +#endif + { + ut_free_dodump(resize_buf, buf_size); + ut_free_dodump(resize_flush_buf, buf_size); +#ifdef _WIN32 + ut_ad(!log.is_opened()); + bool success; + log.m_file= + os_file_create_func(get_log_file_path().c_str(), + OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT, + OS_FILE_NORMAL, OS_LOG_FILE, false, &success); + ut_a(success); + ut_a(log.is_opened()); +#endif + } + } + else + { + /* Adopt the resized log. */ +#ifdef HAVE_PMEM + if (is_pmem()) + { + my_munmap(buf, file_size); + buf= resize_buf; + buf_free= START_OFFSET + (get_lsn() - resizing); + } + else +#endif + { + IF_WIN(,log.close()); + std::swap(log, resize_log); + ut_free_dodump(buf, buf_size); + ut_free_dodump(flush_buf, buf_size); + buf= resize_buf; + flush_buf= resize_flush_buf; + } + srv_log_file_size= resizing_completed= file_size= resize_target; + first_lsn= resizing; + set_capacity(); + } + ut_ad(!resize_log.is_opened()); + resize_buf= nullptr; + resize_flush_buf= nullptr; + resize_target= 0; + resize_lsn.store(0, std::memory_order_relaxed); + } + + log_resize_release(); + + if (UNIV_LIKELY(resizing <= 1)); + else if (resizing > checkpoint_lsn) + buf_flush_ahead(resizing, false); + else if (resizing_completed) + ib::info() << "Resized log to " << ib::bytes_iec{resizing_completed} + << "; start LSN=" << resizing; + else + buf_flush_ahead(end_lsn + 1, false); +} + +/** Initiate a log checkpoint, discarding the start of the log. +@param oldest_lsn the checkpoint LSN +@param end_lsn log_sys.get_lsn() +@return true if success, false if a checkpoint write was already running */ +static bool log_checkpoint_low(lsn_t oldest_lsn, lsn_t end_lsn) +{ + ut_ad(!srv_read_only_mode); +#ifndef SUX_LOCK_GENERIC + ut_ad(log_sys.latch.is_write_locked()); +#endif + ut_ad(oldest_lsn <= end_lsn); + ut_ad(end_lsn == log_sys.get_lsn()); + + if (oldest_lsn == log_sys.last_checkpoint_lsn || + (oldest_lsn == end_lsn && + !log_sys.resize_in_progress() && + oldest_lsn == log_sys.last_checkpoint_lsn + + (log_sys.is_encrypted() + ? SIZE_OF_FILE_CHECKPOINT + 8 : SIZE_OF_FILE_CHECKPOINT))) + { + /* Do nothing, because nothing was logged (other than a + FILE_CHECKPOINT record) since the previous checkpoint. */ + do_nothing: + log_sys.latch.wr_unlock(); + return true; + } + + ut_ad(!recv_no_log_write); + ut_ad(oldest_lsn > log_sys.last_checkpoint_lsn); + /* Repeat the FILE_MODIFY records after the checkpoint, in case some + log records between the checkpoint and log_sys.lsn need them. + Finally, write a FILE_CHECKPOINT record. Redo log apply expects to + see a FILE_CHECKPOINT after the checkpoint, except on clean + shutdown, where the log will be empty after the checkpoint. + + It is important that we write out the redo log before any further + dirty pages are flushed to the tablespace files. At this point, + because we hold exclusive log_sys.latch, + mtr_t::commit() in other threads will be blocked, + and no pages can be added to buf_pool.flush_list. */ + const lsn_t flush_lsn{fil_names_clear(oldest_lsn)}; + ut_ad(flush_lsn >= end_lsn + SIZE_OF_FILE_CHECKPOINT); + log_sys.latch.wr_unlock(); + log_write_up_to(flush_lsn, true); + log_sys.latch.wr_lock(SRW_LOCK_CALL); + if (log_sys.last_checkpoint_lsn >= oldest_lsn) + goto do_nothing; + + ut_ad(log_sys.get_flushed_lsn() >= flush_lsn); + + if (log_sys.checkpoint_pending) + { + /* A checkpoint write is running */ + log_sys.latch.wr_unlock(); + return false; + } + + log_sys.next_checkpoint_lsn= oldest_lsn; + log_sys.write_checkpoint(end_lsn); + + return true; +} + +/** Make a checkpoint. Note that this function does not flush dirty +blocks from the buffer pool: it only checks what is lsn of the oldest +modification in the pool, and writes information about the lsn in +log file. Use log_make_checkpoint() to flush also the pool. +@retval true if the checkpoint was or had been made +@retval false if a checkpoint write was already running */ +static bool log_checkpoint() +{ + if (recv_recovery_is_on()) + recv_sys.apply(true); + + switch (srv_file_flush_method) { + case SRV_NOSYNC: + case SRV_O_DIRECT_NO_FSYNC: + break; + default: + fil_flush_file_spaces(); + } + + log_sys.latch.wr_lock(SRW_LOCK_CALL); + const lsn_t end_lsn= log_sys.get_lsn(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + const lsn_t oldest_lsn= buf_pool.get_oldest_modification(end_lsn); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + return log_checkpoint_low(oldest_lsn, end_lsn); +} + +/** Make a checkpoint. */ +ATTRIBUTE_COLD void log_make_checkpoint() +{ + buf_flush_wait_flushed(log_sys.get_lsn(std::memory_order_acquire)); + while (!log_checkpoint()); +} + +/** Wait for all dirty pages up to an LSN to be written out. +NOTE: The calling thread is not allowed to hold any buffer page latches! */ +static void buf_flush_wait(lsn_t lsn) +{ + ut_ad(lsn <= log_sys.get_lsn()); + + lsn_t oldest_lsn; + + while ((oldest_lsn= buf_pool.get_oldest_modification(lsn)) < lsn) + { + if (buf_flush_sync_lsn < lsn) + { + buf_flush_sync_lsn= lsn; + buf_pool.page_cleaner_set_idle(false); + pthread_cond_signal(&buf_pool.do_flush_list); + my_cond_wait(&buf_pool.done_flush_list, + &buf_pool.flush_list_mutex.m_mutex); + oldest_lsn= buf_pool.get_oldest_modification(lsn); + if (oldest_lsn >= lsn) + break; + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + os_aio_wait_until_no_pending_writes(false); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + } + + if (oldest_lsn >= buf_flush_sync_lsn) + { + buf_flush_sync_lsn= 0; + pthread_cond_broadcast(&buf_pool.done_flush_list); + } +} + +/** Wait until all persistent pages are flushed up to a limit. +@param sync_lsn buf_pool.get_oldest_modification(LSN_MAX) to wait for */ +ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn) +{ + ut_ad(sync_lsn); + ut_ad(sync_lsn < LSN_MAX); + ut_ad(!srv_read_only_mode); + + if (recv_recovery_is_on()) + recv_sys.apply(true); + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + + if (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn) + { + MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS); + +#if 1 /* FIXME: remove this, and guarantee that the page cleaner serves us */ + if (UNIV_UNLIKELY(!buf_page_cleaner_is_active)) + { + do + { + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + ulint n_pages= buf_flush_list(srv_max_io_capacity, sync_lsn); + if (n_pages) + { + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE, + MONITOR_FLUSH_SYNC_COUNT, + MONITOR_FLUSH_SYNC_PAGES, n_pages); + } + os_aio_wait_until_no_pending_writes(false); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + } + while (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn); + } + else +#endif + { + thd_wait_begin(nullptr, THD_WAIT_DISKIO); + tpool::tpool_wait_begin(); + buf_flush_wait(sync_lsn); + tpool::tpool_wait_end(); + thd_wait_end(nullptr); + } + } + + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + if (UNIV_UNLIKELY(log_sys.last_checkpoint_lsn < sync_lsn)) + { + /* If the buffer pool was clean, no log write was guaranteed + to happen until now. There could be an outstanding FILE_CHECKPOINT + record from a previous fil_names_clear() call, which we must + write out before we can advance the checkpoint. */ + log_write_up_to(sync_lsn, true); + DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", return;); + log_checkpoint(); + } +} + +/** Initiate more eager page flushing if the log checkpoint age is too old. +@param lsn buf_pool.get_oldest_modification(LSN_MAX) target +@param furious true=furious flushing, false=limit to innodb_io_capacity */ +ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious) +{ + ut_ad(!srv_read_only_mode); + + if (recv_recovery_is_on()) + recv_sys.apply(true); + + DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", return;); + + Atomic_relaxed &limit= furious + ? buf_flush_sync_lsn : buf_flush_async_lsn; + + if (limit < lsn) + { + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (limit < lsn) + { + limit= lsn; + buf_pool.page_cleaner_set_idle(false); + pthread_cond_signal(&buf_pool.do_flush_list); + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + } +} + +/** Conduct checkpoint-related flushing for innodb_flush_sync=ON, +and try to initiate checkpoints until the target is met. +@param lsn minimum value of buf_pool.get_oldest_modification(LSN_MAX) */ +ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn) +{ + ut_ad(!srv_read_only_mode); + mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); + + for (;;) + { + if (ulint n_flushed= buf_flush_list(srv_max_io_capacity, lsn)) + { + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE, + MONITOR_FLUSH_SYNC_COUNT, + MONITOR_FLUSH_SYNC_PAGES, n_flushed); + } + + switch (srv_file_flush_method) { + case SRV_NOSYNC: + case SRV_O_DIRECT_NO_FSYNC: + break; + default: + fil_flush_file_spaces(); + } + + log_sys.latch.wr_lock(SRW_LOCK_CALL); + const lsn_t newest_lsn= log_sys.get_lsn(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + lsn_t measure= buf_pool.get_oldest_modification(0); + const lsn_t checkpoint_lsn= measure ? measure : newest_lsn; + + if (!recv_recovery_is_on() && + checkpoint_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT) + { + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + log_checkpoint_low(checkpoint_lsn, newest_lsn); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + measure= buf_pool.get_oldest_modification(LSN_MAX); + } + else + { + log_sys.latch.wr_unlock(); + if (!measure) + measure= LSN_MAX; + } + + /* After attempting log checkpoint, check if we have reached our target. */ + const lsn_t target= buf_flush_sync_lsn; + + if (measure >= target) + buf_flush_sync_lsn= 0; + else if (measure >= buf_flush_async_lsn) + buf_flush_async_lsn= 0; + + /* wake up buf_flush_wait() */ + pthread_cond_broadcast(&buf_pool.done_flush_list); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + lsn= std::max(lsn, target); + + if (measure >= lsn) + return; + } +} + +/** Check if the adpative flushing threshold is recommended based on +redo log capacity filled threshold. +@param oldest_lsn buf_pool.get_oldest_modification() +@return true if adaptive flushing is recommended. */ +static bool af_needed_for_redo(lsn_t oldest_lsn) +{ + lsn_t age= (log_sys.get_lsn() - oldest_lsn); + lsn_t af_lwm= static_cast(srv_adaptive_flushing_lwm * + static_cast(log_sys.log_capacity) / 100); + + /* if age > af_lwm adaptive flushing is recommended */ + return (age > af_lwm); +} + +/*********************************************************************//** +Calculates if flushing is required based on redo generation rate. +@return percent of io_capacity to flush to manage redo space */ +static +ulint +af_get_pct_for_lsn( +/*===============*/ + lsn_t age) /*!< in: current age of LSN. */ +{ + lsn_t af_lwm = static_cast( + srv_adaptive_flushing_lwm + * static_cast(log_sys.log_capacity) / 100); + + if (age < af_lwm) { + /* No adaptive flushing. */ + return(0); + } + + lsn_t lsn_age_factor = (age * 100) / log_sys.max_modified_age_async; + + ut_ad(srv_max_io_capacity >= srv_io_capacity); + return static_cast( + (static_cast(srv_max_io_capacity / srv_io_capacity + * lsn_age_factor) + * sqrt(static_cast(lsn_age_factor)) + / 7.5)); +} + +/** This function is called approximately once every second by +buf_flush_page_cleaner() if innodb_max_dirty_pages_pct_lwm>0 +and innodb_adaptive_flushing=ON. +Based on various factors it decides if there is a need to do flushing. +@return number of pages recommended to be flushed +@param last_pages_in number of pages flushed in previous batch +@param oldest_lsn buf_pool.get_oldest_modification(0) +@param pct_lwm innodb_max_dirty_pages_pct_lwm, or 0 to ignore it +@param dirty_blocks UT_LIST_GET_LEN(buf_pool.flush_list) +@param dirty_pct 100*flush_list.count / (LRU.count + free.count) */ +static ulint page_cleaner_flush_pages_recommendation(ulint last_pages_in, + lsn_t oldest_lsn, + double pct_lwm, + ulint dirty_blocks, + double dirty_pct) +{ + static lsn_t prev_lsn = 0; + static ulint sum_pages = 0; + static ulint avg_page_rate = 0; + static ulint n_iterations = 0; + static time_t prev_time; + lsn_t lsn_rate; + ulint n_pages = 0; + + const lsn_t cur_lsn = log_sys.get_lsn(); + ut_ad(oldest_lsn <= cur_lsn); + ulint pct_for_lsn = af_get_pct_for_lsn(cur_lsn - oldest_lsn); + time_t curr_time = time(nullptr); + const double max_pct = srv_max_buf_pool_modified_pct; + + if (!prev_lsn || !pct_for_lsn) { + prev_time = curr_time; + prev_lsn = cur_lsn; + if (max_pct > 0.0) { + dirty_pct /= max_pct; + } + + n_pages = ulint(dirty_pct * double(srv_io_capacity)); + if (n_pages < dirty_blocks) { + n_pages= std::min(srv_io_capacity, dirty_blocks); + } + +func_exit: + page_cleaner.flush_pass++; + return n_pages; + } + + sum_pages += last_pages_in; + + const ulint time_elapsed = std::max(curr_time - prev_time, 1); + + /* We update our variables every innodb_flushing_avg_loops + iterations to smooth out transition in workload. */ + if (++n_iterations >= srv_flushing_avg_loops + || time_elapsed >= srv_flushing_avg_loops) { + + avg_page_rate = (sum_pages / time_elapsed + avg_page_rate) / 2; + + /* How much LSN we have generated since last call. */ + lsn_rate = (cur_lsn - prev_lsn) / time_elapsed; + + lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2; + + if (page_cleaner.flush_pass) { + page_cleaner.flush_time /= page_cleaner.flush_pass; + } + + prev_lsn = cur_lsn; + prev_time = curr_time; + + MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME, + page_cleaner.flush_time); + MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS, + page_cleaner.flush_pass); + + page_cleaner.flush_time = 0; + page_cleaner.flush_pass = 0; + + n_iterations = 0; + sum_pages = 0; + } + + MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn); + + double total_ratio; + if (pct_lwm == 0.0 || max_pct == 0.0) { + total_ratio = 1; + } else { + total_ratio = std::max(double(pct_for_lsn) / 100, + (dirty_pct / max_pct)); + } + + MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, ulint(total_ratio * 100)); + + /* Estimate pages to be flushed for the lsn progress */ + lsn_t target_lsn = oldest_lsn + + lsn_avg_rate * buf_flush_lsn_scan_factor; + ulint pages_for_lsn = 0; + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + + for (buf_page_t* b = UT_LIST_GET_LAST(buf_pool.flush_list); + b != NULL; + b = UT_LIST_GET_PREV(list, b)) { + if (b->oldest_modification() > target_lsn) { + break; + } + if (++pages_for_lsn >= srv_max_io_capacity) { + break; + } + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + pages_for_lsn /= buf_flush_lsn_scan_factor; + if (pages_for_lsn < 1) { + pages_for_lsn = 1; + } + + n_pages = (ulint(double(srv_io_capacity) * total_ratio) + + avg_page_rate + pages_for_lsn) / 3; + + if (n_pages > srv_max_io_capacity) { + n_pages = srv_max_io_capacity; + } + + MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages); + + MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_BY_AGE, pages_for_lsn); + + MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate); + MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate); + + goto func_exit; +} + +#if defined __aarch64__&&defined __GNUC__&&__GNUC__==4&&!defined __clang__ +/* Avoid GCC 4.8.5 internal compiler error "could not split insn". +We would only need this for buf_flush_page_cleaner(), +but GCC 4.8.5 does not support pop_options. */ +# pragma GCC optimize ("O0") +#endif +/** page_cleaner thread tasked with flushing dirty pages from the buffer +pools. As of now we'll have only one coordinator. */ +static void buf_flush_page_cleaner() +{ + my_thread_init(); +#ifdef UNIV_PFS_THREAD + pfs_register_thread(page_cleaner_thread_key); +#endif /* UNIV_PFS_THREAD */ + ut_ad(!srv_read_only_mode); + ut_ad(buf_page_cleaner_is_active); + + ulint last_pages= 0; + timespec abstime; + set_timespec(abstime, 1); + + lsn_t lsn_limit; + ulint last_activity_count= srv_get_activity_count(); + + for (;;) + { + lsn_limit= buf_flush_sync_lsn; + + if (UNIV_UNLIKELY(lsn_limit != 0) && UNIV_LIKELY(srv_flush_sync)) + { + furious_flush: + buf_flush_sync_for_checkpoint(lsn_limit); + last_pages= 0; + set_timespec(abstime, 1); + continue; + } + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (buf_pool.ran_out()) + goto no_wait; + else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED) + break; + + if (buf_pool.page_cleaner_idle() && + (!UT_LIST_GET_LEN(buf_pool.flush_list) || + srv_max_dirty_pages_pct_lwm == 0.0)) + /* We are idle; wait for buf_pool.page_cleaner_wakeup() */ + my_cond_wait(&buf_pool.do_flush_list, + &buf_pool.flush_list_mutex.m_mutex); + else + my_cond_timedwait(&buf_pool.do_flush_list, + &buf_pool.flush_list_mutex.m_mutex, &abstime); + no_wait: + set_timespec(abstime, 1); + + lsn_limit= buf_flush_sync_lsn; + lsn_t oldest_lsn= buf_pool.get_oldest_modification(0); + + if (!oldest_lsn) + { + fully_unemployed: + buf_flush_sync_lsn= 0; + set_idle: + buf_pool.page_cleaner_set_idle(true); + set_almost_idle: + pthread_cond_broadcast(&buf_pool.done_flush_LRU); + pthread_cond_broadcast(&buf_pool.done_flush_list); + if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED)) + break; + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + buf_dblwr.flush_buffered_writes(); + + do + { + DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", continue;); + DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", continue;); + + if (!recv_recovery_is_on() && + !srv_startup_is_before_trx_rollback_phase && + srv_operation <= SRV_OPERATION_EXPORT_RESTORED) + log_checkpoint(); + } + while (false); + + if (!buf_pool.ran_out()) + continue; + mysql_mutex_lock(&buf_pool.flush_list_mutex); + oldest_lsn= buf_pool.get_oldest_modification(0); + } + + lsn_t soft_lsn_limit= buf_flush_async_lsn; + + if (UNIV_UNLIKELY(lsn_limit != 0)) + { + if (srv_flush_sync) + goto do_furious_flush; + if (oldest_lsn >= lsn_limit) + { + buf_flush_sync_lsn= 0; + pthread_cond_broadcast(&buf_pool.done_flush_list); + } + else if (lsn_limit > soft_lsn_limit) + soft_lsn_limit= lsn_limit; + } + + double pct_lwm= 0.0; + ulint n_flushed= 0, n; + + if (UNIV_UNLIKELY(soft_lsn_limit != 0)) + { + if (oldest_lsn >= soft_lsn_limit) + buf_flush_async_lsn= soft_lsn_limit= 0; + } + else if (buf_pool.ran_out()) + { + buf_pool.page_cleaner_set_idle(false); + buf_pool.n_flush_inc(); + /* Remove clean blocks from buf_pool.flush_list before the LRU scan. */ + for (buf_page_t *p= UT_LIST_GET_FIRST(buf_pool.flush_list); p; ) + { + const lsn_t lsn{p->oldest_modification()}; + ut_ad(lsn > 2 || lsn == 1); + buf_page_t *n= UT_LIST_GET_NEXT(list, p); + if (lsn <= 1) + buf_pool.delete_from_flush_list(p); + p= n; + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + n= srv_max_io_capacity; + mysql_mutex_lock(&buf_pool.mutex); + LRU_flush: + n= buf_flush_LRU(n, false); + mysql_mutex_unlock(&buf_pool.mutex); + last_pages+= n; + check_oldest_and_set_idle: + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_pool.n_flush_dec_holding_mutex(); + oldest_lsn= buf_pool.get_oldest_modification(0); + if (!oldest_lsn) + goto fully_unemployed; + if (oldest_lsn >= buf_flush_async_lsn) + buf_flush_async_lsn= 0; + buf_pool.page_cleaner_set_idle(false); + goto set_almost_idle; + } + else if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED)) + break; + + const ulint dirty_blocks= UT_LIST_GET_LEN(buf_pool.flush_list); + /* We perform dirty reads of the LRU+free list lengths here. + Division by zero is not possible, because buf_pool.flush_list is + guaranteed to be nonempty, and it is a subset of buf_pool.LRU. */ + const double dirty_pct= double(dirty_blocks) * 100.0 / + double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free)); + pct_lwm= srv_max_dirty_pages_pct_lwm; + if (pct_lwm != 0.0) + { + const ulint activity_count= srv_get_activity_count(); + if (activity_count != last_activity_count) + { + last_activity_count= activity_count; + goto maybe_unemployed; + } + else if (buf_pool.page_cleaner_idle() && !os_aio_pending_reads()) + { + /* reaching here means 3 things: + - last_activity_count == activity_count: suggesting server is idle + (no trx_t::commit() activity) + - page cleaner is idle (dirty_pct < srv_max_dirty_pages_pct_lwm) + - there are no pending reads but there are dirty pages to flush */ + buf_pool.update_last_activity_count(activity_count); + buf_pool.n_flush_inc(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + goto idle_flush; + } + else + { + maybe_unemployed: + const bool below{dirty_pct < pct_lwm}; + pct_lwm= 0.0; + if (below) + goto possibly_unemployed; + } + } + else if (dirty_pct < srv_max_buf_pool_modified_pct) + possibly_unemployed: + if (!soft_lsn_limit && !af_needed_for_redo(oldest_lsn)) + goto set_idle; + + buf_pool.page_cleaner_set_idle(false); + buf_pool.n_flush_inc(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + if (UNIV_UNLIKELY(soft_lsn_limit != 0)) + { + n= srv_max_io_capacity; + goto background_flush; + } + + if (!srv_adaptive_flushing) + { + idle_flush: + n= srv_io_capacity; + soft_lsn_limit= LSN_MAX; + background_flush: + mysql_mutex_lock(&buf_pool.mutex); + n_flushed= buf_flush_list_holding_mutex(n, soft_lsn_limit); + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE, + MONITOR_FLUSH_BACKGROUND_COUNT, + MONITOR_FLUSH_BACKGROUND_PAGES, + n_flushed); + } + else if ((n= page_cleaner_flush_pages_recommendation(last_pages, + oldest_lsn, + pct_lwm, + dirty_blocks, + dirty_pct)) != 0) + { + const ulint tm= ut_time_ms(); + mysql_mutex_lock(&buf_pool.mutex); + last_pages= n_flushed= buf_flush_list_holding_mutex(n); + page_cleaner.flush_time+= ut_time_ms() - tm; + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE, + MONITOR_FLUSH_ADAPTIVE_COUNT, + MONITOR_FLUSH_ADAPTIVE_PAGES, + n_flushed); + } + else if (buf_flush_async_lsn <= oldest_lsn) + goto check_oldest_and_set_idle; + + n= n >= n_flushed ? n - n_flushed : 0; + goto LRU_flush; + } + + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + if (srv_fast_shutdown != 2) + { + buf_dblwr.flush_buffered_writes(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_flush_wait_LRU_batch_end(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + os_aio_wait_until_no_pending_writes(false); + } + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + lsn_limit= buf_flush_sync_lsn; + if (UNIV_UNLIKELY(lsn_limit != 0)) + { + do_furious_flush: + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + goto furious_flush; + } + buf_page_cleaner_is_active= false; + pthread_cond_broadcast(&buf_pool.done_flush_list); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + my_thread_end(); + +#ifdef UNIV_PFS_THREAD + pfs_delete_thread(); +#endif +} + +/** Initialize page_cleaner. */ +ATTRIBUTE_COLD void buf_flush_page_cleaner_init() +{ + ut_ad(!buf_page_cleaner_is_active); + ut_ad(srv_operation <= SRV_OPERATION_EXPORT_RESTORED || + srv_operation == SRV_OPERATION_RESTORE || + srv_operation == SRV_OPERATION_RESTORE_EXPORT); + buf_flush_async_lsn= 0; + buf_flush_sync_lsn= 0; + buf_page_cleaner_is_active= true; + std::thread(buf_flush_page_cleaner).detach(); +} + +/** Flush the buffer pool on shutdown. */ +ATTRIBUTE_COLD void buf_flush_buffer_pool() +{ + ut_ad(!os_aio_pending_reads()); + ut_ad(!buf_page_cleaner_is_active); + ut_ad(!buf_flush_sync_lsn); + + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "Waiting to flush the buffer pool"); + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + + while (buf_pool.get_oldest_modification(0)) + { + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + buf_flush_list(srv_max_io_capacity); + os_aio_wait_until_no_pending_writes(false); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "Waiting to flush " ULINTPF " pages", + UT_LIST_GET_LEN(buf_pool.flush_list)); + } + + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + ut_ad(!os_aio_pending_reads()); +} + +/** Synchronously flush dirty blocks during recv_sys_t::apply(). +NOTE: The calling thread is not allowed to hold any buffer page latches! */ +void buf_flush_sync_batch(lsn_t lsn) +{ + lsn= std::max(lsn, log_sys.get_lsn()); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_flush_wait(lsn); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); +} + +/** Synchronously flush dirty blocks. +NOTE: The calling thread is not allowed to hold any buffer page latches! */ +void buf_flush_sync() +{ + if (recv_recovery_is_on()) + { + mysql_mutex_lock(&recv_sys.mutex); + recv_sys.apply(true); + mysql_mutex_unlock(&recv_sys.mutex); + } + + thd_wait_begin(nullptr, THD_WAIT_DISKIO); + tpool::tpool_wait_begin(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + for (;;) + { + const lsn_t lsn= log_sys.get_lsn(); + buf_flush_wait(lsn); + /* Wait for the page cleaner to be idle (for log resizing at startup) */ + while (buf_flush_sync_lsn) + my_cond_wait(&buf_pool.done_flush_list, + &buf_pool.flush_list_mutex.m_mutex); + if (lsn == log_sys.get_lsn()) + break; + } + + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + tpool::tpool_wait_end(); + thd_wait_end(nullptr); +} + +#ifdef UNIV_DEBUG +/** Functor to validate the flush list. */ +struct Check { + void operator()(const buf_page_t* elem) const + { + ut_ad(elem->oldest_modification()); + ut_ad(!fsp_is_system_temporary(elem->id().space())); + } +}; + +/** Validate the flush list. */ +static void buf_flush_validate_low() +{ + buf_page_t* bpage; + + mysql_mutex_assert_owner(&buf_pool.flush_list_mutex); + + ut_list_validate(buf_pool.flush_list, Check()); + + bpage = UT_LIST_GET_FIRST(buf_pool.flush_list); + + while (bpage != NULL) { + const lsn_t om = bpage->oldest_modification(); + /* A page in buf_pool.flush_list can be in + BUF_BLOCK_REMOVE_HASH state. This happens when a page + is in the middle of being relocated. In that case the + original descriptor can have this state and still be + in the flush list waiting to acquire the + buf_pool.flush_list_mutex to complete the relocation. */ + ut_d(const auto s= bpage->state()); + ut_ad(s >= buf_page_t::REMOVE_HASH); + ut_ad(om == 1 || om > 2); + + bpage = UT_LIST_GET_NEXT(list, bpage); + ut_ad(om == 1 || !bpage || recv_recovery_is_on() + || om >= bpage->oldest_modification()); + } +} + +/** Validate the flush list. */ +void buf_flush_validate() +{ + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_flush_validate_low(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); +} +#endif /* UNIV_DEBUG */ diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc new file mode 100644 index 00000000..65ee8fa3 --- /dev/null +++ b/storage/innobase/buf/buf0lru.cc @@ -0,0 +1,1452 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0lru.cc +The database buffer replacement algorithm + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "buf0lru.h" +#include "fil0fil.h" +#include "btr0btr.h" +#include "buf0buddy.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "buf0rea.h" +#include "btr0sea.h" +#include "os0file.h" +#include "page0zip.h" +#include "log0recv.h" +#include "srv0srv.h" +#include "srv0mon.h" +#include "my_cpu.h" + +/** Flush this many pages in buf_LRU_get_free_block() */ +size_t innodb_lru_flush_size; + +/** The number of blocks from the LRU_old pointer onward, including +the block pointed to, must be buf_pool.LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV +of the whole LRU list length, except that the tolerance defined below +is allowed. Note that the tolerance must be small enough such that for +even the BUF_LRU_OLD_MIN_LEN long LRU list, the LRU_old pointer is not +allowed to point to either end of the LRU list. */ + +static constexpr ulint BUF_LRU_OLD_TOLERANCE = 20; + +/** The minimum amount of non-old blocks when the LRU_old list exists +(that is, when there are more than BUF_LRU_OLD_MIN_LEN blocks). +@see buf_LRU_old_adjust_len */ +#define BUF_LRU_NON_OLD_MIN_LEN 5 + +/** If we switch on the InnoDB monitor because there are too few available +frames in the buffer pool, we set this to TRUE */ +static bool buf_lru_switched_on_innodb_mon = false; + +/** True if diagnostic message about difficult to find free blocks +in the buffer bool has already printed. */ +static bool buf_lru_free_blocks_error_printed; + +/******************************************************************//** +These statistics are not 'of' LRU but 'for' LRU. We keep count of I/O +and page_zip_decompress() operations. Based on the statistics, +buf_LRU_evict_from_unzip_LRU() decides if we want to evict from +unzip_LRU or the regular LRU. From unzip_LRU, we will only evict the +uncompressed frame (meaning we can evict dirty blocks as well). From +the regular LRU, we will evict the entire block (i.e.: both the +uncompressed and compressed data), which must be clean. */ + +/* @{ */ + +/** Number of intervals for which we keep the history of these stats. +Updated at SRV_MONITOR_INTERVAL (the buf_LRU_stat_update() call rate). */ +static constexpr ulint BUF_LRU_STAT_N_INTERVAL= 4; + +/** Co-efficient with which we multiply I/O operations to equate them +with page_zip_decompress() operations. */ +static constexpr ulint BUF_LRU_IO_TO_UNZIP_FACTOR= 50; + +/** Sampled values buf_LRU_stat_cur. +Not protected by any mutex. Updated by buf_LRU_stat_update(). */ +static buf_LRU_stat_t buf_LRU_stat_arr[BUF_LRU_STAT_N_INTERVAL]; + +/** Cursor to buf_LRU_stat_arr[] that is updated in a round-robin fashion. */ +static ulint buf_LRU_stat_arr_ind; + +/** Current operation counters. Not protected by any mutex. Cleared +by buf_LRU_stat_update(). */ +buf_LRU_stat_t buf_LRU_stat_cur; + +/** Running sum of past values of buf_LRU_stat_cur. +Updated by buf_LRU_stat_update(). Not Protected by any mutex. */ +buf_LRU_stat_t buf_LRU_stat_sum; + +/* @} */ + +/** @name Heuristics for detecting index scan @{ */ +/** Move blocks to "new" LRU list only if the first access was at +least this many milliseconds ago. Not protected by any mutex or latch. */ +uint buf_LRU_old_threshold_ms; +/* @} */ + +/** Remove bpage from buf_pool.LRU and buf_pool.page_hash. + +If !bpage->frame && bpage->oldest_modification() <= 1, +the object will be freed. + +@param bpage buffer block +@param id page identifier +@param chain locked buf_pool.page_hash chain (will be released here) +@param zip whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed + +If a compressed page is freed other compressed pages may be relocated. +@retval true if bpage with bpage->frame was removed from page_hash. The +caller needs to free the page to the free list +@retval false if block without bpage->frame was removed from page_hash. In +this case the block is already returned to the buddy allocator. */ +static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id, + buf_pool_t::hash_chain &chain, + bool zip); + +/** Free a block to buf_pool */ +static void buf_LRU_block_free_hashed_page(buf_block_t *block) +{ + block->page.free_file_page(); + buf_LRU_block_free_non_file_page(block); +} + +/** Increase LRU size in bytes by the page size. +@param[in] bpage control block */ +static inline void incr_LRU_size_in_bytes(const buf_page_t* bpage) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + + buf_pool.stat.LRU_bytes += bpage->physical_size(); + + ut_ad(buf_pool.stat.LRU_bytes <= buf_pool.curr_pool_size); +} + +/** @return whether the unzip_LRU list should be used for evicting a victim +instead of the general LRU list */ +bool buf_LRU_evict_from_unzip_LRU() +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + + /* If the unzip_LRU list is empty, we can only use the LRU. */ + if (UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0) { + return false; + } + + /* If unzip_LRU is at most 10% of the size of the LRU list, + then use the LRU. This slack allows us to keep hot + decompressed pages in the buffer pool. */ + if (UT_LIST_GET_LEN(buf_pool.unzip_LRU) + <= UT_LIST_GET_LEN(buf_pool.LRU) / 10) { + return false; + } + + /* If eviction hasn't started yet, we assume by default + that a workload is disk bound. */ + if (buf_pool.freed_page_clock == 0) { + return true; + } + + /* Calculate the average over past intervals, and add the values + of the current interval. */ + ulint io_avg = buf_LRU_stat_sum.io / BUF_LRU_STAT_N_INTERVAL + + buf_LRU_stat_cur.io; + + ulint unzip_avg = buf_LRU_stat_sum.unzip / BUF_LRU_STAT_N_INTERVAL + + buf_LRU_stat_cur.unzip; + + /* Decide based on our formula. If the load is I/O bound + (unzip_avg is smaller than the weighted io_avg), evict an + uncompressed frame from unzip_LRU. Otherwise we assume that + the load is CPU bound and evict from the regular LRU. */ + return(unzip_avg <= io_avg * BUF_LRU_IO_TO_UNZIP_FACTOR); +} + +/** Try to free an uncompressed page of a compressed block from the unzip +LRU list. The compressed page is preserved, and it need not be clean. +@param limit maximum number of blocks to scan +@return true if freed */ +static bool buf_LRU_free_from_unzip_LRU_list(ulint limit) +{ + if (!buf_LRU_evict_from_unzip_LRU()) { + return(false); + } + + ulint scanned = 0; + bool freed = false; + + for (buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU); + block && scanned < limit; ++scanned) { + buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU, block); + + ut_ad(block->page.in_file()); + ut_ad(block->page.belongs_to_unzip_LRU()); + ut_ad(block->in_unzip_LRU_list); + ut_ad(block->page.in_LRU_list); + + freed = buf_LRU_free_page(&block->page, false); + if (freed) { + scanned++; + break; + } + + block = prev_block; + } + + if (scanned) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_UNZIP_SEARCH_SCANNED, + MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL, + MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL, + scanned); + } + + return(freed); +} + +/** Try to free a clean page from the common LRU list. +@param limit maximum number of blocks to scan +@return whether a page was freed */ +static bool buf_LRU_free_from_common_LRU_list(ulint limit) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + + ulint scanned = 0; + bool freed = false; + + for (buf_page_t* bpage = buf_pool.lru_scan_itr.start(); + bpage && scanned < limit; + ++scanned, bpage = buf_pool.lru_scan_itr.get()) { + buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage); + buf_pool.lru_scan_itr.set(prev); + + const auto accessed = bpage->is_accessed(); + + if (buf_LRU_free_page(bpage, true)) { + if (!accessed) { + /* Keep track of pages that are evicted without + ever being accessed. This gives us a measure of + the effectiveness of readahead */ + ++buf_pool.stat.n_ra_pages_evicted; + } + + freed = true; + scanned++; + break; + } + } + + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_SEARCH_SCANNED, + MONITOR_LRU_SEARCH_SCANNED_NUM_CALL, + MONITOR_LRU_SEARCH_SCANNED_PER_CALL, + scanned); + + return(freed); +} + +/** @return a buffer block from the buf_pool.free list +@retval NULL if the free list is empty */ +buf_block_t* buf_LRU_get_free_only() +{ + buf_block_t* block; + + mysql_mutex_assert_owner(&buf_pool.mutex); + + block = reinterpret_cast( + UT_LIST_GET_FIRST(buf_pool.free)); + + while (block != NULL) { + ut_ad(block->page.in_free_list); + ut_d(block->page.in_free_list = FALSE); + ut_ad(!block->page.oldest_modification()); + ut_ad(!block->page.in_LRU_list); + ut_a(!block->page.in_file()); + UT_LIST_REMOVE(buf_pool.free, &block->page); + + if (!buf_pool.is_shrinking() + || UT_LIST_GET_LEN(buf_pool.withdraw) + >= buf_pool.withdraw_target + || !buf_pool.will_be_withdrawn(block->page)) { + /* No adaptive hash index entries may point to + a free block. */ + assert_block_ahi_empty(block); + + block->page.set_state(buf_page_t::MEMORY); + block->page.set_os_used(); + break; + } + + /* This should be withdrawn */ + UT_LIST_ADD_LAST(buf_pool.withdraw, &block->page); + ut_d(block->in_withdraw_list = true); + + block = reinterpret_cast( + UT_LIST_GET_FIRST(buf_pool.free)); + } + + return(block); +} + +/******************************************************************//** +Checks how much of buf_pool is occupied by non-data objects like +AHI, lock heaps etc. Depending on the size of non-data objects this +function will either assert or issue a warning and switch on the +status monitor. */ +static void buf_LRU_check_size_of_non_data_objects() +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + + if (recv_recovery_is_on() || buf_pool.n_chunks_new != buf_pool.n_chunks) + return; + + const auto s= UT_LIST_GET_LEN(buf_pool.free) + UT_LIST_GET_LEN(buf_pool.LRU); + + if (s < buf_pool.curr_size / 20) + ib::fatal() << "Over 95 percent of the buffer pool is" + " occupied by lock heaps" +#ifdef BTR_CUR_HASH_ADAPT + " or the adaptive hash index" +#endif /* BTR_CUR_HASH_ADAPT */ + "! Check that your transactions do not set too many" + " row locks, or review if innodb_buffer_pool_size=" + << (buf_pool.curr_size >> (20U - srv_page_size_shift)) + << "M could be bigger."; + + if (s < buf_pool.curr_size / 3) + { + if (!buf_lru_switched_on_innodb_mon && srv_monitor_timer) + { + /* Over 67 % of the buffer pool is occupied by lock heaps or + the adaptive hash index. This may be a memory leak! */ + ib::warn() << "Over 67 percent of the buffer pool is" + " occupied by lock heaps" +#ifdef BTR_CUR_HASH_ADAPT + " or the adaptive hash index" +#endif /* BTR_CUR_HASH_ADAPT */ + "! Check that your transactions do not set too many row locks." + " innodb_buffer_pool_size=" + << (buf_pool.curr_size >> (20U - srv_page_size_shift)) + << "M. Starting the InnoDB Monitor to print diagnostics."; + buf_lru_switched_on_innodb_mon= true; + srv_print_innodb_monitor= TRUE; + srv_monitor_timer_schedule_now(); + } + } + else if (buf_lru_switched_on_innodb_mon) + { + /* Switch off the InnoDB Monitor; this is a simple way to stop the + monitor if the situation becomes less urgent, but may also + surprise users who did SET GLOBAL innodb_status_output=ON earlier! */ + buf_lru_switched_on_innodb_mon= false; + srv_print_innodb_monitor= FALSE; + } +} + +/** Get a block from the buf_pool.free list. +If the list is empty, blocks will be moved from the end of buf_pool.LRU +to buf_pool.free. + +This function is called from a user thread when it needs a clean +block to read in a page. Note that we only ever get a block from +the free list. Even when we flush a page or find a page in LRU scan +we put it to free list to be used. +* iteration 0: + * get a block from the buf_pool.free list, success:done + * if buf_pool.try_LRU_scan is set + * scan LRU up to 100 pages to free a clean block + * success:retry the free list + * flush up to innodb_lru_flush_size LRU blocks to data files + (until UT_LIST_GET_GEN(buf_pool.free) < innodb_lru_scan_depth) + * on buf_page_write_complete() the blocks will put on buf_pool.free list + * success: retry the free list +* subsequent iterations: same as iteration 0 except: + * scan whole LRU list + * scan LRU list even if buf_pool.try_LRU_scan is not set + +@param have_mutex whether buf_pool.mutex is already being held +@return the free control block, in state BUF_BLOCK_MEMORY */ +buf_block_t *buf_LRU_get_free_block(bool have_mutex) +{ + ulint n_iterations = 0; + ulint flush_failures = 0; + MONITOR_INC(MONITOR_LRU_GET_FREE_SEARCH); + if (have_mutex) { + mysql_mutex_assert_owner(&buf_pool.mutex); + goto got_mutex; + } + DBUG_EXECUTE_IF("recv_ran_out_of_buffer", + if (recv_recovery_is_on() + && recv_sys.apply_log_recs) { + mysql_mutex_lock(&buf_pool.mutex); + goto flush_lru; + }); +get_mutex: + mysql_mutex_lock(&buf_pool.mutex); +got_mutex: + buf_LRU_check_size_of_non_data_objects(); + buf_block_t* block; + + DBUG_EXECUTE_IF("ib_lru_force_no_free_page", + if (!buf_lru_free_blocks_error_printed) { + n_iterations = 21; + goto not_found;}); + +retry: + /* If there is a block in the free list, take it */ + if ((block = buf_LRU_get_free_only()) != nullptr) { +got_block: + if (!have_mutex) { + mysql_mutex_unlock(&buf_pool.mutex); + } + block->page.zip.clear(); + return block; + } + + MONITOR_INC( MONITOR_LRU_GET_FREE_LOOPS ); + if (n_iterations || buf_pool.try_LRU_scan) { + /* If no block was in the free list, search from the + end of the LRU list and try to free a block there. + If we are doing for the first time we'll scan only + tail of the LRU list otherwise we scan the whole LRU + list. */ + if (buf_LRU_scan_and_free_block(n_iterations + ? ULINT_UNDEFINED : 100)) { + goto retry; + } + + /* Tell other threads that there is no point + in scanning the LRU list. */ + buf_pool.try_LRU_scan = false; + } + + for (;;) { + if ((block = buf_LRU_get_free_only()) != nullptr) { + goto got_block; + } + mysql_mutex_unlock(&buf_pool.mutex); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + const auto n_flush = buf_pool.n_flush(); + if (!buf_pool.try_LRU_scan) { + buf_pool.page_cleaner_wakeup(true); + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + mysql_mutex_lock(&buf_pool.mutex); + if (!n_flush) { + goto not_found; + } + if (!buf_pool.try_LRU_scan) { + my_cond_wait(&buf_pool.done_free, + &buf_pool.mutex.m_mutex); + } + } + +not_found: + if (n_iterations > 1) { + MONITOR_INC( MONITOR_LRU_GET_FREE_WAITS ); + } + + if (n_iterations == 21 && !buf_lru_free_blocks_error_printed + && srv_buf_pool_old_size == srv_buf_pool_size) { + buf_lru_free_blocks_error_printed = true; + mysql_mutex_unlock(&buf_pool.mutex); + ib::warn() << "Difficult to find free blocks in the buffer pool" + " (" << n_iterations << " search iterations)! " + << flush_failures << " failed attempts to" + " flush a page!" + " Consider increasing innodb_buffer_pool_size." + " Pending flushes (fsync): " + << fil_n_pending_tablespace_flushes + << ". " << os_n_file_reads << " OS file reads, " + << os_n_file_writes << " OS file writes, " + << os_n_fsyncs + << " OS fsyncs."; + mysql_mutex_lock(&buf_pool.mutex); + } + + /* No free block was found: try to flush the LRU list. + The freed blocks will be up for grabs for all threads. + + TODO: A more elegant way would have been to return one freed + up block to the caller here but the code that deals with + removing the block from buf_pool.page_hash and buf_pool.LRU is fairly + involved (particularly in case of ROW_FORMAT=COMPRESSED pages). We + can do that in a separate patch sometime in future. */ +#ifndef DBUG_OFF +flush_lru: +#endif + if (!buf_flush_LRU(innodb_lru_flush_size, true)) { + MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT); + ++flush_failures; + } + + n_iterations++; + buf_pool.stat.LRU_waits++; + mysql_mutex_unlock(&buf_pool.mutex); + buf_dblwr.flush_buffered_writes(); + goto get_mutex; +} + +/** Move the LRU_old pointer so that the length of the old blocks list +is inside the allowed limits. */ +static void buf_LRU_old_adjust_len() +{ + ulint old_len; + ulint new_len; + + ut_a(buf_pool.LRU_old); + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(buf_pool.LRU_old_ratio >= BUF_LRU_OLD_RATIO_MIN); + ut_ad(buf_pool.LRU_old_ratio <= BUF_LRU_OLD_RATIO_MAX); + compile_time_assert(BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN + > BUF_LRU_OLD_RATIO_DIV + * (BUF_LRU_OLD_TOLERANCE + 5)); + compile_time_assert(BUF_LRU_NON_OLD_MIN_LEN < BUF_LRU_OLD_MIN_LEN); + +#ifdef UNIV_LRU_DEBUG + /* buf_pool.LRU_old must be the first item in the LRU list + whose "old" flag is set. */ + ut_a(buf_pool.LRU_old->old); + ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old) + || !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old); + ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old) + || UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old); +#endif /* UNIV_LRU_DEBUG */ + + old_len = buf_pool.LRU_old_len; + new_len = ut_min(UT_LIST_GET_LEN(buf_pool.LRU) + * buf_pool.LRU_old_ratio / BUF_LRU_OLD_RATIO_DIV, + UT_LIST_GET_LEN(buf_pool.LRU) + - (BUF_LRU_OLD_TOLERANCE + + BUF_LRU_NON_OLD_MIN_LEN)); + + for (;;) { + buf_page_t* LRU_old = buf_pool.LRU_old; + + ut_a(LRU_old); + ut_ad(LRU_old->in_LRU_list); +#ifdef UNIV_LRU_DEBUG + ut_a(LRU_old->old); +#endif /* UNIV_LRU_DEBUG */ + + /* Update the LRU_old pointer if necessary */ + + if (old_len + BUF_LRU_OLD_TOLERANCE < new_len) { + + buf_pool.LRU_old = LRU_old = UT_LIST_GET_PREV( + LRU, LRU_old); +#ifdef UNIV_LRU_DEBUG + ut_a(!LRU_old->old); +#endif /* UNIV_LRU_DEBUG */ + old_len = ++buf_pool.LRU_old_len; + LRU_old->set_old(true); + + } else if (old_len > new_len + BUF_LRU_OLD_TOLERANCE) { + + buf_pool.LRU_old = UT_LIST_GET_NEXT(LRU, LRU_old); + old_len = --buf_pool.LRU_old_len; + LRU_old->set_old(false); + } else { + return; + } + } +} + +/** Initialize the old blocks pointer in the LRU list. This function should be +called when the LRU list grows to BUF_LRU_OLD_MIN_LEN length. */ +static void buf_LRU_old_init() +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_a(UT_LIST_GET_LEN(buf_pool.LRU) == BUF_LRU_OLD_MIN_LEN); + + /* We first initialize all blocks in the LRU list as old and then use + the adjust function to move the LRU_old pointer to the right + position */ + + for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool.LRU); + bpage != NULL; + bpage = UT_LIST_GET_PREV(LRU, bpage)) { + + ut_ad(bpage->in_LRU_list); + + /* This loop temporarily violates the + assertions of buf_page_t::set_old(). */ + bpage->old = true; + } + + buf_pool.LRU_old = UT_LIST_GET_FIRST(buf_pool.LRU); + buf_pool.LRU_old_len = UT_LIST_GET_LEN(buf_pool.LRU); + + buf_LRU_old_adjust_len(); +} + +/** Remove a block from the unzip_LRU list if it belonged to the list. +@param[in] bpage control block */ +static void buf_unzip_LRU_remove_block_if_needed(buf_page_t* bpage) +{ + ut_ad(bpage->in_file()); + mysql_mutex_assert_owner(&buf_pool.mutex); + + if (bpage->belongs_to_unzip_LRU()) { + buf_block_t* block = reinterpret_cast(bpage); + + ut_ad(block->in_unzip_LRU_list); + ut_d(block->in_unzip_LRU_list = false); + + UT_LIST_REMOVE(buf_pool.unzip_LRU, block); + } +} + +/** Removes a block from the LRU list. +@param[in] bpage control block */ +static inline void buf_LRU_remove_block(buf_page_t* bpage) +{ + /* Important that we adjust the hazard pointers before removing + bpage from the LRU list. */ + buf_page_t* prev_bpage = buf_pool.LRU_remove(bpage); + + /* If the LRU_old pointer is defined and points to just this block, + move it backward one step */ + + if (bpage == buf_pool.LRU_old) { + + /* Below: the previous block is guaranteed to exist, + because the LRU_old pointer is only allowed to differ + by BUF_LRU_OLD_TOLERANCE from strict + buf_pool.LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV of the LRU + list length. */ + ut_a(prev_bpage); +#ifdef UNIV_LRU_DEBUG + ut_a(!prev_bpage->old); +#endif /* UNIV_LRU_DEBUG */ + buf_pool.LRU_old = prev_bpage; + prev_bpage->set_old(true); + + buf_pool.LRU_old_len++; + } + + buf_pool.stat.LRU_bytes -= bpage->physical_size(); + + buf_unzip_LRU_remove_block_if_needed(bpage); + + /* If the LRU list is so short that LRU_old is not defined, + clear the "old" flags and return */ + if (UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN) { + + for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU); + bpage != NULL; + bpage = UT_LIST_GET_NEXT(LRU, bpage)) { + + /* This loop temporarily violates the + assertions of buf_page_t::set_old(). */ + bpage->old = false; + } + + buf_pool.LRU_old = NULL; + buf_pool.LRU_old_len = 0; + + return; + } + + ut_ad(buf_pool.LRU_old); + + /* Update the LRU_old_len field if necessary */ + if (bpage->old) { + buf_pool.LRU_old_len--; + } + + /* Adjust the length of the old block list if necessary */ + buf_LRU_old_adjust_len(); +} + +/******************************************************************//** +Adds a block to the LRU list of decompressed zip pages. */ +void +buf_unzip_LRU_add_block( +/*====================*/ + buf_block_t* block, /*!< in: control block */ + ibool old) /*!< in: TRUE if should be put to the end + of the list, else put to the start */ +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_a(block->page.belongs_to_unzip_LRU()); + ut_ad(!block->in_unzip_LRU_list); + ut_d(block->in_unzip_LRU_list = true); + + if (old) { + UT_LIST_ADD_LAST(buf_pool.unzip_LRU, block); + } else { + UT_LIST_ADD_FIRST(buf_pool.unzip_LRU, block); + } +} + +/******************************************************************//** +Adds a block to the LRU list. Please make sure that the page_size is +already set when invoking the function, so that we can get correct +page_size from the buffer page when adding a block into LRU */ +void +buf_LRU_add_block( + buf_page_t* bpage, /*!< in: control block */ + bool old) /*!< in: true if should be put to the old blocks + in the LRU list, else put to the start; if the + LRU list is very short, the block is added to + the start, regardless of this parameter */ +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(!bpage->in_LRU_list); + + if (!old || (UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN)) { + + UT_LIST_ADD_FIRST(buf_pool.LRU, bpage); + + bpage->freed_page_clock = buf_pool.freed_page_clock + & ((1U << 31) - 1); + } else { +#ifdef UNIV_LRU_DEBUG + /* buf_pool.LRU_old must be the first item in the LRU list + whose "old" flag is set. */ + ut_a(buf_pool.LRU_old->old); + ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old) + || !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old); + ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old) + || UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old); +#endif /* UNIV_LRU_DEBUG */ + UT_LIST_INSERT_AFTER(buf_pool.LRU, buf_pool.LRU_old, + bpage); + + buf_pool.LRU_old_len++; + } + + ut_d(bpage->in_LRU_list = TRUE); + + incr_LRU_size_in_bytes(bpage); + + if (UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_OLD_MIN_LEN) { + + ut_ad(buf_pool.LRU_old); + + /* Adjust the length of the old block list if necessary */ + + bpage->set_old(old); + buf_LRU_old_adjust_len(); + + } else if (UT_LIST_GET_LEN(buf_pool.LRU) == BUF_LRU_OLD_MIN_LEN) { + + /* The LRU list is now long enough for LRU_old to become + defined: init it */ + + buf_LRU_old_init(); + } else { + bpage->set_old(buf_pool.LRU_old != NULL); + } + + /* If this is a zipped block with decompressed frame as well + then put it on the unzip_LRU list */ + if (bpage->belongs_to_unzip_LRU()) { + buf_unzip_LRU_add_block((buf_block_t*) bpage, old); + } +} + +/** Move a block to the start of the LRU list. */ +void buf_page_make_young(buf_page_t *bpage) +{ + if (bpage->is_read_fixed()) + return; + + ut_ad(bpage->in_file()); + + mysql_mutex_lock(&buf_pool.mutex); + + if (UNIV_UNLIKELY(bpage->old)) + buf_pool.stat.n_pages_made_young++; + + buf_LRU_remove_block(bpage); + buf_LRU_add_block(bpage, false); + + mysql_mutex_unlock(&buf_pool.mutex); +} + +/** Try to free a block. If bpage is a descriptor of a compressed-only +ROW_FORMAT=COMPRESSED page, the buf_page_t object will be freed as well. +The caller must hold buf_pool.mutex. +@param bpage block to be freed +@param zip whether to remove both copies of a ROW_FORMAT=COMPRESSED page +@retval true if freed and buf_pool.mutex may have been temporarily released +@retval false if the page was not freed */ +bool buf_LRU_free_page(buf_page_t *bpage, bool zip) +{ + const page_id_t id{bpage->id()}; + buf_page_t* b = nullptr; + + mysql_mutex_assert_owner(&buf_pool.mutex); + + /* First, perform a quick check before we acquire hash_lock. */ + if (!bpage->can_relocate()) { + return false; + } + + /* We must hold an exclusive hash_lock to prevent + bpage->can_relocate() from changing due to a concurrent + execution of buf_page_get_low(). */ + buf_pool_t::hash_chain& chain= buf_pool.page_hash.cell_get(id.fold()); + page_hash_latch& hash_lock = buf_pool.page_hash.lock_get(chain); + /* We cannot use transactional_lock_guard here, + because buf_buddy_relocate() in buf_buddy_free() could get stuck. */ + hash_lock.lock(); + const lsn_t oldest_modification = bpage->oldest_modification_acquire(); + + if (UNIV_UNLIKELY(!bpage->can_relocate())) { + /* Do not free buffer fixed and I/O-fixed blocks. */ + goto func_exit; + } + + switch (oldest_modification) { + case 2: + ut_ad(id.space() == SRV_TMP_SPACE_ID); + ut_ad(!bpage->zip.data); + if (!bpage->is_freed()) { + goto func_exit; + } + bpage->clear_oldest_modification(); + break; + case 1: + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (const lsn_t om = bpage->oldest_modification()) { + ut_ad(om == 1); + buf_pool.delete_from_flush_list(bpage); + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + ut_ad(!bpage->oldest_modification()); + /* fall through */ + case 0: + if (zip || !bpage->zip.data || !bpage->frame) { + break; + } +relocate_compressed: + b = static_cast(ut_zalloc_nokey(sizeof *b)); + ut_a(b); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + new (b) buf_page_t(*bpage); + b->frame = nullptr; + { + ut_d(uint32_t s=) b->fix(); + ut_ad(s == buf_page_t::FREED + || s == buf_page_t::UNFIXED + || s == buf_page_t::IBUF_EXIST + || s == buf_page_t::REINIT); + } + break; + default: + if (zip || !bpage->zip.data || !bpage->frame) { + /* This would completely free the block. */ + /* Do not completely free dirty blocks. */ +func_exit: + hash_lock.unlock(); + return(false); + } + goto relocate_compressed; + } + + mysql_mutex_assert_owner(&buf_pool.mutex); + + DBUG_PRINT("ib_buf", ("free page %u:%u", id.space(), id.page_no())); + + ut_ad(bpage->can_relocate()); + + if (!buf_LRU_block_remove_hashed(bpage, id, chain, zip)) { + ut_ad(!b); + mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); + return(true); + } + + /* We have just freed a BUF_BLOCK_FILE_PAGE. If b != nullptr + then it was a compressed page with an uncompressed frame and + we are interested in freeing only the uncompressed frame. + Therefore we have to reinsert the compressed page descriptor + into the LRU and page_hash (and possibly flush_list). + if !b then it was a regular page that has been freed */ + + if (UNIV_LIKELY_NULL(b)) { + buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, b); + + ut_ad(!buf_pool.page_hash.get(id, chain)); + ut_ad(b->zip_size()); + + /* The field in_LRU_list of + the to-be-freed block descriptor should have + been cleared in + buf_LRU_block_remove_hashed(), which + invokes buf_LRU_remove_block(). */ + ut_ad(!bpage->in_LRU_list); + ut_ad(bpage->frame); + ut_ad(!((buf_block_t*) bpage)->in_unzip_LRU_list); + + /* The fields of bpage were copied to b before + buf_LRU_block_remove_hashed() was invoked. */ + ut_ad(!b->in_zip_hash); + ut_ad(b->in_LRU_list); + ut_ad(b->in_page_hash); + ut_d(b->in_page_hash = false); + b->hash = nullptr; + + buf_pool.page_hash.append(chain, b); + + /* Insert b where bpage was in the LRU list. */ + if (prev_b) { + ulint lru_len; + + ut_ad(prev_b->in_LRU_list); + ut_ad(prev_b->in_file()); + + UT_LIST_INSERT_AFTER(buf_pool.LRU, prev_b, b); + + incr_LRU_size_in_bytes(b); + + if (b->is_old()) { + buf_pool.LRU_old_len++; + if (buf_pool.LRU_old + == UT_LIST_GET_NEXT(LRU, b)) { + + buf_pool.LRU_old = b; + } + } + + lru_len = UT_LIST_GET_LEN(buf_pool.LRU); + + if (lru_len > BUF_LRU_OLD_MIN_LEN) { + ut_ad(buf_pool.LRU_old); + /* Adjust the length of the + old block list if necessary */ + buf_LRU_old_adjust_len(); + } else if (lru_len == BUF_LRU_OLD_MIN_LEN) { + /* The LRU list is now long + enough for LRU_old to become + defined: init it */ + buf_LRU_old_init(); + } +#ifdef UNIV_LRU_DEBUG + /* Check that the "old" flag is consistent + in the block and its neighbours. */ + b->set_old(b->is_old()); +#endif /* UNIV_LRU_DEBUG */ + } else { + ut_d(b->in_LRU_list = FALSE); + buf_LRU_add_block(b, b->old); + } + + buf_flush_relocate_on_flush_list(bpage, b); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + bpage->zip.data = nullptr; + + page_zip_set_size(&bpage->zip, 0); + + b->lock.x_lock(); + hash_lock.unlock(); + } else if (!zip) { + hash_lock.unlock(); + } + + buf_block_t* block = reinterpret_cast(bpage); + +#ifdef BTR_CUR_HASH_ADAPT + if (block->index) { + mysql_mutex_unlock(&buf_pool.mutex); + + /* Remove the adaptive hash index on the page. + The page was declared uninitialized by + buf_LRU_block_remove_hashed(). We need to flag + the contents of the page valid (which it still is) in + order to avoid bogus Valgrind or MSAN warnings.*/ + + MEM_MAKE_DEFINED(block->page.frame, srv_page_size); + btr_search_drop_page_hash_index(block, false); + MEM_UNDEFINED(block->page.frame, srv_page_size); + mysql_mutex_lock(&buf_pool.mutex); + } +#endif + if (UNIV_LIKELY_NULL(b)) { + ut_ad(b->zip_size()); + b->lock.x_unlock(); + b->unfix(); + } + + buf_LRU_block_free_hashed_page(block); + + return(true); +} + +/******************************************************************//** +Puts a block back to the free list. */ +void +buf_LRU_block_free_non_file_page( +/*=============================*/ + buf_block_t* block) /*!< in: block, must not contain a file page */ +{ + void* data; + + ut_ad(block->page.state() == buf_page_t::MEMORY); + assert_block_ahi_empty(block); + ut_ad(!block->page.in_free_list); + ut_ad(!block->page.oldest_modification()); + ut_ad(!block->page.in_LRU_list); + ut_ad(!block->page.hash); + + block->page.set_state(buf_page_t::NOT_USED); + + MEM_UNDEFINED(block->page.frame, srv_page_size); + data = block->page.zip.data; + + if (data != NULL) { + block->page.zip.data = NULL; + buf_pool_mutex_exit_forbid(); + + ut_ad(block->zip_size()); + + buf_buddy_free(data, block->zip_size()); + + buf_pool_mutex_exit_allow(); + page_zip_set_size(&block->page.zip, 0); + } + + if (buf_pool.is_shrinking() + && UT_LIST_GET_LEN(buf_pool.withdraw) < buf_pool.withdraw_target + && buf_pool.will_be_withdrawn(block->page)) { + /* This should be withdrawn */ + UT_LIST_ADD_LAST( + buf_pool.withdraw, + &block->page); + ut_d(block->in_withdraw_list = true); + } else { + UT_LIST_ADD_FIRST(buf_pool.free, &block->page); + ut_d(block->page.in_free_list = true); + buf_pool.try_LRU_scan= true; + pthread_cond_broadcast(&buf_pool.done_free); + } + + block->page.set_os_unused(); +} + +/** Release a memory block to the buffer pool. */ +ATTRIBUTE_COLD void buf_pool_t::free_block(buf_block_t *block) +{ + ut_ad(this == &buf_pool); + mysql_mutex_lock(&mutex); + buf_LRU_block_free_non_file_page(block); + mysql_mutex_unlock(&mutex); +} + + +/** Remove bpage from buf_pool.LRU and buf_pool.page_hash. + +If !bpage->frame && !bpage->oldest_modification(), the object will be freed. + +@param bpage buffer block +@param id page identifier +@param chain locked buf_pool.page_hash chain (will be released here) +@param zip whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed + +If a compressed page is freed other compressed pages may be relocated. +@retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The +caller needs to free the page to the free list +@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In +this case the block is already returned to the buddy allocator. */ +static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id, + buf_pool_t::hash_chain &chain, + bool zip) +{ + ut_a(bpage->can_relocate()); + ut_ad(buf_pool.page_hash.lock_get(chain).is_write_locked()); + + buf_LRU_remove_block(bpage); + + buf_pool.freed_page_clock += 1; + + if (UNIV_LIKELY(!bpage->zip.data)) { + MEM_CHECK_ADDRESSABLE(bpage, sizeof(buf_block_t)); + MEM_CHECK_ADDRESSABLE(bpage->frame, srv_page_size); + buf_block_modify_clock_inc((buf_block_t*) bpage); + } else if (const page_t *page = bpage->frame) { + MEM_CHECK_ADDRESSABLE(bpage, sizeof(buf_block_t)); + MEM_CHECK_ADDRESSABLE(bpage->frame, srv_page_size); + buf_block_modify_clock_inc((buf_block_t*) bpage); + + ut_a(!zip || !bpage->oldest_modification()); + ut_ad(bpage->zip_size()); + /* Skip consistency checks if the page was freed. + In recovery, we could get a sole FREE_PAGE record + and nothing else, for a ROW_FORMAT=COMPRESSED page. + Its contents would be garbage. */ + if (!bpage->is_freed()) + switch (fil_page_get_type(page)) { + case FIL_PAGE_TYPE_ALLOCATED: + case FIL_PAGE_INODE: + case FIL_PAGE_IBUF_BITMAP: + case FIL_PAGE_TYPE_FSP_HDR: + case FIL_PAGE_TYPE_XDES: + /* These are essentially uncompressed pages. */ + if (!zip) { + /* InnoDB writes the data to the + uncompressed page frame. Copy it + to the compressed page, which will + be preserved. */ + memcpy(bpage->zip.data, page, + bpage->zip_size()); + } + break; + case FIL_PAGE_TYPE_ZBLOB: + case FIL_PAGE_TYPE_ZBLOB2: + case FIL_PAGE_INDEX: + case FIL_PAGE_RTREE: + break; + default: + ib::error() << "The compressed page to be" + " evicted seems corrupt:"; + ut_print_buf(stderr, page, srv_page_size); + + ib::error() << "Possibly older version of" + " the page:"; + + ut_print_buf(stderr, bpage->zip.data, + bpage->zip_size()); + putc('\n', stderr); + ut_error; + } + } else { + ut_a(!bpage->oldest_modification()); + MEM_CHECK_ADDRESSABLE(bpage->zip.data, bpage->zip_size()); + } + + ut_ad(!bpage->in_zip_hash); + buf_pool.page_hash.remove(chain, bpage); + page_hash_latch& hash_lock = buf_pool.page_hash.lock_get(chain); + + if (UNIV_UNLIKELY(!bpage->frame)) { + ut_ad(!bpage->in_free_list); + ut_ad(!bpage->in_LRU_list); + ut_a(bpage->zip.data); + ut_a(bpage->zip.ssize); + ut_ad(!bpage->oldest_modification()); + + hash_lock.unlock(); + buf_pool_mutex_exit_forbid(); + + buf_buddy_free(bpage->zip.data, bpage->zip_size()); + + buf_pool_mutex_exit_allow(); + bpage->lock.free(); + ut_free(bpage); + return false; + } else { + static_assert(FIL_NULL == 0xffffffffU, "fill pattern"); + static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment"); + memset_aligned<4>(bpage->frame + FIL_PAGE_OFFSET, 0xff, 4); + static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2, + "not perfect alignment"); + memset_aligned<2>(bpage->frame + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4); + MEM_UNDEFINED(bpage->frame, srv_page_size); + bpage->set_state(buf_page_t::REMOVE_HASH); + + if (!zip) { + return true; + } + + hash_lock.unlock(); + + if (bpage->zip.data) { + /* Free the compressed page. */ + void* data = bpage->zip.data; + bpage->zip.data = NULL; + + ut_ad(!bpage->in_free_list); + ut_ad(!bpage->oldest_modification()); + ut_ad(!bpage->in_LRU_list); + buf_pool_mutex_exit_forbid(); + + buf_buddy_free(data, bpage->zip_size()); + + buf_pool_mutex_exit_allow(); + + page_zip_set_size(&bpage->zip, 0); + } + + return true; + } +} + +/** Release and evict a corrupted page. +@param bpage x-latched page that was found corrupted +@param state expected current state of the page */ +ATTRIBUTE_COLD +void buf_pool_t::corrupted_evict(buf_page_t *bpage, uint32_t state) +{ + const page_id_t id{bpage->id()}; + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold()); + page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain); + + recv_sys.free_corrupted_page(id); + mysql_mutex_lock(&mutex); + hash_lock.lock(); + + ut_ad(!bpage->oldest_modification()); + bpage->set_corrupt_id(); + auto unfix= state - buf_page_t::FREED; + auto s= bpage->zip.fix.fetch_sub(unfix) - unfix; + bpage->lock.x_unlock(true); + + while (s != buf_page_t::FREED || bpage->lock.is_locked_or_waiting()) + { + ut_ad(s >= buf_page_t::FREED); + ut_ad(s < buf_page_t::UNFIXED); + /* Wait for other threads to release the fix count + before releasing the bpage from LRU list. */ + (void) LF_BACKOFF(); + s= bpage->state(); + } + + /* remove from LRU and page_hash */ + if (buf_LRU_block_remove_hashed(bpage, id, chain, true)) + buf_LRU_block_free_hashed_page(reinterpret_cast(bpage)); + + mysql_mutex_unlock(&mutex); +} + +/** Update buf_pool.LRU_old_ratio. +@param[in] old_pct Reserve this percentage of + the buffer pool for "old" blocks +@param[in] adjust true=adjust the LRU list; + false=just assign buf_pool.LRU_old_ratio + during the initialization of InnoDB +@return updated old_pct */ +uint buf_LRU_old_ratio_update(uint old_pct, bool adjust) +{ + uint ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100; + if (ratio < BUF_LRU_OLD_RATIO_MIN) { + ratio = BUF_LRU_OLD_RATIO_MIN; + } else if (ratio > BUF_LRU_OLD_RATIO_MAX) { + ratio = BUF_LRU_OLD_RATIO_MAX; + } + + if (adjust) { + mysql_mutex_lock(&buf_pool.mutex); + + if (ratio != buf_pool.LRU_old_ratio) { + buf_pool.LRU_old_ratio = ratio; + + if (UT_LIST_GET_LEN(buf_pool.LRU) + >= BUF_LRU_OLD_MIN_LEN) { + buf_LRU_old_adjust_len(); + } + } + + mysql_mutex_unlock(&buf_pool.mutex); + } else { + buf_pool.LRU_old_ratio = ratio; + } + /* the reverse of + ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100 */ + return((uint) (ratio * 100 / (double) BUF_LRU_OLD_RATIO_DIV + 0.5)); +} + +/********************************************************************//** +Update the historical stats that we are collecting for LRU eviction +policy at the end of each interval. */ +void +buf_LRU_stat_update() +{ + buf_LRU_stat_t* item; + buf_LRU_stat_t cur_stat; + + if (!buf_pool.freed_page_clock) { + goto func_exit; + } + + /* Update the index. */ + item = &buf_LRU_stat_arr[buf_LRU_stat_arr_ind]; + buf_LRU_stat_arr_ind++; + buf_LRU_stat_arr_ind %= BUF_LRU_STAT_N_INTERVAL; + + /* Add the current value and subtract the obsolete entry. + Since buf_LRU_stat_cur is not protected by any mutex, + it can be changing between adding to buf_LRU_stat_sum + and copying to item. Assign it to local variables to make + sure the same value assign to the buf_LRU_stat_sum + and item */ + cur_stat = buf_LRU_stat_cur; + + buf_LRU_stat_sum.io += cur_stat.io - item->io; + buf_LRU_stat_sum.unzip += cur_stat.unzip - item->unzip; + + /* Put current entry in the array. */ + memcpy(item, &cur_stat, sizeof *item); + +func_exit: + /* Clear the current entry. */ + memset(&buf_LRU_stat_cur, 0, sizeof buf_LRU_stat_cur); +} + +#if defined __aarch64__&&defined __GNUC__&&__GNUC__==4&&!defined __clang__ +/* Avoid GCC 4.8.5 internal compiler error "could not split insn". +We would only need this for buf_LRU_scan_and_free_block(), +but GCC 4.8.5 does not support pop_options. */ +# pragma GCC optimize ("O0") +#endif +/** Try to free a replaceable block. +@param limit maximum number of blocks to scan +@return true if found and freed */ +bool buf_LRU_scan_and_free_block(ulint limit) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + + return buf_LRU_free_from_unzip_LRU_list(limit) || + buf_LRU_free_from_common_LRU_list(limit); +} + +#ifdef UNIV_DEBUG +/** Validate the LRU list. */ +void buf_LRU_validate() +{ + ulint old_len; + ulint new_len; + + mysql_mutex_lock(&buf_pool.mutex); + + if (UT_LIST_GET_LEN(buf_pool.LRU) >= BUF_LRU_OLD_MIN_LEN) { + + ut_a(buf_pool.LRU_old); + old_len = buf_pool.LRU_old_len; + + new_len = ut_min(UT_LIST_GET_LEN(buf_pool.LRU) + * buf_pool.LRU_old_ratio + / BUF_LRU_OLD_RATIO_DIV, + UT_LIST_GET_LEN(buf_pool.LRU) + - (BUF_LRU_OLD_TOLERANCE + + BUF_LRU_NON_OLD_MIN_LEN)); + + ut_a(old_len >= new_len - BUF_LRU_OLD_TOLERANCE); + ut_a(old_len <= new_len + BUF_LRU_OLD_TOLERANCE); + } + + CheckInLRUList::validate(); + + old_len = 0; + + for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU); + bpage != NULL; + bpage = UT_LIST_GET_NEXT(LRU, bpage)) { + ut_ad(bpage->in_file()); + ut_ad(!bpage->frame + || reinterpret_cast(bpage) + ->in_unzip_LRU_list + == bpage->belongs_to_unzip_LRU()); + + if (bpage->is_old()) { + const buf_page_t* prev + = UT_LIST_GET_PREV(LRU, bpage); + const buf_page_t* next + = UT_LIST_GET_NEXT(LRU, bpage); + + if (!old_len++) { + ut_a(buf_pool.LRU_old == bpage); + } else { + ut_a(!prev || prev->is_old()); + } + + ut_a(!next || next->is_old()); + } + } + + ut_a(buf_pool.LRU_old_len == old_len); + + CheckInFreeList::validate(); + + for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.free); + bpage != NULL; + bpage = UT_LIST_GET_NEXT(list, bpage)) { + + ut_a(bpage->state() == buf_page_t::NOT_USED); + } + + CheckUnzipLRUAndLRUList::validate(); + + for (buf_block_t* block = UT_LIST_GET_FIRST(buf_pool.unzip_LRU); + block != NULL; + block = UT_LIST_GET_NEXT(unzip_LRU, block)) { + + ut_ad(block->in_unzip_LRU_list); + ut_ad(block->page.in_LRU_list); + ut_a(block->page.belongs_to_unzip_LRU()); + } + + mysql_mutex_unlock(&buf_pool.mutex); +} +#endif /* UNIV_DEBUG */ + +#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG +/** Dump the LRU list to stderr. */ +void buf_LRU_print() +{ + mysql_mutex_lock(&buf_pool.mutex); + + for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU); + bpage != NULL; + bpage = UT_LIST_GET_NEXT(LRU, bpage)) { + const page_id_t id(bpage->id()); + + fprintf(stderr, "BLOCK space %u page %u ", + id.space(), id.page_no()); + + if (bpage->is_old()) { + fputs("old ", stderr); + } + + const unsigned s = bpage->state(); + if (s > buf_page_t::UNFIXED) { + fprintf(stderr, "fix %u ", s - buf_page_t::UNFIXED); + } else { + ut_ad(s == buf_page_t::UNFIXED + || s == buf_page_t::REMOVE_HASH); + } + + if (bpage->oldest_modification()) { + fputs("modif. ", stderr); + } + + if (const byte* frame = bpage->zip.data) { + fprintf(stderr, "\ntype %u size " ULINTPF + " index id " IB_ID_FMT "\n", + fil_page_get_type(frame), + bpage->zip_size(), + btr_page_get_index_id(frame)); + } else { + fprintf(stderr, "\ntype %u index id " IB_ID_FMT "\n", + fil_page_get_type(bpage->frame), + btr_page_get_index_id(bpage->frame)); + } + } + + mysql_mutex_unlock(&buf_pool.mutex); +} +#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */ diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc new file mode 100644 index 00000000..c4f07738 --- /dev/null +++ b/storage/innobase/buf/buf0rea.cc @@ -0,0 +1,710 @@ +/***************************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0rea.cc +The database buffer read + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "univ.i" +#include + +#include "buf0rea.h" +#include "fil0fil.h" +#include "mtr0mtr.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "buf0lru.h" +#include "buf0buddy.h" +#include "buf0dblwr.h" +#include "ibuf0ibuf.h" +#include "log0recv.h" +#include "trx0sys.h" +#include "os0file.h" +#include "srv0start.h" +#include "srv0srv.h" +#include "log.h" +#include "mariadb_stats.h" + +/** If there are buf_pool.curr_size per the number below pending reads, then +read-ahead is not done: this is to prevent flooding the buffer pool with +i/o-fixed buffer blocks */ +#define BUF_READ_AHEAD_PEND_LIMIT 2 + +/** Remove the sentinel block for the watch before replacing it with a +real block. watch_unset() or watch_occurred() will notice +that the block has been replaced with the real block. +@param w sentinel +@param chain locked hash table chain +@return w->state() */ +inline uint32_t buf_pool_t::watch_remove(buf_page_t *w, + buf_pool_t::hash_chain &chain) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(xtest() || page_hash.lock_get(chain).is_write_locked()); + ut_ad(w >= &watch[0]); + ut_ad(w < &watch[array_elements(watch)]); + ut_ad(!w->in_zip_hash); + ut_ad(!w->zip.data); + + uint32_t s{w->state()}; + w->set_state(buf_page_t::NOT_USED); + ut_ad(s >= buf_page_t::UNFIXED); + ut_ad(s < buf_page_t::READ_FIX); + + if (~buf_page_t::LRU_MASK & s) + page_hash.remove(chain, w); + + ut_ad(!w->in_page_hash); + w->id_= page_id_t(~0ULL); + return s; +} + +/** Initialize a page for read to the buffer buf_pool. If the page is +(1) already in buf_pool, or +(2) if we specify to read only ibuf pages and the page is not an ibuf page, or +(3) if the space is deleted or being deleted, +then this function does nothing. +Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock +on the buffer frame. The io-handler must take care that the flag is cleared +and the lock released later. +@param[in] mode BUF_READ_IBUF_PAGES_ONLY, ... +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] unzip whether the uncompressed page is + requested (for ROW_FORMAT=COMPRESSED) +@return pointer to the block +@retval NULL in case of an error */ +TRANSACTIONAL_TARGET +static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id, + ulint zip_size, bool unzip) +{ + mtr_t mtr; + + if (mode == BUF_READ_IBUF_PAGES_ONLY) + { + /* It is a read-ahead within an ibuf routine */ + ut_ad(!ibuf_bitmap_page(page_id, zip_size)); + ibuf_mtr_start(&mtr); + + if (!recv_no_ibuf_operations && !ibuf_page(page_id, zip_size, &mtr)) + { + ibuf_mtr_commit(&mtr); + return nullptr; + } + } + else + ut_ad(mode == BUF_READ_ANY_PAGE); + + buf_page_t *bpage= nullptr; + buf_block_t *block= nullptr; + if (!zip_size || unzip || recv_recovery_is_on()) + { + block= buf_LRU_get_free_block(false); + block->initialise(page_id, zip_size, buf_page_t::READ_FIX); + /* x_unlock() will be invoked + in buf_page_t::read_complete() by the io-handler thread. */ + block->page.lock.x_lock(true); + } + + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold()); + + mysql_mutex_lock(&buf_pool.mutex); + + buf_page_t *hash_page= buf_pool.page_hash.get(page_id, chain); + if (hash_page && !buf_pool.watch_is_sentinel(*hash_page)) + { + /* The page is already in the buffer pool. */ + if (block) + { + block->page.lock.x_unlock(true); + ut_d(block->page.set_state(buf_page_t::MEMORY)); + buf_LRU_block_free_non_file_page(block); + } + goto func_exit; + } + + if (UNIV_LIKELY(block != nullptr)) + { + bpage= &block->page; + + /* Insert into the hash table of file pages */ + if (hash_page) + { + transactional_lock_guard g + {buf_pool.page_hash.lock_get(chain)}; + bpage->set_state(buf_pool.watch_remove(hash_page, chain) + + (buf_page_t::READ_FIX - buf_page_t::UNFIXED)); + buf_pool.page_hash.append(chain, &block->page); + } + else + { + transactional_lock_guard g + {buf_pool.page_hash.lock_get(chain)}; + buf_pool.page_hash.append(chain, &block->page); + } + + /* The block must be put to the LRU list, to the old blocks */ + buf_LRU_add_block(&block->page, true/* to old blocks */); + + if (UNIV_UNLIKELY(zip_size)) + { + /* buf_pool.mutex may be released and reacquired by + buf_buddy_alloc(). We must defer this operation until after the + block descriptor has been added to buf_pool.LRU and + buf_pool.page_hash. */ + block->page.zip.data= static_cast + (buf_buddy_alloc(zip_size)); + + /* To maintain the invariant + block->in_unzip_LRU_list == block->page.belongs_to_unzip_LRU() + we have to add this block to unzip_LRU + after block->page.zip.data is set. */ + ut_ad(block->page.belongs_to_unzip_LRU()); + buf_unzip_LRU_add_block(block, TRUE); + } + } + else + { + /* The compressed page must be allocated before the + control block (bpage), in order to avoid the + invocation of buf_buddy_relocate_block() on + uninitialized data. */ + bool lru= false; + void *data= buf_buddy_alloc(zip_size, &lru); + + /* If buf_buddy_alloc() allocated storage from the LRU list, + it released and reacquired buf_pool.mutex. Thus, we must + check the page_hash again, as it may have been modified. */ + if (UNIV_UNLIKELY(lru)) + { + hash_page= buf_pool.page_hash.get(page_id, chain); + + if (UNIV_UNLIKELY(hash_page && !buf_pool.watch_is_sentinel(*hash_page))) + { + /* The block was added by some other thread. */ + buf_buddy_free(data, zip_size); + goto func_exit; + } + } + + bpage= static_cast(ut_zalloc_nokey(sizeof *bpage)); + + page_zip_des_init(&bpage->zip); + page_zip_set_size(&bpage->zip, zip_size); + bpage->zip.data = (page_zip_t*) data; + + bpage->init(buf_page_t::READ_FIX, page_id); + bpage->lock.x_lock(true); + + { + transactional_lock_guard g + {buf_pool.page_hash.lock_get(chain)}; + + if (hash_page) + bpage->set_state(buf_pool.watch_remove(hash_page, chain) + + (buf_page_t::READ_FIX - buf_page_t::UNFIXED)); + + buf_pool.page_hash.append(chain, bpage); + } + + /* The block must be put to the LRU list, to the old blocks. + The zip size is already set into the page zip */ + buf_LRU_add_block(bpage, true/* to old blocks */); + } + + buf_pool.stat.n_pages_read++; +func_exit: + mysql_mutex_unlock(&buf_pool.mutex); + + if (mode == BUF_READ_IBUF_PAGES_ONLY) + ibuf_mtr_commit(&mtr); + + ut_ad(!bpage || bpage->in_file()); + + return bpage; +} + +/** Low-level function which reads a page asynchronously from a file to the +buffer buf_pool if it is not already there, in which case does nothing. +Sets the io_fix flag and sets an exclusive lock on the buffer frame. The +flag is cleared and the x-lock released by an i/o-handler thread. + +@param[in,out] space tablespace +@param[in] sync true if synchronous aio is desired +@param[in] mode BUF_READ_IBUF_PAGES_ONLY, ..., +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] unzip true=request uncompressed page +@return error code +@retval DB_SUCCESS if the page was read +@retval DB_SUCCESS_LOCKED_REC if the page exists in the buffer pool already */ +static +dberr_t +buf_read_page_low( + fil_space_t* space, + bool sync, + ulint mode, + const page_id_t page_id, + ulint zip_size, + bool unzip) +{ + buf_page_t* bpage; + + if (buf_dblwr.is_inside(page_id)) { + space->release(); + return DB_PAGE_CORRUPTED; + } + + if (sync) { + } else if (trx_sys_hdr_page(page_id) + || ibuf_bitmap_page(page_id, zip_size) + || (!recv_no_ibuf_operations + && ibuf_page(page_id, zip_size, nullptr))) { + + /* Trx sys header is so low in the latching order that we play + safe and do not leave the i/o-completion to an asynchronous + i/o-thread. Change buffer pages must always be read with + synchronous i/o, to make sure they do not get involved in + thread deadlocks. */ + sync = true; + } + + /* The following call will also check if the tablespace does not exist + or is being dropped; if we succeed in initing the page in the buffer + pool for read, then DISCARD cannot proceed until the read has + completed */ + bpage = buf_page_init_for_read(mode, page_id, zip_size, unzip); + + if (!bpage) { + space->release(); + return DB_SUCCESS_LOCKED_REC; + } + + ut_ad(bpage->in_file()); + ulonglong mariadb_timer= 0; + + if (sync) { + thd_wait_begin(nullptr, THD_WAIT_DISKIO); + if (mariadb_stats_active()) + mariadb_timer= mariadb_measure(); + } + + DBUG_LOG("ib_buf", + "read page " << page_id << " zip_size=" << zip_size + << " unzip=" << unzip << ',' << (sync ? "sync" : "async")); + + void* dst = zip_size ? bpage->zip.data : bpage->frame; + const ulint len = zip_size ? zip_size : srv_page_size; + + auto fio = space->io(IORequest(sync + ? IORequest::READ_SYNC + : IORequest::READ_ASYNC), + os_offset_t{page_id.page_no()} * len, len, + dst, bpage); + + if (UNIV_UNLIKELY(fio.err != DB_SUCCESS)) { + buf_pool.corrupted_evict(bpage, buf_page_t::READ_FIX); + } else if (sync) { + thd_wait_end(NULL); + /* The i/o was already completed in space->io() */ + fio.err = bpage->read_complete(*fio.node); + space->release(); + if (fio.err == DB_FAIL) { + fio.err = DB_PAGE_CORRUPTED; + } + if (mariadb_timer) + mariadb_increment_pages_read_time(mariadb_timer); + } + + return fio.err; +} + +/** Applies a random read-ahead in buf_pool if there are at least a threshold +value of accessed pages from the random read-ahead area. Does not read any +page, not even the one at the position (space, offset), if the read-ahead +mechanism is not activated. NOTE 1: the calling thread may own latches on +pages: to avoid deadlocks this function must be written such that it cannot +end up waiting for these latches! NOTE 2: the calling thread must want +access to the page given: this rule is set to prevent unintended read-aheads +performed by ibuf routines, a situation which could result in a deadlock if +the OS does not support asynchronous i/o. +@param[in] page_id page id of a page which the current thread +wants to access +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] ibuf whether we are inside ibuf routine +@return number of page read requests issued; NOTE that if we read ibuf +pages, it may happen that the page at the given page number does not +get read even if we return a positive value! */ +TRANSACTIONAL_TARGET +ulint +buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf) +{ + if (!srv_random_read_ahead || page_id.space() >= SRV_TMP_SPACE_ID) + /* Disable the read-ahead for temporary tablespace */ + return 0; + + if (srv_startup_is_before_trx_rollback_phase) + /* No read-ahead to avoid thread deadlocks */ + return 0; + + if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id)) + /* If it is an ibuf bitmap page or trx sys hdr, we do no + read-ahead, as that could break the ibuf page access order */ + return 0; + + if (os_aio_pending_reads_approx() > + buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT) + return 0; + + fil_space_t* space= fil_space_t::get(page_id.space()); + if (!space) + return 0; + + const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area; + ulint count= 5 + buf_read_ahead_area / 8; + const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area); + page_id_t high= low + buf_read_ahead_area; + high.set_page_no(std::min(high.page_no(), space->last_page_number())); + + /* Count how many blocks in the area have been recently accessed, + that is, reside near the start of the LRU list. */ + + for (page_id_t i= low; i < high; ++i) + { + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(i.fold()); + transactional_shared_lock_guard g + {buf_pool.page_hash.lock_get(chain)}; + if (const buf_page_t *bpage= buf_pool.page_hash.get(i, chain)) + if (bpage->is_accessed() && buf_page_peek_if_young(bpage) && !--count) + goto read_ahead; + } + +no_read_ahead: + space->release(); + return 0; + +read_ahead: + if (space->is_stopping()) + goto no_read_ahead; + + /* Read all the suitable blocks within the area */ + const ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE; + + for (page_id_t i= low; i < high; ++i) + { + if (ibuf_bitmap_page(i, zip_size)) + continue; + if (space->is_stopping()) + break; + space->reacquire(); + if (buf_read_page_low(space, false, ibuf_mode, i, zip_size, false) == + DB_SUCCESS) + count++; + } + + if (count) + { + DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u", + count, space->chain.start->name, + low.page_no())); + mysql_mutex_lock(&buf_pool.mutex); + /* Read ahead is considered one I/O operation for the purpose of + LRU policy decision. */ + buf_LRU_stat_inc_io(); + buf_pool.stat.n_ra_pages_read_rnd+= count; + mysql_mutex_unlock(&buf_pool.mutex); + } + + space->release(); + return count; +} + +/** High-level function which reads a page from a file to buf_pool +if it is not already there. Sets the io_fix and an exclusive lock +on the buffer frame. The flag is cleared and the x-lock +released by the i/o-handler thread. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@retval DB_SUCCESS if the page was read and is not corrupted +@retval DB_SUCCESS_LOCKED_REC if the page was not read +@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted +@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but +after decryption normal page checksum does not match. +@retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */ +dberr_t buf_read_page(const page_id_t page_id, ulint zip_size) +{ + fil_space_t *space= fil_space_t::get(page_id.space()); + if (!space) + { + ib::info() << "trying to read page " << page_id + << " in nonexisting or being-dropped tablespace"; + return DB_TABLESPACE_DELETED; + } + + buf_LRU_stat_inc_io(); /* NOT protected by buf_pool.mutex */ + return buf_read_page_low(space, true, BUF_READ_ANY_PAGE, + page_id, zip_size, false); +} + +/** High-level function which reads a page asynchronously from a file to the +buffer buf_pool if it is not already there. Sets the io_fix flag and sets +an exclusive lock on the buffer frame. The flag is cleared and the x-lock +released by the i/o-handler thread. +@param[in,out] space tablespace +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 */ +void buf_read_page_background(fil_space_t *space, const page_id_t page_id, + ulint zip_size) +{ + buf_read_page_low(space, false, BUF_READ_ANY_PAGE, + page_id, zip_size, false); + + /* We do not increment number of I/O operations used for LRU policy + here (buf_LRU_stat_inc_io()). We use this in heuristics to decide + about evicting uncompressed version of compressed pages from the + buffer pool. Since this function is called from buffer pool load + these IOs are deliberate and are not part of normal workload we can + ignore these in our heuristics. */ +} + +/** Applies linear read-ahead if in the buf_pool the page is a border page of +a linear read-ahead area and all the pages in the area have been accessed. +Does not read any page if the read-ahead mechanism is not activated. Note +that the algorithm looks at the 'natural' adjacent successor and +predecessor of the page, which on the leaf level of a B-tree are the next +and previous page in the chain of leaves. To know these, the page specified +in (space, offset) must already be present in the buf_pool. Thus, the +natural way to use this function is to call it when a page in the buf_pool +is accessed the first time, calling this function just after it has been +bufferfixed. +NOTE 1: as this function looks at the natural predecessor and successor +fields on the page, what happens, if these are not initialized to any +sensible value? No problem, before applying read-ahead we check that the +area to read is within the span of the space, if not, read-ahead is not +applied. An uninitialized value may result in a useless read operation, but +only very improbably. +NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this +function must be written such that it cannot end up waiting for these +latches! +NOTE 3: the calling thread must want access to the page given: this rule is +set to prevent unintended read-aheads performed by ibuf routines, a situation +which could result in a deadlock if the OS does not support asynchronous io. +@param[in] page_id page id; see NOTE 3 above +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] ibuf whether if we are inside ibuf routine +@return number of page read requests issued */ +TRANSACTIONAL_TARGET +ulint +buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf) +{ + /* check if readahead is disabled. + Disable the read ahead logic for temporary tablespace */ + if (!srv_read_ahead_threshold || page_id.space() >= SRV_TMP_SPACE_ID) + return 0; + + if (srv_startup_is_before_trx_rollback_phase) + /* No read-ahead to avoid thread deadlocks */ + return 0; + + if (os_aio_pending_reads_approx() > + buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT) + return 0; + + const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area; + const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area); + const page_id_t high_1= low + (buf_read_ahead_area - 1); + + /* We will check that almost all pages in the area have been accessed + in the desired order. */ + const bool descending= page_id != low; + + if (!descending && page_id != high_1) + /* This is not a border page of the area */ + return 0; + + if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id)) + /* If it is an ibuf bitmap page or trx sys hdr, we do no + read-ahead, as that could break the ibuf page access order */ + return 0; + + fil_space_t *space= fil_space_t::get(page_id.space()); + if (!space) + return 0; + + if (high_1.page_no() > space->last_page_number()) + { + /* The area is not whole. */ +fail: + space->release(); + return 0; + } + + /* How many out of order accessed pages can we ignore + when working out the access pattern for linear readahead */ + ulint count= std::min(buf_pool_t::READ_AHEAD_PAGES - + srv_read_ahead_threshold, + uint32_t{buf_pool.read_ahead_area}); + page_id_t new_low= low, new_high_1= high_1; + unsigned prev_accessed= 0; + for (page_id_t i= low; i <= high_1; ++i) + { + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(i.fold()); + page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain); + /* It does not make sense to use transactional_lock_guard here, + because we would have many complex conditions inside the memory + transaction. */ + hash_lock.lock_shared(); + + const buf_page_t* bpage= buf_pool.page_hash.get(i, chain); + if (!bpage) + { + hash_lock.unlock_shared(); + if (i == page_id) + goto fail; +failed: + if (--count) + continue; + goto fail; + } + const unsigned accessed= bpage->is_accessed(); + if (i == page_id) + { + /* Read the natural predecessor and successor page addresses from + the page; NOTE that because the calling thread may have an x-latch + on the page, we do not acquire an s-latch on the page, this is to + prevent deadlocks. The hash_lock is only protecting the + buf_pool.page_hash for page i, not the bpage contents itself. */ + const byte *f= bpage->frame ? bpage->frame : bpage->zip.data; + uint32_t prev= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_PREV)); + uint32_t next= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_NEXT)); + hash_lock.unlock_shared(); + if (prev == FIL_NULL || next == FIL_NULL) + goto fail; + page_id_t id= page_id; + if (descending) + { + if (id == high_1) + ++id; + else if (next - 1 != page_id.page_no()) + goto fail; + else + id.set_page_no(prev); + } + else + { + if (prev + 1 != page_id.page_no()) + goto fail; + id.set_page_no(next); + } + + new_low= id - (id.page_no() % buf_read_ahead_area); + new_high_1= new_low + (buf_read_ahead_area - 1); + + if (id != new_low && id != new_high_1) + /* This is not a border page of the area: return */ + goto fail; + if (new_high_1.page_no() > space->last_page_number()) + /* The area is not whole */ + goto fail; + } + else + hash_lock.unlock_shared(); + + if (!accessed) + goto failed; + /* Note that buf_page_t::is_accessed() returns the time of the + first access. If some blocks of the extent existed in the buffer + pool at the time of a linear access pattern, the first access + times may be nonmonotonic, even though the latest access times + were linear. The threshold (srv_read_ahead_factor) should help a + little against this. */ + bool fail= prev_accessed && + (descending ? prev_accessed > accessed : prev_accessed < accessed); + prev_accessed= accessed; + if (fail) + goto failed; + } + + /* If we got this far, read-ahead can be sensible: do it */ + count= 0; + for (ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE; + new_low <= new_high_1; ++new_low) + { + if (ibuf_bitmap_page(new_low, zip_size)) + continue; + if (space->is_stopping()) + break; + space->reacquire(); + if (buf_read_page_low(space, false, ibuf_mode, new_low, zip_size, false) == + DB_SUCCESS) + count++; + } + + if (count) + { + DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u", + count, space->chain.start->name, + new_low.page_no())); + mysql_mutex_lock(&buf_pool.mutex); + /* Read ahead is considered one I/O operation for the purpose of + LRU policy decision. */ + buf_LRU_stat_inc_io(); + buf_pool.stat.n_ra_pages_read+= count; + mysql_mutex_unlock(&buf_pool.mutex); + } + + space->release(); + return count; +} + +/** Schedule a page for recovery. +@param space tablespace +@param page_id page identifier +@param recs log records +@param init page initialization, or nullptr if the page needs to be read */ +void buf_read_recover(fil_space_t *space, const page_id_t page_id, + page_recv_t &recs, recv_init *init) +{ + ut_ad(space->id == page_id.space()); + space->reacquire(); + const ulint zip_size= space->zip_size(); + + if (init) + { + if (buf_page_t *bpage= buf_page_init_for_read(BUF_READ_ANY_PAGE, page_id, + zip_size, true)) + { + ut_ad(bpage->in_file()); + os_fake_read(IORequest{bpage, (buf_tmp_buffer_t*) &recs, + UT_LIST_GET_FIRST(space->chain), + IORequest::READ_ASYNC}, ptrdiff_t(init)); + } + } + else if (dberr_t err= buf_read_page_low(space, false, BUF_READ_ANY_PAGE, + page_id, zip_size, true)) + { + if (err != DB_SUCCESS_LOCKED_REC) + sql_print_error("InnoDB: Recovery failed to read page " + UINT32PF " from %s", + page_id.page_no(), space->chain.start->name); + } +} diff --git a/storage/innobase/data/data0data.cc b/storage/innobase/data/data0data.cc new file mode 100644 index 00000000..9a7eff21 --- /dev/null +++ b/storage/innobase/data/data0data.cc @@ -0,0 +1,820 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file data/data0data.cc +SQL data field and tuple + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#include "data0data.h" +#include "rem0rec.h" +#include "rem0cmp.h" +#include "page0page.h" +#include "page0zip.h" +#include "dict0dict.h" +#include "btr0cur.h" +#include "row0upd.h" + +#ifdef UNIV_DEBUG +/** Dummy variable to catch access to uninitialized fields. In the +debug version, dtuple_create() will make all fields of dtuple_t point +to data_error. */ +ut_d(byte data_error); +#endif /* UNIV_DEBUG */ + +/** Trim the tail of an index tuple before insert or update. +After instant ADD COLUMN, if the last fields of a clustered index tuple +match the default values that were explicitly specified or implied during +ADD COLUMN, there will be no need to store them. +NOTE: A page latch in the index must be held, so that the index +may not lose 'instantness' before the trimmed tuple has been +inserted or updated. +@param[in] index index possibly with instantly added columns */ +void dtuple_t::trim(const dict_index_t& index) +{ + ut_ad(n_fields >= index.n_core_fields); + ut_ad(n_fields <= index.n_fields); + ut_ad(index.is_instant()); + + ulint i = n_fields; + for (; i > index.n_core_fields; i--) { + const dfield_t* dfield = dtuple_get_nth_field(this, i - 1); + const dict_col_t* col = dict_index_get_nth_col(&index, i - 1); + + if (col->is_dropped()) { + continue; + } + + ut_ad(col->is_added()); + ulint len = dfield_get_len(dfield); + if (len != col->def_val.len) { + break; + } + + if (len != 0 && len != UNIV_SQL_NULL + && dfield->data != col->def_val.data + && memcmp(dfield->data, col->def_val.data, len)) { + break; + } + } + + n_fields = i; +} + +/*********************************************************************//** +Sets number of fields used in a tuple. Normally this is set in +dtuple_create, but if you want later to set it smaller, you can use this. */ +void +dtuple_set_n_fields( +/*================*/ + dtuple_t* tuple, /*!< in: tuple */ + ulint n_fields) /*!< in: number of fields */ +{ + tuple->n_fields = n_fields; + tuple->n_fields_cmp = n_fields; +} + +/**********************************************************//** +Checks that a data field is typed. +@return TRUE if ok */ +static +ibool +dfield_check_typed_no_assert( +/*=========================*/ + const dfield_t* field) /*!< in: data field */ +{ + if (dfield_get_type(field)->mtype > DATA_MTYPE_CURRENT_MAX + || dfield_get_type(field)->mtype < DATA_MTYPE_CURRENT_MIN) { + + ib::error() << "Data field type " + << dfield_get_type(field)->mtype + << ", len " << dfield_get_len(field); + + return(FALSE); + } + + return(TRUE); +} + +/**********************************************************//** +Checks that a data tuple is typed. +@return TRUE if ok */ +static +ibool +dtuple_check_typed_no_assert( +/*=========================*/ + const dtuple_t* tuple) /*!< in: tuple */ +{ + const dfield_t* field; + ulint i; + + if (dtuple_get_n_fields(tuple) > REC_MAX_N_FIELDS) { + ib::error() << "Index entry has " + << dtuple_get_n_fields(tuple) << " fields"; +dump: + fputs("InnoDB: Tuple contents: ", stderr); + dtuple_print(stderr, tuple); + putc('\n', stderr); + + return(FALSE); + } + + for (i = 0; i < dtuple_get_n_fields(tuple); i++) { + + field = dtuple_get_nth_field(tuple, i); + + if (!dfield_check_typed_no_assert(field)) { + goto dump; + } + } + + return(TRUE); +} + +#ifdef UNIV_DEBUG +/**********************************************************//** +Checks that a data field is typed. Asserts an error if not. +@return TRUE if ok */ +ibool +dfield_check_typed( +/*===============*/ + const dfield_t* field) /*!< in: data field */ +{ + if (dfield_get_type(field)->mtype > DATA_MTYPE_CURRENT_MAX + || dfield_get_type(field)->mtype < DATA_MTYPE_CURRENT_MIN) { + + ib::fatal() << "Data field type " + << dfield_get_type(field)->mtype + << ", len " << dfield_get_len(field); + } + + return(TRUE); +} + +/**********************************************************//** +Checks that a data tuple is typed. Asserts an error if not. +@return TRUE if ok */ +ibool +dtuple_check_typed( +/*===============*/ + const dtuple_t* tuple) /*!< in: tuple */ +{ + const dfield_t* field; + ulint i; + + for (i = 0; i < dtuple_get_n_fields(tuple); i++) { + + field = dtuple_get_nth_field(tuple, i); + + ut_a(dfield_check_typed(field)); + } + + return(TRUE); +} + +/**********************************************************//** +Validates the consistency of a tuple which must be complete, i.e, +all fields must have been set. +@return TRUE if ok */ +ibool +dtuple_validate( +/*============*/ + const dtuple_t* tuple) /*!< in: tuple */ +{ + ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N); +#ifdef HAVE_valgrind + const ulint n_fields = dtuple_get_n_fields(tuple); + + for (ulint i = 0; i < n_fields; i++) { + const dfield_t* field = dtuple_get_nth_field(tuple, i); + + if (!dfield_is_null(field)) { + MEM_CHECK_DEFINED(dfield_get_data(field), + dfield_get_len(field)); + } + } +#endif /* HAVE_valgrind */ + ut_ad(dtuple_check_typed(tuple)); + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/*************************************************************//** +Pretty prints a dfield value according to its data type. */ +void +dfield_print( +/*=========*/ + const dfield_t* dfield) /*!< in: dfield */ +{ + const byte* data; + ulint len; + ulint i; + + len = dfield_get_len(dfield); + data = static_cast(dfield_get_data(dfield)); + + if (dfield_is_null(dfield)) { + fputs("NULL", stderr); + + return; + } + + switch (dtype_get_mtype(dfield_get_type(dfield))) { + case DATA_CHAR: + case DATA_VARCHAR: + for (i = 0; i < len; i++) { + int c = *data++; + putc(isprint(c) ? c : ' ', stderr); + } + + if (dfield_is_ext(dfield)) { + fputs("(external)", stderr); + } + break; + case DATA_INT: + ut_a(len == 4); /* only works for 32-bit integers */ + fprintf(stderr, "%d", (int) mach_read_from_4(data)); + break; + default: + ut_error; + } +} + +/*************************************************************//** +Pretty prints a dfield value according to its data type. Also the hex string +is printed if a string contains non-printable characters. */ +void +dfield_print_also_hex( +/*==================*/ + const dfield_t* dfield) /*!< in: dfield */ +{ + const byte* data; + ulint len; + ulint prtype; + ulint i; + ibool print_also_hex; + + len = dfield_get_len(dfield); + data = static_cast(dfield_get_data(dfield)); + + if (dfield_is_null(dfield)) { + fputs("NULL", stderr); + + return; + } + + prtype = dtype_get_prtype(dfield_get_type(dfield)); + + switch (dtype_get_mtype(dfield_get_type(dfield))) { + ib_id_t id; + case DATA_INT: + switch (len) { + ulint val; + case 1: + val = mach_read_from_1(data); + + if (!(prtype & DATA_UNSIGNED)) { + val &= ~0x80U; + fprintf(stderr, "%ld", (long) val); + } else { + fprintf(stderr, "%lu", (ulong) val); + } + break; + + case 2: + val = mach_read_from_2(data); + + if (!(prtype & DATA_UNSIGNED)) { + val &= ~0x8000U; + fprintf(stderr, "%ld", (long) val); + } else { + fprintf(stderr, "%lu", (ulong) val); + } + break; + + case 3: + val = mach_read_from_3(data); + + if (!(prtype & DATA_UNSIGNED)) { + val &= ~0x800000U; + fprintf(stderr, "%ld", (long) val); + } else { + fprintf(stderr, "%lu", (ulong) val); + } + break; + + case 4: + val = mach_read_from_4(data); + + if (!(prtype & DATA_UNSIGNED)) { + val &= ~0x80000000; + fprintf(stderr, "%ld", (long) val); + } else { + fprintf(stderr, "%lu", (ulong) val); + } + break; + + case 6: + id = mach_read_from_6(data); + fprintf(stderr, IB_ID_FMT, id); + break; + + case 7: + id = mach_read_from_7(data); + fprintf(stderr, IB_ID_FMT, id); + break; + case 8: + id = mach_read_from_8(data); + fprintf(stderr, IB_ID_FMT, id); + break; + default: + goto print_hex; + } + break; + + case DATA_SYS: + switch (prtype & DATA_SYS_PRTYPE_MASK) { + case DATA_TRX_ID: + id = mach_read_from_6(data); + + fprintf(stderr, "trx_id " TRX_ID_FMT, id); + break; + + case DATA_ROLL_PTR: + id = mach_read_from_7(data); + + fprintf(stderr, "roll_ptr " TRX_ID_FMT, id); + break; + + case DATA_ROW_ID: + id = mach_read_from_6(data); + + fprintf(stderr, "row_id " TRX_ID_FMT, id); + break; + + default: + goto print_hex; + } + break; + + case DATA_CHAR: + case DATA_VARCHAR: + print_also_hex = FALSE; + + for (i = 0; i < len; i++) { + int c = *data++; + + if (!isprint(c)) { + print_also_hex = TRUE; + + fprintf(stderr, "\\x%02x", (unsigned char) c); + } else { + putc(c, stderr); + } + } + + if (dfield_is_ext(dfield)) { + fputs("(external)", stderr); + } + + if (!print_also_hex) { + break; + } + + data = static_cast(dfield_get_data(dfield)); + /* fall through */ + + case DATA_BINARY: + default: +print_hex: + fputs(" Hex: ",stderr); + + for (i = 0; i < len; i++) { + fprintf(stderr, "%02x", *data++); + } + + if (dfield_is_ext(dfield)) { + fputs("(external)", stderr); + } + } +} + +/*************************************************************//** +Print a dfield value using ut_print_buf. */ +static +void +dfield_print_raw( +/*=============*/ + FILE* f, /*!< in: output stream */ + const dfield_t* dfield) /*!< in: dfield */ +{ + ulint len = dfield_get_len(dfield); + if (!dfield_is_null(dfield)) { + ulint print_len = ut_min(len, static_cast(1000)); + ut_print_buf(f, dfield_get_data(dfield), print_len); + if (len != print_len) { + std::ostringstream str_bytes; + str_bytes << ib::bytes_iec{len}; + fprintf(f, "(total %s%s)", + str_bytes.str().c_str(), + dfield_is_ext(dfield) ? ", external" : ""); + } + } else { + fputs(" SQL NULL", f); + } +} + +/**********************************************************//** +The following function prints the contents of a tuple. */ +void +dtuple_print( +/*=========*/ + FILE* f, /*!< in: output stream */ + const dtuple_t* tuple) /*!< in: tuple */ +{ + ulint n_fields; + ulint i; + + n_fields = dtuple_get_n_fields(tuple); + + fprintf(f, "DATA TUPLE: %lu fields;\n", (ulong) n_fields); + + for (i = 0; i < n_fields; i++) { + fprintf(f, " %lu:", (ulong) i); + + dfield_print_raw(f, dtuple_get_nth_field(tuple, i)); + + putc(';', f); + putc('\n', f); + } + + ut_ad(dtuple_validate(tuple)); +} + +/** Print the contents of a tuple. +@param[out] o output stream +@param[in] field array of data fields +@param[in] n number of data fields */ +void +dfield_print( + std::ostream& o, + const dfield_t* field, + ulint n) +{ + for (ulint i = 0; i < n; i++, field++) { + const void* data = dfield_get_data(field); + const ulint len = dfield_get_len(field); + + if (i) { + o << ','; + } + + if (dfield_is_null(field)) { + o << "NULL"; + } else if (dfield_is_ext(field)) { + ulint local_len = len - BTR_EXTERN_FIELD_REF_SIZE; + ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE); + + o << '[' + << local_len + << '+' << BTR_EXTERN_FIELD_REF_SIZE << ']'; + ut_print_buf(o, data, local_len); + ut_print_buf_hex(o, static_cast(data) + + local_len, + BTR_EXTERN_FIELD_REF_SIZE); + } else { + o << '[' << len << ']'; + ut_print_buf(o, data, len); + } + } +} + +/** Print the contents of a tuple. +@param[out] o output stream +@param[in] tuple data tuple */ +void +dtuple_print( + std::ostream& o, + const dtuple_t* tuple) +{ + const ulint n = dtuple_get_n_fields(tuple); + + o << "TUPLE (info_bits=" << dtuple_get_info_bits(tuple) + << ", " << n << " fields): {"; + + dfield_print(o, tuple->fields, n); + + o << "}"; +} + +/**************************************************************//** +Moves parts of long fields in entry to the big record vector so that +the size of tuple drops below the maximum record size allowed in the +database. Moves data only from those fields which are not necessary +to determine uniquely the insertion place of the tuple in the index. +@return own: created big record vector, NULL if we are not able to +shorten the entry enough, i.e., if there are too many fixed-length or +short fields in entry or the index is clustered */ +big_rec_t* +dtuple_convert_big_rec( +/*===================*/ + dict_index_t* index, /*!< in: index */ + upd_t* upd, /*!< in/out: update vector */ + dtuple_t* entry, /*!< in/out: index entry */ + ulint* n_ext) /*!< in/out: number of + externally stored columns */ +{ + mem_heap_t* heap; + big_rec_t* vector; + dfield_t* dfield; + ulint size; + ulint local_prefix_len; + + if (!dict_index_is_clust(index)) { + return(NULL); + } + + if (!index->table->space) { + return NULL; + } + + ulint local_len = index->table->get_overflow_field_local_len(); + const auto zip_size = index->table->space->zip_size(); + + ut_ad(index->n_uniq > 0); + + ut_a(dtuple_check_typed_no_assert(entry)); + + size = rec_get_converted_size(index, entry, *n_ext); + + if (UNIV_UNLIKELY(size > 1000000000)) { + ib::warn() << "Tuple size is very big: " << ib::bytes_iec{size}; + fputs("InnoDB: Tuple contents: ", stderr); + dtuple_print(stderr, entry); + putc('\n', stderr); + } + + heap = mem_heap_create(size + dtuple_get_n_fields(entry) + * sizeof(big_rec_field_t) + 1000); + + vector = big_rec_t::alloc(heap, dtuple_get_n_fields(entry)); + + /* Decide which fields to shorten: the algorithm is to look for + a variable-length field that yields the biggest savings when + stored externally */ + + ut_d(ulint n_fields = 0); + uint16_t longest_i; + ulint longest; + + const bool mblob = entry->is_alter_metadata(); + ut_ad(entry->n_fields - mblob >= index->first_user_field()); + ut_ad(entry->n_fields - mblob <= index->n_fields); + + if (mblob) { + longest_i = index->first_user_field(); + dfield = dtuple_get_nth_field(entry, longest_i); + local_len = BTR_EXTERN_FIELD_REF_SIZE; + ut_ad(!dfield_is_ext(dfield)); + goto ext_write; + } + + if (!dict_table_has_atomic_blobs(index->table)) { + /* up to MySQL 5.1: store a 768-byte prefix locally */ + local_len = BTR_EXTERN_FIELD_REF_SIZE + + DICT_ANTELOPE_MAX_INDEX_COL_LEN; + } else { + /* new-format table: do not store any BLOB prefix locally */ + local_len = BTR_EXTERN_FIELD_REF_SIZE; + } + + while (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, + *n_ext), + index->table->not_redundant(), + dict_index_get_n_fields(index), + zip_size)) { + longest_i = 0; + longest = 0; + for (uint16_t i = index->first_user_field(); + i < entry->n_fields - mblob; i++) { + ulint savings; + dfield = dtuple_get_nth_field(entry, i + mblob); + + const dict_field_t* ifield = dict_index_get_nth_field( + index, i); + + /* Skip fixed-length, NULL, externally stored, + or short columns */ + + if (ifield->fixed_len + || dfield_is_null(dfield) + || dfield_is_ext(dfield) + || dfield_get_len(dfield) <= local_len + || dfield_get_len(dfield) + <= BTR_EXTERN_LOCAL_STORED_MAX_SIZE) { + goto skip_field; + } + + savings = dfield_get_len(dfield) - local_len; + + /* Check that there would be savings */ + if (longest >= savings) { + goto skip_field; + } + + /* In DYNAMIC and COMPRESSED format, store + locally any non-BLOB columns whose maximum + length does not exceed 256 bytes. This is + because there is no room for the "external + storage" flag when the maximum length is 255 + bytes or less. This restriction trivially + holds in REDUNDANT and COMPACT format, because + there we always store locally columns whose + length is up to local_len == 788 bytes. + @see rec_init_offsets_comp_ordinary */ + if (!DATA_BIG_COL(ifield->col)) { + goto skip_field; + } + + longest_i = uint16_t(i + mblob); + longest = savings; + +skip_field: + continue; + } + + if (!longest_i) { + /* Cannot shorten more */ + + mem_heap_free(heap); + + return(NULL); + } + + /* Move data from field longest_i to big rec vector. + + We store the first bytes locally to the record. Then + we can calculate all ordering fields in all indexes + from locally stored data. */ + dfield = dtuple_get_nth_field(entry, longest_i); +ext_write: + local_prefix_len = local_len - BTR_EXTERN_FIELD_REF_SIZE; + + vector->append( + big_rec_field_t( + longest_i, + dfield_get_len(dfield) - local_prefix_len, + static_cast(dfield_get_data(dfield)) + + local_prefix_len)); + + /* Allocate the locally stored part of the column. */ + byte* data = static_cast( + mem_heap_alloc(heap, local_len)); + + /* Copy the local prefix. */ + memcpy(data, dfield_get_data(dfield), local_prefix_len); + /* Clear the extern field reference (BLOB pointer). */ + memset(data + local_prefix_len, 0, BTR_EXTERN_FIELD_REF_SIZE); + + dfield_set_data(dfield, data, local_len); + dfield_set_ext(dfield); + + (*n_ext)++; + ut_ad(++n_fields < dtuple_get_n_fields(entry)); + + if (upd && !upd->is_modified(longest_i)) { + + DEBUG_SYNC_C("ib_mv_nonupdated_column_offpage"); + + upd_field_t upd_field; + upd_field.field_no = longest_i; + upd_field.orig_len = 0; + upd_field.exp = NULL; + upd_field.old_v_val = NULL; + dfield_copy(&upd_field.new_val, + dfield->clone(upd->heap)); + upd->append(upd_field); + ut_ad(upd->is_modified(longest_i)); + + ut_ad(upd_field.new_val.len + >= BTR_EXTERN_FIELD_REF_SIZE); + ut_ad(upd_field.new_val.len == local_len); + ut_ad(upd_field.new_val.len == dfield_get_len(dfield)); + } + } + + ut_ad(n_fields == vector->n_fields); + + return(vector); +} + +/**************************************************************//** +Puts back to entry the data stored in vector. Note that to ensure the +fields in entry can accommodate the data, vector must have been created +from entry with dtuple_convert_big_rec. */ +void +dtuple_convert_back_big_rec( +/*========================*/ + dict_index_t* index MY_ATTRIBUTE((unused)), /*!< in: index */ + dtuple_t* entry, /*!< in/out: entry whose data was put to vector */ + big_rec_t* vector) /*!< in, own: big rec vector; it is + freed in this function */ +{ + big_rec_field_t* b = vector->fields; + const big_rec_field_t* const end = b + vector->n_fields; + + for (; b < end; b++) { + dfield_t* dfield; + ulint local_len; + + dfield = dtuple_get_nth_field(entry, b->field_no); + local_len = dfield_get_len(dfield); + + ut_ad(dfield_is_ext(dfield)); + ut_ad(local_len >= BTR_EXTERN_FIELD_REF_SIZE); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + /* Only in REDUNDANT and COMPACT format, we store + up to DICT_ANTELOPE_MAX_INDEX_COL_LEN (768) bytes + locally */ + ut_ad(local_len <= DICT_ANTELOPE_MAX_INDEX_COL_LEN); + + dfield_set_data(dfield, + (char*) b->data - local_len, + b->len + local_len); + } + + mem_heap_free(vector->heap); +} + +/** Allocate a big_rec_t object in the given memory heap, and for storing +n_fld number of fields. +@param[in] heap memory heap in which this object is allocated +@param[in] n_fld maximum number of fields that can be stored in + this object + +@return the allocated object */ +big_rec_t* +big_rec_t::alloc( + mem_heap_t* heap, + ulint n_fld) +{ + big_rec_t* rec = static_cast( + mem_heap_alloc(heap, sizeof(big_rec_t))); + + new(rec) big_rec_t(n_fld); + + rec->heap = heap; + rec->fields = static_cast( + mem_heap_alloc(heap, + n_fld * sizeof(big_rec_field_t))); + + rec->n_fields = 0; + return(rec); +} + +/** Create a deep copy of this object. +@param[in,out] heap memory heap in which the clone will be created +@return the cloned object */ +dfield_t* +dfield_t::clone(mem_heap_t* heap) const +{ + const ulint size = len == UNIV_SQL_NULL ? 0 : len; + dfield_t* obj = static_cast( + mem_heap_alloc(heap, sizeof(dfield_t) + size)); + + ut_ad(len != UNIV_SQL_DEFAULT); + obj->ext = ext; + obj->len = len; + obj->type = type; + obj->spatial_status = spatial_status; + + if (len != UNIV_SQL_NULL) { + obj->data = obj + 1; + memcpy(obj->data, data, len); + } else { + obj->data = 0; + } + + return(obj); +} diff --git a/storage/innobase/data/data0type.cc b/storage/innobase/data/data0type.cc new file mode 100644 index 00000000..b1952bcc --- /dev/null +++ b/storage/innobase/data/data0type.cc @@ -0,0 +1,212 @@ +/***************************************************************************** + +Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file data/data0type.cc +Data types + +Created 1/16/1996 Heikki Tuuri +*******************************************************/ + +#include "dict0mem.h" +#include "my_sys.h" + +/** The DB_TRX_ID,DB_ROLL_PTR values for "no history is available" */ +const byte reset_trx_id[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN] = { + 0, 0, 0, 0, 0, 0, + 0x80, 0, 0, 0, 0, 0, 0 +}; + +/* At the database startup we store the default-charset collation number of +this MySQL installation to this global variable. If we have < 4.1.2 format +column definitions, or records in the insert buffer, we use this +charset-collation code for them. */ + +ulint data_mysql_default_charset_coll; + +/*********************************************************************//** +Determine how many bytes the first n characters of the given string occupy. +If the string is shorter than n characters, returns the number of bytes +the characters in the string occupy. +@return length of the prefix, in bytes */ +ulint +dtype_get_at_most_n_mbchars( +/*========================*/ + ulint prtype, /*!< in: precise type */ + ulint mbminlen, /*!< in: minimum length of + a multi-byte character, in bytes */ + ulint mbmaxlen, /*!< in: maximum length of + a multi-byte character, in bytes */ + ulint prefix_len, /*!< in: length of the requested + prefix, in characters, multiplied by + dtype_get_mbmaxlen(dtype) */ + ulint data_len, /*!< in: length of str (in bytes) */ + const char* str) /*!< in: the string whose prefix + length is being determined */ +{ + ut_a(len_is_stored(data_len)); + ut_ad(!mbmaxlen || !(prefix_len % mbmaxlen) || !(prefix_len % 4)); + + if (mbminlen != mbmaxlen) { + ut_a(!(prefix_len % mbmaxlen) || !(prefix_len % 4)); + return(innobase_get_at_most_n_mbchars( + dtype_get_charset_coll(prtype), + prefix_len, data_len, str)); + } + + if (prefix_len < data_len) { + + return(prefix_len); + + } + + return(data_len); +} + +/*********************************************************************//** +Validates a data type structure. +@return TRUE if ok */ +ibool +dtype_validate( +/*===========*/ + const dtype_t* type) /*!< in: type struct to validate */ +{ + ut_a(type); + ut_a(type->mtype >= DATA_VARCHAR); + ut_a(type->mtype <= DATA_MTYPE_MAX); + + if (type->mtype == DATA_SYS) { + ut_a((type->prtype & DATA_MYSQL_TYPE_MASK) < DATA_N_SYS_COLS); + } + + ut_a(dtype_get_mbminlen(type) <= dtype_get_mbmaxlen(type)); + + return(TRUE); +} + +#ifdef UNIV_DEBUG +/** Print a data type structure. +@param[in] type data type */ +void +dtype_print(const dtype_t* type) +{ + ulint mtype; + ulint prtype; + ulint len; + + ut_a(type); + + mtype = type->mtype; + prtype = type->prtype; + + switch (mtype) { + case DATA_VARCHAR: + fputs("DATA_VARCHAR", stderr); + break; + + case DATA_CHAR: + fputs("DATA_CHAR", stderr); + break; + + case DATA_BINARY: + fputs("DATA_BINARY", stderr); + break; + + case DATA_FIXBINARY: + fputs("DATA_FIXBINARY", stderr); + break; + + case DATA_BLOB: + fputs("DATA_BLOB", stderr); + break; + + case DATA_GEOMETRY: + fputs("DATA_GEOMETRY", stderr); + break; + + case DATA_INT: + fputs("DATA_INT", stderr); + break; + + case DATA_MYSQL: + fputs("DATA_MYSQL", stderr); + break; + + case DATA_SYS: + fputs("DATA_SYS", stderr); + break; + + case DATA_FLOAT: + fputs("DATA_FLOAT", stderr); + break; + + case DATA_DOUBLE: + fputs("DATA_DOUBLE", stderr); + break; + + case DATA_DECIMAL: + fputs("DATA_DECIMAL", stderr); + break; + + case DATA_VARMYSQL: + fputs("DATA_VARMYSQL", stderr); + break; + + default: + fprintf(stderr, "type %lu", (ulong) mtype); + break; + } + + len = type->len; + + if ((type->mtype == DATA_SYS) + || (type->mtype == DATA_VARCHAR) + || (type->mtype == DATA_CHAR)) { + putc(' ', stderr); + if (prtype == DATA_ROW_ID) { + fputs("DATA_ROW_ID", stderr); + len = DATA_ROW_ID_LEN; + } else if (prtype == DATA_ROLL_PTR) { + fputs("DATA_ROLL_PTR", stderr); + len = DATA_ROLL_PTR_LEN; + } else if (prtype == DATA_TRX_ID) { + fputs("DATA_TRX_ID", stderr); + len = DATA_TRX_ID_LEN; + } else if (prtype == DATA_ENGLISH) { + fputs("DATA_ENGLISH", stderr); + } else { + fprintf(stderr, "prtype %lu", (ulong) prtype); + } + } else { + if (prtype & DATA_UNSIGNED) { + fputs(" DATA_UNSIGNED", stderr); + } + + if (prtype & DATA_BINARY_TYPE) { + fputs(" DATA_BINARY_TYPE", stderr); + } + + if (prtype & DATA_NOT_NULL) { + fputs(" DATA_NOT_NULL", stderr); + } + } + + fprintf(stderr, " len %lu", (ulong) len); +} +#endif /* UNIV_DEBUG */ diff --git a/storage/innobase/dict/dict0boot.cc b/storage/innobase/dict/dict0boot.cc new file mode 100644 index 00000000..5516bce9 --- /dev/null +++ b/storage/innobase/dict/dict0boot.cc @@ -0,0 +1,440 @@ +/***************************************************************************** + +Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file dict/dict0boot.cc +Data dictionary creation and booting + +Created 4/18/1996 Heikki Tuuri +*******************************************************/ + +#include "dict0boot.h" +#include "dict0crea.h" +#include "btr0btr.h" +#include "dict0load.h" +#include "trx0trx.h" +#include "srv0srv.h" +#include "ibuf0ibuf.h" +#include "buf0flu.h" +#include "log0recv.h" +#include "os0file.h" + +/** The DICT_HDR page identifier */ +static constexpr page_id_t hdr_page_id{DICT_HDR_SPACE, DICT_HDR_PAGE_NO}; + +/** @return the DICT_HDR block, x-latched */ +static buf_block_t *dict_hdr_get(mtr_t *mtr) +{ + /* We assume that the DICT_HDR page is always readable and available. */ + return buf_page_get_gen(hdr_page_id, 0, RW_X_LATCH, nullptr, BUF_GET, mtr); +} + +/**********************************************************************//** +Returns a new table, index, or space id. */ +void +dict_hdr_get_new_id( +/*================*/ + table_id_t* table_id, /*!< out: table id + (not assigned if NULL) */ + index_id_t* index_id, /*!< out: index id + (not assigned if NULL) */ + uint32_t* space_id) /*!< out: space id + (not assigned if NULL) */ +{ + ib_id_t id; + mtr_t mtr; + + mtr.start(); + buf_block_t* dict_hdr = dict_hdr_get(&mtr); + + if (table_id) { + id = mach_read_from_8(DICT_HDR + DICT_HDR_TABLE_ID + + dict_hdr->page.frame); + id++; + mtr.write<8>(*dict_hdr, DICT_HDR + DICT_HDR_TABLE_ID + + dict_hdr->page.frame, id); + *table_id = id; + } + + if (index_id) { + id = mach_read_from_8(DICT_HDR + DICT_HDR_INDEX_ID + + dict_hdr->page.frame); + id++; + mtr.write<8>(*dict_hdr, DICT_HDR + DICT_HDR_INDEX_ID + + dict_hdr->page.frame, id); + *index_id = id; + } + + if (space_id) { + *space_id = mach_read_from_4(DICT_HDR + DICT_HDR_MAX_SPACE_ID + + dict_hdr->page.frame); + if (fil_assign_new_space_id(space_id)) { + mtr.write<4>(*dict_hdr, + DICT_HDR + DICT_HDR_MAX_SPACE_ID + + dict_hdr->page.frame, *space_id); + } + } + + mtr.commit(); +} + +/** Update dict_sys.row_id in the dictionary header file page. */ +void dict_hdr_flush_row_id(row_id_t id) +{ + mtr_t mtr; + mtr.start(); + buf_block_t* d= dict_hdr_get(&mtr); + byte *row_id= DICT_HDR + DICT_HDR_ROW_ID + d->page.frame; + if (mach_read_from_8(row_id) < id) + mtr.write<8>(*d, row_id, id); + mtr.commit(); +} + +/** Create the DICT_HDR page on database initialization. +@return error code */ +dberr_t dict_create() +{ + ulint root_page_no; + + dberr_t err; + mtr_t mtr; + mtr.start(); + compile_time_assert(DICT_HDR_SPACE == 0); + + /* Create the dictionary header file block in a new, allocated file + segment in the system tablespace */ + buf_block_t* d = fseg_create(fil_system.sys_space, + DICT_HDR + DICT_HDR_FSEG_HEADER, &mtr, + &err); + if (!d) { + goto func_exit; + } + ut_a(d->page.id() == hdr_page_id); + + /* Start counting row, table, index, and tree ids from + DICT_HDR_FIRST_ID */ + mtr.write<8>(*d, DICT_HDR + DICT_HDR_ROW_ID + d->page.frame, + DICT_HDR_FIRST_ID); + mtr.write<8>(*d, DICT_HDR + DICT_HDR_TABLE_ID + d->page.frame, + DICT_HDR_FIRST_ID); + mtr.write<8>(*d, DICT_HDR + DICT_HDR_INDEX_ID + d->page.frame, + DICT_HDR_FIRST_ID); + + ut_ad(!mach_read_from_4(DICT_HDR + DICT_HDR_MAX_SPACE_ID + + d->page.frame)); + + /* Obsolete, but we must initialize it anyway. */ + mtr.write<4>(*d, DICT_HDR + DICT_HDR_MIX_ID_LOW + d->page.frame, + DICT_HDR_FIRST_ID); + + /* Create the B-tree roots for the clustered indexes of the basic + system tables */ + + /*--------------------------*/ + root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, + fil_system.sys_space, DICT_TABLES_ID, + nullptr, &mtr, &err); + if (root_page_no == FIL_NULL) { + goto func_exit; + } + + mtr.write<4>(*d, DICT_HDR + DICT_HDR_TABLES + d->page.frame, + root_page_no); + /*--------------------------*/ + root_page_no = btr_create(DICT_UNIQUE, + fil_system.sys_space, DICT_TABLE_IDS_ID, + nullptr, &mtr, &err); + if (root_page_no == FIL_NULL) { + goto func_exit; + } + + mtr.write<4>(*d, DICT_HDR + DICT_HDR_TABLE_IDS + d->page.frame, + root_page_no); + /*--------------------------*/ + root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, + fil_system.sys_space, DICT_COLUMNS_ID, + nullptr, &mtr, &err); + if (root_page_no == FIL_NULL) { + goto func_exit; + } + + mtr.write<4>(*d, DICT_HDR + DICT_HDR_COLUMNS + d->page.frame, + root_page_no); + /*--------------------------*/ + root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, + fil_system.sys_space, DICT_INDEXES_ID, + nullptr, &mtr, &err); + if (root_page_no == FIL_NULL) { + goto func_exit; + } + + mtr.write<4>(*d, DICT_HDR + DICT_HDR_INDEXES + d->page.frame, + root_page_no); + /*--------------------------*/ + root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, + fil_system.sys_space, DICT_FIELDS_ID, + nullptr, &mtr, &err); + if (root_page_no == FIL_NULL) { + goto func_exit; + } + + mtr.write<4>(*d, DICT_HDR + DICT_HDR_FIELDS + d->page.frame, + root_page_no); +func_exit: + mtr.commit(); + return err ? err : dict_boot(); +} + +/*****************************************************************//** +Initializes the data dictionary memory structures when the database is +started. This function is also called when the data dictionary is created. +@return DB_SUCCESS or error code. */ +dberr_t dict_boot() +{ + dict_table_t* table; + dict_index_t* index; + mem_heap_t* heap; + mtr_t mtr; + + static_assert(DICT_NUM_COLS__SYS_TABLES == 8, "compatibility"); + static_assert(DICT_NUM_FIELDS__SYS_TABLES == 10, "compatibility"); + static_assert(DICT_NUM_FIELDS__SYS_TABLE_IDS == 2, "compatibility"); + static_assert(DICT_NUM_COLS__SYS_COLUMNS == 7, "compatibility"); + static_assert(DICT_NUM_FIELDS__SYS_COLUMNS == 9, "compatibility"); + static_assert(DICT_NUM_COLS__SYS_INDEXES == 8, "compatibility"); + static_assert(DICT_NUM_FIELDS__SYS_INDEXES == 10, "compatibility"); + static_assert(DICT_NUM_COLS__SYS_FIELDS == 3, "compatibility"); + static_assert(DICT_NUM_FIELDS__SYS_FIELDS == 5, "compatibility"); + static_assert(DICT_NUM_COLS__SYS_FOREIGN == 4, "compatibility"); + static_assert(DICT_NUM_FIELDS__SYS_FOREIGN == 6, "compatibility"); + static_assert(DICT_NUM_FIELDS__SYS_FOREIGN_FOR_NAME == 2, + "compatibility"); + static_assert(DICT_NUM_COLS__SYS_FOREIGN_COLS == 4, "compatibility"); + static_assert(DICT_NUM_FIELDS__SYS_FOREIGN_COLS == 6, "compatibility"); + + mtr.start(); + /* Create the hash tables etc. */ + dict_sys.create(); + + dberr_t err; + const buf_block_t *d = buf_page_get_gen(hdr_page_id, 0, RW_X_LATCH, + nullptr, BUF_GET, &mtr, &err); + if (!d) { + mtr.commit(); + return err; + } + + heap = mem_heap_create(450); + + dict_sys.lock(SRW_LOCK_CALL); + + const byte* dict_hdr = &d->page.frame[DICT_HDR]; + + /* Because we only write new row ids to disk-based data structure + (dictionary header) when it is divisible by + DICT_HDR_ROW_ID_WRITE_MARGIN, in recovery we will not recover + the latest value of the row id counter. Therefore we advance + the counter at the database startup to avoid overlapping values. + Note that when a user after database startup first time asks for + a new row id, then because the counter is now divisible by + ..._MARGIN, it will immediately be updated to the disk-based + header. */ + + dict_sys.recover_row_id(mach_read_from_8(dict_hdr + DICT_HDR_ROW_ID)); + if (uint32_t max_space_id + = mach_read_from_4(dict_hdr + DICT_HDR_MAX_SPACE_ID)) { + max_space_id--; + fil_assign_new_space_id(&max_space_id); + } + + /* Insert into the dictionary cache the descriptions of the basic + system tables */ + /*-------------------------*/ + table = dict_table_t::create(dict_sys.SYS_TABLE[dict_sys.SYS_TABLES], + fil_system.sys_space, + DICT_NUM_COLS__SYS_TABLES, 0, 0, 0); + dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, + MAX_FULL_NAME_LEN); + dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 8); + /* ROW_FORMAT = (N_COLS >> 31) ? COMPACT : REDUNDANT */ + dict_mem_table_add_col(table, heap, "N_COLS", DATA_INT, 0, 4); + /* The low order bit of TYPE is always set to 1. If ROW_FORMAT + is not REDUNDANT or COMPACT, this field matches table->flags. */ + dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "MIX_ID", DATA_BINARY, 0, 0); + /* MIX_LEN may contain additional table flags when + ROW_FORMAT!=REDUNDANT. */ + dict_mem_table_add_col(table, heap, "MIX_LEN", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "CLUSTER_NAME", DATA_BINARY, 0, 0); + dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4); + + table->id = DICT_TABLES_ID; + + dict_table_add_system_columns(table, heap); + table->add_to_cache(); + dict_sys.sys_tables = table; + mem_heap_empty(heap); + + index = dict_mem_index_create(table, "CLUST_IND", + DICT_UNIQUE | DICT_CLUSTERED, 1); + + dict_mem_index_add_field(index, "NAME", 0); + + index->id = DICT_TABLES_ID; + err = dict_index_add_to_cache( + index, mach_read_from_4(dict_hdr + DICT_HDR_TABLES)); + ut_a(err == DB_SUCCESS); + ut_ad(!table->is_instant()); + table->indexes.start->n_core_null_bytes = static_cast( + UT_BITS_IN_BYTES(unsigned(table->indexes.start->n_nullable))); + + /*-------------------------*/ + index = dict_mem_index_create(table, "ID_IND", DICT_UNIQUE, 1); + dict_mem_index_add_field(index, "ID", 0); + + index->id = DICT_TABLE_IDS_ID; + err = dict_index_add_to_cache( + index, mach_read_from_4(dict_hdr + DICT_HDR_TABLE_IDS)); + ut_a(err == DB_SUCCESS); + + /*-------------------------*/ + table = dict_table_t::create(dict_sys.SYS_TABLE[dict_sys.SYS_COLUMNS], + fil_system.sys_space, + DICT_NUM_COLS__SYS_COLUMNS, 0, 0, 0); + dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 8); + dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0); + dict_mem_table_add_col(table, heap, "MTYPE", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "PRTYPE", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "LEN", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "PREC", DATA_INT, 0, 4); + + table->id = DICT_COLUMNS_ID; + + dict_table_add_system_columns(table, heap); + table->add_to_cache(); + dict_sys.sys_columns = table; + mem_heap_empty(heap); + + index = dict_mem_index_create(table, "CLUST_IND", + DICT_UNIQUE | DICT_CLUSTERED, 2); + + dict_mem_index_add_field(index, "TABLE_ID", 0); + dict_mem_index_add_field(index, "POS", 0); + + index->id = DICT_COLUMNS_ID; + err = dict_index_add_to_cache( + index, mach_read_from_4(dict_hdr + DICT_HDR_COLUMNS)); + ut_a(err == DB_SUCCESS); + ut_ad(!table->is_instant()); + table->indexes.start->n_core_null_bytes = static_cast( + UT_BITS_IN_BYTES(unsigned(table->indexes.start->n_nullable))); + + /*-------------------------*/ + table = dict_table_t::create(dict_sys.SYS_TABLE[dict_sys.SYS_INDEXES], + fil_system.sys_space, + DICT_NUM_COLS__SYS_INDEXES, 0, 0, 0); + + dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 8); + dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 8); + dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0); + dict_mem_table_add_col(table, heap, "N_FIELDS", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4); + /* SYS_INDEXES.SPACE is only read by in dict_drop_index_tree() */ + dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "PAGE_NO", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "MERGE_THRESHOLD", DATA_INT, 0, 4); + + table->id = DICT_INDEXES_ID; + + dict_table_add_system_columns(table, heap); + /* The column SYS_INDEXES.MERGE_THRESHOLD was "instantly" + added in MySQL 5.7 and MariaDB 10.2.2. Assign it DEFAULT NULL. + Because of file format compatibility, we must treat SYS_INDEXES + as a special case, relaxing some debug assertions + for DICT_INDEXES_ID. */ + dict_table_get_nth_col(table, DICT_COL__SYS_INDEXES__MERGE_THRESHOLD) + ->def_val.len = UNIV_SQL_NULL; + table->add_to_cache(); + dict_sys.sys_indexes = table; + mem_heap_empty(heap); + + index = dict_mem_index_create(table, "CLUST_IND", + DICT_UNIQUE | DICT_CLUSTERED, 2); + + dict_mem_index_add_field(index, "TABLE_ID", 0); + dict_mem_index_add_field(index, "ID", 0); + + index->id = DICT_INDEXES_ID; + err = dict_index_add_to_cache( + index, mach_read_from_4(dict_hdr + DICT_HDR_INDEXES)); + ut_a(err == DB_SUCCESS); + ut_ad(!table->is_instant()); + table->indexes.start->n_core_null_bytes = static_cast( + UT_BITS_IN_BYTES(unsigned(table->indexes.start->n_nullable))); + + /*-------------------------*/ + table = dict_table_t::create(dict_sys.SYS_TABLE[dict_sys.SYS_FIELDS], + fil_system.sys_space, + DICT_NUM_COLS__SYS_FIELDS, 0, 0, 0); + dict_mem_table_add_col(table, heap, "INDEX_ID", DATA_BINARY, 0, 8); + dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4); + dict_mem_table_add_col(table, heap, "COL_NAME", DATA_BINARY, 0, 0); + + table->id = DICT_FIELDS_ID; + + dict_table_add_system_columns(table, heap); + table->add_to_cache(); + dict_sys.sys_fields = table; + mem_heap_free(heap); + + index = dict_mem_index_create(table, "CLUST_IND", + DICT_UNIQUE | DICT_CLUSTERED, 2); + + dict_mem_index_add_field(index, "INDEX_ID", 0); + dict_mem_index_add_field(index, "POS", 0); + + index->id = DICT_FIELDS_ID; + err = dict_index_add_to_cache( + index, mach_read_from_4(dict_hdr + DICT_HDR_FIELDS)); + ut_a(err == DB_SUCCESS); + ut_ad(!table->is_instant()); + table->indexes.start->n_core_null_bytes = static_cast( + UT_BITS_IN_BYTES(unsigned(table->indexes.start->n_nullable))); + + mtr.commit(); + + err = ibuf_init_at_db_start(); + + if (err == DB_SUCCESS || srv_force_recovery >= SRV_FORCE_NO_DDL_UNDO) { + err = DB_SUCCESS; + /* Load definitions of other indexes on system tables */ + + dict_load_sys_table(dict_sys.sys_tables); + dict_load_sys_table(dict_sys.sys_columns); + dict_load_sys_table(dict_sys.sys_indexes); + dict_load_sys_table(dict_sys.sys_fields); + dict_sys.unlock(); + dict_sys.load_sys_tables(); + } else { + dict_sys.unlock(); + } + + return err; +} diff --git a/storage/innobase/dict/dict0crea.cc b/storage/innobase/dict/dict0crea.cc new file mode 100644 index 00000000..cce5f2f2 --- /dev/null +++ b/storage/innobase/dict/dict0crea.cc @@ -0,0 +1,1906 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file dict/dict0crea.cc +Database object creation + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#include "dict0crea.h" +#include "btr0pcur.h" +#ifdef BTR_CUR_HASH_ADAPT +# include "btr0sea.h" +#endif /* BTR_CUR_HASH_ADAPT */ +#include "page0page.h" +#include "mach0data.h" +#include "dict0boot.h" +#include "dict0dict.h" +#include "lock0lock.h" +#include "que0que.h" +#include "row0ins.h" +#include "row0mysql.h" +#include "pars0pars.h" +#include "trx0roll.h" +#include "trx0rseg.h" +#include "trx0undo.h" +#include "ut0vec.h" +#include "fts0priv.h" +#include "srv0start.h" +#include "log.h" + +/*****************************************************************//** +Based on a table object, this function builds the entry to be inserted +in the SYS_TABLES system table. +@return the tuple which should be inserted */ +static +dtuple_t* +dict_create_sys_tables_tuple( +/*=========================*/ + const dict_table_t* table, /*!< in: table */ + mem_heap_t* heap) /*!< in: memory heap from + which the memory for the built + tuple is allocated */ +{ + dtuple_t* entry; + dfield_t* dfield; + byte* ptr; + ulint type; + + ut_ad(table); + ut_ad(!table->space || table->space->id == table->space_id); + ut_ad(heap); + ut_ad(table->n_cols >= DATA_N_SYS_COLS); + + entry = dtuple_create(heap, 8 + DATA_N_SYS_COLS); + + dict_table_copy_types(entry, dict_sys.sys_tables); + + /* 0: NAME -----------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_TABLES__NAME); + + dfield_set_data(dfield, + table->name.m_name, strlen(table->name.m_name)); + + /* 1: DB_TRX_ID added later */ + /* 2: DB_ROLL_PTR added later */ + /* 3: ID -------------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_TABLES__ID); + + ptr = static_cast(mem_heap_alloc(heap, 8)); + mach_write_to_8(ptr, table->id); + + dfield_set_data(dfield, ptr, 8); + + /* 4: N_COLS ---------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_TABLES__N_COLS); + + ptr = static_cast(mem_heap_alloc(heap, 4)); + + /* If there is any virtual column, encode it in N_COLS */ + mach_write_to_4(ptr, dict_table_encode_n_col( + ulint(table->n_cols - DATA_N_SYS_COLS), + ulint(table->n_v_def)) + | (ulint(table->flags & DICT_TF_COMPACT) << 31)); + dfield_set_data(dfield, ptr, 4); + + /* 5: TYPE (table flags) -----------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_TABLES__TYPE); + + ptr = static_cast(mem_heap_alloc(heap, 4)); + + /* Validate the table flags and convert them to what is saved in + SYS_TABLES.TYPE. Table flag values 0 and 1 are both written to + SYS_TABLES.TYPE as 1. */ + type = dict_tf_to_sys_tables_type(table->flags); + mach_write_to_4(ptr, type); + + dfield_set_data(dfield, ptr, 4); + + /* 6: MIX_ID (obsolete) ---------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_TABLES__MIX_ID); + + ptr = static_cast(mem_heap_zalloc(heap, 8)); + + dfield_set_data(dfield, ptr, 8); + + /* 7: MIX_LEN (additional flags) --------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_TABLES__MIX_LEN); + + ptr = static_cast(mem_heap_alloc(heap, 4)); + /* Be sure all non-used bits are zero. */ + ut_a(!(table->flags2 & DICT_TF2_UNUSED_BIT_MASK)); + mach_write_to_4(ptr, table->flags2); + + dfield_set_data(dfield, ptr, 4); + + /* 8: CLUSTER_NAME ---------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_TABLES__CLUSTER_ID); + dfield_set_null(dfield); /* not supported */ + + /* 9: SPACE ----------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_TABLES__SPACE); + + ptr = static_cast(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, table->space_id); + + dfield_set_data(dfield, ptr, 4); + /*----------------------------------*/ + + return(entry); +} + +/*****************************************************************//** +Based on a table object, this function builds the entry to be inserted +in the SYS_COLUMNS system table. +@return the tuple which should be inserted */ +static +dtuple_t* +dict_create_sys_columns_tuple( +/*==========================*/ + const dict_table_t* table, /*!< in: table */ + ulint i, /*!< in: column number */ + mem_heap_t* heap) /*!< in: memory heap from + which the memory for the built + tuple is allocated */ +{ + dtuple_t* entry; + const dict_col_t* column; + dfield_t* dfield; + byte* ptr; + const char* col_name; + ulint num_base = 0; + ulint v_col_no = ULINT_UNDEFINED; + + ut_ad(table); + ut_ad(heap); + + /* Any column beyond table->n_def would be virtual columns */ + if (i >= table->n_def) { + dict_v_col_t* v_col = dict_table_get_nth_v_col( + table, i - table->n_def); + column = &v_col->m_col; + num_base = v_col->num_base; + v_col_no = column->ind; + } else { + column = dict_table_get_nth_col(table, i); + ut_ad(!column->is_virtual()); + } + + entry = dtuple_create(heap, 7 + DATA_N_SYS_COLS); + + dict_table_copy_types(entry, dict_sys.sys_columns); + + /* 0: TABLE_ID -----------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__TABLE_ID); + + ptr = static_cast(mem_heap_alloc(heap, 8)); + mach_write_to_8(ptr, table->id); + + dfield_set_data(dfield, ptr, 8); + + /* 1: POS ----------------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__POS); + + ptr = static_cast(mem_heap_alloc(heap, 4)); + + if (v_col_no != ULINT_UNDEFINED) { + /* encode virtual column's position in MySQL table and InnoDB + table in "POS" */ + mach_write_to_4(ptr, dict_create_v_col_pos( + i - table->n_def, v_col_no)); + } else { + mach_write_to_4(ptr, i); + } + + dfield_set_data(dfield, ptr, 4); + + /* 2: DB_TRX_ID added later */ + /* 3: DB_ROLL_PTR added later */ + /* 4: NAME ---------------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__NAME); + + if (i >= table->n_def) { + col_name = dict_table_get_v_col_name(table, i - table->n_def); + } else { + col_name = dict_table_get_col_name(table, i); + } + + dfield_set_data(dfield, col_name, strlen(col_name)); + + /* 5: MTYPE --------------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__MTYPE); + + ptr = static_cast(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, column->mtype); + + dfield_set_data(dfield, ptr, 4); + + /* 6: PRTYPE -------------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__PRTYPE); + + ptr = static_cast(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, column->prtype); + + dfield_set_data(dfield, ptr, 4); + + /* 7: LEN ----------------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__LEN); + + ptr = static_cast(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, column->len); + + dfield_set_data(dfield, ptr, 4); + + /* 8: PREC ---------------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__PREC); + + ptr = static_cast(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, num_base); + + dfield_set_data(dfield, ptr, 4); + /*---------------------------------*/ + + return(entry); +} + +/** Based on a table object, this function builds the entry to be inserted +in the SYS_VIRTUAL system table. Each row maps a virtual column to one of +its base column. +@param[in] table table +@param[in] v_col_n virtual column number +@param[in] b_col_n base column sequence num +@param[in] heap memory heap +@return the tuple which should be inserted */ +static +dtuple_t* +dict_create_sys_virtual_tuple( + const dict_table_t* table, + ulint v_col_n, + ulint b_col_n, + mem_heap_t* heap) +{ + dtuple_t* entry; + const dict_col_t* base_column; + dfield_t* dfield; + byte* ptr; + + ut_ad(table); + ut_ad(heap); + + ut_ad(v_col_n < table->n_v_def); + dict_v_col_t* v_col = dict_table_get_nth_v_col(table, v_col_n); + base_column = v_col->base_col[b_col_n]; + + entry = dtuple_create(heap, DICT_NUM_COLS__SYS_VIRTUAL + + DATA_N_SYS_COLS); + + dict_table_copy_types(entry, dict_sys.sys_virtual); + + /* 0: TABLE_ID -----------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_VIRTUAL__TABLE_ID); + + ptr = static_cast(mem_heap_alloc(heap, 8)); + mach_write_to_8(ptr, table->id); + + dfield_set_data(dfield, ptr, 8); + + /* 1: POS ---------------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_VIRTUAL__POS); + + ptr = static_cast(mem_heap_alloc(heap, 4)); + ulint v_col_no = dict_create_v_col_pos(v_col_n, v_col->m_col.ind); + mach_write_to_4(ptr, v_col_no); + + dfield_set_data(dfield, ptr, 4); + + /* 2: BASE_POS ----------------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_VIRTUAL__BASE_POS); + + ptr = static_cast(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, base_column->ind); + + dfield_set_data(dfield, ptr, 4); + + /* 3: DB_TRX_ID added later */ + /* 4: DB_ROLL_PTR added later */ + + /*---------------------------------*/ + return(entry); +} + +/***************************************************************//** +Builds a table definition to insert. +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +dict_build_table_def_step( +/*======================*/ + que_thr_t* thr, /*!< in: query thread */ + tab_node_t* node) /*!< in: table create node */ +{ + ut_ad(dict_sys.locked()); + dict_table_t* table = node->table; + ut_ad(!table->is_temporary()); + ut_ad(!table->space); + ut_ad(table->space_id == UINT32_MAX); + dict_hdr_get_new_id(&table->id, nullptr, nullptr); + + /* Always set this bit for all new created tables */ + DICT_TF2_FLAG_SET(table, DICT_TF2_FTS_AUX_HEX_NAME); + DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name", + DICT_TF2_FLAG_UNSET(table, + DICT_TF2_FTS_AUX_HEX_NAME);); + + if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_USE_FILE_PER_TABLE)) { + /* This table will need a new tablespace. */ + + ut_ad(DICT_TF_GET_ZIP_SSIZE(table->flags) == 0 + || dict_table_has_atomic_blobs(table)); + /* Get a new tablespace ID */ + dict_hdr_get_new_id(NULL, NULL, &table->space_id); + + DBUG_EXECUTE_IF( + "ib_create_table_fail_out_of_space_ids", + table->space_id = UINT32_MAX; + ); + + if (table->space_id == UINT32_MAX) { + return DB_ERROR; + } + } else { + ut_ad(dict_tf_get_rec_format(table->flags) + != REC_FORMAT_COMPRESSED); + table->space = fil_system.sys_space; + table->space_id = TRX_SYS_SPACE; + } + + ins_node_set_new_row(node->tab_def, + dict_create_sys_tables_tuple(table, node->heap)); + return DB_SUCCESS; +} + +/** Builds a SYS_VIRTUAL row definition to insert. +@param[in] node table create node */ +static +void +dict_build_v_col_def_step( + tab_node_t* node) +{ + dtuple_t* row; + + row = dict_create_sys_virtual_tuple(node->table, node->col_no, + node->base_col_no, + node->heap); + ins_node_set_new_row(node->v_col_def, row); +} + +/*****************************************************************//** +Based on an index object, this function builds the entry to be inserted +in the SYS_INDEXES system table. +@return the tuple which should be inserted */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dtuple_t* +dict_create_sys_indexes_tuple( +/*==========================*/ + const dict_index_t* index, /*!< in: index */ + mem_heap_t* heap) /*!< in: memory heap from + which the memory for the built + tuple is allocated */ +{ + dtuple_t* entry; + dfield_t* dfield; + byte* ptr; + + ut_ad(dict_sys.locked()); + ut_ad(index); + ut_ad(index->table->space || !UT_LIST_GET_LEN(index->table->indexes) + || index->table->file_unreadable); + ut_ad(!index->table->space + || index->table->space->id == index->table->space_id); + ut_ad(heap); + + entry = dtuple_create( + heap, DICT_NUM_COLS__SYS_INDEXES + DATA_N_SYS_COLS); + + dict_table_copy_types(entry, dict_sys.sys_indexes); + + /* 0: TABLE_ID -----------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_INDEXES__TABLE_ID); + + ptr = static_cast(mem_heap_alloc(heap, 8)); + mach_write_to_8(ptr, index->table->id); + + dfield_set_data(dfield, ptr, 8); + + /* 1: ID ----------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_INDEXES__ID); + + ptr = static_cast(mem_heap_alloc(heap, 8)); + mach_write_to_8(ptr, index->id); + + dfield_set_data(dfield, ptr, 8); + + /* 2: DB_TRX_ID added later */ + /* 3: DB_ROLL_PTR added later */ + /* 4: NAME --------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_INDEXES__NAME); + + if (!index->is_committed()) { + ulint len = strlen(index->name) + 1; + char* name = static_cast( + mem_heap_alloc(heap, len)); + *name = *TEMP_INDEX_PREFIX_STR; + memcpy(name + 1, index->name, len - 1); + dfield_set_data(dfield, name, len); + } else { + dfield_set_data(dfield, index->name, strlen(index->name)); + } + + /* 5: N_FIELDS ----------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_INDEXES__N_FIELDS); + + ptr = static_cast(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, index->n_fields); + + dfield_set_data(dfield, ptr, 4); + + /* 6: TYPE --------------------------*/ + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_INDEXES__TYPE); + + ptr = static_cast(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, index->type); + + dfield_set_data(dfield, ptr, 4); + + /* 7: SPACE --------------------------*/ + + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_INDEXES__SPACE); + + ptr = static_cast(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, index->table->space_id); + + dfield_set_data(dfield, ptr, 4); + + /* 8: PAGE_NO --------------------------*/ + + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_INDEXES__PAGE_NO); + + ptr = static_cast(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, FIL_NULL); + + dfield_set_data(dfield, ptr, 4); + + /* 9: MERGE_THRESHOLD ----------------*/ + + dfield = dtuple_get_nth_field( + entry, DICT_COL__SYS_INDEXES__MERGE_THRESHOLD); + + ptr = static_cast(mem_heap_alloc(heap, 4)); + mach_write_to_4(ptr, DICT_INDEX_MERGE_THRESHOLD_DEFAULT); + + dfield_set_data(dfield, ptr, 4); + + /*--------------------------------*/ + + return(entry); +} + +/*****************************************************************//** +Based on an index object, this function builds the entry to be inserted +in the SYS_FIELDS system table. +@return the tuple which should be inserted */ +static +dtuple_t* +dict_create_sys_fields_tuple( +/*=========================*/ + const dict_index_t* index, /*!< in: index */ + ulint fld_no, /*!< in: field number */ + mem_heap_t* heap) /*!< in: memory heap from + which the memory for the built + tuple is allocated */ +{ + dtuple_t* entry; + dict_field_t* field; + dfield_t* dfield; + byte* ptr; + bool wide_pos = false; + + ut_ad(index); + ut_ad(heap); + + for (unsigned j = 0; j < index->n_fields; j++) { + const dict_field_t* f = dict_index_get_nth_field(index, j); + if (f->prefix_len || f->descending) { + wide_pos = true; + break; + } + } + + field = dict_index_get_nth_field(index, fld_no); + + entry = dtuple_create(heap, 3 + DATA_N_SYS_COLS); + + dict_table_copy_types(entry, dict_sys.sys_fields); + + /* 0: INDEX_ID -----------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__INDEX_ID); + + ptr = static_cast(mem_heap_alloc(heap, 8)); + mach_write_to_8(ptr, index->id); + + dfield_set_data(dfield, ptr, 8); + + /* 1: POS; FIELD NUMBER & PREFIX LENGTH -----------------------*/ + + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__POS); + + ptr = static_cast(mem_heap_alloc(heap, 4)); + + if (wide_pos) { + /* If there are column prefixes or columns with + descending order in the index, then we write the + field number to the 16 most significant bits, + the DESC flag to bit 15, and the prefix length + in the 15 least significant bits. */ + mach_write_to_4(ptr, (fld_no << 16) + | (!!field->descending) << 15 + | field->prefix_len); + } else { + /* Else we store the number of the field to the 2 LOW bytes. + This is to keep the storage format compatible with + InnoDB versions < 4.0.14. */ + + mach_write_to_4(ptr, fld_no); + } + + dfield_set_data(dfield, ptr, 4); + + /* 2: DB_TRX_ID added later */ + /* 3: DB_ROLL_PTR added later */ + /* 4: COL_NAME -------------------------*/ + dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__COL_NAME); + + dfield_set_data(dfield, field->name, strlen(field->name)); + /*---------------------------------*/ + + return(entry); +} + +/*****************************************************************//** +Creates the tuple with which the index entry is searched for writing the index +tree root page number, if such a tree is created. +@return the tuple for search */ +static +dtuple_t* +dict_create_search_tuple( +/*=====================*/ + const dtuple_t* tuple, /*!< in: the tuple inserted in the SYS_INDEXES + table */ + mem_heap_t* heap) /*!< in: memory heap from which the memory for + the built tuple is allocated */ +{ + dtuple_t* search_tuple; + const dfield_t* field1; + dfield_t* field2; + + ut_ad(tuple && heap); + + search_tuple = dtuple_create(heap, 2); + + field1 = dtuple_get_nth_field(tuple, 0); + field2 = dtuple_get_nth_field(search_tuple, 0); + + dfield_copy(field2, field1); + + field1 = dtuple_get_nth_field(tuple, 1); + field2 = dtuple_get_nth_field(search_tuple, 1); + + dfield_copy(field2, field1); + + ut_ad(dtuple_validate(search_tuple)); + + return(search_tuple); +} + +/***************************************************************//** +Builds an index definition row to insert. +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +dict_build_index_def_step( +/*======================*/ + que_thr_t* thr, /*!< in: query thread */ + ind_node_t* node) /*!< in: index create node */ +{ + dict_table_t* table; + dict_index_t* index; + dtuple_t* row; + trx_t* trx; + + ut_ad(dict_sys.locked()); + + trx = thr_get_trx(thr); + + index = node->index; + + table = dict_table_open_on_name( + node->table_name, true, DICT_ERR_IGNORE_TABLESPACE); + + if (!table) { + return DB_TABLE_NOT_FOUND; + } + + index->table = table; + + ut_ad((UT_LIST_GET_LEN(table->indexes) > 0) + || dict_index_is_clust(index)); + + dict_hdr_get_new_id(NULL, &index->id, NULL); + + node->page_no = FIL_NULL; + row = dict_create_sys_indexes_tuple(index, node->heap); + node->ind_row = row; + + ins_node_set_new_row(node->ind_def, row); + + /* Note that the index was created by this transaction. */ + index->trx_id = trx->id; + ut_ad(table->def_trx_id <= trx->id); + table->def_trx_id = trx->id; + table->release(); + + return(DB_SUCCESS); +} + +/***************************************************************//** +Builds an index definition without updating SYSTEM TABLES. +@return DB_SUCCESS or error code */ +void +dict_build_index_def( +/*=================*/ + const dict_table_t* table, /*!< in: table */ + dict_index_t* index, /*!< in/out: index */ + trx_t* trx) /*!< in/out: InnoDB transaction handle */ +{ + ut_ad(dict_sys.locked()); + + ut_ad((UT_LIST_GET_LEN(table->indexes) > 0) + || dict_index_is_clust(index)); + + dict_hdr_get_new_id(NULL, &index->id, NULL); + + /* Note that the index was created by this transaction. */ + index->trx_id = trx->id; +} + +/***************************************************************//** +Builds a field definition row to insert. */ +static +void +dict_build_field_def_step( +/*======================*/ + ind_node_t* node) /*!< in: index create node */ +{ + dict_index_t* index; + dtuple_t* row; + + index = node->index; + + row = dict_create_sys_fields_tuple(index, node->field_no, node->heap); + + ins_node_set_new_row(node->field_def, row); +} + +/***************************************************************//** +Creates an index tree for the index. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +dict_create_index_tree_step( +/*========================*/ + ind_node_t* node) /*!< in: index create node */ +{ + mtr_t mtr; + btr_pcur_t pcur; + dict_index_t* index; + dtuple_t* search_tuple; + + ut_ad(dict_sys.locked()); + + index = node->index; + + if (index->type == DICT_FTS) { + /* FTS index does not need an index tree */ + return(DB_SUCCESS); + } + + /* Run a mini-transaction in which the index tree is allocated for + the index and its root address is written to the index entry in + sys_indexes */ + + mtr.start(); + + search_tuple = dict_create_search_tuple(node->ind_row, node->heap); + node->page_no = FIL_NULL; + pcur.btr_cur.page_cur.index = + UT_LIST_GET_FIRST(dict_sys.sys_indexes->indexes); + + dberr_t err = btr_pcur_open(search_tuple, PAGE_CUR_L, BTR_MODIFY_LEAF, + &pcur, &mtr); + + if (err != DB_SUCCESS) { +func_exit: + mtr.commit(); + return err; + } + + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + + if (UNIV_UNLIKELY(btr_pcur_is_after_last_on_page(&pcur))) { +corrupted: + err = DB_CORRUPTION; + goto func_exit; + } + + ulint len; + byte* data = rec_get_nth_field_old(btr_pcur_get_rec(&pcur), + DICT_FLD__SYS_INDEXES__ID, + &len); + if (UNIV_UNLIKELY(len != 8 || mach_read_from_8(data) != index->id)) { + goto corrupted; + } + + data = rec_get_nth_field_old(btr_pcur_get_rec(&pcur), + DICT_FLD__SYS_INDEXES__PAGE_NO, &len); + if (len != 4) { + goto corrupted; + } + + if (index->is_readable()) { + index->set_modified(mtr); + + node->page_no = btr_create( + index->type, index->table->space, + index->id, index, &mtr, &err); + + DBUG_EXECUTE_IF("ib_import_create_index_failure_1", + node->page_no = FIL_NULL; + err = DB_OUT_OF_FILE_SPACE; ); + } + + mtr.write<4,mtr_t::MAYBE_NOP>(*btr_pcur_get_block(&pcur), data, + node->page_no); + goto func_exit; +} + +/***************************************************************//** +Creates an index tree for the index if it is not a member of a cluster. +Don't update SYSTEM TABLES. +@return error code */ +dberr_t +dict_create_index_tree_in_mem( +/*==========================*/ + dict_index_t* index, /*!< in/out: index */ + const trx_t* trx) /*!< in: InnoDB transaction handle */ +{ + mtr_t mtr; + + ut_ad(dict_sys.locked()); + ut_ad(!(index->type & DICT_FTS)); + + mtr_start(&mtr); + mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO); + + /* Currently this function is being used by temp-tables only. + Import/Discard of temp-table is blocked and so this assert. */ + ut_ad(index->is_readable()); + ut_ad(!(index->table->flags2 & DICT_TF2_DISCARDED)); + + dberr_t err; + index->page = btr_create(index->type, index->table->space, + index->id, index, &mtr, &err); + mtr_commit(&mtr); + + index->trx_id = trx->id; + + return err; +} + +/** Drop the index tree associated with a row in SYS_INDEXES table. +@param[in,out] pcur persistent cursor on rec +@param[in,out] trx dictionary transaction +@param[in,out] mtr mini-transaction +@return tablespace ID to drop (if this is the clustered index) +@retval 0 if no tablespace is to be dropped */ +uint32_t dict_drop_index_tree(btr_pcur_t *pcur, trx_t *trx, mtr_t *mtr) +{ + rec_t *rec= btr_pcur_get_rec(pcur); + + ut_ad(!trx || dict_sys.locked()); + ut_ad(!dict_table_is_comp(dict_sys.sys_indexes)); + btr_pcur_store_position(pcur, mtr); + + static_assert(DICT_FLD__SYS_INDEXES__TABLE_ID == 0, "compatibility"); + static_assert(DICT_FLD__SYS_INDEXES__ID == 1, "compatibility"); + + ulint len= rec_get_n_fields_old(rec); + if (len < DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD || + len > DICT_NUM_FIELDS__SYS_INDEXES) + { +rec_corrupted: + sql_print_error("InnoDB: Corrupted SYS_INDEXES record"); + return 0; + } + + if (rec_get_1byte_offs_flag(rec)) + { + if (rec_1_get_field_end_info(rec, 0) != 8 || + rec_1_get_field_end_info(rec, 1) != 8 + 8) + goto rec_corrupted; + } + else if (rec_2_get_field_end_info(rec, 0) != 8 || + rec_2_get_field_end_info(rec, 1) != 8 + 8) + goto rec_corrupted; + + const byte *p= rec_get_nth_field_old(rec, DICT_FLD__SYS_INDEXES__TYPE, &len); + if (len != 4) + goto rec_corrupted; + const uint32_t type= mach_read_from_4(p); + p= rec_get_nth_field_old(rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len); + if (len != 4) + goto rec_corrupted; + const uint32_t root_page_no= mach_read_from_4(p); + p= rec_get_nth_field_old(rec, DICT_FLD__SYS_INDEXES__SPACE, &len); + if (len != 4) + goto rec_corrupted; + + const uint32_t space_id= mach_read_from_4(p); + ut_ad(root_page_no == FIL_NULL || space_id <= SRV_SPACE_ID_UPPER_BOUND); + + if (space_id && (type & DICT_CLUSTERED)) + return space_id; + + if (root_page_no == FIL_NULL) + /* The tree has already been freed */; + else if (fil_space_t*s= fil_space_t::get(space_id)) + { + /* Ensure that the tablespace file exists + in order to avoid a crash in buf_page_get_gen(). */ + if (root_page_no < s->get_size()) + { + static_assert(FIL_NULL == 0xffffffff, "compatibility"); + static_assert(DICT_FLD__SYS_INDEXES__PAGE_NO == + DICT_FLD__SYS_INDEXES__SPACE + 1, "compatibility"); + mtr->memset(btr_pcur_get_block(pcur), page_offset(p + 4), 4, 0xff); + btr_free_if_exists(s, root_page_no, mach_read_from_8(rec + 8), mtr); + } + s->release(); + } + + return 0; +} + +/*********************************************************************//** +Creates a table create graph. +@return own: table create node */ +tab_node_t* +tab_create_graph_create( +/*====================*/ + dict_table_t* table, /*!< in: table to create, built as a memory data + structure */ + mem_heap_t* heap) /*!< in: heap where created */ +{ + tab_node_t* node; + + node = static_cast( + mem_heap_alloc(heap, sizeof(tab_node_t))); + + node->common.type = QUE_NODE_CREATE_TABLE; + + node->table = table; + + node->state = TABLE_BUILD_TABLE_DEF; + node->heap = mem_heap_create(256); + + node->tab_def = ins_node_create(INS_DIRECT, dict_sys.sys_tables, + heap); + node->tab_def->common.parent = node; + + node->col_def = ins_node_create(INS_DIRECT, dict_sys.sys_columns, + heap); + node->col_def->common.parent = node; + + node->v_col_def = ins_node_create(INS_DIRECT, dict_sys.sys_virtual, + heap); + node->v_col_def->common.parent = node; + + return(node); +} + +/** Creates an index create graph. +@param[in] index index to create, built as a memory data structure +@param[in] table table name +@param[in,out] heap heap where created +@param[in] mode encryption mode (for creating a table) +@param[in] key_id encryption key identifier (for creating a table) +@param[in] add_v new virtual columns added in the same clause with + add index +@return own: index create node */ +ind_node_t* +ind_create_graph_create( + dict_index_t* index, + const char* table, + mem_heap_t* heap, + fil_encryption_t mode, + uint32_t key_id, + const dict_add_v_col_t* add_v) +{ + ind_node_t* node; + + node = static_cast( + mem_heap_alloc(heap, sizeof(ind_node_t))); + + node->common.type = QUE_NODE_CREATE_INDEX; + + node->index = index; + + node->table_name = table; + + node->key_id = key_id; + node->mode = mode; + node->add_v = add_v; + + node->state = INDEX_BUILD_INDEX_DEF; + node->page_no = FIL_NULL; + node->heap = mem_heap_create(256); + + node->ind_def = ins_node_create(INS_DIRECT, + dict_sys.sys_indexes, heap); + node->ind_def->common.parent = node; + + node->field_def = ins_node_create(INS_DIRECT, + dict_sys.sys_fields, heap); + node->field_def->common.parent = node; + + return(node); +} + +/***********************************************************//** +Creates a table. This is a high-level function used in SQL execution graphs. +@return query thread to run next or NULL */ +que_thr_t* +dict_create_table_step( +/*===================*/ + que_thr_t* thr) /*!< in: query thread */ +{ + tab_node_t* node; + dberr_t err = DB_ERROR; + trx_t* trx; + + ut_ad(thr); + ut_ad(dict_sys.locked()); + + trx = thr_get_trx(thr); + + node = static_cast(thr->run_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_CREATE_TABLE); + + if (thr->prev_node == que_node_get_parent(node)) { + node->state = TABLE_BUILD_TABLE_DEF; + } + + if (node->state == TABLE_BUILD_TABLE_DEF) { + + /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */ + + err = dict_build_table_def_step(thr, node); + if (err != DB_SUCCESS) { + + goto function_exit; + } + + node->state = TABLE_BUILD_COL_DEF; + node->col_no = 0; + + thr->run_node = node->tab_def; + + return(thr); + } + + if (node->state == TABLE_BUILD_COL_DEF) { + if (node->col_no + DATA_N_SYS_COLS + < (static_cast(node->table->n_def) + + static_cast(node->table->n_v_def))) { + + ulint i = node->col_no++; + if (i + DATA_N_SYS_COLS >= node->table->n_def) { + i += DATA_N_SYS_COLS; + } + + ins_node_set_new_row( + node->col_def, + dict_create_sys_columns_tuple(node->table, i, + node->heap)); + + thr->run_node = node->col_def; + + return(thr); + } else { + /* Move on to SYS_VIRTUAL table */ + node->col_no = 0; + node->base_col_no = 0; + node->state = TABLE_BUILD_V_COL_DEF; + } + } + + if (node->state == TABLE_BUILD_V_COL_DEF) { + + if (node->col_no < static_cast(node->table->n_v_def)) { + dict_v_col_t* v_col = dict_table_get_nth_v_col( + node->table, node->col_no); + + /* If no base column */ + while (v_col->num_base == 0) { + node->col_no++; + if (node->col_no == static_cast( + (node->table)->n_v_def)) { + node->state = TABLE_ADD_TO_CACHE; + break; + } + + v_col = dict_table_get_nth_v_col( + node->table, node->col_no); + node->base_col_no = 0; + } + + if (node->state != TABLE_ADD_TO_CACHE) { + ut_ad(node->col_no == v_col->v_pos); + dict_build_v_col_def_step(node); + + if (node->base_col_no + < unsigned{v_col->num_base} - 1) { + /* move on to next base column */ + node->base_col_no++; + } else { + /* move on to next virtual column */ + node->col_no++; + node->base_col_no = 0; + } + + thr->run_node = node->v_col_def; + + return(thr); + } + } else { + node->state = TABLE_ADD_TO_CACHE; + } + } + + if (node->state == TABLE_ADD_TO_CACHE) { + node->table->can_be_evicted = !node->table->fts; + node->table->add_to_cache(); + + err = DB_SUCCESS; + } + +function_exit: + trx->error_state = err; + + if (err != DB_SUCCESS) { + return(NULL); + } + + thr->run_node = que_node_get_parent(node); + + return(thr); +} + +static dberr_t dict_create_index_space(const ind_node_t &node) +{ + dict_table_t *table= node.index->table; + if (table->space || (table->flags2 & DICT_TF2_DISCARDED)) + return DB_SUCCESS; + ut_ad(table->space_id); + ut_ad(table->space_id < SRV_TMP_SPACE_ID); + /* Determine the tablespace flags. */ + const bool has_data_dir= DICT_TF_HAS_DATA_DIR(table->flags); + ut_ad(!has_data_dir || table->data_dir_path); + char* filepath= fil_make_filepath(has_data_dir + ? table->data_dir_path : nullptr, + table->name, IBD, has_data_dir); + if (!filepath) + return DB_OUT_OF_MEMORY; + + /* We create a new single-table tablespace for the table. + We initially let it be 4 pages: + - page 0 is the fsp header and an extent descriptor page, + - page 1 is an ibuf bitmap page, + - page 2 is the first inode page, + - page 3 will contain the root of the clustered index of + the table we create here. */ + dberr_t err; + table->space= fil_ibd_create(table->space_id, table->name, filepath, + dict_tf_to_fsp_flags(table->flags), + FIL_IBD_FILE_INITIAL_SIZE, + node.mode, node.key_id, &err); + ut_ad((err != DB_SUCCESS) == !table->space); + ut_free(filepath); + + return err; +} + +/***********************************************************//** +Creates an index. This is a high-level function used in SQL execution +graphs. +@return query thread to run next or NULL */ +que_thr_t* +dict_create_index_step( +/*===================*/ + que_thr_t* thr) /*!< in: query thread */ +{ + ind_node_t* node; + dberr_t err = DB_ERROR; + trx_t* trx; + + ut_ad(thr); + ut_ad(dict_sys.locked()); + + trx = thr_get_trx(thr); + + node = static_cast(thr->run_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_CREATE_INDEX); + + if (thr->prev_node == que_node_get_parent(node)) { + node->state = INDEX_BUILD_INDEX_DEF; + } + + if (node->state == INDEX_BUILD_INDEX_DEF) { + /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */ + err = dict_build_index_def_step(thr, node); + + if (err != DB_SUCCESS) { + + goto function_exit; + } + + node->state = INDEX_BUILD_FIELD_DEF; + node->field_no = 0; + + thr->run_node = node->ind_def; + + return(thr); + } + + if (node->state == INDEX_BUILD_FIELD_DEF) { + err = dict_create_index_space(*node); + if (err != DB_SUCCESS) { + dict_mem_index_free(node->index); + node->index = nullptr; + goto function_exit; + } + + if (node->field_no < (node->index)->n_fields) { + + dict_build_field_def_step(node); + + node->field_no++; + + thr->run_node = node->field_def; + + return(thr); + } else { + node->state = INDEX_ADD_TO_CACHE; + } + } + + if (node->state == INDEX_ADD_TO_CACHE) { + err = dict_index_add_to_cache(node->index, FIL_NULL, + node->add_v); + + ut_ad(!node->index == (err != DB_SUCCESS)); + + if (!node->index) { + goto function_exit; + } + + ut_ad(!node->index->is_instant()); + ut_ad(node->index->n_core_null_bytes + == ((dict_index_is_clust(node->index) + && node->index->table->supports_instant()) + ? dict_index_t::NO_CORE_NULL_BYTES + : UT_BITS_IN_BYTES( + unsigned(node->index->n_nullable)))); + node->index->n_core_null_bytes = static_cast( + UT_BITS_IN_BYTES(unsigned(node->index->n_nullable))); + node->state = INDEX_CREATE_INDEX_TREE; + } + + if (node->state == INDEX_CREATE_INDEX_TREE) { + + err = dict_create_index_tree_step(node); + + DBUG_EXECUTE_IF("ib_dict_create_index_tree_fail", + err = DB_OUT_OF_MEMORY;); + + if (err != DB_SUCCESS) { + dict_table_t* table = node->index->table; + /* If this is a FTS index, we will need to remove + it from fts->cache->indexes list as well */ + if (!(node->index->type & DICT_FTS)) { + } else if (auto fts = table->fts) { + fts_index_cache_t* index_cache; + + mysql_mutex_lock(&fts->cache->init_lock); + + index_cache = (fts_index_cache_t*) + fts_find_index_cache( + fts->cache, + node->index); + + if (index_cache->words) { + rbt_free(index_cache->words); + index_cache->words = 0; + } + + ib_vector_remove( + fts->cache->indexes, + *reinterpret_cast(index_cache)); + + mysql_mutex_unlock(&fts->cache->init_lock); + } + +#ifdef BTR_CUR_HASH_ADAPT + ut_ad(!node->index->search_info->ref_count); +#endif /* BTR_CUR_HASH_ADAPT */ + dict_index_remove_from_cache(table, node->index); + node->index = NULL; + + goto function_exit; + } + + node->index->page = node->page_no; + /* These should have been set in + dict_build_index_def_step() and + dict_index_add_to_cache(). */ + ut_ad(node->index->trx_id == trx->id); + ut_ad(node->index->table->def_trx_id == trx->id); + } + +function_exit: + trx->error_state = err; + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + return nullptr; + } + + thr->run_node = que_node_get_parent(node); + + return(thr); +} + +bool dict_sys_t::load_sys_tables() +{ + ut_ad(!srv_any_background_activity()); + bool mismatch= false; + lock(SRW_LOCK_CALL); + if (!(sys_foreign= load_table(SYS_TABLE[SYS_FOREIGN], + DICT_ERR_IGNORE_FK_NOKEY))); + else if (UT_LIST_GET_LEN(sys_foreign->indexes) == 3 && + sys_foreign->n_cols == DICT_NUM_COLS__SYS_FOREIGN + DATA_N_SYS_COLS) + prevent_eviction(sys_foreign); + else + { + sys_foreign= nullptr; + mismatch= true; + sql_print_error("InnoDB: Invalid definition of SYS_FOREIGN"); + } + if (!(sys_foreign_cols= load_table(SYS_TABLE[SYS_FOREIGN_COLS], + DICT_ERR_IGNORE_FK_NOKEY))); + else if (UT_LIST_GET_LEN(sys_foreign_cols->indexes) == 1 && + sys_foreign_cols->n_cols == + DICT_NUM_COLS__SYS_FOREIGN_COLS + DATA_N_SYS_COLS) + prevent_eviction(sys_foreign_cols); + else + { + sys_foreign_cols= nullptr; + mismatch= true; + sql_print_error("InnoDB: Invalid definition of SYS_FOREIGN_COLS"); + } + if (!(sys_virtual= load_table(SYS_TABLE[SYS_VIRTUAL], + DICT_ERR_IGNORE_FK_NOKEY))); + else if (UT_LIST_GET_LEN(sys_virtual->indexes) == 1 && + sys_virtual->n_cols == DICT_NUM_COLS__SYS_VIRTUAL + DATA_N_SYS_COLS) + prevent_eviction(sys_virtual); + else + { + sys_virtual= nullptr; + mismatch= true; + sql_print_error("InnoDB: Invalid definition of SYS_VIRTUAL"); + } + unlock(); + return mismatch; +} + +dberr_t dict_sys_t::create_or_check_sys_tables() +{ + if (sys_tables_exist()) + return DB_SUCCESS; + + if (srv_read_only_mode || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO) + return DB_READ_ONLY; + + if (load_sys_tables()) + { + sql_print_information("InnoDB: Set innodb_read_only=1 " + "or innodb_force_recovery=3 to start up"); + return DB_CORRUPTION; + } + + if (sys_tables_exist()) + return DB_SUCCESS; + + trx_t *trx= trx_create(); + trx_start_for_ddl(trx); + + { + /* Do not bother with transactional memory; this is only + executed at startup, with no conflicts present. */ + LockMutexGuard g{SRW_LOCK_CALL}; + trx->mutex_lock(); + lock_table_create(dict_sys.sys_tables, LOCK_X, trx); + lock_table_create(dict_sys.sys_columns, LOCK_X, trx); + lock_table_create(dict_sys.sys_indexes, LOCK_X, trx); + lock_table_create(dict_sys.sys_fields, LOCK_X, trx); + trx->mutex_unlock(); + } + + row_mysql_lock_data_dictionary(trx); + + /* NOTE: when designing InnoDB's foreign key support in 2001, Heikki Tuuri + made a mistake and defined table names and the foreign key id to be of type + CHAR (internally, really VARCHAR). The type should have been VARBINARY. */ + + /* System tables are always created inside the system tablespace. */ + const auto srv_file_per_table_backup= srv_file_per_table; + srv_file_per_table= 0; + dberr_t error; + span tablename; + + if (!sys_foreign) + { + error= que_eval_sql(nullptr, "PROCEDURE CREATE_FOREIGN() IS\n" + "BEGIN\n" + "CREATE TABLE\n" + "SYS_FOREIGN(ID CHAR, FOR_NAME CHAR," + " REF_NAME CHAR, N_COLS INT);\n" + "CREATE UNIQUE CLUSTERED INDEX ID_IND" + " ON SYS_FOREIGN (ID);\n" + "CREATE INDEX FOR_IND" + " ON SYS_FOREIGN (FOR_NAME);\n" + "CREATE INDEX REF_IND" + " ON SYS_FOREIGN (REF_NAME);\n" + "END;\n", trx); + if (UNIV_UNLIKELY(error != DB_SUCCESS)) + { + tablename= SYS_TABLE[SYS_FOREIGN]; +err_exit: + sql_print_error("InnoDB: Creation of %.*s failed: %s", + int(tablename.size()), tablename.data(), + ut_strerr(error)); + trx->rollback(); + row_mysql_unlock_data_dictionary(trx); + trx->free(); + srv_file_per_table= srv_file_per_table_backup; + return error; + } + } + if (!sys_foreign_cols) + { + error= que_eval_sql(nullptr, "PROCEDURE CREATE_FOREIGN_COLS() IS\n" + "BEGIN\n" + "CREATE TABLE\n" + "SYS_FOREIGN_COLS(ID CHAR, POS INT," + " FOR_COL_NAME CHAR, REF_COL_NAME CHAR);\n" + "CREATE UNIQUE CLUSTERED INDEX ID_IND" + " ON SYS_FOREIGN_COLS (ID, POS);\n" + "END;\n", trx); + if (UNIV_UNLIKELY(error != DB_SUCCESS)) + { + tablename= SYS_TABLE[SYS_FOREIGN_COLS]; + goto err_exit; + } + } + if (!sys_virtual) + { + error= que_eval_sql(nullptr, "PROCEDURE CREATE_VIRTUAL() IS\n" + "BEGIN\n" + "CREATE TABLE\n" + "SYS_VIRTUAL(TABLE_ID BIGINT,POS INT,BASE_POS INT);\n" + "CREATE UNIQUE CLUSTERED INDEX BASE_IDX" + " ON SYS_VIRTUAL(TABLE_ID, POS, BASE_POS);\n" + "END;\n", trx); + if (UNIV_UNLIKELY(error != DB_SUCCESS)) + { + tablename= SYS_TABLE[SYS_VIRTUAL]; + goto err_exit; + } + } + + trx->commit(); + row_mysql_unlock_data_dictionary(trx); + trx->free(); + srv_file_per_table= srv_file_per_table_backup; + + lock(SRW_LOCK_CALL); + if (sys_foreign); + else if (!(sys_foreign= load_table(SYS_TABLE[SYS_FOREIGN]))) + { + tablename= SYS_TABLE[SYS_FOREIGN]; +load_fail: + unlock(); + sql_print_error("InnoDB: Failed to CREATE TABLE %.*s", + int(tablename.size()), tablename.data()); + return DB_TABLE_NOT_FOUND; + } + else + prevent_eviction(sys_foreign); + + if (sys_foreign_cols); + else if (!(sys_foreign_cols= load_table(SYS_TABLE[SYS_FOREIGN_COLS]))) + { + tablename= SYS_TABLE[SYS_FOREIGN_COLS]; + goto load_fail; + } + else + prevent_eviction(sys_foreign_cols); + + if (sys_virtual); + else if (!(sys_virtual= load_table(SYS_TABLE[SYS_VIRTUAL]))) + { + tablename= SYS_TABLE[SYS_VIRTUAL]; + goto load_fail; + } + else + prevent_eviction(sys_virtual); + + unlock(); + return DB_SUCCESS; +} + +/****************************************************************//** +Evaluate the given foreign key SQL statement. +@return error code or DB_SUCCESS */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +dict_foreign_eval_sql( +/*==================*/ + pars_info_t* info, /*!< in: info struct */ + const char* sql, /*!< in: SQL string to evaluate */ + const char* name, /*!< in: table name (for diagnostics) */ + const char* id, /*!< in: foreign key id */ + trx_t* trx) /*!< in/out: transaction */ +{ + FILE* ef = dict_foreign_err_file; + + dberr_t error = que_eval_sql(info, sql, trx); + + switch (error) { + case DB_SUCCESS: + break; + case DB_DUPLICATE_KEY: + mysql_mutex_lock(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + fputs(" Error in foreign key constraint creation for table ", + ef); + ut_print_name(ef, trx, name); + fputs(".\nA foreign key constraint of name ", ef); + ut_print_name(ef, trx, id); + fputs("\nalready exists." + " (Note that internally InnoDB adds 'databasename'\n" + "in front of the user-defined constraint name.)\n" + "Note that InnoDB's FOREIGN KEY system tables store\n" + "constraint names as case-insensitive, with the\n" + "MariaDB standard latin1_swedish_ci collation. If you\n" + "create tables or databases whose names differ only in\n" + "the character case, then collisions in constraint\n" + "names can occur. Workaround: name your constraints\n" + "explicitly with unique names.\n", + ef); + goto release; + default: + sql_print_error("InnoDB: " + "Foreign key constraint creation failed: %s", + ut_strerr(error)); + + mysql_mutex_lock(&dict_foreign_err_mutex); + ut_print_timestamp(ef); + fputs(" Internal error in foreign key constraint creation" + " for table ", ef); + ut_print_name(ef, trx, name); + fputs(".\n" + "See the MariaDB .err log in the datadir" + " for more information.\n", ef); +release: + mysql_mutex_unlock(&dict_foreign_err_mutex); + } + + return error; +} + +/********************************************************************//** +Add a single foreign key field definition to the data dictionary tables in +the database. +@return error code or DB_SUCCESS */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +dict_create_add_foreign_field_to_dictionary( +/*========================================*/ + ulint field_nr, /*!< in: field number */ + const char* table_name, /*!< in: table name */ + const dict_foreign_t* foreign, /*!< in: foreign */ + trx_t* trx) /*!< in/out: transaction */ +{ + DBUG_ENTER("dict_create_add_foreign_field_to_dictionary"); + + pars_info_t* info = pars_info_create(); + + pars_info_add_str_literal(info, "id", foreign->id); + + pars_info_add_int4_literal(info, "pos", field_nr); + + pars_info_add_str_literal(info, "for_col_name", + foreign->foreign_col_names[field_nr]); + + pars_info_add_str_literal(info, "ref_col_name", + foreign->referenced_col_names[field_nr]); + + DBUG_RETURN(dict_foreign_eval_sql( + info, + "PROCEDURE P () IS\n" + "BEGIN\n" + "INSERT INTO SYS_FOREIGN_COLS VALUES" + "(:id, :pos, :for_col_name, :ref_col_name);\n" + "END;\n", + table_name, foreign->id, trx)); +} + +/********************************************************************//** +Construct foreign key constraint defintion from data dictionary information. +*/ +static +char* +dict_foreign_def_get( +/*=================*/ + dict_foreign_t* foreign,/*!< in: foreign */ + trx_t* trx) /*!< in: trx */ +{ + char* fk_def = (char *)mem_heap_alloc(foreign->heap, 4*1024); + const char* tbname; + char tablebuf[MAX_TABLE_NAME_LEN + 1] = ""; + unsigned i; + char* bufend; + + tbname = dict_remove_db_name(foreign->id); + bufend = innobase_convert_name(tablebuf, MAX_TABLE_NAME_LEN, + tbname, strlen(tbname), trx->mysql_thd); + tablebuf[bufend - tablebuf] = '\0'; + + sprintf(fk_def, + (char *)"CONSTRAINT %s FOREIGN KEY (", (char *)tablebuf); + + for(i = 0; i < foreign->n_fields; i++) { + char buf[MAX_TABLE_NAME_LEN + 1] = ""; + innobase_convert_name(buf, MAX_TABLE_NAME_LEN, + foreign->foreign_col_names[i], + strlen(foreign->foreign_col_names[i]), + trx->mysql_thd); + strcat(fk_def, buf); + if (i < static_cast(foreign->n_fields-1)) { + strcat(fk_def, (char *)","); + } + } + + strcat(fk_def,(char *)") REFERENCES "); + + bufend = innobase_convert_name(tablebuf, MAX_TABLE_NAME_LEN, + foreign->referenced_table_name, + strlen(foreign->referenced_table_name), + trx->mysql_thd); + tablebuf[bufend - tablebuf] = '\0'; + + strcat(fk_def, tablebuf); + strcat(fk_def, " ("); + + for(i = 0; i < foreign->n_fields; i++) { + char buf[MAX_TABLE_NAME_LEN + 1] = ""; + bufend = innobase_convert_name(buf, MAX_TABLE_NAME_LEN, + foreign->referenced_col_names[i], + strlen(foreign->referenced_col_names[i]), + trx->mysql_thd); + buf[bufend - buf] = '\0'; + strcat(fk_def, buf); + if (i < (uint)foreign->n_fields-1) { + strcat(fk_def, (char *)","); + } + } + strcat(fk_def, (char *)")"); + + return fk_def; +} + +/********************************************************************//** +Convert foreign key column names from data dictionary to SQL-layer. +*/ +static +void +dict_foreign_def_get_fields( +/*========================*/ + dict_foreign_t* foreign,/*!< in: foreign */ + trx_t* trx, /*!< in: trx */ + char** field, /*!< out: foreign column */ + char** field2, /*!< out: referenced column */ + ulint col_no) /*!< in: column number */ +{ + char* bufend; + char* fieldbuf = (char *)mem_heap_alloc(foreign->heap, MAX_TABLE_NAME_LEN+1); + char* fieldbuf2 = (char *)mem_heap_alloc(foreign->heap, MAX_TABLE_NAME_LEN+1); + + bufend = innobase_convert_name(fieldbuf, MAX_TABLE_NAME_LEN, + foreign->foreign_col_names[col_no], + strlen(foreign->foreign_col_names[col_no]), + trx->mysql_thd); + + fieldbuf[bufend - fieldbuf] = '\0'; + + bufend = innobase_convert_name(fieldbuf2, MAX_TABLE_NAME_LEN, + foreign->referenced_col_names[col_no], + strlen(foreign->referenced_col_names[col_no]), + trx->mysql_thd); + + fieldbuf2[bufend - fieldbuf2] = '\0'; + *field = fieldbuf; + *field2 = fieldbuf2; +} + +/********************************************************************//** +Add a foreign key definition to the data dictionary tables. +@return error code or DB_SUCCESS */ +dberr_t +dict_create_add_foreign_to_dictionary( +/*==================================*/ + const char* name, /*!< in: table name */ + const dict_foreign_t* foreign,/*!< in: foreign key */ + trx_t* trx) /*!< in/out: dictionary transaction */ +{ + dberr_t error; + + DBUG_ENTER("dict_create_add_foreign_to_dictionary"); + + pars_info_t* info = pars_info_create(); + + pars_info_add_str_literal(info, "id", foreign->id); + + pars_info_add_str_literal(info, "for_name", name); + + pars_info_add_str_literal(info, "ref_name", + foreign->referenced_table_name); + + pars_info_add_int4_literal(info, "n_cols", + ulint(foreign->n_fields) + | (ulint(foreign->type) << 24)); + + DBUG_PRINT("dict_create_add_foreign_to_dictionary", + ("'%s', '%s', '%s', %d", foreign->id, name, + foreign->referenced_table_name, + foreign->n_fields + (foreign->type << 24))); + + error = dict_foreign_eval_sql(info, + "PROCEDURE P () IS\n" + "BEGIN\n" + "INSERT INTO SYS_FOREIGN VALUES" + "(:id, :for_name, :ref_name, :n_cols);\n" + "END;\n" + , name, foreign->id, trx); + + if (error != DB_SUCCESS) { + + if (error == DB_DUPLICATE_KEY) { + char buf[MAX_TABLE_NAME_LEN + 1] = ""; + char tablename[MAX_TABLE_NAME_LEN + 1] = ""; + char* fk_def; + + innobase_convert_name(tablename, MAX_TABLE_NAME_LEN, + name, strlen(name), trx->mysql_thd); + + innobase_convert_name(buf, MAX_TABLE_NAME_LEN, + foreign->id, strlen(foreign->id), trx->mysql_thd); + + fk_def = dict_foreign_def_get((dict_foreign_t*)foreign, trx); + + ib_push_warning(trx, error, + "Create or Alter table %s with foreign key constraint" + " failed. Foreign key constraint %s" + " already exists on data dictionary." + " Foreign key constraint names need to be unique in database." + " Error in foreign key definition: %s.", + tablename, buf, fk_def); + } + + DBUG_RETURN(error); + } + + for (ulint i = 0; i < foreign->n_fields; i++) { + error = dict_create_add_foreign_field_to_dictionary( + i, name, foreign, trx); + + if (error != DB_SUCCESS) { + char buf[MAX_TABLE_NAME_LEN + 1] = ""; + char tablename[MAX_TABLE_NAME_LEN + 1] = ""; + char* field=NULL; + char* field2=NULL; + char* fk_def; + + innobase_convert_name(tablename, MAX_TABLE_NAME_LEN, + name, strlen(name), trx->mysql_thd); + innobase_convert_name(buf, MAX_TABLE_NAME_LEN, + foreign->id, strlen(foreign->id), trx->mysql_thd); + fk_def = dict_foreign_def_get((dict_foreign_t*)foreign, trx); + dict_foreign_def_get_fields((dict_foreign_t*)foreign, trx, &field, &field2, i); + + ib_push_warning(trx, error, + "Create or Alter table %s with foreign key constraint" + " failed. Error adding foreign key constraint name %s" + " fields %s or %s to the dictionary." + " Error in foreign key definition: %s.", + tablename, buf, i+1, fk_def); + + DBUG_RETURN(error); + } + } + + DBUG_RETURN(error); +} + +/** Check if a foreign constraint is on the given column name. +@param[in] col_name column name to be searched for fk constraint +@param[in] table table to which foreign key constraint belongs +@return true if fk constraint is present on the table, false otherwise. */ +static +bool +dict_foreign_base_for_stored( + const char* col_name, + const dict_table_t* table) +{ + /* Loop through each stored column and check if its base column has + the same name as the column name being checked */ + dict_s_col_list::const_iterator it; + for (it = table->s_cols->begin(); + it != table->s_cols->end(); ++it) { + dict_s_col_t s_col = *it; + + for (ulint j = 0; j < s_col.num_base; j++) { + if (strcmp(col_name, dict_table_get_col_name( + table, + s_col.base_col[j]->ind)) == 0) { + return(true); + } + } + } + + return(false); +} + +/** Check if a foreign constraint is on columns served as base columns +of any stored column. This is to prevent creating SET NULL or CASCADE +constraint on such columns +@param[in] local_fk_set set of foreign key objects, to be added to +the dictionary tables +@param[in] table table to which the foreign key objects in +local_fk_set belong to +@return true if yes, otherwise, false */ +bool +dict_foreigns_has_s_base_col( + const dict_foreign_set& local_fk_set, + const dict_table_t* table) +{ + dict_foreign_t* foreign; + + if (table->s_cols == NULL) { + return (false); + } + + for (dict_foreign_set::const_iterator it = local_fk_set.begin(); + it != local_fk_set.end(); ++it) { + + foreign = *it; + ulint type = foreign->type; + + type &= ~(DICT_FOREIGN_ON_DELETE_NO_ACTION + | DICT_FOREIGN_ON_UPDATE_NO_ACTION); + + if (type == 0) { + continue; + } + + for (ulint i = 0; i < foreign->n_fields; i++) { + /* Check if the constraint is on a column that + is a base column of any stored column */ + if (dict_foreign_base_for_stored( + foreign->foreign_col_names[i], table)) { + return(true); + } + } + } + + return(false); +} + +/** Adds the given set of foreign key objects to the dictionary tables +in the database. This function does not modify the dictionary cache. The +caller must ensure that all foreign key objects contain a valid constraint +name in foreign->id. +@param[in] local_fk_set set of foreign key objects, to be added to +the dictionary tables +@param[in] table table to which the foreign key objects in +local_fk_set belong to +@param[in,out] trx transaction +@return error code or DB_SUCCESS */ +dberr_t +dict_create_add_foreigns_to_dictionary( +/*===================================*/ + const dict_foreign_set& local_fk_set, + const dict_table_t* table, + trx_t* trx) +{ + ut_ad(dict_sys.locked()); + + if (!dict_sys.sys_foreign) + { + sql_print_error("InnoDB: Table SYS_FOREIGN not found" + " in internal data dictionary"); + return DB_ERROR; + } + + for (auto fk : local_fk_set) + if (dberr_t error= + dict_create_add_foreign_to_dictionary(table->name.m_name, fk, trx)) + return error; + + return DB_SUCCESS; +} diff --git a/storage/innobase/dict/dict0defrag_bg.cc b/storage/innobase/dict/dict0defrag_bg.cc new file mode 100644 index 00000000..bec6da8e --- /dev/null +++ b/storage/innobase/dict/dict0defrag_bg.cc @@ -0,0 +1,434 @@ +/***************************************************************************** + +Copyright (c) 2016, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file dict/dict0defrag_bg.cc +Defragmentation routines. + +Created 25/08/2016 Jan Lindström +*******************************************************/ + +#include "dict0dict.h" +#include "dict0stats.h" +#include "dict0stats_bg.h" +#include "dict0defrag_bg.h" +#include "btr0btr.h" +#include "srv0start.h" +#include "trx0trx.h" +#include "lock0lock.h" +#include "row0mysql.h" + +static mysql_mutex_t defrag_pool_mutex; + +/** Iterator type for iterating over the elements of objects of type +defrag_pool_t. */ +typedef defrag_pool_t::iterator defrag_pool_iterator_t; + +/** Pool where we store information on which tables are to be processed +by background defragmentation. */ +defrag_pool_t defrag_pool; + + +/*****************************************************************//** +Initialize the defrag pool, called once during thread initialization. */ +void +dict_defrag_pool_init(void) +/*=======================*/ +{ + ut_ad(!srv_read_only_mode); + mysql_mutex_init(0, &defrag_pool_mutex, nullptr); +} + +/*****************************************************************//** +Free the resources occupied by the defrag pool, called once during +thread de-initialization. */ +void +dict_defrag_pool_deinit(void) +/*=========================*/ +{ + ut_ad(!srv_read_only_mode); + + mysql_mutex_destroy(&defrag_pool_mutex); +} + +/*****************************************************************//** +Get an index from the auto defrag pool. The returned index id is removed +from the pool. +@return true if the pool was non-empty and "id" was set, false otherwise */ +static +bool +dict_stats_defrag_pool_get( +/*=======================*/ + table_id_t* table_id, /*!< out: table id, or unmodified if + list is empty */ + index_id_t* index_id) /*!< out: index id, or unmodified if + list is empty */ +{ + ut_ad(!srv_read_only_mode); + + mysql_mutex_lock(&defrag_pool_mutex); + + if (defrag_pool.empty()) { + mysql_mutex_unlock(&defrag_pool_mutex); + return(false); + } + + defrag_pool_item_t& item = defrag_pool.back(); + *table_id = item.table_id; + *index_id = item.index_id; + + defrag_pool.pop_back(); + + mysql_mutex_unlock(&defrag_pool_mutex); + + return(true); +} + +/*****************************************************************//** +Add an index in a table to the defrag pool, which is processed by the +background stats gathering thread. Only the table id and index id are +added to the list, so the table can be closed after being enqueued and +it will be opened when needed. If the table or index does not exist later +(has been DROPped), then it will be removed from the pool and skipped. */ +void +dict_stats_defrag_pool_add( +/*=======================*/ + const dict_index_t* index) /*!< in: table to add */ +{ + defrag_pool_item_t item; + + ut_ad(!srv_read_only_mode); + + mysql_mutex_lock(&defrag_pool_mutex); + + /* quit if already in the list */ + for (defrag_pool_iterator_t iter = defrag_pool.begin(); + iter != defrag_pool.end(); + ++iter) { + if ((*iter).table_id == index->table->id + && (*iter).index_id == index->id) { + mysql_mutex_unlock(&defrag_pool_mutex); + return; + } + } + + item.table_id = index->table->id; + item.index_id = index->id; + defrag_pool.push_back(item); + if (defrag_pool.size() == 1) { + /* Kick off dict stats optimizer work */ + dict_stats_schedule_now(); + } + mysql_mutex_unlock(&defrag_pool_mutex); +} + +/*****************************************************************//** +Delete a given index from the auto defrag pool. */ +void +dict_stats_defrag_pool_del( +/*=======================*/ + const dict_table_t* table, /*!id) + || (index + && (*iter).table_id == index->table->id + && (*iter).index_id == index->id)) { + /* erase() invalidates the iterator */ + iter = defrag_pool.erase(iter); + if (index) + break; + } else { + iter++; + } + } + + mysql_mutex_unlock(&defrag_pool_mutex); +} + +/*****************************************************************//** +Get the first index that has been added for updating persistent defrag +stats and eventually save its stats. */ +static void dict_stats_process_entry_from_defrag_pool(THD *thd) +{ + table_id_t table_id; + index_id_t index_id; + + ut_ad(!srv_read_only_mode); + + /* pop the first index from the auto defrag pool */ + if (!dict_stats_defrag_pool_get(&table_id, &index_id)) + /* no index in defrag pool */ + return; + + /* If the table is no longer cached, we've already lost the in + memory stats so there's nothing really to write to disk. */ + MDL_ticket *mdl= nullptr; + if (dict_table_t *table= + dict_table_open_on_id(table_id, false, DICT_TABLE_OP_OPEN_ONLY_IF_CACHED, + thd, &mdl)) + { + if (dict_index_t *index= !table->corrupted + ? dict_table_find_index_on_id(table, index_id) : nullptr) + if (index->is_btree()) + dict_stats_save_defrag_stats(index); + dict_table_close(table, false, thd, mdl); + } +} + +/** +Get the first index that has been added for updating persistent defrag +stats and eventually save its stats. */ +void dict_defrag_process_entries_from_defrag_pool(THD *thd) +{ + while (!defrag_pool.empty()) + dict_stats_process_entry_from_defrag_pool(thd); +} + +/*********************************************************************//** +Save defragmentation result. +@return DB_SUCCESS or error code */ +dberr_t dict_stats_save_defrag_summary(dict_index_t *index, THD *thd) +{ + if (index->is_ibuf()) + return DB_SUCCESS; + + MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr; + dict_table_t *table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false, + DICT_ERR_IGNORE_NONE); + if (table_stats) + { + dict_sys.freeze(SRW_LOCK_CALL); + table_stats= dict_acquire_mdl_shared(table_stats, thd, &mdl_table); + dict_sys.unfreeze(); + } + if (!table_stats || strcmp(table_stats->name.m_name, TABLE_STATS_NAME)) + { +release_and_exit: + if (table_stats) + dict_table_close(table_stats, false, thd, mdl_table); + return DB_STATS_DO_NOT_EXIST; + } + + dict_table_t *index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false, + DICT_ERR_IGNORE_NONE); + if (index_stats) + { + dict_sys.freeze(SRW_LOCK_CALL); + index_stats= dict_acquire_mdl_shared(index_stats, thd, &mdl_index); + dict_sys.unfreeze(); + } + if (!index_stats) + goto release_and_exit; + if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME)) + { + dict_table_close(index_stats, false, thd, mdl_index); + goto release_and_exit; + } + + trx_t *trx= trx_create(); + trx->mysql_thd= thd; + trx_start_internal(trx); + dberr_t ret= trx->read_only + ? DB_READ_ONLY + : lock_table_for_trx(table_stats, trx, LOCK_X); + if (ret == DB_SUCCESS) + ret= lock_table_for_trx(index_stats, trx, LOCK_X); + row_mysql_lock_data_dictionary(trx); + if (ret == DB_SUCCESS) + ret= dict_stats_save_index_stat(index, time(nullptr), "n_pages_freed", + index->stat_defrag_n_pages_freed, + nullptr, + "Number of pages freed during" + " last defragmentation run.", + trx); + if (ret == DB_SUCCESS) + trx->commit(); + else + trx->rollback(); + + if (table_stats) + dict_table_close(table_stats, true, thd, mdl_table); + if (index_stats) + dict_table_close(index_stats, true, thd, mdl_index); + + row_mysql_unlock_data_dictionary(trx); + trx->free(); + + return ret; +} + +/**************************************************************//** +Gets the number of reserved and used pages in a B-tree. +@return number of pages reserved, or ULINT_UNDEFINED if the index +is unavailable */ +static +ulint +btr_get_size_and_reserved( + dict_index_t* index, /*!< in: index */ + ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */ + ulint* used, /*!< out: number of pages used (<= reserved) */ + mtr_t* mtr) /*!< in/out: mini-transaction where index + is s-latched */ +{ + ulint dummy; + + ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_SX_LOCK)); + ut_a(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE); + + if (index->page == FIL_NULL + || dict_index_is_online_ddl(index) + || !index->is_committed() + || !index->table->space) { + return(ULINT_UNDEFINED); + } + + dberr_t err; + buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr, &err); + *used = 0; + if (!root) { + return ULINT_UNDEFINED; + } + + mtr->x_lock_space(index->table->space); + + ulint n = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF + + root->page.frame, used, mtr); + if (flag == BTR_TOTAL_SIZE) { + n += fseg_n_reserved_pages(*root, + PAGE_HEADER + PAGE_BTR_SEG_TOP + + root->page.frame, &dummy, mtr); + *used += dummy; + } + + return(n); +} + +/*********************************************************************//** +Save defragmentation stats for a given index. +@return DB_SUCCESS or error code */ +dberr_t +dict_stats_save_defrag_stats( +/*============================*/ + dict_index_t* index) /*!< in: index */ +{ + if (index->is_ibuf()) + return DB_SUCCESS; + if (!index->is_readable()) + return dict_stats_report_error(index->table, true); + + const time_t now= time(nullptr); + mtr_t mtr; + ulint n_leaf_pages; + mtr.start(); + mtr_sx_lock_index(index, &mtr); + ulint n_leaf_reserved= btr_get_size_and_reserved(index, BTR_N_LEAF_PAGES, + &n_leaf_pages, &mtr); + mtr.commit(); + + if (n_leaf_reserved == ULINT_UNDEFINED) + return DB_SUCCESS; + + THD *thd= current_thd; + MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr; + dict_table_t* table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false, + DICT_ERR_IGNORE_NONE); + if (table_stats) + { + dict_sys.freeze(SRW_LOCK_CALL); + table_stats= dict_acquire_mdl_shared(table_stats, thd, &mdl_table); + dict_sys.unfreeze(); + } + if (!table_stats || strcmp(table_stats->name.m_name, TABLE_STATS_NAME)) + { +release_and_exit: + if (table_stats) + dict_table_close(table_stats, false, thd, mdl_table); + return DB_STATS_DO_NOT_EXIST; + } + + dict_table_t *index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false, + DICT_ERR_IGNORE_NONE); + if (index_stats) + { + dict_sys.freeze(SRW_LOCK_CALL); + index_stats= dict_acquire_mdl_shared(index_stats, thd, &mdl_index); + dict_sys.unfreeze(); + } + if (!index_stats) + goto release_and_exit; + + if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME)) + { + dict_table_close(index_stats, false, thd, mdl_index); + goto release_and_exit; + } + + trx_t *trx= trx_create(); + trx->mysql_thd= thd; + trx_start_internal(trx); + dberr_t ret= trx->read_only + ? DB_READ_ONLY + : lock_table_for_trx(table_stats, trx, LOCK_X); + if (ret == DB_SUCCESS) + ret= lock_table_for_trx(index_stats, trx, LOCK_X); + + row_mysql_lock_data_dictionary(trx); + + if (ret == DB_SUCCESS) + ret= dict_stats_save_index_stat(index, now, "n_page_split", + index->stat_defrag_n_page_split, nullptr, + "Number of new page splits on leaves" + " since last defragmentation.", trx); + + if (ret == DB_SUCCESS) + ret= dict_stats_save_index_stat(index, now, "n_leaf_pages_defrag", + n_leaf_pages, nullptr, + "Number of leaf pages when" + " this stat is saved to disk", trx); + + if (ret == DB_SUCCESS) + ret= dict_stats_save_index_stat(index, now, "n_leaf_pages_reserved", + n_leaf_reserved, nullptr, + "Number of pages reserved for" + " this index leaves" + " when this stat is saved to disk", trx); + + if (ret == DB_SUCCESS) + trx->commit(); + else + trx->rollback(); + + if (table_stats) + dict_table_close(table_stats, true, thd, mdl_table); + if (index_stats) + dict_table_close(index_stats, true, thd, mdl_index); + row_mysql_unlock_data_dictionary(trx); + trx->free(); + + return ret; +} diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc new file mode 100644 index 00000000..5bc7ab6e --- /dev/null +++ b/storage/innobase/dict/dict0dict.cc @@ -0,0 +1,4859 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file dict/dict0dict.cc +Data dictionary system + +Created 1/8/1996 Heikki Tuuri +***********************************************************************/ + +#include +#include + +#include "ha_prototypes.h" +#include +#include + +#include "dict0dict.h" +#include "fts0fts.h" +#include "fil0fil.h" +#include +#include "sql_class.h" +#include "sql_table.h" +#include + +#include "btr0btr.h" +#include "btr0cur.h" +#include "btr0sea.h" +#include "buf0buf.h" +#include "data0type.h" +#include "dict0boot.h" +#include "dict0load.h" +#include "dict0crea.h" +#include "dict0mem.h" +#include "dict0stats.h" +#include "fts0fts.h" +#include "fts0types.h" +#include "lock0lock.h" +#include "mach0data.h" +#include "mem0mem.h" +#include "page0page.h" +#include "page0zip.h" +#include "pars0pars.h" +#include "pars0sym.h" +#include "que0que.h" +#include "rem0cmp.h" +#include "row0log.h" +#include "row0merge.h" +#include "row0mysql.h" +#include "row0upd.h" +#include "srv0mon.h" +#include "srv0start.h" +#include "trx0undo.h" +#include "trx0purge.h" + +#include +#include + +/** the dictionary system */ +dict_sys_t dict_sys; + +/** System table names; @see dict_system_id_t */ +const span dict_sys_t::SYS_TABLE[]= +{ + {C_STRING_WITH_LEN("SYS_TABLES")},{C_STRING_WITH_LEN("SYS_INDEXES")}, + {C_STRING_WITH_LEN("SYS_COLUMNS")},{C_STRING_WITH_LEN("SYS_FIELDS")}, + {C_STRING_WITH_LEN("SYS_FOREIGN")},{C_STRING_WITH_LEN("SYS_FOREIGN_COLS")}, + {C_STRING_WITH_LEN("SYS_VIRTUAL")} +}; + +/** Diagnostic message for exceeding the mutex_lock_wait() timeout */ +const char dict_sys_t::fatal_msg[]= + "innodb_fatal_semaphore_wait_threshold was exceeded for dict_sys.latch. " + "Please refer to " + "https://mariadb.com/kb/en/how-to-produce-a-full-stack-trace-for-mysqld/"; + +/** Percentage of compression failures that are allowed in a single +round */ +ulong zip_failure_threshold_pct = 5; + +/** Maximum percentage of a page that can be allowed as a pad to avoid +compression failures */ +ulong zip_pad_max = 50; + +#define DICT_HEAP_SIZE 100 /*!< initial memory heap size when + creating a table or index object */ +#define DICT_POOL_PER_TABLE_HASH 512 /*!< buffer pool max size per table + hash table fixed size in bytes */ +#define DICT_POOL_PER_VARYING 4 /*!< buffer pool max size per data + dictionary varying size in bytes */ + +/** Identifies generated InnoDB foreign key names */ +static char dict_ibfk[] = "_ibfk_"; + +/*******************************************************************//** +Tries to find column names for the index and sets the col field of the +index. +@param[in] index index +@param[in] add_v new virtual columns added along with an add index call +@return whether the column names were found */ +static +bool +dict_index_find_cols( + dict_index_t* index, + const dict_add_v_col_t* add_v); +/*******************************************************************//** +Builds the internal dictionary cache representation for a clustered +index, containing also system fields not defined by the user. +@return own: the internal representation of the clustered index */ +static +dict_index_t* +dict_index_build_internal_clust( +/*============================*/ + dict_index_t* index); /*!< in: user representation of + a clustered index */ +/*******************************************************************//** +Builds the internal dictionary cache representation for a non-clustered +index, containing also system fields not defined by the user. +@return own: the internal representation of the non-clustered index */ +static +dict_index_t* +dict_index_build_internal_non_clust( +/*================================*/ + dict_index_t* index); /*!< in: user representation of + a non-clustered index */ +/**********************************************************************//** +Builds the internal dictionary cache representation for an FTS index. +@return own: the internal representation of the FTS index */ +static +dict_index_t* +dict_index_build_internal_fts( +/*==========================*/ + dict_index_t* index); /*!< in: user representation of an FTS index */ + +/**********************************************************************//** +Removes an index from the dictionary cache. */ +static +void +dict_index_remove_from_cache_low( +/*=============================*/ + dict_table_t* table, /*!< in/out: table */ + dict_index_t* index, /*!< in, own: index */ + ibool lru_evict); /*!< in: TRUE if page being evicted + to make room in the table LRU list */ +#ifdef UNIV_DEBUG +/**********************************************************************//** +Validate the dictionary table LRU list. +@return TRUE if validate OK */ +static +ibool +dict_lru_validate(void); +/*===================*/ +#endif /* UNIV_DEBUG */ + +/* Stream for storing detailed information about the latest foreign key +and unique key errors. Only created if !srv_read_only_mode */ +FILE* dict_foreign_err_file = NULL; +/* mutex protecting the foreign and unique error buffers */ +mysql_mutex_t dict_foreign_err_mutex; + +/********************************************************************//** +Checks if the database name in two table names is the same. +@return TRUE if same db name */ +ibool +dict_tables_have_same_db( +/*=====================*/ + const char* name1, /*!< in: table name in the form + dbname '/' tablename */ + const char* name2) /*!< in: table name in the form + dbname '/' tablename */ +{ + for (; *name1 == *name2; name1++, name2++) { + if (*name1 == '/') { + return(TRUE); + } + ut_a(*name1); /* the names must contain '/' */ + } + return(FALSE); +} + +/********************************************************************//** +Return the end of table name where we have removed dbname and '/'. +@return table name */ +const char* +dict_remove_db_name( +/*================*/ + const char* name) /*!< in: table name in the form + dbname '/' tablename */ +{ + const char* s = strchr(name, '/'); + ut_a(s); + + return(s + 1); +} + +/** Decrement the count of open handles */ +void dict_table_close(dict_table_t *table) +{ + if (table->get_ref_count() == 1 && + dict_stats_is_persistent_enabled(table) && + strchr(table->name.m_name, '/')) + { + /* It looks like we are closing the last handle. The user could + have executed FLUSH TABLES in order to have the statistics reloaded + from the InnoDB persistent statistics tables. We must acquire + exclusive dict_sys.latch to prevent a race condition with another + thread concurrently acquiring a handle on the table. */ + dict_sys.lock(SRW_LOCK_CALL); + if (table->release()) + { + table->stats_mutex_lock(); + if (table->get_ref_count() == 0) + dict_stats_deinit(table); + table->stats_mutex_unlock(); + } + dict_sys.unlock(); + } + else + table->release(); +} + +/** Decrements the count of open handles of a table. +@param[in,out] table table +@param[in] dict_locked whether dict_sys.latch is being held +@param[in] thd thread to release MDL +@param[in] mdl metadata lock or NULL if the thread + is a foreground one. */ +void +dict_table_close( + dict_table_t* table, + bool dict_locked, + THD* thd, + MDL_ticket* mdl) +{ + if (!dict_locked) + dict_table_close(table); + else + { + if (table->release() && dict_stats_is_persistent_enabled(table) && + strchr(table->name.m_name, '/')) + { + /* Force persistent stats re-read upon next open of the table so + that FLUSH TABLE can be used to forcibly fetch stats from disk if + they have been manually modified. */ + table->stats_mutex_lock(); + if (table->get_ref_count() == 0) + dict_stats_deinit(table); + table->stats_mutex_unlock(); + } + + ut_ad(dict_lru_validate()); + ut_ad(dict_sys.find(table)); + } + + if (!thd || !mdl); + else if (MDL_context *mdl_context= static_cast + (thd_mdl_context(thd))) + mdl_context->release_lock(mdl); +} + +/** Check if the table has a given (non_virtual) column. +@param[in] table table object +@param[in] col_name column name +@param[in] col_nr column number guessed, 0 as default +@return column number if the table has the specified column, +otherwise table->n_def */ +ulint +dict_table_has_column( + const dict_table_t* table, + const char* col_name, + ulint col_nr) +{ + ulint col_max = table->n_def; + + ut_ad(table); + ut_ad(col_name); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + if (col_nr < col_max + && innobase_strcasecmp( + col_name, dict_table_get_col_name(table, col_nr)) == 0) { + return(col_nr); + } + + /** The order of column may changed, check it with other columns */ + for (ulint i = 0; i < col_max; i++) { + if (i != col_nr + && innobase_strcasecmp( + col_name, dict_table_get_col_name(table, i)) == 0) { + + return(i); + } + } + + return(col_max); +} + +/** Retrieve the column name. +@param[in] table the table of this column */ +const char* dict_col_t::name(const dict_table_t& table) const +{ + ut_ad(table.magic_n == DICT_TABLE_MAGIC_N); + + size_t col_nr; + const char *s; + + if (is_virtual()) { + col_nr = size_t(reinterpret_cast(this) + - table.v_cols); + ut_ad(col_nr < table.n_v_def); + s = table.v_col_names; + } else { + col_nr = size_t(this - table.cols); + ut_ad(col_nr < table.n_def); + s = table.col_names; + } + + if (s) { + for (size_t i = 0; i < col_nr; i++) { + s += strlen(s) + 1; + } + } + + return(s); +} + +/** Returns a virtual column's name. +@param[in] table target table +@param[in] col_nr virtual column number (nth virtual column) +@return column name or NULL if column number out of range. */ +const char* +dict_table_get_v_col_name( + const dict_table_t* table, + ulint col_nr) +{ + const char* s; + + ut_ad(table); + ut_ad(col_nr < table->n_v_def); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + if (col_nr >= table->n_v_def) { + return(NULL); + } + + s = table->v_col_names; + + if (s != NULL) { + for (ulint i = 0; i < col_nr; i++) { + s += strlen(s) + 1; + } + } + + return(s); +} + +/** Search virtual column's position in InnoDB according to its position +in original table's position +@param[in] table target table +@param[in] col_nr column number (nth column in the MySQL table) +@return virtual column's position in InnoDB, ULINT_UNDEFINED if not find */ +static +ulint +dict_table_get_v_col_pos_for_mysql( + const dict_table_t* table, + ulint col_nr) +{ + ulint i; + + ut_ad(table); + ut_ad(col_nr < static_cast(table->n_t_def)); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + for (i = 0; i < table->n_v_def; i++) { + if (col_nr == dict_get_v_col_mysql_pos( + table->v_cols[i].m_col.ind)) { + break; + } + } + + if (i == table->n_v_def) { + return(ULINT_UNDEFINED); + } + + return(i); +} + +/** Returns a virtual column's name according to its original +MySQL table position. +@param[in] table target table +@param[in] col_nr column number (nth column in the table) +@return column name. */ +static +const char* +dict_table_get_v_col_name_mysql( + const dict_table_t* table, + ulint col_nr) +{ + ulint i = dict_table_get_v_col_pos_for_mysql(table, col_nr); + + if (i == ULINT_UNDEFINED) { + return(NULL); + } + + return(dict_table_get_v_col_name(table, i)); +} + +/** Get nth virtual column according to its original MySQL table position +@param[in] table target table +@param[in] col_nr column number in MySQL Table definition +@return dict_v_col_t ptr */ +dict_v_col_t* +dict_table_get_nth_v_col_mysql( + const dict_table_t* table, + ulint col_nr) +{ + ulint i = dict_table_get_v_col_pos_for_mysql(table, col_nr); + + if (i == ULINT_UNDEFINED) { + return(NULL); + } + + return(dict_table_get_nth_v_col(table, i)); +} + + +/** Get all the FTS indexes on a table. +@param[in] table table +@param[out] indexes all FTS indexes on this table +@return number of FTS indexes */ +ulint +dict_table_get_all_fts_indexes( + const dict_table_t* table, + ib_vector_t* indexes) +{ + dict_index_t* index; + + ut_a(ib_vector_size(indexes) == 0); + + for (index = dict_table_get_first_index(table); + index; + index = dict_table_get_next_index(index)) { + + if (index->type == DICT_FTS) { + ib_vector_push(indexes, &index); + } + } + + return(ib_vector_size(indexes)); +} + +/** Looks for column n in an index. +@param[in] index index +@param[in] n column number +@param[in] inc_prefix true=consider column prefixes too +@param[in] is_virtual true==virtual column +@param[out] prefix_col_pos col num if prefix +@return position in internal representation of the index; +ULINT_UNDEFINED if not contained */ +ulint +dict_index_get_nth_col_or_prefix_pos( + const dict_index_t* index, + ulint n, + bool inc_prefix, + bool is_virtual, + ulint* prefix_col_pos) +{ + const dict_field_t* field; + const dict_col_t* col; + ulint pos; + ulint n_fields; + + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + if (prefix_col_pos) { + *prefix_col_pos = ULINT_UNDEFINED; + } + + if (is_virtual) { + col = &(dict_table_get_nth_v_col(index->table, n)->m_col); + } else { + col = dict_table_get_nth_col(index->table, n); + } + + if (dict_index_is_clust(index)) { + + return(dict_col_get_clust_pos(col, index)); + } + + n_fields = dict_index_get_n_fields(index); + + for (pos = 0; pos < n_fields; pos++) { + field = dict_index_get_nth_field(index, pos); + + if (col == field->col) { + if (prefix_col_pos) { + *prefix_col_pos = pos; + } + if (inc_prefix || field->prefix_len == 0) { + return(pos); + } + } + } + + return(ULINT_UNDEFINED); +} + +/** Check if the index contains a column or a prefix of that column. +@param[in] n column number +@param[in] is_virtual whether it is a virtual col +@return whether the index contains the column or its prefix */ +bool dict_index_t::contains_col_or_prefix(ulint n, bool is_virtual) const +{ + ut_ad(magic_n == DICT_INDEX_MAGIC_N); + + if (is_primary()) { + return(!is_virtual); + } + + const dict_col_t* col = is_virtual + ? &dict_table_get_nth_v_col(table, n)->m_col + : dict_table_get_nth_col(table, n); + + for (ulint pos = 0; pos < n_fields; pos++) { + if (col == fields[pos].col) { + return true; + } + } + + return false; +} + +/********************************************************************//** +Looks for a matching field in an index. The column has to be the same. The +column in index must be complete, or must contain a prefix longer than the +column in index2. That is, we must be able to construct the prefix in index2 +from the prefix in index. +@return position in internal representation of the index; +ULINT_UNDEFINED if not contained */ +ulint +dict_index_get_nth_field_pos( +/*=========================*/ + const dict_index_t* index, /*!< in: index from which to search */ + const dict_index_t* index2, /*!< in: index */ + ulint n) /*!< in: field number in index2 */ +{ + const dict_field_t* field; + const dict_field_t* field2; + ulint n_fields; + ulint pos; + + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + field2 = dict_index_get_nth_field(index2, n); + + n_fields = dict_index_get_n_fields(index); + + /* Are we looking for a MBR (Minimum Bound Box) field of + a spatial index */ + bool is_mbr_fld = (n == 0 && dict_index_is_spatial(index2)); + + for (pos = 0; pos < n_fields; pos++) { + field = dict_index_get_nth_field(index, pos); + + /* The first field of a spatial index is a transformed + MBR (Minimum Bound Box) field made out of original column, + so its field->col still points to original cluster index + col, but the actual content is different. So we cannot + consider them equal if neither of them is MBR field */ + if (pos == 0 && dict_index_is_spatial(index) && !is_mbr_fld) { + continue; + } + + if (field->col == field2->col + && (field->prefix_len == 0 + || (field->prefix_len >= field2->prefix_len + && field2->prefix_len != 0))) { + + return(pos); + } + } + + return(ULINT_UNDEFINED); +} + +/** Parse the table file name into table name and database name. +@tparam dict_frozen whether the caller holds dict_sys.latch +@param[in,out] db_name database name buffer +@param[in,out] tbl_name table name buffer +@param[out] db_name_len database name length +@param[out] tbl_name_len table name length +@return whether the table name is visible to SQL */ +template +bool dict_table_t::parse_name(char (&db_name)[NAME_LEN + 1], + char (&tbl_name)[NAME_LEN + 1], + size_t *db_name_len, size_t *tbl_name_len) const +{ + char db_buf[MAX_DATABASE_NAME_LEN + 1]; + char tbl_buf[MAX_TABLE_NAME_LEN + 1]; + + if (!dict_frozen) + dict_sys.freeze(SRW_LOCK_CALL); /* protect against renaming */ + ut_ad(dict_sys.frozen()); + const size_t db_len= name.dblen(); + ut_ad(db_len <= MAX_DATABASE_NAME_LEN); + + memcpy(db_buf, mdl_name.m_name, db_len); + db_buf[db_len]= 0; + + size_t tbl_len= strlen(mdl_name.m_name + db_len + 1); + const bool is_temp= mdl_name.is_temporary(); + + if (is_temp); + else if (const char *is_part= static_cast + (memchr(mdl_name.m_name + db_len + 1, '#', tbl_len))) + tbl_len= static_cast(is_part - &mdl_name.m_name[db_len + 1]); + + memcpy(tbl_buf, mdl_name.m_name + db_len + 1, tbl_len); + tbl_buf[tbl_len]= 0; + + if (!dict_frozen) + dict_sys.unfreeze(); + + *db_name_len= filename_to_tablename(db_buf, db_name, + MAX_DATABASE_NAME_LEN + 1, true); + + if (is_temp) + return false; + + *tbl_name_len= filename_to_tablename(tbl_buf, tbl_name, + MAX_TABLE_NAME_LEN + 1, true); + return true; +} + +template bool +dict_table_t::parse_name<>(char(&)[NAME_LEN + 1], char(&)[NAME_LEN + 1], + size_t*, size_t*) const; + +/** Acquire MDL shared for the table name. +@tparam trylock whether to use non-blocking operation +@param[in,out] table table object +@param[in,out] thd background thread +@param[out] mdl mdl ticket +@param[in] table_op operation to perform when opening +@return table object after locking MDL shared +@retval nullptr if the table is not readable, or if trylock && MDL blocked */ +template +dict_table_t* +dict_acquire_mdl_shared(dict_table_t *table, + THD *thd, + MDL_ticket **mdl, + dict_table_op_t table_op) +{ + if (!table || !mdl) + return table; + + MDL_context *mdl_context= static_cast(thd_mdl_context(thd)); + size_t db_len; + dict_table_t *not_found= nullptr; + + if (trylock) + { + dict_sys.freeze(SRW_LOCK_CALL); + db_len= dict_get_db_name_len(table->name.m_name); + dict_sys.unfreeze(); + } + else + { + ut_ad(dict_sys.frozen_not_locked()); + db_len= dict_get_db_name_len(table->name.m_name); + } + + if (db_len == 0) + return table; /* InnoDB system tables are not covered by MDL */ + + if (!mdl_context) + return nullptr; + + table_id_t table_id= table->id; + char db_buf[NAME_LEN + 1], db_buf1[NAME_LEN + 1]; + char tbl_buf[NAME_LEN + 1], tbl_buf1[NAME_LEN + 1]; + size_t tbl_len; + bool unaccessible= false; + + if (!table->parse_name(db_buf, tbl_buf, &db_len, &tbl_len)) + /* The name of an intermediate table starts with #sql */ + return table; + +retry: + if (!unaccessible && (!table->is_readable() || table->corrupted)) + { + if (*mdl) + { + mdl_context->release_lock(*mdl); + *mdl= nullptr; + } + unaccessible= true; + } + + if (!trylock) + table->release(); + + if (unaccessible) + return nullptr; + + if (!trylock) + dict_sys.unfreeze(); + + { + MDL_request request; + MDL_REQUEST_INIT(&request,MDL_key::TABLE, db_buf, tbl_buf, MDL_SHARED, + MDL_EXPLICIT); + if (trylock + ? mdl_context->try_acquire_lock(&request) + : mdl_context->acquire_lock(&request, + /* FIXME: use compatible type, and maybe + remove this parameter altogether! */ + static_cast(global_system_variables + .lock_wait_timeout))) + { + *mdl= nullptr; + if (trylock) + return nullptr; + } + else + { + *mdl= request.ticket; + if (trylock && !*mdl) + return nullptr; + } + } + + dict_sys.freeze(SRW_LOCK_CALL); + table= dict_sys.find_table(table_id); + if (table) + table->acquire(); + if (!table && table_op != DICT_TABLE_OP_OPEN_ONLY_IF_CACHED) + { + dict_sys.unfreeze(); + dict_sys.lock(SRW_LOCK_CALL); + table= dict_load_table_on_id(table_id, + table_op == DICT_TABLE_OP_LOAD_TABLESPACE + ? DICT_ERR_IGNORE_RECOVER_LOCK + : DICT_ERR_IGNORE_FK_NOKEY); + if (table) + table->acquire(); + dict_sys.unlock(); + dict_sys.freeze(SRW_LOCK_CALL); + } + + if (!table || !table->is_accessible()) + { + table= nullptr; +return_without_mdl: + if (trylock) + dict_sys.unfreeze(); + if (*mdl) + { + mdl_context->release_lock(*mdl); + *mdl= nullptr; + } + return not_found; + } + + size_t db1_len, tbl1_len; + + if (!table->parse_name(db_buf1, tbl_buf1, &db1_len, &tbl1_len)) + { + /* The table was renamed to #sql prefix. + Release MDL (if any) for the old name and return. */ + goto return_without_mdl; + } + + if (*mdl) + { + if (db_len == db1_len && tbl_len == tbl1_len && + !memcmp(db_buf, db_buf1, db_len) && + !memcmp(tbl_buf, tbl_buf1, tbl_len)) + { + if (trylock) + dict_sys.unfreeze(); + return table; + } + + /* The table was renamed. Release MDL for the old name and + try to acquire MDL for the new name. */ + mdl_context->release_lock(*mdl); + *mdl= nullptr; + } + + db_len= db1_len; + tbl_len= tbl1_len; + + memcpy(tbl_buf, tbl_buf1, tbl_len + 1); + memcpy(db_buf, db_buf1, db_len + 1); + goto retry; +} + +template dict_table_t* dict_acquire_mdl_shared +(dict_table_t*,THD*,MDL_ticket**,dict_table_op_t); +template dict_table_t* dict_acquire_mdl_shared +(dict_table_t*,THD*,MDL_ticket**,dict_table_op_t); + +/** Look up a table by numeric identifier. +@param[in] table_id table identifier +@param[in] dict_locked data dictionary locked +@param[in] table_op operation to perform when opening +@param[in,out] thd background thread, or NULL to not acquire MDL +@param[out] mdl mdl ticket, or NULL +@return table, NULL if does not exist */ +dict_table_t *dict_table_open_on_id(table_id_t table_id, bool dict_locked, + dict_table_op_t table_op, THD *thd, + MDL_ticket **mdl) +{ + if (!dict_locked) + dict_sys.freeze(SRW_LOCK_CALL); + + dict_table_t *table= dict_sys.find_table(table_id); + + if (table) + { + table->acquire(); + if (thd && !dict_locked) + table= dict_acquire_mdl_shared(table, thd, mdl, table_op); + } + else if (table_op != DICT_TABLE_OP_OPEN_ONLY_IF_CACHED) + { + if (!dict_locked) + { + dict_sys.unfreeze(); + dict_sys.lock(SRW_LOCK_CALL); + } + table= dict_load_table_on_id(table_id, + table_op == DICT_TABLE_OP_LOAD_TABLESPACE + ? DICT_ERR_IGNORE_RECOVER_LOCK + : DICT_ERR_IGNORE_FK_NOKEY); + if (table) + table->acquire(); + if (!dict_locked) + { + dict_sys.unlock(); + if (table && thd) + { + dict_sys.freeze(SRW_LOCK_CALL); + table= dict_acquire_mdl_shared(table, thd, mdl, table_op); + dict_sys.unfreeze(); + } + return table; + } + } + + if (!dict_locked) + dict_sys.unfreeze(); + + return table; +} + +/********************************************************************//** +Looks for column n position in the clustered index. +@return position in internal representation of the clustered index */ +unsigned +dict_table_get_nth_col_pos( +/*=======================*/ + const dict_table_t* table, /*!< in: table */ + ulint n, /*!< in: column number */ + ulint* prefix_col_pos) +{ + ulint pos= dict_index_get_nth_col_pos(dict_table_get_first_index(table), + n, prefix_col_pos); + DBUG_ASSERT(pos <= dict_index_t::MAX_N_FIELDS); + return static_cast(pos); +} + +/********************************************************************//** +Checks if a column is in the ordering columns of the clustered index of a +table. Column prefixes are treated like whole columns. +@return TRUE if the column, or its prefix, is in the clustered key */ +ibool +dict_table_col_in_clustered_key( +/*============================*/ + const dict_table_t* table, /*!< in: table */ + ulint n) /*!< in: column number */ +{ + const dict_index_t* index; + const dict_field_t* field; + const dict_col_t* col; + ulint pos; + ulint n_fields; + + col = dict_table_get_nth_col(table, n); + + index = dict_table_get_first_index(table); + + n_fields = dict_index_get_n_unique(index); + + for (pos = 0; pos < n_fields; pos++) { + field = dict_index_get_nth_field(index, pos); + + if (col == field->col) { + + return(TRUE); + } + } + + return(FALSE); +} + +/** Initialise the data dictionary cache. */ +void dict_sys_t::create() +{ + ut_ad(this == &dict_sys); + ut_ad(!is_initialised()); + m_initialised= true; + UT_LIST_INIT(table_LRU, &dict_table_t::table_LRU); + UT_LIST_INIT(table_non_LRU, &dict_table_t::table_LRU); + + const ulint hash_size = buf_pool_get_curr_size() + / (DICT_POOL_PER_TABLE_HASH * UNIV_WORD_SIZE); + + table_hash.create(hash_size); + table_id_hash.create(hash_size); + temp_id_hash.create(hash_size); + + latch.SRW_LOCK_INIT(dict_operation_lock_key); + + if (!srv_read_only_mode) + { + dict_foreign_err_file= os_file_create_tmpfile(); + ut_a(dict_foreign_err_file); + } + + mysql_mutex_init(dict_foreign_err_mutex_key, &dict_foreign_err_mutex, + nullptr); +} + + +void dict_sys_t::lock_wait(SRW_LOCK_ARGS(const char *file, unsigned line)) +{ + ulonglong now= my_hrtime_coarse().val, old= 0; + if (latch_ex_wait_start.compare_exchange_strong + (old, now, std::memory_order_relaxed, std::memory_order_relaxed)) + { + latch.wr_lock(SRW_LOCK_ARGS(file, line)); + latch_ex_wait_start.store(0, std::memory_order_relaxed); + ut_ad(!latch_readers); + ut_ad(!latch_ex); + ut_d(latch_ex= pthread_self()); + return; + } + + ut_ad(old); + /* We could have old > now due to our use of my_hrtime_coarse(). */ + ulong waited= old <= now ? static_cast((now - old) / 1000000) : 0; + const ulong threshold= srv_fatal_semaphore_wait_threshold; + + if (waited >= threshold) + ib::fatal() << fatal_msg; + + if (waited > threshold / 4) + ib::warn() << "A long wait (" << waited + << " seconds) was observed for dict_sys.latch"; + latch.wr_lock(SRW_LOCK_ARGS(file, line)); + ut_ad(!latch_readers); + ut_ad(!latch_ex); + ut_d(latch_ex= pthread_self()); +} + +#ifdef UNIV_PFS_RWLOCK +ATTRIBUTE_NOINLINE void dict_sys_t::unlock() +{ + ut_ad(latch_ex == pthread_self()); + ut_ad(!latch_readers); + ut_d(latch_ex= 0); + latch.wr_unlock(); +} + +ATTRIBUTE_NOINLINE void dict_sys_t::freeze(const char *file, unsigned line) +{ + latch.rd_lock(file, line); + ut_ad(!latch_ex); + ut_d(latch_readers++); +} + +ATTRIBUTE_NOINLINE void dict_sys_t::unfreeze() +{ + ut_ad(!latch_ex); + ut_ad(latch_readers--); + latch.rd_unlock(); +} +#endif /* UNIV_PFS_RWLOCK */ + +/**********************************************************************//** +Returns a table object and increments its open handle count. +NOTE! This is a high-level function to be used mainly from outside the +'dict' directory. Inside this directory dict_table_get_low +is usually the appropriate function. +@param[in] table_name Table name +@param[in] dict_locked whether dict_sys.latch is being held exclusively +@param[in] ignore_err error to be ignored when loading the table +@return table +@retval nullptr if does not exist */ +dict_table_t* +dict_table_open_on_name( + const char* table_name, + bool dict_locked, + dict_err_ignore_t ignore_err) +{ + dict_table_t *table; + DBUG_ENTER("dict_table_open_on_name"); + DBUG_PRINT("dict_table_open_on_name", ("table: '%s'", table_name)); + + const span name{table_name, strlen(table_name)}; + + if (!dict_locked) + { + dict_sys.freeze(SRW_LOCK_CALL); + table= dict_sys.find_table(name); + if (table) + { + ut_ad(table->cached); + if (!(ignore_err & ~DICT_ERR_IGNORE_FK_NOKEY) && + !table->is_readable() && table->corrupted) + { + ulint algo = table->space->get_compression_algo(); + if (algo <= PAGE_ALGORITHM_LAST && !fil_comp_algo_loaded(algo)) { + my_printf_error(ER_PROVIDER_NOT_LOADED, + "Table %s is compressed with %s, which is not currently loaded. " + "Please load the %s provider plugin to open the table", + MYF(ME_ERROR_LOG), table->name, + page_compression_algorithms[algo], page_compression_algorithms[algo]); + } else { + my_printf_error(ER_TABLE_CORRUPT, + "Table %s is corrupted. Please drop the table and recreate.", + MYF(ME_ERROR_LOG), table->name); + } + dict_sys.unfreeze(); + DBUG_RETURN(nullptr); + } + table->acquire(); + dict_sys.unfreeze(); + DBUG_RETURN(table); + } + dict_sys.unfreeze(); + dict_sys.lock(SRW_LOCK_CALL); + } + + table= dict_sys.load_table(name, ignore_err); + + if (table) + { + ut_ad(table->cached); + if (!(ignore_err & ~DICT_ERR_IGNORE_FK_NOKEY) && + !table->is_readable() && table->corrupted) + { + ib::error() << "Table " << table->name + << " is corrupted. Please drop the table and recreate."; + if (!dict_locked) + dict_sys.unlock(); + DBUG_RETURN(nullptr); + } + + table->acquire(); + } + + ut_ad(dict_lru_validate()); + if (!dict_locked) + dict_sys.unlock(); + + DBUG_RETURN(table); +} + +/**********************************************************************//** +Adds system columns to a table object. */ +void +dict_table_add_system_columns( +/*==========================*/ + dict_table_t* table, /*!< in/out: table */ + mem_heap_t* heap) /*!< in: temporary heap */ +{ + ut_ad(table->n_def == table->n_cols - DATA_N_SYS_COLS); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(!table->cached); + + /* NOTE: the system columns MUST be added in the following order + (so that they can be indexed by the numerical value of DATA_ROW_ID, + etc.) and as the last columns of the table memory object. + The clustered index will not always physically contain all system + columns. */ + + dict_mem_table_add_col(table, heap, "DB_ROW_ID", DATA_SYS, + DATA_ROW_ID | DATA_NOT_NULL, + DATA_ROW_ID_LEN); + + compile_time_assert(DATA_ROW_ID == 0); + dict_mem_table_add_col(table, heap, "DB_TRX_ID", DATA_SYS, + DATA_TRX_ID | DATA_NOT_NULL, + DATA_TRX_ID_LEN); + compile_time_assert(DATA_TRX_ID == 1); + dict_mem_table_add_col(table, heap, "DB_ROLL_PTR", DATA_SYS, + DATA_ROLL_PTR | DATA_NOT_NULL, + DATA_ROLL_PTR_LEN); + compile_time_assert(DATA_ROLL_PTR == 2); + + /* This check reminds that if a new system column is added to + the program, it should be dealt with here */ + compile_time_assert(DATA_N_SYS_COLS == 3); +} + +/** Add the table definition to the data dictionary cache */ +void dict_table_t::add_to_cache() +{ + cached = TRUE; + + dict_sys.add(this); +} + +/** Add a table definition to the data dictionary cache */ +inline void dict_sys_t::add(dict_table_t* table) +{ + ut_ad(!find(table)); + + ulint fold = my_crc32c(0, table->name.m_name, + strlen(table->name.m_name)); + + table->autoinc_mutex.init(); + table->lock_mutex_init(); + + /* Look for a table with the same name: error if such exists */ + { + dict_table_t* table2; + HASH_SEARCH(name_hash, &table_hash, fold, + dict_table_t*, table2, ut_ad(table2->cached), + !strcmp(table2->name.m_name, table->name.m_name)); + ut_a(table2 == NULL); + +#ifdef UNIV_DEBUG + /* Look for the same table pointer with a different name */ + HASH_SEARCH_ALL(name_hash, &table_hash, + dict_table_t*, table2, ut_ad(table2->cached), + table2 == table); + ut_ad(table2 == NULL); +#endif /* UNIV_DEBUG */ + } + HASH_INSERT(dict_table_t, name_hash, &table_hash, fold, table); + + /* Look for a table with the same id: error if such exists */ + hash_table_t* id_hash = table->is_temporary() + ? &temp_id_hash : &table_id_hash; + const ulint id_fold = ut_fold_ull(table->id); + { + dict_table_t* table2; + HASH_SEARCH(id_hash, id_hash, id_fold, + dict_table_t*, table2, ut_ad(table2->cached), + table2->id == table->id); + ut_a(table2 == NULL); + +#ifdef UNIV_DEBUG + /* Look for the same table pointer with a different id */ + HASH_SEARCH_ALL(id_hash, id_hash, + dict_table_t*, table2, ut_ad(table2->cached), + table2 == table); + ut_ad(table2 == NULL); +#endif /* UNIV_DEBUG */ + + HASH_INSERT(dict_table_t, id_hash, id_hash, id_fold, table); + } + + UT_LIST_ADD_FIRST(table->can_be_evicted ? table_LRU : table_non_LRU, + table); + ut_ad(dict_lru_validate()); +} + +/** Test whether a table can be evicted from dict_sys.table_LRU. +@param table table to be considered for eviction +@return whether the table can be evicted */ +TRANSACTIONAL_TARGET +static bool dict_table_can_be_evicted(dict_table_t *table) +{ + ut_ad(dict_sys.locked()); + ut_a(table->can_be_evicted); + ut_a(table->foreign_set.empty()); + ut_a(table->referenced_set.empty()); + + if (table->get_ref_count() == 0) { + /* The transaction commit and rollback are called from + outside the handler interface. This means that there is + a window where the table->n_ref_count can be zero but + the table instance is in "use". */ + + if (lock_table_has_locks(table)) { + return false; + } + +#ifdef BTR_CUR_HASH_ADAPT + /* We cannot really evict the table if adaptive hash + index entries are pointing to any of its indexes. */ + for (const dict_index_t* index + = dict_table_get_first_index(table); + index; index = dict_table_get_next_index(index)) { + if (index->n_ahi_pages()) { + return false; + } + } +#endif /* BTR_CUR_HASH_ADAPT */ + + ut_ad(!table->fts); + return true; + } + + return false; +} + +#ifdef BTR_CUR_HASH_ADAPT +/** @return a clone of this */ +dict_index_t *dict_index_t::clone() const +{ + ut_ad(n_fields); + ut_ad(is_btree()); + ut_ad(online_status == ONLINE_INDEX_COMPLETE); + ut_ad(is_committed()); + ut_ad(!is_dummy); + ut_ad(!parser); + ut_ad(!online_log); + ut_ad(!rtr_track); + + const size_t size= sizeof *this + n_fields * sizeof(*fields) + +#ifdef BTR_CUR_ADAPT + sizeof *search_info + +#endif + 1 + strlen(name) + + n_uniq * (sizeof *stat_n_diff_key_vals + + sizeof *stat_n_sample_sizes + + sizeof *stat_n_non_null_key_vals); + + mem_heap_t* heap= mem_heap_create(size); + dict_index_t *index= static_cast + (mem_heap_alloc(heap, sizeof *this)); + *index= *this; + index->lock.SRW_LOCK_INIT(index_tree_rw_lock_key); + index->heap= heap; + index->name= mem_heap_strdup(heap, name); + index->fields= static_cast + (mem_heap_dup(heap, fields, n_fields * sizeof *fields)); +#ifdef BTR_CUR_ADAPT + index->search_info= btr_search_info_create(index->heap); +#endif /* BTR_CUR_ADAPT */ + index->stat_n_diff_key_vals= static_cast + (mem_heap_zalloc(heap, n_uniq * sizeof *stat_n_diff_key_vals)); + index->stat_n_sample_sizes= static_cast + (mem_heap_zalloc(heap, n_uniq * sizeof *stat_n_sample_sizes)); + index->stat_n_non_null_key_vals= static_cast + (mem_heap_zalloc(heap, n_uniq * sizeof *stat_n_non_null_key_vals)); + new (&index->zip_pad.mutex) std::mutex(); + return index; +} + +/** Clone this index for lazy dropping of the adaptive hash. +@return this or a clone */ +dict_index_t *dict_index_t::clone_if_needed() +{ + if (!search_info->ref_count) + return this; + dict_index_t *prev= UT_LIST_GET_PREV(indexes, this); + + table->autoinc_mutex.wr_lock(); + UT_LIST_REMOVE(table->indexes, this); + UT_LIST_ADD_LAST(table->freed_indexes, this); + dict_index_t *index= clone(); + set_freed(); + if (prev) + UT_LIST_INSERT_AFTER(table->indexes, prev, index); + else + UT_LIST_ADD_FIRST(table->indexes, index); + table->autoinc_mutex.wr_unlock(); + return index; +} +#endif /* BTR_CUR_HASH_ADAPT */ + +/** Evict unused, unlocked tables from table_LRU. +@param half whether to consider half the tables only (instead of all) +@return number of tables evicted */ +ulint dict_sys_t::evict_table_LRU(bool half) +{ +#ifdef MYSQL_DYNAMIC_PLUGIN + constexpr ulint max_tables = 400; +#else + extern ulong tdc_size; + const ulint max_tables = tdc_size; +#endif + ulint n_evicted = 0; + + lock(SRW_LOCK_CALL); + ut_ad(dict_lru_validate()); + + const ulint len = UT_LIST_GET_LEN(table_LRU); + + if (len < max_tables) { +func_exit: + unlock(); + return(n_evicted); + } + + const ulint check_up_to = half ? len / 2 : 0; + ulint i = len; + + /* Find a suitable candidate to evict from the cache. Don't scan the + entire LRU list. Only scan pct_check list entries. */ + + for (dict_table_t *table = UT_LIST_GET_LAST(table_LRU); + table && i > check_up_to && (len - n_evicted) > max_tables; --i) { + dict_table_t* prev_table = UT_LIST_GET_PREV(table_LRU, table); + + if (dict_table_can_be_evicted(table)) { + remove(table, true); + ++n_evicted; + } + + table = prev_table; + } + + goto func_exit; +} + +/** Looks for an index with the given id given a table instance. +@param[in] table table instance +@param[in] id index id +@return index or NULL */ +dict_index_t* +dict_table_find_index_on_id( + const dict_table_t* table, + index_id_t id) +{ + dict_index_t* index; + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (id == index->id) { + /* Found */ + + return(index); + } + } + + return(NULL); +} + +/** Function object to remove a foreign key constraint from the +referenced_set of the referenced table. The foreign key object is +also removed from the dictionary cache. The foreign key constraint +is not removed from the foreign_set of the table containing the +constraint. */ +struct dict_foreign_remove_partial +{ + void operator()(dict_foreign_t* foreign) { + dict_table_t* table = foreign->referenced_table; + if (table != NULL) { + table->referenced_set.erase(foreign); + } + dict_foreign_free(foreign); + } +}; + +/** This function returns a new path name after replacing the basename +in an old path with a new basename. The old_path is a full path +name including the extension. The tablename is in the normal +form "databasename/tablename". The new base name is found after +the forward slash. Both input strings are null terminated. + +This function allocates memory to be returned. It is the callers +responsibility to free the return value after it is no longer needed. + +@param[in] old_path Pathname +@param[in] tablename Contains new base name +@return own: new full pathname */ +static char *dir_pathname(const char *old_path, span tablename) +{ + /* Split the tablename into its database and table name components. + They are separated by a '/'. */ + const char *base_name= tablename.data(); + for (const char *last= tablename.end(); last > tablename.data(); last--) + { + if (last[-1] == '/') + { + base_name= last; + break; + } + } + const size_t base_name_len= tablename.end() - base_name; + + /* Find the offset of the last slash. We will strip off the + old basename.ibd which starts after that slash. */ + const char *last_slash= strrchr(old_path, '/'); +#ifdef _WIN32 + if (const char *last= strrchr(old_path, '\\')) + if (last > last_slash) + last_slash= last; +#endif + + size_t dir_len= last_slash + ? size_t(last_slash - old_path) + : strlen(old_path); + + /* allocate a new path and move the old directory path to it. */ + size_t new_path_len= dir_len + base_name_len + sizeof "/.ibd"; + char *new_path= static_cast(ut_malloc_nokey(new_path_len)); + memcpy(new_path, old_path, dir_len); + snprintf(new_path + dir_len, new_path_len - dir_len, "/%.*s.ibd", + int(base_name_len), base_name); + return new_path; +} + +/** Rename the data file. +@param new_name name of the table +@param replace whether to replace the file with the new name + (as part of rolling back TRUNCATE) */ +dberr_t +dict_table_t::rename_tablespace(span new_name, bool replace) const +{ + ut_ad(dict_table_is_file_per_table(this)); + ut_ad(!is_temporary()); + + if (!space) + return DB_SUCCESS; + + const char *old_path= UT_LIST_GET_FIRST(space->chain)->name; + const bool data_dir= DICT_TF_HAS_DATA_DIR(flags); + char *path= data_dir + ? dir_pathname(old_path, new_name) + : fil_make_filepath(nullptr, new_name, IBD, false); + dberr_t err; + if (!path) + err= DB_OUT_OF_MEMORY; + else if (!strcmp(path, old_path)) + err= DB_SUCCESS; + else if (data_dir && + DB_SUCCESS != RemoteDatafile::create_link_file(new_name, path)) + err= DB_TABLESPACE_EXISTS; + else + { + space->x_lock(); + err= space->rename(path, true, replace); + if (data_dir) + { + if (err == DB_SUCCESS) + new_name= {name.m_name, strlen(name.m_name)}; + RemoteDatafile::delete_link_file(new_name); + } + space->x_unlock(); + } + + ut_free(path); + return err; +} + +/**********************************************************************//** +Renames a table object. +@return TRUE if success */ +dberr_t +dict_table_rename_in_cache( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + span new_name, /*!< in: new name */ + bool replace_new_file) + /*!< in: whether to replace the + file with the new name + (as part of rolling back TRUNCATE) */ +{ + dict_foreign_t* foreign; + char old_name[MAX_FULL_NAME_LEN + 1]; + + ut_ad(dict_sys.locked()); + + /* store the old/current name to an automatic variable */ + const size_t old_name_len = strlen(table->name.m_name); + ut_a(old_name_len < sizeof old_name); + strcpy(old_name, table->name.m_name); + + const uint32_t fold= my_crc32c(0, new_name.data(), new_name.size()); + ut_a(!dict_sys.find_table(new_name)); + + if (!dict_table_is_file_per_table(table)) { + } else if (dberr_t err = table->rename_tablespace(new_name, + replace_new_file)) { + return err; + } + + /* Remove table from the hash tables of tables */ + HASH_DELETE(dict_table_t, name_hash, &dict_sys.table_hash, + my_crc32c(0, table->name.m_name, old_name_len), table); + + bool keep_mdl_name = !table->name.is_temporary(); + + if (!keep_mdl_name) { + } else if (const char* s = static_cast + (memchr(new_name.data(), '/', new_name.size()))) { + keep_mdl_name = new_name.end() - s >= 5 + && !memcmp(s, "/#sql", 5); + } + + if (keep_mdl_name) { + /* Preserve the original table name for + dict_table_t::parse_name() and dict_acquire_mdl_shared(). */ + table->mdl_name.m_name = mem_heap_strdup(table->heap, + table->name.m_name); + } + + if (new_name.size() > strlen(table->name.m_name)) { + /* We allocate MAX_FULL_NAME_LEN + 1 bytes here to avoid + memory fragmentation, we assume a repeated calls of + ut_realloc() with the same size do not cause fragmentation */ + ut_a(new_name.size() <= MAX_FULL_NAME_LEN); + + table->name.m_name = static_cast( + ut_realloc(table->name.m_name, MAX_FULL_NAME_LEN + 1)); + } + memcpy(table->name.m_name, new_name.data(), new_name.size()); + table->name.m_name[new_name.size()] = '\0'; + + if (!keep_mdl_name) { + table->mdl_name.m_name = table->name.m_name; + } + + /* Add table to hash table of tables */ + HASH_INSERT(dict_table_t, name_hash, &dict_sys.table_hash, fold, + table); + + if (table->name.is_temporary()) { + /* In ALTER TABLE we think of the rename table operation + in the direction table -> temporary table (#sql...) + as dropping the table with the old name and creating + a new with the new name. Thus we kind of drop the + constraints from the dictionary cache here. The foreign key + constraints will be inherited to the new table from the + system tables through a call of dict_load_foreigns. */ + + /* Remove the foreign constraints from the cache */ + std::for_each(table->foreign_set.begin(), + table->foreign_set.end(), + dict_foreign_remove_partial()); + table->foreign_set.clear(); + + /* Reset table field in referencing constraints */ + for (dict_foreign_set::iterator it + = table->referenced_set.begin(); + it != table->referenced_set.end(); + ++it) { + + foreign = *it; + foreign->referenced_table = NULL; + foreign->referenced_index = NULL; + + } + + /* Make the set of referencing constraints empty */ + table->referenced_set.clear(); + + return(DB_SUCCESS); + } + + /* Update the table name fields in foreign constraints, and update also + the constraint id of new format >= 4.0.18 constraints. Note that at + this point we have already changed table->name to the new name. */ + + dict_foreign_set fk_set; + + for (;;) { + + dict_foreign_set::iterator it + = table->foreign_set.begin(); + + if (it == table->foreign_set.end()) { + break; + } + + foreign = *it; + + if (foreign->referenced_table) { + foreign->referenced_table->referenced_set.erase(foreign); + } + + if (strlen(foreign->foreign_table_name) + < strlen(table->name.m_name)) { + /* Allocate a longer name buffer; + TODO: store buf len to save memory */ + + foreign->foreign_table_name = mem_heap_strdup( + foreign->heap, table->name.m_name); + dict_mem_foreign_table_name_lookup_set(foreign, TRUE); + } else { + strcpy(foreign->foreign_table_name, + table->name.m_name); + dict_mem_foreign_table_name_lookup_set(foreign, FALSE); + } + if (strchr(foreign->id, '/')) { + /* This is a >= 4.0.18 format id */ + + ulint db_len; + char* old_id; + char old_name_cs_filename[MAX_FULL_NAME_LEN+1]; + uint errors = 0; + + /* All table names are internally stored in charset + my_charset_filename (except the temp tables and the + partition identifier suffix in partition tables). The + foreign key constraint names are internally stored + in UTF-8 charset. The variable fkid here is used + to store foreign key constraint name in charset + my_charset_filename for comparison further below. */ + char fkid[MAX_TABLE_NAME_LEN * 2 + 20]; + + /* The old table name in my_charset_filename is stored + in old_name_cs_filename */ + + strcpy(old_name_cs_filename, old_name); + old_name_cs_filename[MAX_FULL_NAME_LEN] = '\0'; + if (!dict_table_t::is_temporary_name(old_name)) { + innobase_convert_to_system_charset( + strchr(old_name_cs_filename, '/') + 1, + strchr(old_name, '/') + 1, + MAX_TABLE_NAME_LEN, &errors); + + if (errors) { + /* There has been an error to convert + old table into UTF-8. This probably + means that the old table name is + actually in UTF-8. */ + innobase_convert_to_filename_charset( + strchr(old_name_cs_filename, + '/') + 1, + strchr(old_name, '/') + 1, + MAX_TABLE_NAME_LEN); + } else { + /* Old name already in + my_charset_filename */ + strcpy(old_name_cs_filename, old_name); + old_name_cs_filename[MAX_FULL_NAME_LEN] + = '\0'; + } + } + + strncpy(fkid, foreign->id, (sizeof fkid) - 1); + fkid[(sizeof fkid) - 1] = '\0'; + + const bool on_tmp = dict_table_t::is_temporary_name( + fkid); + + if (!on_tmp) { + innobase_convert_to_filename_charset( + strchr(fkid, '/') + 1, + strchr(foreign->id, '/') + 1, + MAX_TABLE_NAME_LEN+20); + } + + old_id = mem_strdup(foreign->id); + + if (strlen(fkid) > strlen(old_name_cs_filename) + + ((sizeof dict_ibfk) - 1) + && !memcmp(fkid, old_name_cs_filename, + strlen(old_name_cs_filename)) + && !memcmp(fkid + strlen(old_name_cs_filename), + dict_ibfk, (sizeof dict_ibfk) - 1)) { + + /* This is a generated >= 4.0.18 format id */ + + char table_name[MAX_TABLE_NAME_LEN + 1]; + uint errors = 0; + + if (strlen(table->name.m_name) + > strlen(old_name)) { + foreign->id = static_cast( + mem_heap_alloc( + foreign->heap, + strlen(table->name.m_name) + + strlen(old_id) + 1)); + } + + /* Convert the table name to UTF-8 */ + strncpy(table_name, table->name.m_name, + MAX_TABLE_NAME_LEN); + table_name[MAX_TABLE_NAME_LEN] = '\0'; + innobase_convert_to_system_charset( + strchr(table_name, '/') + 1, + strchr(table->name.m_name, '/') + 1, + MAX_TABLE_NAME_LEN, &errors); + + if (errors) { + /* Table name could not be converted + from charset my_charset_filename to + UTF-8. This means that the table name + is already in UTF-8 (#mysql50#). */ + strncpy(table_name, table->name.m_name, + MAX_TABLE_NAME_LEN); + table_name[MAX_TABLE_NAME_LEN] = '\0'; + } + + /* Replace the prefix 'databasename/tablename' + with the new names */ + strcpy(foreign->id, table_name); + if (on_tmp) { + strcat(foreign->id, + old_id + strlen(old_name)); + } else { + sprintf(strchr(foreign->id, '/') + 1, + "%s%s", + strchr(table_name, '/') +1, + strstr(old_id, "_ibfk_") ); + } + + } else { + /* This is a >= 4.0.18 format id where the user + gave the id name */ + db_len = dict_get_db_name_len( + table->name.m_name) + 1; + + if (db_len - 1 + > dict_get_db_name_len(foreign->id)) { + + foreign->id = static_cast( + mem_heap_alloc( + foreign->heap, + db_len + strlen(old_id) + 1)); + } + + /* Replace the database prefix in id with the + one from table->name */ + + memcpy(foreign->id, + table->name.m_name, db_len); + + strcpy(foreign->id + db_len, + dict_remove_db_name(old_id)); + } + + ut_free(old_id); + } + + table->foreign_set.erase(it); + fk_set.insert(foreign); + + if (foreign->referenced_table) { + foreign->referenced_table->referenced_set.insert(foreign); + } + } + + ut_a(table->foreign_set.empty()); + table->foreign_set.swap(fk_set); + + for (dict_foreign_set::iterator it = table->referenced_set.begin(); + it != table->referenced_set.end(); + ++it) { + + foreign = *it; + + if (strlen(foreign->referenced_table_name) + < strlen(table->name.m_name)) { + /* Allocate a longer name buffer; + TODO: store buf len to save memory */ + + foreign->referenced_table_name = mem_heap_strdup( + foreign->heap, table->name.m_name); + + dict_mem_referenced_table_name_lookup_set( + foreign, TRUE); + } else { + /* Use the same buffer */ + strcpy(foreign->referenced_table_name, + table->name.m_name); + + dict_mem_referenced_table_name_lookup_set( + foreign, FALSE); + } + } + + return(DB_SUCCESS); +} + +/**********************************************************************//** +Change the id of a table object in the dictionary cache. This is used in +DISCARD TABLESPACE. */ +void +dict_table_change_id_in_cache( +/*==========================*/ + dict_table_t* table, /*!< in/out: table object already in cache */ + table_id_t new_id) /*!< in: new id to set */ +{ + ut_ad(dict_sys.locked()); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(!table->is_temporary()); + + /* Remove the table from the hash table of id's */ + + HASH_DELETE(dict_table_t, id_hash, &dict_sys.table_id_hash, + ut_fold_ull(table->id), table); + table->id = new_id; + + /* Add the table back to the hash table */ + HASH_INSERT(dict_table_t, id_hash, &dict_sys.table_id_hash, + ut_fold_ull(table->id), table); +} + +/** Evict a table definition from the InnoDB data dictionary cache. +@param[in,out] table cached table definition to be evicted +@param[in] lru whether this is part of least-recently-used eviction +@param[in] keep whether to keep (not free) the object */ +void dict_sys_t::remove(dict_table_t* table, bool lru, bool keep) +{ + dict_foreign_t* foreign; + dict_index_t* index; + + ut_ad(dict_lru_validate()); + ut_a(table->get_ref_count() == 0); + ut_a(table->n_rec_locks == 0); + ut_ad(find(table)); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + /* Remove the foreign constraints from the cache */ + std::for_each(table->foreign_set.begin(), table->foreign_set.end(), + dict_foreign_remove_partial()); + table->foreign_set.clear(); + + /* Reset table field in referencing constraints */ + for (dict_foreign_set::iterator it = table->referenced_set.begin(); + it != table->referenced_set.end(); + ++it) { + + foreign = *it; + foreign->referenced_table = NULL; + foreign->referenced_index = NULL; + } + + /* Remove the indexes from the cache */ + + for (index = UT_LIST_GET_LAST(table->indexes); + index != NULL; + index = UT_LIST_GET_LAST(table->indexes)) { + + dict_index_remove_from_cache_low(table, index, lru); + } + + /* Remove table from the hash tables of tables */ + + HASH_DELETE(dict_table_t, name_hash, &table_hash, + my_crc32c(0, table->name.m_name, + strlen(table->name.m_name)), + table); + + hash_table_t* id_hash = table->is_temporary() + ? &temp_id_hash : &table_id_hash; + const ulint id_fold = ut_fold_ull(table->id); + HASH_DELETE(dict_table_t, id_hash, id_hash, id_fold, table); + + /* Remove table from LRU or non-LRU list. */ + if (table->can_be_evicted) { + UT_LIST_REMOVE(table_LRU, table); + } else { + UT_LIST_REMOVE(table_non_LRU, table); + } + + /* Free virtual column template if any */ + if (table->vc_templ != NULL) { + dict_free_vc_templ(table->vc_templ); + UT_DELETE(table->vc_templ); + } + + table->lock_mutex_destroy(); + + if (keep) { + table->autoinc_mutex.destroy(); + return; + } + +#ifdef BTR_CUR_HASH_ADAPT + if (table->fts) { + fts_optimize_remove_table(table); + table->fts->~fts_t(); + table->fts = nullptr; + } + + table->autoinc_mutex.wr_lock(); + + ulint freed = UT_LIST_GET_LEN(table->freed_indexes); + + table->vc_templ = NULL; + table->id = 0; + table->autoinc_mutex.wr_unlock(); + + if (UNIV_UNLIKELY(freed != 0)) { + return; + } +#endif /* BTR_CUR_HASH_ADAPT */ + + table->autoinc_mutex.destroy(); + dict_mem_table_free(table); +} + +/****************************************************************//** +If the given column name is reserved for InnoDB system columns, return +TRUE. +@return TRUE if name is reserved */ +ibool +dict_col_name_is_reserved( +/*======================*/ + const char* name) /*!< in: column name */ +{ + static const char* reserved_names[] = { + "DB_ROW_ID", "DB_TRX_ID", "DB_ROLL_PTR" + }; + + compile_time_assert(UT_ARR_SIZE(reserved_names) == DATA_N_SYS_COLS); + + for (ulint i = 0; i < UT_ARR_SIZE(reserved_names); i++) { + if (innobase_strcasecmp(name, reserved_names[i]) == 0) { + + return(TRUE); + } + } + + return(FALSE); +} + +/** Adds an index to the dictionary cache, with possible indexing newly +added column. +@param[in,out] index index; NOTE! The index memory + object is freed in this function! +@param[in] page_no root page number of the index +@param[in] add_v virtual columns being added along with ADD INDEX +@return DB_SUCCESS, or DB_CORRUPTION */ +dberr_t +dict_index_add_to_cache( + dict_index_t*& index, + ulint page_no, + const dict_add_v_col_t* add_v) +{ + dict_index_t* new_index; + ulint n_ord; + ulint i; + + ut_ad(dict_sys.locked()); + ut_ad(index->n_def == index->n_fields); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + ut_ad(!dict_index_is_online_ddl(index)); + ut_ad(!dict_index_is_ibuf(index)); + + ut_d(mem_heap_validate(index->heap)); + ut_a(!dict_index_is_clust(index) + || UT_LIST_GET_LEN(index->table->indexes) == 0); + ut_ad(dict_index_is_clust(index) || !index->table->no_rollback()); + + if (!dict_index_find_cols(index, add_v)) { + + dict_mem_index_free(index); + index = NULL; + return DB_CORRUPTION; + } + + /* Build the cache internal representation of the index, + containing also the added system fields */ + + if (dict_index_is_clust(index)) { + new_index = dict_index_build_internal_clust(index); + } else { + new_index = (index->type & DICT_FTS) + ? dict_index_build_internal_fts(index) + : dict_index_build_internal_non_clust(index); + new_index->n_core_null_bytes = static_cast( + UT_BITS_IN_BYTES(unsigned(new_index->n_nullable))); + } + + /* Set the n_fields value in new_index to the actual defined + number of fields in the cache internal representation */ + + new_index->n_fields = new_index->n_def; + new_index->trx_id = index->trx_id; + new_index->set_committed(index->is_committed()); + new_index->nulls_equal = index->nulls_equal; + + n_ord = new_index->n_uniq; + /* Flag the ordering columns and also set column max_prefix */ + + for (i = 0; i < n_ord; i++) { + const dict_field_t* field + = dict_index_get_nth_field(new_index, i); + + /* Check the column being added in the index for + the first time and flag the ordering column. */ + if (field->col->ord_part == 0 ) { + field->col->max_prefix = field->prefix_len; + field->col->ord_part = 1; + } else if (field->prefix_len == 0) { + /* Set the max_prefix for a column to 0 if + its prefix length is 0 (for this index) + even if it was a part of any other index + with some prefix length. */ + field->col->max_prefix = 0; + } else if (field->col->max_prefix != 0 + && field->prefix_len + > field->col->max_prefix) { + /* Set the max_prefix value based on the + prefix_len. */ + ut_ad(field->col->is_binary() + || field->prefix_len % field->col->mbmaxlen == 0 + || field->prefix_len % 4 == 0); + field->col->max_prefix = field->prefix_len; + } + ut_ad(field->col->ord_part == 1); + } + + new_index->stat_n_diff_key_vals = + static_cast(mem_heap_zalloc( + new_index->heap, + dict_index_get_n_unique(new_index) + * sizeof(*new_index->stat_n_diff_key_vals))); + + new_index->stat_n_sample_sizes = + static_cast(mem_heap_zalloc( + new_index->heap, + dict_index_get_n_unique(new_index) + * sizeof(*new_index->stat_n_sample_sizes))); + + new_index->stat_n_non_null_key_vals = + static_cast(mem_heap_zalloc( + new_index->heap, + dict_index_get_n_unique(new_index) + * sizeof(*new_index->stat_n_non_null_key_vals))); + + new_index->stat_index_size = 1; + new_index->stat_n_leaf_pages = 1; + + new_index->stat_defrag_n_pages_freed = 0; + new_index->stat_defrag_n_page_split = 0; + + new_index->stat_defrag_sample_next_slot = 0; + memset(&new_index->stat_defrag_data_size_sample, + 0x0, sizeof(ulint) * STAT_DEFRAG_DATA_SIZE_N_SAMPLE); + + /* Add the new index as the last index for the table */ + + UT_LIST_ADD_LAST(new_index->table->indexes, new_index); +#ifdef BTR_CUR_ADAPT + new_index->search_info = btr_search_info_create(new_index->heap); +#endif /* BTR_CUR_ADAPT */ + + new_index->page = unsigned(page_no); + new_index->lock.SRW_LOCK_INIT(index_tree_rw_lock_key); + + new_index->n_core_fields = new_index->n_fields; + + dict_mem_index_free(index); + index = new_index; + return DB_SUCCESS; +} + +/**********************************************************************//** +Removes an index from the dictionary cache. */ +TRANSACTIONAL_TARGET +static +void +dict_index_remove_from_cache_low( +/*=============================*/ + dict_table_t* table, /*!< in/out: table */ + dict_index_t* index, /*!< in, own: index */ + ibool lru_evict) /*!< in: TRUE if index being evicted + to make room in the table LRU list */ +{ + ut_ad(table && index); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + ut_ad(dict_sys.locked()); + ut_ad(table->id); +#ifdef BTR_CUR_HASH_ADAPT + ut_ad(!index->freed()); +#endif /* BTR_CUR_HASH_ADAPT */ + + /* No need to acquire the dict_index_t::lock here because + there can't be any active operations on this index (or table). */ + + if (index->online_log) { + row_log_free(index->online_log); + index->online_log = NULL; + } + + /* Remove the index from the list of indexes of the table */ + UT_LIST_REMOVE(table->indexes, index); + + /* The index is being dropped, remove any compression stats for it. */ + if (!lru_evict && DICT_TF_GET_ZIP_SSIZE(index->table->flags)) { + mysql_mutex_lock(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index.erase(index->id); + mysql_mutex_unlock(&page_zip_stat_per_index_mutex); + } + + /* Remove the index from affected virtual column index list */ + index->detach_columns(); + +#ifdef BTR_CUR_HASH_ADAPT + /* We always create search info whether or not adaptive + hash index is enabled or not. */ + /* We are not allowed to free the in-memory index struct + dict_index_t until all entries in the adaptive hash index + that point to any of the page belonging to his b-tree index + are dropped. This is so because dropping of these entries + require access to dict_index_t struct. To avoid such scenario + We keep a count of number of such pages in the search_info and + only free the dict_index_t struct when this count drops to + zero. See also: dict_table_can_be_evicted() */ + + if (index->n_ahi_pages()) { + table->autoinc_mutex.wr_lock(); + index->set_freed(); + UT_LIST_ADD_LAST(table->freed_indexes, index); + table->autoinc_mutex.wr_unlock(); + return; + } +#endif /* BTR_CUR_HASH_ADAPT */ + + index->lock.free(); + + dict_mem_index_free(index); +} + +/**********************************************************************//** +Removes an index from the dictionary cache. */ +void +dict_index_remove_from_cache( +/*=========================*/ + dict_table_t* table, /*!< in/out: table */ + dict_index_t* index) /*!< in, own: index */ +{ + dict_index_remove_from_cache_low(table, index, FALSE); +} + +/** Tries to find column names for the index and sets the col field of the +index. +@param[in] table table +@param[in,out] index index +@param[in] add_v new virtual columns added along with an add index call +@return whether the column names were found */ +static +bool +dict_index_find_cols( + dict_index_t* index, + const dict_add_v_col_t* add_v) +{ + std::vector > col_added; + std::vector > v_col_added; + + const dict_table_t* table = index->table; + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(dict_sys.locked()); + + for (ulint i = 0; i < index->n_fields; i++) { + ulint j; + dict_field_t* field = dict_index_get_nth_field(index, i); + + for (j = 0; j < table->n_cols; j++) { + if (!innobase_strcasecmp(dict_table_get_col_name(table, j), + field->name)) { + + /* Check if same column is being assigned again + which suggest that column has duplicate name. */ + bool exists = + std::find(col_added.begin(), + col_added.end(), j) + != col_added.end(); + + if (exists) { + /* Duplicate column found. */ + goto dup_err; + } + + field->col = dict_table_get_nth_col(table, j); + + col_added.push_back(j); + + goto found; + } + } + + /* Let's check if it is a virtual column */ + for (j = 0; j < table->n_v_cols; j++) { + if (!strcmp(dict_table_get_v_col_name(table, j), + field->name)) { + + /* Check if same column is being assigned again + which suggest that column has duplicate name. */ + bool exists = + std::find(v_col_added.begin(), + v_col_added.end(), j) + != v_col_added.end(); + + if (exists) { + /* Duplicate column found. */ + break; + } + + field->col = reinterpret_cast( + dict_table_get_nth_v_col(table, j)); + + v_col_added.push_back(j); + + goto found; + } + } + + if (add_v) { + for (j = 0; j < add_v->n_v_col; j++) { + if (!strcmp(add_v->v_col_name[j], + field->name)) { + field->col = const_cast( + &add_v->v_col[j].m_col); + goto found; + } + } + } + +dup_err: +#ifdef UNIV_DEBUG + /* It is an error not to find a matching column. */ + ib::error() << "No matching column for " << field->name + << " in index " << index->name + << " of table " << table->name; +#endif /* UNIV_DEBUG */ + return(FALSE); + +found: + ; + } + + return(TRUE); +} + +/** Add a column to an index. +@param index index +@param table table +@param col column +@param prefix_len column prefix length +@param descending whether to use descending order */ +void dict_index_add_col(dict_index_t *index, const dict_table_t *table, + dict_col_t *col, ulint prefix_len, bool descending) +{ + dict_field_t* field; + const char* col_name; + + if (col->is_virtual()) { + dict_v_col_t* v_col = reinterpret_cast(col); + /* Register the index with the virtual column index list */ + v_col->v_indexes.push_front(dict_v_idx_t(index, index->n_def)); + col_name = dict_table_get_v_col_name_mysql( + table, dict_col_get_no(col)); + } else { + col_name = dict_table_get_col_name(table, dict_col_get_no(col)); + } + + dict_mem_index_add_field(index, col_name, prefix_len); + + field = dict_index_get_nth_field(index, unsigned(index->n_def) - 1); + + field->col = col; + field->fixed_len = static_cast( + dict_col_get_fixed_size( + col, dict_table_is_comp(table))) + & ((1U << 10) - 1); + + if (prefix_len && field->fixed_len > prefix_len) { + field->fixed_len = static_cast(prefix_len) + & ((1U << 10) - 1); + } + + /* Long fixed-length fields that need external storage are treated as + variable-length fields, so that the extern flag can be embedded in + the length word. */ + + if (field->fixed_len > DICT_MAX_FIXED_COL_LEN) { + field->fixed_len = 0; + } + + field->descending = descending; + + /* The comparison limit above must be constant. If it were + changed, the disk format of some fixed-length columns would + change, which would be a disaster. */ + compile_time_assert(DICT_MAX_FIXED_COL_LEN == 768); + + if (!(col->prtype & DATA_NOT_NULL)) { + index->n_nullable++; + } +} + +/*******************************************************************//** +Copies fields contained in index2 to index1. */ +static +void +dict_index_copy( +/*============*/ + dict_index_t* index1, /*!< in: index to copy to */ + const dict_index_t* index2, /*!< in: index to copy from */ + ulint start, /*!< in: first position to copy */ + ulint end) /*!< in: last position to copy */ +{ + dict_field_t* field; + ulint i; + + /* Copy fields contained in index2 */ + + for (i = start; i < end; i++) { + + field = dict_index_get_nth_field(index2, i); + + dict_index_add_col(index1, index2->table, field->col, + field->prefix_len, field->descending); + } +} + +/*******************************************************************//** +Copies types of fields contained in index to tuple. */ +void +dict_index_copy_types( +/*==================*/ + dtuple_t* tuple, /*!< in/out: data tuple */ + const dict_index_t* index, /*!< in: index */ + ulint n_fields) /*!< in: number of + field types to copy */ +{ + ulint i; + + if (dict_index_is_ibuf(index)) { + dtuple_set_types_binary(tuple, n_fields); + + return; + } + + for (i = 0; i < n_fields; i++) { + const dict_field_t* ifield; + dtype_t* dfield_type; + + ifield = dict_index_get_nth_field(index, i); + dfield_type = dfield_get_type(dtuple_get_nth_field(tuple, i)); + dict_col_copy_type(dict_field_get_col(ifield), dfield_type); + if (dict_index_is_spatial(index) + && DATA_GEOMETRY_MTYPE(dfield_type->mtype)) { + dfield_type->prtype |= DATA_GIS_MBR; + } + } +} + +/** Copies types of virtual columns contained in table to tuple and sets all +fields of the tuple to the SQL NULL value. This function should +be called right after dtuple_create(). +@param[in,out] tuple data tuple +@param[in] table table +*/ +void +dict_table_copy_v_types( + dtuple_t* tuple, + const dict_table_t* table) +{ + /* tuple could have more virtual columns than existing table, + if we are calling this for creating index along with adding + virtual columns */ + ulint n_fields = ut_min(dtuple_get_n_v_fields(tuple), + static_cast(table->n_v_def)); + + for (ulint i = 0; i < n_fields; i++) { + + dfield_t* dfield = dtuple_get_nth_v_field(tuple, i); + dtype_t* dtype = dfield_get_type(dfield); + + dfield_set_null(dfield); + dict_col_copy_type( + &(dict_table_get_nth_v_col(table, i)->m_col), + dtype); + } +} +/*******************************************************************//** +Copies types of columns contained in table to tuple and sets all +fields of the tuple to the SQL NULL value. This function should +be called right after dtuple_create(). */ +void +dict_table_copy_types( +/*==================*/ + dtuple_t* tuple, /*!< in/out: data tuple */ + const dict_table_t* table) /*!< in: table */ +{ + ulint i; + + for (i = 0; i < dtuple_get_n_fields(tuple); i++) { + + dfield_t* dfield = dtuple_get_nth_field(tuple, i); + dtype_t* dtype = dfield_get_type(dfield); + + dfield_set_null(dfield); + dict_col_copy_type(dict_table_get_nth_col(table, i), dtype); + } + + dict_table_copy_v_types(tuple, table); +} + +/*******************************************************************//** +Builds the internal dictionary cache representation for a clustered +index, containing also system fields not defined by the user. +@return own: the internal representation of the clustered index */ +static +dict_index_t* +dict_index_build_internal_clust( +/*============================*/ + dict_index_t* index) /*!< in: user representation of + a clustered index */ +{ + dict_table_t* table = index->table; + dict_index_t* new_index; + dict_field_t* field; + ulint trx_id_pos; + ulint i; + ibool* indexed; + + ut_ad(index->is_primary()); + ut_ad(!index->has_virtual()); + + ut_ad(dict_sys.locked()); + + /* Create a new index object with certainly enough fields */ + new_index = dict_mem_index_create(index->table, index->name, + index->type, + unsigned(index->n_fields + + table->n_cols)); + + /* Copy other relevant data from the old index struct to the new + struct: it inherits the values */ + + new_index->n_user_defined_cols = index->n_fields; + + new_index->id = index->id; + + /* Copy the fields of index */ + dict_index_copy(new_index, index, 0, index->n_fields); + + if (dict_index_is_unique(index)) { + /* Only the fields defined so far are needed to identify + the index entry uniquely */ + + new_index->n_uniq = new_index->n_def; + } else { + /* Also the row id is needed to identify the entry */ + new_index->n_uniq = unsigned(new_index->n_def + 1) + & dict_index_t::MAX_N_FIELDS; + } + + new_index->trx_id_offset = 0; + + /* Add system columns, trx id first */ + + trx_id_pos = new_index->n_def; + + compile_time_assert(DATA_ROW_ID == 0); + compile_time_assert(DATA_TRX_ID == 1); + compile_time_assert(DATA_ROLL_PTR == 2); + + if (!dict_index_is_unique(index)) { + dict_index_add_col(new_index, table, + dict_table_get_sys_col( + table, DATA_ROW_ID), + 0); + trx_id_pos++; + } + + dict_index_add_col( + new_index, table, + dict_table_get_sys_col(table, DATA_TRX_ID), 0); + + for (i = 0; i < trx_id_pos; i++) { + + ulint fixed_size = dict_col_get_fixed_size( + dict_index_get_nth_col(new_index, i), + dict_table_is_comp(table)); + + if (fixed_size == 0) { + new_index->trx_id_offset = 0; + + break; + } + + dict_field_t* field = dict_index_get_nth_field( + new_index, i); + if (field->prefix_len > 0) { + new_index->trx_id_offset = 0; + + break; + } + + /* Add fixed_size to new_index->trx_id_offset. + Because the latter is a bit-field, an overflow + can theoretically occur. Check for it. */ + fixed_size += new_index->trx_id_offset; + + new_index->trx_id_offset = static_cast(fixed_size) + & ((1U << 12) - 1); + + if (new_index->trx_id_offset != fixed_size) { + /* Overflow. Pretend that this is a + variable-length PRIMARY KEY. */ + ut_ad(0); + new_index->trx_id_offset = 0; + break; + } + } + + dict_index_add_col( + new_index, table, + dict_table_get_sys_col(table, DATA_ROLL_PTR), 0); + + /* Remember the table columns already contained in new_index */ + indexed = static_cast( + ut_zalloc_nokey(table->n_cols * sizeof *indexed)); + + /* Mark the table columns already contained in new_index */ + for (i = 0; i < new_index->n_def; i++) { + + field = dict_index_get_nth_field(new_index, i); + + /* If there is only a prefix of the column in the index + field, do not mark the column as contained in the index */ + + if (field->prefix_len == 0) { + + indexed[field->col->ind] = TRUE; + } + } + + /* Add to new_index non-system columns of table not yet included + there */ + for (i = 0; i + DATA_N_SYS_COLS < ulint(table->n_cols); i++) { + dict_col_t* col = dict_table_get_nth_col(table, i); + ut_ad(col->mtype != DATA_SYS); + + if (!indexed[col->ind]) { + dict_index_add_col(new_index, table, col, 0); + } + } + + ut_free(indexed); + + ut_ad(UT_LIST_GET_LEN(table->indexes) == 0); + + new_index->n_core_null_bytes = table->supports_instant() + ? dict_index_t::NO_CORE_NULL_BYTES + : static_cast( + UT_BITS_IN_BYTES(unsigned(new_index->n_nullable))); + new_index->cached = TRUE; + + return(new_index); +} + +/*******************************************************************//** +Builds the internal dictionary cache representation for a non-clustered +index, containing also system fields not defined by the user. +@return own: the internal representation of the non-clustered index */ +static +dict_index_t* +dict_index_build_internal_non_clust( +/*================================*/ + dict_index_t* index) /*!< in: user representation of + a non-clustered index */ +{ + dict_field_t* field; + dict_index_t* new_index; + dict_index_t* clust_index; + dict_table_t* table = index->table; + ulint i; + ibool* indexed; + + ut_ad(table && index); + ut_ad(!dict_index_is_clust(index)); + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(dict_sys.locked()); + + /* The clustered index should be the first in the list of indexes */ + clust_index = UT_LIST_GET_FIRST(table->indexes); + + ut_ad(clust_index); + ut_ad(dict_index_is_clust(clust_index)); + ut_ad(!dict_index_is_ibuf(clust_index)); + + /* Create a new index */ + new_index = dict_mem_index_create( + index->table, index->name, index->type, + ulint(index->n_fields + 1 + clust_index->n_uniq)); + + /* Copy other relevant data from the old index + struct to the new struct: it inherits the values */ + + new_index->n_user_defined_cols = index->n_fields; + + new_index->id = index->id; + + /* Copy fields from index to new_index */ + dict_index_copy(new_index, index, 0, index->n_fields); + + /* Remember the table columns already contained in new_index */ + indexed = static_cast( + ut_zalloc_nokey(table->n_cols * sizeof *indexed)); + + /* Mark the table columns already contained in new_index */ + for (i = 0; i < new_index->n_def; i++) { + + field = dict_index_get_nth_field(new_index, i); + + if (field->col->is_virtual()) { + continue; + } + + /* If there is only a prefix of the column in the index + field, do not mark the column as contained in the index */ + + if (field->prefix_len == 0) { + + indexed[field->col->ind] = TRUE; + } + } + + /* Add to new_index the columns necessary to determine the clustered + index entry uniquely */ + + for (i = 0; i < clust_index->n_uniq; i++) { + field = dict_index_get_nth_field(clust_index, i); + + if (!indexed[field->col->ind] || index->is_spatial()) { + dict_index_add_col(new_index, table, field->col, + field->prefix_len, + field->descending); + } + } + + ut_free(indexed); + + if (dict_index_is_unique(index)) { + new_index->n_uniq = index->n_fields; + } else { + new_index->n_uniq = new_index->n_def; + } + + /* Set the n_fields value in new_index to the actual defined + number of fields */ + + new_index->n_fields = new_index->n_def; + + new_index->cached = TRUE; + + return(new_index); +} + +/*********************************************************************** +Builds the internal dictionary cache representation for an FTS index. +@return own: the internal representation of the FTS index */ +static +dict_index_t* +dict_index_build_internal_fts( +/*==========================*/ + dict_index_t* index) /*!< in: user representation of an FTS index */ +{ + dict_index_t* new_index; + + ut_ad(index->type & DICT_FTS); + ut_ad(dict_sys.locked()); + + /* Create a new index */ + new_index = dict_mem_index_create(index->table, index->name, + index->type, index->n_fields); + + /* Copy other relevant data from the old index struct to the new + struct: it inherits the values */ + + new_index->n_user_defined_cols = index->n_fields; + + new_index->id = index->id; + + /* Copy fields from index to new_index */ + dict_index_copy(new_index, index, 0, index->n_fields); + + new_index->n_uniq = 0; + new_index->cached = TRUE; + + dict_table_t* table = index->table; + + if (table->fts->cache == NULL) { + table->fts->cache = fts_cache_create(table); + } + + mysql_mutex_lock(&table->fts->cache->init_lock); + /* Notify the FTS cache about this index. */ + fts_cache_index_cache_create(table, new_index); + mysql_mutex_unlock(&table->fts->cache->init_lock); + + return(new_index); +} +/*====================== FOREIGN KEY PROCESSING ========================*/ + +/**********************************************************************//** +Removes a foreign constraint struct from the dictionary cache. */ +void +dict_foreign_remove_from_cache( +/*===========================*/ + dict_foreign_t* foreign) /*!< in, own: foreign constraint */ +{ + ut_ad(dict_sys.locked()); + ut_a(foreign); + + if (foreign->referenced_table != NULL) { + foreign->referenced_table->referenced_set.erase(foreign); + } + + if (foreign->foreign_table != NULL) { + foreign->foreign_table->foreign_set.erase(foreign); + } + + dict_foreign_free(foreign); +} + +/**********************************************************************//** +Looks for the foreign constraint from the foreign and referenced lists +of a table. +@return foreign constraint */ +static +dict_foreign_t* +dict_foreign_find( +/*==============*/ + dict_table_t* table, /*!< in: table object */ + dict_foreign_t* foreign) /*!< in: foreign constraint */ +{ + ut_ad(dict_sys.frozen()); + + ut_ad(dict_foreign_set_validate(table->foreign_set)); + ut_ad(dict_foreign_set_validate(table->referenced_set)); + + dict_foreign_set::iterator it = table->foreign_set.find(foreign); + + if (it != table->foreign_set.end()) { + return(*it); + } + + it = table->referenced_set.find(foreign); + + if (it != table->referenced_set.end()) { + return(*it); + } + + return(NULL); +} + +/*********************************************************************//** +Tries to find an index whose first fields are the columns in the array, +in the same order and is not marked for deletion and is not the same +as types_idx. +@return matching index, NULL if not found */ +dict_index_t* +dict_foreign_find_index( +/*====================*/ + const dict_table_t* table, /*!< in: table */ + const char** col_names, + /*!< in: column names, or NULL + to use table->col_names */ + const char** columns,/*!< in: array of column names */ + ulint n_cols, /*!< in: number of columns */ + const dict_index_t* types_idx, + /*!< in: NULL or an index + whose types the column types + must match */ + bool check_charsets, + /*!< in: whether to check + charsets. only has an effect + if types_idx != NULL */ + ulint check_null, + /*!< in: nonzero if none of + the columns must be declared + NOT NULL */ + fkerr_t* error, /*!< out: error code */ + ulint* err_col_no, + /*!< out: column number where + error happened */ + dict_index_t** err_index) + /*!< out: index where error + happened */ +{ + ut_ad(dict_sys.frozen()); + + if (error) { + *error = FK_INDEX_NOT_FOUND; + } + + for (dict_index_t* index = dict_table_get_first_index(table); + index; + index = dict_table_get_next_index(index)) { + if (types_idx != index + && !index->to_be_dropped + && !dict_index_is_online_ddl(index) + && dict_foreign_qualify_index( + table, col_names, columns, n_cols, + index, types_idx, + check_charsets, check_null, + error, err_col_no, err_index)) { + if (error) { + *error = FK_SUCCESS; + } + + return(index); + } + } + + return(NULL); +} +/**********************************************************************//** +Report an error in a foreign key definition. */ +static +void +dict_foreign_error_report_low( +/*==========================*/ + FILE* file, /*!< in: output stream */ + const char* name) /*!< in: table name */ +{ + rewind(file); + ut_print_timestamp(file); + fprintf(file, " Error in foreign key constraint of table %s:\n", + name); +} + +/**********************************************************************//** +Report an error in a foreign key definition. */ +static +void +dict_foreign_error_report( +/*======================*/ + FILE* file, /*!< in: output stream */ + dict_foreign_t* fk, /*!< in: foreign key constraint */ + const char* msg) /*!< in: the error message */ +{ + std::string fk_str; + mysql_mutex_lock(&dict_foreign_err_mutex); + dict_foreign_error_report_low(file, fk->foreign_table_name); + fputs(msg, file); + fputs(" Constraint:\n", file); + fk_str = dict_print_info_on_foreign_key_in_create_format(NULL, fk, TRUE); + fputs(fk_str.c_str(), file); + putc('\n', file); + if (fk->foreign_index) { + fprintf(file, "The index in the foreign key in table is" + " %s\n%s\n", fk->foreign_index->name(), + FOREIGN_KEY_CONSTRAINTS_MSG); + } + mysql_mutex_unlock(&dict_foreign_err_mutex); +} + +/**********************************************************************//** +Adds a foreign key constraint object to the dictionary cache. May free +the object if there already is an object with the same identifier in. +At least one of the foreign table and the referenced table must already +be in the dictionary cache! +@return DB_SUCCESS or error code */ +dberr_t +dict_foreign_add_to_cache( +/*======================*/ + dict_foreign_t* foreign, + /*!< in, own: foreign key constraint */ + const char** col_names, + /*!< in: column names, or NULL to use + foreign->foreign_table->col_names */ + bool check_charsets, + /*!< in: whether to check charset + compatibility */ + dict_err_ignore_t ignore_err) + /*!< in: error to be ignored */ +{ + dict_table_t* for_table; + dict_table_t* ref_table; + dict_foreign_t* for_in_cache = NULL; + dict_index_t* index; + ibool added_to_referenced_list= FALSE; + FILE* ef = dict_foreign_err_file; + + DBUG_ENTER("dict_foreign_add_to_cache"); + DBUG_PRINT("dict_foreign_add_to_cache", ("id: %s", foreign->id)); + + ut_ad(dict_sys.locked()); + + for_table = dict_sys.find_table( + {foreign->foreign_table_name_lookup, + strlen(foreign->foreign_table_name_lookup)}); + + ref_table = dict_sys.find_table( + {foreign->referenced_table_name_lookup, + strlen(foreign->referenced_table_name_lookup)}); + ut_a(for_table || ref_table); + + if (for_table) { + for_in_cache = dict_foreign_find(for_table, foreign); + } + + if (!for_in_cache && ref_table) { + for_in_cache = dict_foreign_find(ref_table, foreign); + } + + if (for_in_cache) { + dict_foreign_free(foreign); + } else { + for_in_cache = foreign; + + } + + if (ref_table && !for_in_cache->referenced_table) { + index = dict_foreign_find_index( + ref_table, NULL, + for_in_cache->referenced_col_names, + for_in_cache->n_fields, for_in_cache->foreign_index, + check_charsets, false); + + if (index == NULL + && !(ignore_err & DICT_ERR_IGNORE_FK_NOKEY)) { + dict_foreign_error_report( + ef, for_in_cache, + "there is no index in referenced table" + " which would contain\n" + "the columns as the first columns," + " or the data types in the\n" + "referenced table do not match" + " the ones in table."); + + if (for_in_cache == foreign) { + dict_foreign_free(foreign); + } + + DBUG_RETURN(DB_CANNOT_ADD_CONSTRAINT); + } + + for_in_cache->referenced_table = ref_table; + for_in_cache->referenced_index = index; + + std::pair ret + = ref_table->referenced_set.insert(for_in_cache); + + ut_a(ret.second); /* second is true if the insertion + took place */ + added_to_referenced_list = TRUE; + } + + if (for_table && !for_in_cache->foreign_table) { + index = dict_foreign_find_index( + for_table, col_names, + for_in_cache->foreign_col_names, + for_in_cache->n_fields, + for_in_cache->referenced_index, check_charsets, + for_in_cache->type + & (DICT_FOREIGN_ON_DELETE_SET_NULL + | DICT_FOREIGN_ON_UPDATE_SET_NULL)); + + if (index == NULL + && !(ignore_err & DICT_ERR_IGNORE_FK_NOKEY)) { + dict_foreign_error_report( + ef, for_in_cache, + "there is no index in the table" + " which would contain\n" + "the columns as the first columns," + " or the data types in the\n" + "table do not match" + " the ones in the referenced table\n" + "or one of the ON ... SET NULL columns" + " is declared NOT NULL."); + + if (for_in_cache == foreign) { + if (added_to_referenced_list) { + const dict_foreign_set::size_type + n = ref_table->referenced_set + .erase(for_in_cache); + + ut_a(n == 1); /* the number of + elements removed must + be one */ + } + + dict_foreign_free(foreign); + } + + DBUG_RETURN(DB_CANNOT_ADD_CONSTRAINT); + } + + for_in_cache->foreign_table = for_table; + for_in_cache->foreign_index = index; + + std::pair ret + = for_table->foreign_set.insert(for_in_cache); + + ut_a(ret.second); /* second is true if the insertion + took place */ + } + + /* We need to move the table to the non-LRU end of the table LRU + list. Otherwise it will be evicted from the cache. */ + + if (ref_table != NULL) { + dict_sys.prevent_eviction(ref_table); + } + + if (for_table != NULL) { + dict_sys.prevent_eviction(for_table); + } + + ut_ad(dict_lru_validate()); + DBUG_RETURN(DB_SUCCESS); +} + +/*********************************************************************//** +Scans from pointer onwards. Stops if is at the start of a copy of +'string' where characters are compared without case sensitivity, and +only outside `` or "" quotes. Stops also at NUL. +@return scanned up to this */ +static +const char* +dict_scan_to( +/*=========*/ + const char* ptr, /*!< in: scan from */ + const char* string) /*!< in: look for this */ +{ + char quote = '\0'; + bool escape = false; + + for (; *ptr; ptr++) { + if (*ptr == quote) { + /* Closing quote character: do not look for + starting quote or the keyword. */ + + /* If the quote character is escaped by a + backslash, ignore it. */ + if (escape) { + escape = false; + } else { + quote = '\0'; + } + } else if (quote) { + /* Within quotes: do nothing. */ + if (escape) { + escape = false; + } else if (*ptr == '\\') { + escape = true; + } + } else if (*ptr == '`' || *ptr == '"' || *ptr == '\'') { + /* Starting quote: remember the quote character. */ + quote = *ptr; + } else { + /* Outside quotes: look for the keyword. */ + ulint i; + for (i = 0; string[i]; i++) { + if (toupper((int)(unsigned char)(ptr[i])) + != toupper((int)(unsigned char) + (string[i]))) { + goto nomatch; + } + } + break; +nomatch: + ; + } + } + + return(ptr); +} + +/*********************************************************************//** +Accepts a specified string. Comparisons are case-insensitive. +@return if string was accepted, the pointer is moved after that, else +ptr is returned */ +static +const char* +dict_accept( +/*========*/ + CHARSET_INFO* cs, /*!< in: the character set of ptr */ + const char* ptr, /*!< in: scan from this */ + const char* string, /*!< in: accept only this string as the next + non-whitespace string */ + ibool* success)/*!< out: TRUE if accepted */ +{ + const char* old_ptr = ptr; + const char* old_ptr2; + + *success = FALSE; + + while (my_isspace(cs, *ptr)) { + ptr++; + } + + old_ptr2 = ptr; + + ptr = dict_scan_to(ptr, string); + + if (*ptr == '\0' || old_ptr2 != ptr) { + return(old_ptr); + } + + *success = TRUE; + + return ptr + strlen(string); +} + +/*********************************************************************//** +Scans an id. For the lexical definition of an 'id', see the code below. +Strips backquotes or double quotes from around the id. +@return scanned to */ +static +const char* +dict_scan_id( +/*=========*/ + CHARSET_INFO* cs, /*!< in: the character set of ptr */ + const char* ptr, /*!< in: scanned to */ + mem_heap_t* heap, /*!< in: heap where to allocate the id + (NULL=id will not be allocated, but it + will point to string near ptr) */ + const char** id, /*!< out,own: the id; NULL if no id was + scannable */ + ibool table_id,/*!< in: TRUE=convert the allocated id + as a table name; FALSE=convert to UTF-8 */ + ibool accept_also_dot) + /*!< in: TRUE if also a dot can appear in a + non-quoted id; in a quoted id it can appear + always */ +{ + char quote = '\0'; + ulint len = 0; + const char* s; + char* str; + char* dst; + + *id = NULL; + + while (my_isspace(cs, *ptr)) { + ptr++; + } + + if (*ptr == '\0') { + + return(ptr); + } + + if (*ptr == '`' || *ptr == '"') { + quote = *ptr++; + } + + s = ptr; + + if (quote) { + for (;;) { + if (!*ptr) { + /* Syntax error */ + return(ptr); + } + if (*ptr == quote) { + ptr++; + if (*ptr != quote) { + break; + } + } + ptr++; + len++; + } + } else { + while (!my_isspace(cs, *ptr) && *ptr != '(' && *ptr != ')' + && (accept_also_dot || *ptr != '.') + && *ptr != ',' && *ptr != '\0') { + + ptr++; + } + + len = ulint(ptr - s); + } + + if (heap == NULL) { + /* no heap given: id will point to source string */ + *id = s; + return(ptr); + } + + if (quote) { + char* d; + + str = d = static_cast( + mem_heap_alloc(heap, len + 1)); + + while (len--) { + if ((*d++ = *s++) == quote) { + s++; + } + } + *d++ = 0; + len = ulint(d - str); + ut_ad(*s == quote); + ut_ad(s + 1 == ptr); + } else { + str = mem_heap_strdupl(heap, s, len); + } + + if (!table_id) { +convert_id: + /* Convert the identifier from connection character set + to UTF-8. */ + len = 3 * len + 1; + *id = dst = static_cast(mem_heap_alloc(heap, len)); + + innobase_convert_from_id(cs, dst, str, len); + } else if (!strncmp(str, srv_mysql50_table_name_prefix, + sizeof(srv_mysql50_table_name_prefix) - 1)) { + /* This is a pre-5.1 table name + containing chars other than [A-Za-z0-9]. + Discard the prefix and use raw UTF-8 encoding. */ + str += sizeof(srv_mysql50_table_name_prefix) - 1; + len -= sizeof(srv_mysql50_table_name_prefix) - 1; + goto convert_id; + } else { + /* Encode using filename-safe characters. */ + len = 5 * len + 1; + *id = dst = static_cast(mem_heap_alloc(heap, len)); + + innobase_convert_from_table_id(cs, dst, str, len); + } + + return(ptr); +} + +/*********************************************************************//** +Open a table from its database and table name, this is currently used by +foreign constraint parser to get the referenced table. +@return complete table name with database and table name, allocated from +heap memory passed in */ +char* +dict_get_referenced_table( + const char* name, /*!< in: foreign key table name */ + const char* database_name, /*!< in: table db name */ + ulint database_name_len, /*!< in: db name length */ + const char* table_name, /*!< in: table name */ + ulint table_name_len, /*!< in: table name length */ + dict_table_t** table, /*!< out: table object or NULL */ + mem_heap_t* heap, /*!< in/out: heap memory */ + CHARSET_INFO* from_cs) /*!< in: table name charset */ +{ + char* ref; + char db_name[MAX_DATABASE_NAME_LEN]; + char tbl_name[MAX_TABLE_NAME_LEN]; + CHARSET_INFO* to_cs = &my_charset_filename; + uint errors; + ut_ad(database_name || name); + ut_ad(table_name); + + if (!strncmp(table_name, srv_mysql50_table_name_prefix, + sizeof(srv_mysql50_table_name_prefix) - 1)) { + /* This is a pre-5.1 table name + containing chars other than [A-Za-z0-9]. + Discard the prefix and use raw UTF-8 encoding. */ + table_name += sizeof(srv_mysql50_table_name_prefix) - 1; + table_name_len -= sizeof(srv_mysql50_table_name_prefix) - 1; + + to_cs = system_charset_info; + } + + table_name_len = strconvert(from_cs, table_name, table_name_len, to_cs, + tbl_name, MAX_TABLE_NAME_LEN, &errors); + table_name = tbl_name; + + if (database_name) { + to_cs = &my_charset_filename; + if (!strncmp(database_name, srv_mysql50_table_name_prefix, + sizeof(srv_mysql50_table_name_prefix) - 1)) { + database_name + += sizeof(srv_mysql50_table_name_prefix) - 1; + database_name_len + -= sizeof(srv_mysql50_table_name_prefix) - 1; + to_cs = system_charset_info; + } + + database_name_len = strconvert( + from_cs, database_name, database_name_len, to_cs, + db_name, MAX_DATABASE_NAME_LEN, &errors); + database_name = db_name; + } else { + /* Use the database name of the foreign key table */ + + database_name = name; + database_name_len = dict_get_db_name_len(name); + } + + /* Copy database_name, '/', table_name, '\0' */ + const size_t len = database_name_len + table_name_len + 1; + ref = static_cast(mem_heap_alloc(heap, len + 1)); + memcpy(ref, database_name, database_name_len); + ref[database_name_len] = '/'; + memcpy(ref + database_name_len + 1, table_name, table_name_len + 1); + + /* Values; 0 = Store and compare as given; case sensitive + 1 = Store and compare in lower; case insensitive + 2 = Store as given, compare in lower; case semi-sensitive */ + if (lower_case_table_names == 2) { + innobase_casedn_str(ref); + *table = dict_sys.load_table({ref, len}); + memcpy(ref, database_name, database_name_len); + ref[database_name_len] = '/'; + memcpy(ref + database_name_len + 1, table_name, table_name_len + 1); + + } else { +#ifndef _WIN32 + if (lower_case_table_names == 1) { + innobase_casedn_str(ref); + } +#else + innobase_casedn_str(ref); +#endif /* !_WIN32 */ + *table = dict_sys.load_table({ref, len}); + } + + return(ref); +} + +/*********************************************************************//** +Removes MySQL comments from an SQL string. A comment is either +(a) '#' to the end of the line, +(b) '--[space]' to the end of the line, or +(c) '[slash][asterisk]' till the next '[asterisk][slash]' (like the familiar +C comment syntax). +@return own: SQL string stripped from comments; the caller must free +this with ut_free()! */ +static +char* +dict_strip_comments( +/*================*/ + const char* sql_string, /*!< in: SQL string */ + size_t sql_length) /*!< in: length of sql_string */ +{ + char* str; + const char* sptr; + const char* eptr = sql_string + sql_length; + char* ptr; + /* unclosed quote character (0 if none) */ + char quote = 0; + bool escape = false; + + DBUG_ENTER("dict_strip_comments"); + + DBUG_PRINT("dict_strip_comments", ("%s", sql_string)); + + str = static_cast(ut_malloc_nokey(sql_length + 1)); + + sptr = sql_string; + ptr = str; + + for (;;) { +scan_more: + if (sptr >= eptr || *sptr == '\0') { +end_of_string: + *ptr = '\0'; + + ut_a(ptr <= str + sql_length); + + DBUG_PRINT("dict_strip_comments", ("%s", str)); + DBUG_RETURN(str); + } + + if (*sptr == quote) { + /* Closing quote character: do not look for + starting quote or comments. */ + + /* If the quote character is escaped by a + backslash, ignore it. */ + if (escape) { + escape = false; + } else { + quote = 0; + } + } else if (quote) { + /* Within quotes: do not look for + starting quotes or comments. */ + if (escape) { + escape = false; + } else if (*sptr == '\\') { + escape = true; + } + } else if (*sptr == '"' || *sptr == '`' || *sptr == '\'') { + /* Starting quote: remember the quote character. */ + quote = *sptr; + } else if (*sptr == '#' + || (sptr[0] == '-' && sptr[1] == '-' + && sptr[2] == ' ')) { + for (;;) { + if (++sptr >= eptr) { + goto end_of_string; + } + + /* In Unix a newline is 0x0A while in Windows + it is 0x0D followed by 0x0A */ + + switch (*sptr) { + case (char) 0X0A: + case (char) 0x0D: + case '\0': + goto scan_more; + } + } + } else if (!quote && *sptr == '/' && *(sptr + 1) == '*') { + sptr += 2; + for (;;) { + if (sptr >= eptr) { + goto end_of_string; + } + + switch (*sptr) { + case '\0': + goto scan_more; + case '*': + if (sptr[1] == '/') { + sptr += 2; + goto scan_more; + } + } + + sptr++; + } + } + + *ptr = *sptr; + + ptr++; + sptr++; + } +} + +/*********************************************************************//** +Finds the highest [number] for foreign key constraints of the table. Looks +only at the >= 4.0.18-format id's, which are of the form +databasename/tablename_ibfk_[number]. +@return highest number, 0 if table has no new format foreign key constraints */ +ulint +dict_table_get_highest_foreign_id( +/*==============================*/ + dict_table_t* table) /*!< in: table in the dictionary memory cache */ +{ + dict_foreign_t* foreign; + char* endp; + ulint biggest_id = 0; + ulint id; + ulint len; + + DBUG_ENTER("dict_table_get_highest_foreign_id"); + + ut_a(table); + + len = strlen(table->name.m_name); + + for (dict_foreign_set::iterator it = table->foreign_set.begin(); + it != table->foreign_set.end(); + ++it) { + char fkid[MAX_TABLE_NAME_LEN * 2 + 20]; + foreign = *it; + + strncpy(fkid, foreign->id, (sizeof fkid) - 1); + fkid[(sizeof fkid) - 1] = '\0'; + /* Convert foreign key identifier on dictionary memory + cache to filename charset. */ + innobase_convert_to_filename_charset( + strchr(fkid, '/') + 1, + strchr(foreign->id, '/') + 1, + MAX_TABLE_NAME_LEN); + + if (strlen(fkid) > ((sizeof dict_ibfk) - 1) + len + && 0 == memcmp(fkid, table->name.m_name, len) + && 0 == memcmp(fkid + len, + dict_ibfk, (sizeof dict_ibfk) - 1) + && fkid[len + ((sizeof dict_ibfk) - 1)] != '0') { + /* It is of the >= 4.0.18 format */ + + id = strtoul(fkid + len + + ((sizeof dict_ibfk) - 1), + &endp, 10); + if (*endp == '\0') { + ut_a(id != biggest_id); + + if (id > biggest_id) { + biggest_id = id; + } + } + } + } + + DBUG_PRINT("dict_table_get_highest_foreign_id", + ("id: " ULINTPF, biggest_id)); + + DBUG_RETURN(biggest_id); +} + +/**********************************************************************//** +Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement. +@return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the +constraint id does not match */ +dberr_t +dict_foreign_parse_drop_constraints( +/*================================*/ + mem_heap_t* heap, /*!< in: heap from which we can + allocate memory */ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table, /*!< in: table */ + ulint* n, /*!< out: number of constraints + to drop */ + const char*** constraints_to_drop) /*!< out: id's of the + constraints to drop */ +{ + ibool success; + char* str; + size_t len; + const char* ptr; + const char* ptr1; + const char* id; + CHARSET_INFO* cs; + + ut_a(trx->mysql_thd); + + cs = thd_charset(trx->mysql_thd); + + *n = 0; + + *constraints_to_drop = static_cast( + mem_heap_alloc(heap, 1000 * sizeof(char*))); + + ptr = innobase_get_stmt_unsafe(trx->mysql_thd, &len); + + str = dict_strip_comments(ptr, len); + + ptr = str; + + ut_ad(dict_sys.locked()); +loop: + ptr = dict_scan_to(ptr, "DROP"); + + if (*ptr == '\0') { + ut_free(str); + + return(DB_SUCCESS); + } + + ptr = dict_accept(cs, ptr, "DROP", &success); + + if (!my_isspace(cs, *ptr)) { + + goto loop; + } + + ptr = dict_accept(cs, ptr, "FOREIGN", &success); + + if (!success || !my_isspace(cs, *ptr)) { + + goto loop; + } + + ptr = dict_accept(cs, ptr, "KEY", &success); + + if (!success) { + + goto syntax_error; + } + + ptr1 = dict_accept(cs, ptr, "IF", &success); + + if (success && my_isspace(cs, *ptr1)) { + ptr1 = dict_accept(cs, ptr1, "EXISTS", &success); + if (success) { + ptr = ptr1; + } + } + + ptr = dict_scan_id(cs, ptr, heap, &id, FALSE, TRUE); + + if (id == NULL) { + + goto syntax_error; + } + + ut_a(*n < 1000); + (*constraints_to_drop)[*n] = id; + (*n)++; + + if (std::find_if(table->foreign_set.begin(), + table->foreign_set.end(), + dict_foreign_matches_id(id)) + == table->foreign_set.end()) { + + if (!srv_read_only_mode) { + FILE* ef = dict_foreign_err_file; + + mysql_mutex_lock(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + fputs(" Error in dropping of a foreign key" + " constraint of table ", ef); + ut_print_name(ef, NULL, table->name.m_name); + fprintf(ef, ",\nin SQL command\n%s" + "\nCannot find a constraint with the" + " given id %s.\n", str, id); + mysql_mutex_unlock(&dict_foreign_err_mutex); + } + + ut_free(str); + + return(DB_CANNOT_DROP_CONSTRAINT); + } + + goto loop; + +syntax_error: + if (!srv_read_only_mode) { + FILE* ef = dict_foreign_err_file; + + mysql_mutex_lock(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + fputs(" Syntax error in dropping of a" + " foreign key constraint of table ", ef); + ut_print_name(ef, NULL, table->name.m_name); + fprintf(ef, ",\n" + "close to:\n%s\n in SQL command\n%s\n", ptr, str); + mysql_mutex_unlock(&dict_foreign_err_mutex); + } + + ut_free(str); + + return(DB_CANNOT_DROP_CONSTRAINT); +} + +/*==================== END OF FOREIGN KEY PROCESSING ====================*/ + +/**********************************************************************//** +Returns an index object if it is found in the dictionary cache. +Assumes that dict_sys.latch is already being held. +@return index, NULL if not found */ +dict_index_t* +dict_index_get_if_in_cache_low( +/*===========================*/ + index_id_t index_id) /*!< in: index id */ +{ + ut_ad(dict_sys.frozen()); + + for (dict_table_t *table= UT_LIST_GET_FIRST(dict_sys.table_LRU); + table; table= UT_LIST_GET_NEXT(table_LRU, table)) + if (dict_index_t *index= dict_table_find_index_on_id(table, index_id)) + return index; + + for (dict_table_t *table = UT_LIST_GET_FIRST(dict_sys.table_non_LRU); + table; table= UT_LIST_GET_NEXT(table_LRU, table)) + if (dict_index_t *index= dict_table_find_index_on_id(table, index_id)) + return index; + + return nullptr; +} + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Returns an index object if it is found in the dictionary cache. +@return index, NULL if not found */ +dict_index_t* +dict_index_get_if_in_cache( +/*=======================*/ + index_id_t index_id) /*!< in: index id */ +{ + dict_index_t* index; + + if (!dict_sys.is_initialised()) { + return(NULL); + } + + dict_sys.freeze(SRW_LOCK_CALL); + + index = dict_index_get_if_in_cache_low(index_id); + + dict_sys.unfreeze(); + + return(index); +} + +/**********************************************************************//** +Checks that a tuple has n_fields_cmp value in a sensible range, so that +no comparison can occur with the page number field in a node pointer. +@return TRUE if ok */ +ibool +dict_index_check_search_tuple( +/*==========================*/ + const dict_index_t* index, /*!< in: index tree */ + const dtuple_t* tuple) /*!< in: tuple used in a search */ +{ + ut_ad(dtuple_get_n_fields_cmp(tuple) + <= dict_index_get_n_unique_in_tree(index)); + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/**********************************************************************//** +Builds a node pointer out of a physical record and a page number. +@return own: node pointer */ +dtuple_t* +dict_index_build_node_ptr( +/*======================*/ + const dict_index_t* index, /*!< in: index */ + const rec_t* rec, /*!< in: record for which to build node + pointer */ + ulint page_no,/*!< in: page number to put in node + pointer */ + mem_heap_t* heap, /*!< in: memory heap where pointer + created */ + ulint level) /*!< in: level of rec in tree: + 0 means leaf level */ +{ + dtuple_t* tuple; + dfield_t* field; + byte* buf; + ulint n_unique; + + if (dict_index_is_ibuf(index)) { + /* In a universal index tree, we take the whole record as + the node pointer if the record is on the leaf level, + on non-leaf levels we remove the last field, which + contains the page number of the child page */ + + ut_a(!dict_table_is_comp(index->table)); + n_unique = rec_get_n_fields_old(rec); + + if (level > 0) { + ut_a(n_unique > 1); + n_unique--; + } + } else { + n_unique = dict_index_get_n_unique_in_tree_nonleaf(index); + } + + tuple = dtuple_create(heap, n_unique + 1); + + /* When searching in the tree for the node pointer, we must not do + comparison on the last field, the page number field, as on upper + levels in the tree there may be identical node pointers with a + different page number; therefore, we set the n_fields_cmp to one + less: */ + + dtuple_set_n_fields_cmp(tuple, n_unique); + + dict_index_copy_types(tuple, index, n_unique); + + buf = static_cast(mem_heap_alloc(heap, 4)); + + mach_write_to_4(buf, page_no); + + field = dtuple_get_nth_field(tuple, n_unique); + dfield_set_data(field, buf, 4); + + dtype_set(dfield_get_type(field), DATA_SYS_CHILD, DATA_NOT_NULL, 4); + + rec_copy_prefix_to_dtuple(tuple, rec, index, + level ? 0 : index->n_core_fields, + n_unique, heap); + dtuple_set_info_bits(tuple, dtuple_get_info_bits(tuple) + | REC_STATUS_NODE_PTR); + + ut_ad(dtuple_check_typed(tuple)); + + return(tuple); +} + +/** Convert a physical record into a search tuple. +@param[in] rec index record (not necessarily in an index page) +@param[in] index index +@param[in] leaf whether rec is in a leaf page +@param[in] n_fields number of data fields +@param[in,out] heap memory heap for allocation +@return own: data tuple */ +dtuple_t* +dict_index_build_data_tuple( + const rec_t* rec, + const dict_index_t* index, + bool leaf, + ulint n_fields, + mem_heap_t* heap) +{ + ut_ad(!index->is_clust()); + + dtuple_t* tuple = dtuple_create(heap, n_fields); + + dict_index_copy_types(tuple, index, n_fields); + + rec_copy_prefix_to_dtuple(tuple, rec, index, + leaf ? n_fields : 0, n_fields, heap); + + ut_ad(dtuple_check_typed(tuple)); + + return(tuple); +} + +/*********************************************************************//** +Calculates the minimum record length in an index. */ +ulint +dict_index_calc_min_rec_len( +/*========================*/ + const dict_index_t* index) /*!< in: index */ +{ + ulint sum = 0; + ulint i; + ulint comp = dict_table_is_comp(index->table); + + if (comp) { + ulint nullable = 0; + sum = REC_N_NEW_EXTRA_BYTES; + for (i = 0; i < dict_index_get_n_fields(index); i++) { + const dict_col_t* col + = dict_index_get_nth_col(index, i); + ulint size = dict_col_get_fixed_size(col, comp); + sum += size; + if (!size) { + size = col->len; + sum += size < 128 ? 1 : 2; + } + if (!(col->prtype & DATA_NOT_NULL)) { + nullable++; + } + } + + /* round the NULL flags up to full bytes */ + sum += UT_BITS_IN_BYTES(nullable); + + return(sum); + } + + for (i = 0; i < dict_index_get_n_fields(index); i++) { + sum += dict_col_get_fixed_size( + dict_index_get_nth_col(index, i), comp); + } + + if (sum > 127) { + sum += 2 * dict_index_get_n_fields(index); + } else { + sum += dict_index_get_n_fields(index); + } + + sum += REC_N_OLD_EXTRA_BYTES; + + return(sum); +} + +/**********************************************************************//** +Outputs info on a foreign key of a table in a format suitable for +CREATE TABLE. */ +std::string +dict_print_info_on_foreign_key_in_create_format( +/*============================================*/ + trx_t* trx, /*!< in: transaction */ + dict_foreign_t* foreign, /*!< in: foreign key constraint */ + ibool add_newline) /*!< in: whether to add a newline */ +{ + const char* stripped_id; + ulint i; + std::string str; + + if (strchr(foreign->id, '/')) { + /* Strip the preceding database name from the constraint id */ + stripped_id = foreign->id + 1 + + dict_get_db_name_len(foreign->id); + } else { + stripped_id = foreign->id; + } + + str.append(","); + + if (add_newline) { + /* SHOW CREATE TABLE wants constraints each printed nicely + on its own line, while error messages want no newlines + inserted. */ + str.append("\n "); + } + + str.append(" CONSTRAINT "); + + str.append(innobase_quote_identifier(trx, stripped_id)); + str.append(" FOREIGN KEY ("); + + for (i = 0;;) { + str.append(innobase_quote_identifier(trx, foreign->foreign_col_names[i])); + + if (++i < foreign->n_fields) { + str.append(", "); + } else { + break; + } + } + + str.append(") REFERENCES "); + + if (dict_tables_have_same_db(foreign->foreign_table_name_lookup, + foreign->referenced_table_name_lookup)) { + /* Do not print the database name of the referenced table */ + str.append(ut_get_name(trx, + dict_remove_db_name( + foreign->referenced_table_name))); + } else { + str.append(ut_get_name(trx, + foreign->referenced_table_name)); + } + + str.append(" ("); + + for (i = 0;;) { + str.append(innobase_quote_identifier(trx, + foreign->referenced_col_names[i])); + + if (++i < foreign->n_fields) { + str.append(", "); + } else { + break; + } + } + + str.append(")"); + + if (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE) { + str.append(" ON DELETE CASCADE"); + } + + if (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL) { + str.append(" ON DELETE SET NULL"); + } + + if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION) { + str.append(" ON DELETE NO ACTION"); + } + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE) { + str.append(" ON UPDATE CASCADE"); + } + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL) { + str.append(" ON UPDATE SET NULL"); + } + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION) { + str.append(" ON UPDATE NO ACTION"); + } + + return str; +} + +/**********************************************************************//** +Outputs info on foreign keys of a table. */ +std::string +dict_print_info_on_foreign_keys( +/*============================*/ + ibool create_table_format, /*!< in: if TRUE then print in + a format suitable to be inserted into + a CREATE TABLE, otherwise in the format + of SHOW TABLE STATUS */ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table) /*!< in: table */ +{ + dict_foreign_t* foreign; + std::string str; + + dict_sys.freeze(SRW_LOCK_CALL); + + for (dict_foreign_set::iterator it = table->foreign_set.begin(); + it != table->foreign_set.end(); + ++it) { + + foreign = *it; + + if (create_table_format) { + str.append( + dict_print_info_on_foreign_key_in_create_format( + trx, foreign, TRUE)); + } else { + ulint i; + str.append("; ("); + + for (i = 0; i < foreign->n_fields; i++) { + if (i) { + str.append(" "); + } + + str.append(innobase_quote_identifier(trx, + foreign->foreign_col_names[i])); + } + + str.append(") REFER "); + str.append(ut_get_name(trx, + foreign->referenced_table_name)); + str.append(")"); + + for (i = 0; i < foreign->n_fields; i++) { + if (i) { + str.append(" "); + } + str.append(innobase_quote_identifier( + trx, + foreign->referenced_col_names[i])); + } + + str.append(")"); + + if (foreign->type == DICT_FOREIGN_ON_DELETE_CASCADE) { + str.append(" ON DELETE CASCADE"); + } + + if (foreign->type == DICT_FOREIGN_ON_DELETE_SET_NULL) { + str.append(" ON DELETE SET NULL"); + } + + if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION) { + str.append(" ON DELETE NO ACTION"); + } + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE) { + str.append(" ON UPDATE CASCADE"); + } + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL) { + str.append(" ON UPDATE SET NULL"); + } + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION) { + str.append(" ON UPDATE NO ACTION"); + } + } + } + + dict_sys.unfreeze(); + return str; +} + +/**********************************************************************//** +Flags an index corrupted both in the data dictionary cache +and in the SYS_INDEXES */ +void dict_set_corrupted(dict_index_t *index, const char *ctx) +{ + mem_heap_t* heap; + mtr_t mtr; + dict_index_t* sys_index; + dtuple_t* tuple; + dfield_t* dfield; + byte* buf; + const char* status; + btr_cur_t cursor; + + dict_sys.lock(SRW_LOCK_CALL); + + ut_ad(!dict_table_is_comp(dict_sys.sys_tables)); + ut_ad(!dict_table_is_comp(dict_sys.sys_indexes)); + + /* Mark the table as corrupted only if the clustered index + is corrupted */ + if (dict_index_is_clust(index)) { + index->table->corrupted = TRUE; + goto func_exit; + } + + if (index->type & DICT_CORRUPT) { + /* The index was already flagged corrupted. */ + ut_ad(!dict_index_is_clust(index) || index->table->corrupted); + goto func_exit; + } + + /* If this is read only mode, do not update SYS_INDEXES, just + mark it as corrupted in memory */ + if (high_level_read_only) { + index->type |= DICT_CORRUPT; + goto func_exit; + } + + heap = mem_heap_create(sizeof(dtuple_t) + 2 * (sizeof(dfield_t) + + sizeof(que_fork_t) + sizeof(upd_node_t) + + sizeof(upd_t) + 12)); + mtr_start(&mtr); + index->type |= DICT_CORRUPT; + + sys_index = UT_LIST_GET_FIRST(dict_sys.sys_indexes->indexes); + + /* Find the index row in SYS_INDEXES */ + tuple = dtuple_create(heap, 2); + + dfield = dtuple_get_nth_field(tuple, 0); + buf = static_cast(mem_heap_alloc(heap, 8)); + mach_write_to_8(buf, index->table->id); + dfield_set_data(dfield, buf, 8); + + dfield = dtuple_get_nth_field(tuple, 1); + buf = static_cast(mem_heap_alloc(heap, 8)); + mach_write_to_8(buf, index->id); + dfield_set_data(dfield, buf, 8); + + dict_index_copy_types(tuple, sys_index, 2); + cursor.page_cur.index = sys_index; + + if (cursor.search_leaf(tuple, PAGE_CUR_LE, BTR_MODIFY_LEAF, &mtr) + != DB_SUCCESS) { + goto fail; + } + + if (cursor.low_match == dtuple_get_n_fields(tuple)) { + /* UPDATE SYS_INDEXES SET TYPE=index->type + WHERE TABLE_ID=index->table->id AND INDEX_ID=index->id */ + ulint len; + byte* field = rec_get_nth_field_old( + btr_cur_get_rec(&cursor), + DICT_FLD__SYS_INDEXES__TYPE, &len); + if (len != 4) { + goto fail; + } + mtr.write<4>(*btr_cur_get_block(&cursor), field, index->type); + status = "Flagged"; + } else { +fail: + status = "Unable to flag"; + } + + mtr_commit(&mtr); + mem_heap_free(heap); + ib::error() << status << " corruption of " << index->name + << " in table " << index->table->name << " in " << ctx; + +func_exit: + dict_sys.unlock(); +} + +/** Sets merge_threshold in the SYS_INDEXES +@param[in,out] index index +@param[in] merge_threshold value to set */ +void +dict_index_set_merge_threshold( + dict_index_t* index, + ulint merge_threshold) +{ + mem_heap_t* heap; + mtr_t mtr; + dict_index_t* sys_index; + dtuple_t* tuple; + dfield_t* dfield; + byte* buf; + btr_cur_t cursor; + + ut_ad(index != NULL); + ut_ad(!dict_table_is_comp(dict_sys.sys_tables)); + ut_ad(!dict_table_is_comp(dict_sys.sys_indexes)); + + heap = mem_heap_create(sizeof(dtuple_t) + 2 * (sizeof(dfield_t) + + sizeof(que_fork_t) + sizeof(upd_node_t) + + sizeof(upd_t) + 12)); + + mtr.start(); + + sys_index = UT_LIST_GET_FIRST(dict_sys.sys_indexes->indexes); + + /* Find the index row in SYS_INDEXES */ + tuple = dtuple_create(heap, 2); + + dfield = dtuple_get_nth_field(tuple, 0); + buf = static_cast(mem_heap_alloc(heap, 8)); + mach_write_to_8(buf, index->table->id); + dfield_set_data(dfield, buf, 8); + + dfield = dtuple_get_nth_field(tuple, 1); + buf = static_cast(mem_heap_alloc(heap, 8)); + mach_write_to_8(buf, index->id); + dfield_set_data(dfield, buf, 8); + + dict_index_copy_types(tuple, sys_index, 2); + cursor.page_cur.index = sys_index; + + if (cursor.search_leaf(tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF, &mtr) + != DB_SUCCESS) { + goto func_exit; + } + + if (cursor.up_match == dtuple_get_n_fields(tuple) + && rec_get_n_fields_old(btr_cur_get_rec(&cursor)) + == DICT_NUM_FIELDS__SYS_INDEXES) { + ulint len; + byte* field = rec_get_nth_field_old( + btr_cur_get_rec(&cursor), + DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD, &len); + + ut_ad(len == 4); + mtr.write<4,mtr_t::MAYBE_NOP>(*btr_cur_get_block(&cursor), + field, merge_threshold); + } + +func_exit: + mtr_commit(&mtr); + mem_heap_free(heap); +} + +#ifdef UNIV_DEBUG +/** Sets merge_threshold for all indexes in the list of tables +@param[in] list pointer to the list of tables */ +inline +void +dict_set_merge_threshold_list_debug( + UT_LIST_BASE_NODE_T(dict_table_t)* list, + uint merge_threshold_all) +{ + for (dict_table_t* table = UT_LIST_GET_FIRST(*list); + table != NULL; + table = UT_LIST_GET_NEXT(table_LRU, table)) { + for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); + index != NULL; + index = UT_LIST_GET_NEXT(indexes, index)) { + index->lock.x_lock(SRW_LOCK_CALL); + index->merge_threshold = merge_threshold_all + & ((1U << 6) - 1); + index->lock.x_unlock(); + } + } +} + +/** Sets merge_threshold for all indexes in dictionary cache for debug. +@param[in] merge_threshold_all value to set for all indexes */ +void +dict_set_merge_threshold_all_debug( + uint merge_threshold_all) +{ + dict_sys.freeze(SRW_LOCK_CALL); + + dict_set_merge_threshold_list_debug( + &dict_sys.table_LRU, merge_threshold_all); + dict_set_merge_threshold_list_debug( + &dict_sys.table_non_LRU, merge_threshold_all); + + dict_sys.unfreeze(); +} + +#endif /* UNIV_DEBUG */ + +/** Get an index by name. +@param[in] table the table where to look for the index +@param[in] name the index name to look for +@return index, NULL if does not exist */ +dict_index_t* +dict_table_get_index_on_name(dict_table_t* table, const char* name) +{ + dict_index_t* index; + + index = dict_table_get_first_index(table); + + while (index != NULL) { + if (index->is_committed() && !strcmp(index->name, name)) { + return(index); + } + + index = dict_table_get_next_index(index); + } + + return(NULL); +} + +/**********************************************************************//** +Replace the index passed in with another equivalent index in the +foreign key lists of the table. +@return whether all replacements were found */ +bool +dict_foreign_replace_index( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + const char** col_names, + /*!< in: column names, or NULL + to use table->col_names */ + const dict_index_t* index) /*!< in: index to be replaced */ +{ + bool found = true; + dict_foreign_t* foreign; + + ut_ad(index->to_be_dropped); + ut_ad(index->table == table); + + for (dict_foreign_set::iterator it = table->foreign_set.begin(); + it != table->foreign_set.end(); + ++it) { + + foreign = *it; + if (foreign->foreign_index == index) { + ut_ad(foreign->foreign_table == index->table); + + dict_index_t* new_index = dict_foreign_find_index( + foreign->foreign_table, col_names, + foreign->foreign_col_names, + foreign->n_fields, index, + /*check_charsets=*/TRUE, /*check_null=*/FALSE, + NULL, NULL, NULL); + if (new_index) { + ut_ad(new_index->table == index->table); + ut_ad(!new_index->to_be_dropped); + } else { + found = false; + } + + foreign->foreign_index = new_index; + } + } + + for (dict_foreign_set::iterator it = table->referenced_set.begin(); + it != table->referenced_set.end(); + ++it) { + + foreign = *it; + if (foreign->referenced_index == index) { + ut_ad(foreign->referenced_table == index->table); + + dict_index_t* new_index = dict_foreign_find_index( + foreign->referenced_table, NULL, + foreign->referenced_col_names, + foreign->n_fields, index, + /*check_charsets=*/TRUE, /*check_null=*/FALSE, + NULL, NULL, NULL); + /* There must exist an alternative index, + since this must have been checked earlier. */ + if (new_index) { + ut_ad(new_index->table == index->table); + ut_ad(!new_index->to_be_dropped); + } else { + found = false; + } + + foreign->referenced_index = new_index; + } + } + + return(found); +} + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Check for duplicate index entries in a table [using the index name] */ +void +dict_table_check_for_dup_indexes( +/*=============================*/ + const dict_table_t* table, /*!< in: Check for dup indexes + in this table */ + enum check_name check) /*!< in: whether and when to allow + temporary index names */ +{ + /* Check for duplicates, ignoring indexes that are marked + as to be dropped */ + + const dict_index_t* index1; + const dict_index_t* index2; + + ut_ad(dict_sys.frozen()); + + /* The primary index _must_ exist */ + ut_a(UT_LIST_GET_LEN(table->indexes) > 0); + + index1 = UT_LIST_GET_FIRST(table->indexes); + + do { + if (!index1->is_committed()) { + ut_a(!dict_index_is_clust(index1)); + + switch (check) { + case CHECK_ALL_COMPLETE: + ut_error; + case CHECK_ABORTED_OK: + switch (dict_index_get_online_status(index1)) { + case ONLINE_INDEX_COMPLETE: + case ONLINE_INDEX_CREATION: + ut_error; + break; + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + break; + } + /* fall through */ + case CHECK_PARTIAL_OK: + break; + } + } + + for (index2 = UT_LIST_GET_NEXT(indexes, index1); + index2 != NULL; + index2 = UT_LIST_GET_NEXT(indexes, index2)) { + ut_ad(index1->is_committed() + != index2->is_committed() + || strcmp(index1->name, index2->name) != 0); + } + + index1 = UT_LIST_GET_NEXT(indexes, index1); + } while (index1); +} +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Converts a database and table name from filesystem encoding +(e.g. d@i1b/a@q1b@1Kc, same format as used in dict_table_t::name) in two +strings in UTF8 encoding (e.g. dцb and aÑŽbØc). The output buffers must be +at least MAX_DB_UTF8_LEN and MAX_TABLE_UTF8_LEN bytes. */ +void +dict_fs2utf8( +/*=========*/ + const char* db_and_table, /*!< in: database and table names, + e.g. d@i1b/a@q1b@1Kc */ + char* db_utf8, /*!< out: database name, e.g. dцb */ + size_t db_utf8_size, /*!< in: dbname_utf8 size */ + char* table_utf8, /*!< out: table name, e.g. aÑŽbØc */ + size_t table_utf8_size)/*!< in: table_utf8 size */ +{ + char db[MAX_DATABASE_NAME_LEN + 1]; + ulint db_len; + uint errors; + + db_len = dict_get_db_name_len(db_and_table); + + ut_a(db_len <= sizeof(db)); + + memcpy(db, db_and_table, db_len); + db[db_len] = '\0'; + + strconvert( + &my_charset_filename, db, uint(db_len), system_charset_info, + db_utf8, uint(db_utf8_size), &errors); + + /* convert each # to @0023 in table name and store the result in buf */ + const char* table = dict_remove_db_name(db_and_table); + const char* table_p; + char buf[MAX_TABLE_NAME_LEN * 5 + 1]; + char* buf_p; + for (table_p = table, buf_p = buf; table_p[0] != '\0'; table_p++) { + if (table_p[0] != '#') { + buf_p[0] = table_p[0]; + buf_p++; + } else { + buf_p[0] = '@'; + buf_p[1] = '0'; + buf_p[2] = '0'; + buf_p[3] = '2'; + buf_p[4] = '3'; + buf_p += 5; + } + ut_a((size_t) (buf_p - buf) < sizeof(buf)); + } + buf_p[0] = '\0'; + + errors = 0; + strconvert( + &my_charset_filename, buf, (uint) (buf_p - buf), + system_charset_info, + table_utf8, uint(table_utf8_size), + &errors); + + if (errors != 0) { + snprintf(table_utf8, table_utf8_size, "%s%s", + srv_mysql50_table_name_prefix, table); + } +} + +/** Resize the hash tables based on the current buffer pool size. */ +void dict_sys_t::resize() +{ + ut_ad(this == &dict_sys); + ut_ad(is_initialised()); + lock(SRW_LOCK_CALL); + + /* all table entries are in table_LRU and table_non_LRU lists */ + table_hash.free(); + table_id_hash.free(); + temp_id_hash.free(); + + const ulint hash_size = buf_pool_get_curr_size() + / (DICT_POOL_PER_TABLE_HASH * UNIV_WORD_SIZE); + table_hash.create(hash_size); + table_id_hash.create(hash_size); + temp_id_hash.create(hash_size); + + for (dict_table_t *table= UT_LIST_GET_FIRST(table_LRU); table; + table= UT_LIST_GET_NEXT(table_LRU, table)) + { + ut_ad(!table->is_temporary()); + ulint fold= my_crc32c(0, table->name.m_name, strlen(table->name.m_name)); + ulint id_fold= ut_fold_ull(table->id); + + HASH_INSERT(dict_table_t, name_hash, &table_hash, fold, table); + HASH_INSERT(dict_table_t, id_hash, &table_id_hash, id_fold, table); + } + + for (dict_table_t *table = UT_LIST_GET_FIRST(table_non_LRU); table; + table= UT_LIST_GET_NEXT(table_LRU, table)) + { + ulint fold= my_crc32c(0, table->name.m_name, strlen(table->name.m_name)); + ulint id_fold= ut_fold_ull(table->id); + + HASH_INSERT(dict_table_t, name_hash, &table_hash, fold, table); + + hash_table_t *id_hash= table->is_temporary() + ? &temp_id_hash : &table_id_hash; + + HASH_INSERT(dict_table_t, id_hash, id_hash, id_fold, table); + } + + unlock(); +} + +/** Close the data dictionary cache on shutdown. */ +void dict_sys_t::close() +{ + ut_ad(this == &dict_sys); + if (!is_initialised()) return; + + lock(SRW_LOCK_CALL); + + /* Free the hash elements. We don't remove them from table_hash + because we are invoking table_hash.free() below. */ + for (ulint i= table_hash.n_cells; i--; ) + while (dict_table_t *table= static_cast + (HASH_GET_FIRST(&table_hash, i))) + dict_sys.remove(table); + + table_hash.free(); + + /* table_id_hash contains the same elements as in table_hash, + therefore we don't delete the individual elements. */ + table_id_hash.free(); + + /* No temporary tables should exist at this point. */ + temp_id_hash.free(); + + unlock(); + latch.destroy(); + + mysql_mutex_destroy(&dict_foreign_err_mutex); + + if (dict_foreign_err_file) + { + my_fclose(dict_foreign_err_file, MYF(MY_WME)); + dict_foreign_err_file = NULL; + } + + m_initialised= false; +} + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Validate the dictionary table LRU list. +@return TRUE if valid */ +static +ibool +dict_lru_validate(void) +/*===================*/ +{ + dict_table_t* table; + + ut_ad(dict_sys.frozen()); + + for (table = UT_LIST_GET_FIRST(dict_sys.table_LRU); + table != NULL; + table = UT_LIST_GET_NEXT(table_LRU, table)) { + + ut_a(table->can_be_evicted); + } + + for (table = UT_LIST_GET_FIRST(dict_sys.table_non_LRU); + table != NULL; + table = UT_LIST_GET_NEXT(table_LRU, table)) { + + ut_a(!table->can_be_evicted); + } + + return(TRUE); +} +#endif /* UNIV_DEBUG */ +/*********************************************************************//** +Check an index to see whether its first fields are the columns in the array, +in the same order and is not marked for deletion and is not the same +as types_idx. +@return true if the index qualifies, otherwise false */ +bool +dict_foreign_qualify_index( +/*=======================*/ + const dict_table_t* table, /*!< in: table */ + const char** col_names, + /*!< in: column names, or NULL + to use table->col_names */ + const char** columns,/*!< in: array of column names */ + ulint n_cols, /*!< in: number of columns */ + const dict_index_t* index, /*!< in: index to check */ + const dict_index_t* types_idx, + /*!< in: NULL or an index + whose types the column types + must match */ + bool check_charsets, + /*!< in: whether to check + charsets. only has an effect + if types_idx != NULL */ + ulint check_null, + /*!< in: nonzero if none of + the columns must be declared + NOT NULL */ + fkerr_t* error, /*!< out: error code */ + ulint* err_col_no, + /*!< out: column number where + error happened */ + dict_index_t** err_index) + /*!< out: index where error + happened */ +{ + if (dict_index_get_n_fields(index) < n_cols) { + return(false); + } + + if (!index->is_btree()) { + return false; + } + + if (index->online_status >= ONLINE_INDEX_ABORTED) { + return false; + } + + for (ulint i = 0; i < n_cols; i++) { + dict_field_t* field; + const char* col_name; + ulint col_no; + + field = dict_index_get_nth_field(index, i); + col_no = dict_col_get_no(field->col); + + if (field->prefix_len != 0) { + /* We do not accept column prefix + indexes here */ + if (error && err_col_no && err_index) { + *error = FK_IS_PREFIX_INDEX; + *err_col_no = i; + *err_index = (dict_index_t*)index; + } + return(false); + } + + if (check_null + && (field->col->prtype & DATA_NOT_NULL)) { + if (error && err_col_no && err_index) { + *error = FK_COL_NOT_NULL; + *err_col_no = i; + *err_index = (dict_index_t*)index; + } + return(false); + } + + if (field->col->is_virtual()) { + col_name = ""; + for (ulint j = 0; j < table->n_v_def; j++) { + col_name = dict_table_get_v_col_name(table, j); + if (innobase_strcasecmp(field->name,col_name) == 0) { + break; + } + } + } else { + col_name = col_names + ? col_names[col_no] + : dict_table_get_col_name(table, col_no); + } + + if (0 != innobase_strcasecmp(columns[i], col_name)) { + return(false); + } + + if (types_idx && !cmp_cols_are_equal( + dict_index_get_nth_col(index, i), + dict_index_get_nth_col(types_idx, i), + check_charsets)) { + if (error && err_col_no && err_index) { + *error = FK_COLS_NOT_EQUAL; + *err_col_no = i; + *err_index = (dict_index_t*)index; + } + + return(false); + } + } + + return(true); +} + +/*********************************************************************//** +Update the state of compression failure padding heuristics. This is +called whenever a compression operation succeeds or fails. +The caller must be holding info->mutex */ +static +void +dict_index_zip_pad_update( +/*======================*/ + zip_pad_info_t* info, /*pad % ZIP_PAD_INCR == 0); + + total = info->success + info->failure; + + ut_ad(total > 0); + + if (zip_threshold == 0) { + /* User has just disabled the padding. */ + return; + } + + if (total < ZIP_PAD_ROUND_LEN) { + /* We are in middle of a round. Do nothing. */ + return; + } + + /* We are at a 'round' boundary. Reset the values but first + calculate fail rate for our heuristic. */ + fail_pct = (info->failure * 100) / total; + info->failure = 0; + info->success = 0; + + if (fail_pct > zip_threshold) { + /* Compression failures are more then user defined + threshold. Increase the pad size to reduce chances of + compression failures. + + Only do increment if it won't increase padding + beyond max pad size. */ + if (info->pad + ZIP_PAD_INCR + < (srv_page_size * zip_pad_max) / 100) { + info->pad.fetch_add(ZIP_PAD_INCR); + + MONITOR_INC(MONITOR_PAD_INCREMENTS); + } + + info->n_rounds = 0; + + } else { + /* Failure rate was OK. Another successful round + completed. */ + ++info->n_rounds; + + /* If enough successful rounds are completed with + compression failure rate in control, decrease the + padding. */ + if (info->n_rounds >= ZIP_PAD_SUCCESSFUL_ROUND_LIMIT + && info->pad > 0) { + info->pad.fetch_sub(ZIP_PAD_INCR); + + info->n_rounds = 0; + + MONITOR_INC(MONITOR_PAD_DECREMENTS); + } + } +} + +/*********************************************************************//** +This function should be called whenever a page is successfully +compressed. Updates the compression padding information. */ +void +dict_index_zip_success( +/*===================*/ + dict_index_t* index) /*!< in/out: index to be updated. */ +{ + ulint zip_threshold = zip_failure_threshold_pct; + if (!zip_threshold) { + /* Disabled by user. */ + return; + } + + index->zip_pad.mutex.lock(); + ++index->zip_pad.success; + dict_index_zip_pad_update(&index->zip_pad, zip_threshold); + index->zip_pad.mutex.unlock(); +} + +/*********************************************************************//** +This function should be called whenever a page compression attempt +fails. Updates the compression padding information. */ +void +dict_index_zip_failure( +/*===================*/ + dict_index_t* index) /*!< in/out: index to be updated. */ +{ + ulint zip_threshold = zip_failure_threshold_pct; + if (!zip_threshold) { + /* Disabled by user. */ + return; + } + + index->zip_pad.mutex.lock(); + ++index->zip_pad.failure; + dict_index_zip_pad_update(&index->zip_pad, zip_threshold); + index->zip_pad.mutex.unlock(); +} + +/*********************************************************************//** +Return the optimal page size, for which page will likely compress. +@return page size beyond which page might not compress */ +ulint +dict_index_zip_pad_optimal_page_size( +/*=================================*/ + dict_index_t* index) /*!< in: index for which page size + is requested */ +{ + ulint pad; + ulint min_sz; + ulint sz; + + if (!zip_failure_threshold_pct) { + /* Disabled by user. */ + return(srv_page_size); + } + + pad = index->zip_pad.pad; + + ut_ad(pad < srv_page_size); + sz = srv_page_size - pad; + + /* Min size allowed by user. */ + ut_ad(zip_pad_max < 100); + min_sz = (srv_page_size * (100 - zip_pad_max)) / 100; + + return(ut_max(sz, min_sz)); +} + +/*************************************************************//** +Convert table flag to row format string. +@return row format name. */ +const char* +dict_tf_to_row_format_string( +/*=========================*/ + ulint table_flag) /*!< in: row format setting */ +{ + switch (dict_tf_get_rec_format(table_flag)) { + case REC_FORMAT_REDUNDANT: + return("ROW_TYPE_REDUNDANT"); + case REC_FORMAT_COMPACT: + return("ROW_TYPE_COMPACT"); + case REC_FORMAT_COMPRESSED: + return("ROW_TYPE_COMPRESSED"); + case REC_FORMAT_DYNAMIC: + return("ROW_TYPE_DYNAMIC"); + } + + ut_error; + return(0); +} diff --git a/storage/innobase/dict/dict0load.cc b/storage/innobase/dict/dict0load.cc new file mode 100644 index 00000000..f769839d --- /dev/null +++ b/storage/innobase/dict/dict0load.cc @@ -0,0 +1,3213 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file dict/dict0load.cc +Loads to the memory cache database object definitions +from dictionary tables + +Created 4/24/1996 Heikki Tuuri +*******************************************************/ + +#include "dict0load.h" + +#include "log.h" +#include "btr0pcur.h" +#include "btr0btr.h" +#include "dict0boot.h" +#include "dict0crea.h" +#include "dict0dict.h" +#include "dict0mem.h" +#include "dict0stats.h" +#include "fsp0file.h" +#include "fts0priv.h" +#include "mach0data.h" +#include "page0page.h" +#include "rem0cmp.h" +#include "srv0start.h" +#include "srv0srv.h" +#include "fts0opt.h" +#include "row0vers.h" + +/** Loads a table definition and also all its index definitions. + +Loads those foreign key constraints whose referenced table is already in +dictionary cache. If a foreign key constraint is not loaded, then the +referenced table is pushed into the output stack (fk_tables), if it is not +NULL. These tables must be subsequently loaded so that all the foreign +key constraints are loaded into memory. + +@param[in] name Table name in the db/tablename format +@param[in] ignore_err Error to be ignored when loading table + and its index definition +@param[out] fk_tables Related table names that must also be + loaded to ensure that all foreign key + constraints are loaded. +@return table, possibly with file_unreadable flag set +@retval nullptr if the table does not exist */ +static dict_table_t *dict_load_table_one(const span &name, + dict_err_ignore_t ignore_err, + dict_names_t &fk_tables); + +/** Load an index definition from a SYS_INDEXES record to dict_index_t. +@return error message +@retval NULL on success */ +static +const char* +dict_load_index_low( + byte* table_id, /*!< in/out: table id (8 bytes), + an "in" value if mtr + and "out" when !mtr */ + bool uncommitted, /*!< in: false=READ COMMITTED, + true=READ UNCOMMITTED */ + mem_heap_t* heap, /*!< in/out: temporary memory heap */ + const rec_t* rec, /*!< in: SYS_INDEXES record */ + mtr_t* mtr, /*!< in/out: mini-transaction, + or nullptr if a pre-allocated + *index is to be filled in */ + dict_table_t* table, /*!< in/out: table, or NULL */ + dict_index_t** index); /*!< out,own: index, or NULL */ + +/** Load a table column definition from a SYS_COLUMNS record to dict_table_t. +@param table table, or nullptr if the output will be in column +@param use_uncommitted 0=READ COMMITTED, 1=detect, 2=READ UNCOMMITTED +@param heap memory heap for temporary storage +@param column pointer to output buffer, or nullptr if table!=nullptr +@param table_id table identifier +@param col_name column name +@param rec SYS_COLUMNS record +@param mtr mini-transaction +@param nth_v_col nullptr, or pointer to a counter of virtual columns +@return error message +@retval nullptr on success */ +static const char *dict_load_column_low(dict_table_t *table, + unsigned use_uncommitted, + mem_heap_t *heap, dict_col_t *column, + table_id_t *table_id, + const char **col_name, + const rec_t *rec, + mtr_t *mtr, + ulint *nth_v_col); + +/** Load a virtual column "mapping" (to base columns) information +from a SYS_VIRTUAL record +@param[in,out] table table +@param[in] uncommitted false=READ COMMITTED, true=READ UNCOMMITTED +@param[in,out] column mapped base column's dict_column_t +@param[in,out] table_id table id +@param[in,out] pos virtual column position +@param[in,out] base_pos base column position +@param[in] rec SYS_VIRTUAL record +@return error message +@retval NULL on success */ +static +const char* +dict_load_virtual_low( + dict_table_t* table, + bool uncommitted, + dict_col_t** column, + table_id_t* table_id, + ulint* pos, + ulint* base_pos, + const rec_t* rec); + +/** Load an index field definition from a SYS_FIELDS record to dict_index_t. +@return error message +@retval NULL on success */ +static +const char* +dict_load_field_low( + byte* index_id, /*!< in/out: index id (8 bytes) + an "in" value if index != NULL + and "out" if index == NULL */ + bool uncommitted, /*!< in: false=READ COMMITTED, + true=READ UNCOMMITTED */ + dict_index_t* index, /*!< in/out: index, could be NULL + if we just populate a dict_field_t + struct with information from + a SYS_FIELDS record */ + dict_field_t* sys_field, /*!< out: dict_field_t to be + filled */ + ulint* pos, /*!< out: Field position */ + byte* last_index_id, /*!< in: last index id */ + mem_heap_t* heap, /*!< in/out: memory heap + for temporary storage */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + const rec_t* rec); /*!< in: SYS_FIELDS record */ + +#ifdef UNIV_DEBUG +/****************************************************************//** +Compare the name of an index column. +@return TRUE if the i'th column of index is 'name'. */ +static +ibool +name_of_col_is( +/*===========*/ + const dict_table_t* table, /*!< in: table */ + const dict_index_t* index, /*!< in: index */ + ulint i, /*!< in: index field offset */ + const char* name) /*!< in: name to compare to */ +{ + ulint tmp = dict_col_get_no(dict_field_get_col( + dict_index_get_nth_field( + index, i))); + + return(strcmp(name, dict_table_get_col_name(table, tmp)) == 0); +} +#endif /* UNIV_DEBUG */ + +/********************************************************************//** +This function gets the next system table record as it scans the table. +@return the next record if found, NULL if end of scan */ +static +const rec_t* +dict_getnext_system_low( +/*====================*/ + btr_pcur_t* pcur, /*!< in/out: persistent cursor to the + record*/ + mtr_t* mtr) /*!< in: the mini-transaction */ +{ + rec_t* rec = NULL; + + while (!rec) { + btr_pcur_move_to_next_user_rec(pcur, mtr); + + rec = btr_pcur_get_rec(pcur); + + if (!btr_pcur_is_on_user_rec(pcur)) { + /* end of index */ + btr_pcur_close(pcur); + + return(NULL); + } + } + + /* Get a record, let's save the position */ + btr_pcur_store_position(pcur, mtr); + + return(rec); +} + +/********************************************************************//** +This function opens a system table, and returns the first record. +@return first record of the system table */ +const rec_t* +dict_startscan_system( +/*==================*/ + btr_pcur_t* pcur, /*!< out: persistent cursor to + the record */ + mtr_t* mtr, /*!< in: the mini-transaction */ + dict_table_t* table) /*!< in: system table */ +{ + btr_pcur_init(pcur); + if (pcur->open_leaf(true, table->indexes.start, BTR_SEARCH_LEAF, mtr) != + DB_SUCCESS) + return nullptr; + const rec_t *rec; + do + rec= dict_getnext_system_low(pcur, mtr); + while (rec && rec_get_deleted_flag(rec, 0)); + return rec; +} + +/********************************************************************//** +This function gets the next system table record as it scans the table. +@return the next record if found, NULL if end of scan */ +const rec_t* +dict_getnext_system( +/*================*/ + btr_pcur_t* pcur, /*!< in/out: persistent cursor + to the record */ + mtr_t* mtr) /*!< in: the mini-transaction */ +{ + const rec_t *rec=nullptr; + if (pcur->restore_position(BTR_SEARCH_LEAF, mtr) != btr_pcur_t::CORRUPTED) + do + rec= dict_getnext_system_low(pcur, mtr); + while (rec && rec_get_deleted_flag(rec, 0)); + return rec; +} + +/********************************************************************//** +This function parses a SYS_INDEXES record and populate a dict_index_t +structure with the information from the record. For detail information +about SYS_INDEXES fields, please refer to dict_boot() function. +@return error message, or NULL on success */ +const char* +dict_process_sys_indexes_rec( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_INDEXES rec */ + dict_index_t* index, /*!< out: index to be filled */ + table_id_t* table_id) /*!< out: index table id */ +{ + byte buf[8]; + + ut_d(index->is_dummy = true); + ut_d(index->in_instant_init = false); + + /* Parse the record, and get "dict_index_t" struct filled */ + const char *err_msg= dict_load_index_low(buf, false, heap, rec, + nullptr, nullptr, &index); + *table_id= mach_read_from_8(buf); + return err_msg; +} + +/********************************************************************//** +This function parses a SYS_COLUMNS record and populate a dict_column_t +structure with the information from the record. +@return error message, or NULL on success */ +const char* +dict_process_sys_columns_rec( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_COLUMNS rec */ + dict_col_t* column, /*!< out: dict_col_t to be filled */ + table_id_t* table_id, /*!< out: table id */ + const char** col_name, /*!< out: column name */ + ulint* nth_v_col) /*!< out: if virtual col, this is + record's sequence number */ +{ + const char* err_msg; + + /* Parse the record, and get "dict_col_t" struct filled */ + err_msg = dict_load_column_low(NULL, 0, heap, column, + table_id, col_name, rec, nullptr, + nth_v_col); + + return(err_msg); +} + +/** This function parses a SYS_VIRTUAL record and extracts virtual column +information +@param[in] rec current SYS_COLUMNS rec +@param[in,out] table_id table id +@param[in,out] pos virtual column position +@param[in,out] base_pos base column position +@return error message, or NULL on success */ +const char* +dict_process_sys_virtual_rec( + const rec_t* rec, + table_id_t* table_id, + ulint* pos, + ulint* base_pos) +{ + return dict_load_virtual_low(nullptr, false, nullptr, table_id, + pos, base_pos, rec); +} + +/********************************************************************//** +This function parses a SYS_FIELDS record and populates a dict_field_t +structure with the information from the record. +@return error message, or NULL on success */ +const char* +dict_process_sys_fields_rec( +/*========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_FIELDS rec */ + dict_field_t* sys_field, /*!< out: dict_field_t to be + filled */ + ulint* pos, /*!< out: Field position */ + index_id_t* index_id, /*!< out: current index id */ + index_id_t last_id) /*!< in: previous index id */ +{ + byte buf[8]; + byte last_index_id[8]; + const char* err_msg; + + mach_write_to_8(last_index_id, last_id); + + err_msg = dict_load_field_low(buf, false, nullptr, sys_field, + pos, last_index_id, heap, nullptr, rec); + + *index_id = mach_read_from_8(buf); + + return(err_msg); + +} + +/********************************************************************//** +This function parses a SYS_FOREIGN record and populate a dict_foreign_t +structure with the information from the record. For detail information +about SYS_FOREIGN fields, please refer to dict_load_foreign() function. +@return error message, or NULL on success */ +const char* +dict_process_sys_foreign_rec( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_FOREIGN rec */ + dict_foreign_t* foreign) /*!< out: dict_foreign_t struct + to be filled */ +{ + ulint len; + const byte* field; + + if (rec_get_deleted_flag(rec, 0)) { + return("delete-marked record in SYS_FOREIGN"); + } + + if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_FOREIGN) { + return("wrong number of columns in SYS_FOREIGN record"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN__ID, &len); + if (len == 0 || len == UNIV_SQL_NULL) { +err_len: + return("incorrect column length in SYS_FOREIGN"); + } + + /* This receives a dict_foreign_t* that points to a stack variable. + So dict_foreign_free(foreign) is not used as elsewhere. + Since the heap used here is freed elsewhere, foreign->heap + is not assigned. */ + foreign->id = mem_heap_strdupl(heap, (const char*) field, len); + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_FOREIGN__DB_TRX_ID, &len); + if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_FOREIGN__DB_ROLL_PTR, &len); + if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + /* The _lookup versions of the referenced and foreign table names + are not assigned since they are not used in this dict_foreign_t */ + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN__FOR_NAME, &len); + if (len == 0 || len == UNIV_SQL_NULL) { + goto err_len; + } + foreign->foreign_table_name = mem_heap_strdupl( + heap, (const char*) field, len); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN__REF_NAME, &len); + if (len == 0 || len == UNIV_SQL_NULL) { + goto err_len; + } + foreign->referenced_table_name = mem_heap_strdupl( + heap, (const char*) field, len); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN__N_COLS, &len); + if (len != 4) { + goto err_len; + } + uint32_t n_fields_and_type = mach_read_from_4(field); + + foreign->type = n_fields_and_type >> 24 & ((1U << 6) - 1); + foreign->n_fields = n_fields_and_type & dict_index_t::MAX_N_FIELDS; + + return(NULL); +} + +/********************************************************************//** +This function parses a SYS_FOREIGN_COLS record and extract necessary +information from the record and return to caller. +@return error message, or NULL on success */ +const char* +dict_process_sys_foreign_col_rec( +/*=============================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_FOREIGN_COLS rec */ + const char** name, /*!< out: foreign key constraint name */ + const char** for_col_name, /*!< out: referencing column name */ + const char** ref_col_name, /*!< out: referenced column name + in referenced table */ + ulint* pos) /*!< out: column position */ +{ + ulint len; + const byte* field; + + if (rec_get_deleted_flag(rec, 0)) { + return("delete-marked record in SYS_FOREIGN_COLS"); + } + + if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_FOREIGN_COLS) { + return("wrong number of columns in SYS_FOREIGN_COLS record"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__ID, &len); + if (len == 0 || len == UNIV_SQL_NULL) { +err_len: + return("incorrect column length in SYS_FOREIGN_COLS"); + } + *name = mem_heap_strdupl(heap, (char*) field, len); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__POS, &len); + if (len != 4) { + goto err_len; + } + *pos = mach_read_from_4(field); + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__DB_TRX_ID, &len); + if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__DB_ROLL_PTR, &len); + if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME, &len); + if (len == 0 || len == UNIV_SQL_NULL) { + goto err_len; + } + *for_col_name = mem_heap_strdupl(heap, (char*) field, len); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME, &len); + if (len == 0 || len == UNIV_SQL_NULL) { + goto err_len; + } + *ref_col_name = mem_heap_strdupl(heap, (char*) field, len); + + return(NULL); +} + +/** Check the validity of a SYS_TABLES record +Make sure the fields are the right length and that they +do not contain invalid contents. +@param[in] rec SYS_TABLES record +@return error message, or NULL on success */ +static +const char* +dict_sys_tables_rec_check( + const rec_t* rec) +{ + const byte* field; + ulint len; + + ut_ad(dict_sys.locked()); + + if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_TABLES) { + return("wrong number of columns in SYS_TABLES record"); + } + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_TABLES__NAME, &len); + if (len == 0 || len == UNIV_SQL_NULL) { +err_len: + return("incorrect column length in SYS_TABLES"); + } + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_TABLES__DB_TRX_ID, &len); + if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_TABLES__DB_ROLL_PTR, &len); + if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + rec_get_nth_field_offs_old(rec, DICT_FLD__SYS_TABLES__ID, &len); + if (len != 8) { + goto err_len; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__N_COLS, &len); + if (field == NULL || len != 4) { + goto err_len; + } + + rec_get_nth_field_offs_old(rec, DICT_FLD__SYS_TABLES__TYPE, &len); + if (len != 4) { + goto err_len; + } + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_TABLES__MIX_ID, &len); + if (len != 8) { + goto err_len; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__MIX_LEN, &len); + if (field == NULL || len != 4) { + goto err_len; + } + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_TABLES__CLUSTER_ID, &len); + if (len != UNIV_SQL_NULL) { + goto err_len; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__SPACE, &len); + if (field == NULL || len != 4) { + goto err_len; + } + + return(NULL); +} + +/** Check if SYS_TABLES.TYPE is valid +@param[in] type SYS_TABLES.TYPE +@param[in] not_redundant whether ROW_FORMAT=REDUNDANT is not used +@return whether the SYS_TABLES.TYPE value is valid */ +static +bool +dict_sys_tables_type_valid(ulint type, bool not_redundant) +{ + /* The DATA_DIRECTORY flag can be assigned fully independently + of all other persistent table flags. */ + type &= ~DICT_TF_MASK_DATA_DIR; + + if (type == 1) { + return(true); /* ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPACT */ + } + + if (!(type & 1)) { + /* For ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT, + SYS_TABLES.TYPE=1. Else, it is the same as + dict_table_t::flags, and the least significant bit + would be set. So, the bit never can be 0. */ + return(false); + } + + if (!not_redundant) { + /* SYS_TABLES.TYPE must be 1 or 1|DICT_TF_MASK_NO_ROLLBACK + for ROW_FORMAT=REDUNDANT. */ + return !(type & ~(1U | DICT_TF_MASK_NO_ROLLBACK)); + } + + if (type >= 1U << DICT_TF_POS_UNUSED) { + /* Some unknown bits are set. */ + return(false); + } + + return(dict_tf_is_valid_not_redundant(type)); +} + +/** Convert SYS_TABLES.TYPE to dict_table_t::flags. +@param[in] type SYS_TABLES.TYPE +@param[in] not_redundant whether ROW_FORMAT=REDUNDANT is not used +@return table flags */ +static +uint32_t dict_sys_tables_type_to_tf(uint32_t type, bool not_redundant) +{ + ut_ad(dict_sys_tables_type_valid(type, not_redundant)); + uint32_t flags = not_redundant ? 1 : 0; + + /* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION, + PAGE_COMPRESSION_LEVEL are the same. */ + flags |= type & (DICT_TF_MASK_ZIP_SSIZE + | DICT_TF_MASK_ATOMIC_BLOBS + | DICT_TF_MASK_DATA_DIR + | DICT_TF_MASK_PAGE_COMPRESSION + | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL + | DICT_TF_MASK_NO_ROLLBACK); + + ut_ad(dict_tf_is_valid(flags)); + return(flags); +} + +/** Outcome of dict_sys_tables_rec_read() */ +enum table_read_status { READ_OK= 0, READ_ERROR, READ_NOT_FOUND }; + +/** Read and return 5 integer fields from a SYS_TABLES record. +@param[in] rec A record of SYS_TABLES +@param[in] uncommitted true=use READ UNCOMMITTED, false=READ COMMITTED +@param[in] mtr mini-transaction +@param[out] table_id Pointer to the table_id for this table +@param[out] space_id Pointer to the space_id for this table +@param[out] n_cols Pointer to number of columns for this table. +@param[out] flags Pointer to table flags +@param[out] flags2 Pointer to table flags2 +@param[out] trx_id DB_TRX_ID of the committed SYS_TABLES record, + or nullptr to perform READ UNCOMMITTED +@return whether the record was read correctly */ +MY_ATTRIBUTE((warn_unused_result)) +static +table_read_status +dict_sys_tables_rec_read( + const rec_t* rec, + bool uncommitted, + mtr_t* mtr, + table_id_t* table_id, + uint32_t* space_id, + uint32_t* n_cols, + uint32_t* flags, + uint32_t* flags2, + trx_id_t* trx_id) +{ + const byte* field; + ulint len; + mem_heap_t* heap = nullptr; + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__DB_TRX_ID, &len); + ut_ad(len == 6 || len == UNIV_SQL_NULL); + trx_id_t id = len == 6 ? trx_read_trx_id(field) : 0; + if (id && !uncommitted && trx_sys.find(nullptr, id, false)) { + const auto savepoint = mtr->get_savepoint(); + heap = mem_heap_create(1024); + dict_index_t* index = UT_LIST_GET_FIRST( + dict_sys.sys_tables->indexes); + rec_offs* offsets = rec_get_offsets( + rec, index, nullptr, true, ULINT_UNDEFINED, &heap); + const rec_t* old_vers; + row_vers_build_for_semi_consistent_read( + nullptr, rec, mtr, index, &offsets, &heap, + heap, &old_vers, nullptr); + mtr->rollback_to_savepoint(savepoint); + rec = old_vers; + if (!rec) { + mem_heap_free(heap); + return READ_NOT_FOUND; + } + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__DB_TRX_ID, &len); + if (UNIV_UNLIKELY(len != 6)) { + mem_heap_free(heap); + return READ_ERROR; + } + id = trx_read_trx_id(field); + } + + if (rec_get_deleted_flag(rec, 0)) { + ut_ad(id); + if (trx_id) { + return READ_NOT_FOUND; + } + } + + if (trx_id) { + *trx_id = id; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__ID, &len); + ut_ad(len == 8); + *table_id = static_cast(mach_read_from_8(field)); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__SPACE, &len); + ut_ad(len == 4); + *space_id = mach_read_from_4(field); + + /* Read the 4 byte flags from the TYPE field */ + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__TYPE, &len); + ut_a(len == 4); + uint32_t type = mach_read_from_4(field); + + /* Handle MDEV-12873 InnoDB SYS_TABLES.TYPE incompatibility + for PAGE_COMPRESSED=YES in MariaDB 10.2.2 to 10.2.6. + + MariaDB 10.2.2 introduced the SHARED_SPACE flag from MySQL 5.7, + shifting the flags PAGE_COMPRESSION, PAGE_COMPRESSION_LEVEL, + ATOMIC_WRITES (repurposed to NO_ROLLBACK in 10.3.1) by one bit. + The SHARED_SPACE flag would always + be written as 0 by MariaDB, because MariaDB does not support + CREATE TABLESPACE or CREATE TABLE...TABLESPACE for InnoDB. + + So, instead of the bits AALLLLCxxxxxxx we would have + AALLLLC0xxxxxxx if the table was created with MariaDB 10.2.2 + to 10.2.6. (AA=ATOMIC_WRITES, LLLL=PAGE_COMPRESSION_LEVEL, + C=PAGE_COMPRESSED, xxxxxxx=7 bits that were not moved.) + + The case LLLLC=00000 is not a problem. The problem is the case + AALLLL10DB00001 where D is the (mostly ignored) DATA_DIRECTORY + flag and B is the ATOMIC_BLOBS flag (1 for ROW_FORMAT=DYNAMIC + and 0 for ROW_FORMAT=COMPACT in this case). Other low-order + bits must be so, because PAGE_COMPRESSED=YES is only allowed + for ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPACT, not for + ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPRESSED. + + Starting with MariaDB 10.2.4, the flags would be + 00LLLL10DB00001, because ATOMIC_WRITES is always written as 0. + + We will concentrate on the PAGE_COMPRESSION_LEVEL and + PAGE_COMPRESSED=YES. PAGE_COMPRESSED=NO implies + PAGE_COMPRESSION_LEVEL=0, and in that case all the affected + bits will be 0. For PAGE_COMPRESSED=YES, the values 1..9 are + allowed for PAGE_COMPRESSION_LEVEL. That is, we must interpret + the bits AALLLL10DB00001 as AALLLL1DB00001. + + If someone created a table in MariaDB 10.2.2 or 10.2.3 with + the attribute ATOMIC_WRITES=OFF (value 2) and without + PAGE_COMPRESSED=YES or PAGE_COMPRESSION_LEVEL, that should be + rejected. The value ATOMIC_WRITES=ON (1) would look like + ATOMIC_WRITES=OFF, but it would be ignored starting with + MariaDB 10.2.4. */ + compile_time_assert(DICT_TF_POS_PAGE_COMPRESSION == 7); + compile_time_assert(DICT_TF_POS_UNUSED == 14); + + if ((type & 0x19f) != 0x101) { + /* The table cannot have been created with MariaDB + 10.2.2 to 10.2.6, because they would write the + low-order bits of SYS_TABLES.TYPE as 0b10xx00001 for + PAGE_COMPRESSED=YES. No adjustment is applicable. */ + } else if (type >= 3 << 13) { + /* 10.2.2 and 10.2.3 write ATOMIC_WRITES less than 3, + and no other flags above that can be set for the + SYS_TABLES.TYPE to be in the 10.2.2..10.2.6 format. + This would in any case be invalid format for 10.2 and + earlier releases. */ + ut_ad(!dict_sys_tables_type_valid(type, true)); + } else { + /* SYS_TABLES.TYPE is of the form AALLLL10DB00001. We + must still validate that the LLLL bits are between 0 + and 9 before we can discard the extraneous 0 bit. */ + ut_ad(!DICT_TF_GET_PAGE_COMPRESSION(type)); + + if ((((type >> 9) & 0xf) - 1) < 9) { + ut_ad(DICT_TF_GET_PAGE_COMPRESSION_LEVEL(type) & 1); + + type = (type & 0x7fU) | (type >> 1 & ~0x7fU); + + ut_ad(DICT_TF_GET_PAGE_COMPRESSION(type)); + ut_ad(DICT_TF_GET_PAGE_COMPRESSION_LEVEL(type) >= 1); + ut_ad(DICT_TF_GET_PAGE_COMPRESSION_LEVEL(type) <= 9); + } else { + ut_ad(!dict_sys_tables_type_valid(type, true)); + } + } + + /* The low order bit of SYS_TABLES.TYPE is always set to 1. But in + dict_table_t::flags the low order bit is used to determine if the + ROW_FORMAT=REDUNDANT (0) or anything else (1). + Read the 4 byte N_COLS field and look at the high order bit. It + should be set for COMPACT and later. It should not be set for + REDUNDANT. */ + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__N_COLS, &len); + ut_a(len == 4); + *n_cols = mach_read_from_4(field); + + const bool not_redundant = 0 != (*n_cols & DICT_N_COLS_COMPACT); + + if (!dict_sys_tables_type_valid(type, not_redundant)) { + sql_print_error("InnoDB: Table %.*s in InnoDB" + " data dictionary contains invalid flags." + " SYS_TABLES.TYPE=" UINT32PF + " SYS_TABLES.N_COLS=" UINT32PF, + int(rec_get_field_start_offs(rec, 1)), rec, + type, *n_cols); +err_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return READ_ERROR; + } + + *flags = dict_sys_tables_type_to_tf(type, not_redundant); + + /* For tables created before MySQL 4.1, there may be + garbage in SYS_TABLES.MIX_LEN where flags2 are found. Such tables + would always be in ROW_FORMAT=REDUNDANT which do not have the + high bit set in n_cols, and flags would be zero. + MySQL 4.1 was the first version to support innodb_file_per_table, + that is, *space_id != 0. */ + if (not_redundant || *space_id != 0 || *n_cols & DICT_N_COLS_COMPACT + || fil_system.sys_space->full_crc32()) { + + /* Get flags2 from SYS_TABLES.MIX_LEN */ + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLES__MIX_LEN, &len); + *flags2 = mach_read_from_4(field); + + if (!dict_tf2_is_valid(*flags, *flags2)) { + sql_print_error("InnoDB: Table %.*s in InnoDB" + " data dictionary" + " contains invalid flags." + " SYS_TABLES.TYPE=" UINT32PF + " SYS_TABLES.MIX_LEN=" UINT32PF, + int(rec_get_field_start_offs(rec, 1)), + rec, + type, *flags2); + goto err_exit; + } + + /* DICT_TF2_FTS will be set when indexes are being loaded */ + *flags2 &= ~DICT_TF2_FTS; + + /* Now that we have used this bit, unset it. */ + *n_cols &= ~DICT_N_COLS_COMPACT; + } else { + *flags2 = 0; + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return READ_OK; +} + +/** Check each tablespace found in the data dictionary. +Then look at each table defined in SYS_TABLES that has a space_id > 0 +to find all the file-per-table tablespaces. + +In a crash recovery we already have some tablespace objects created from +processing the REDO log. We will compare the +space_id information in the data dictionary to what we find in the +tablespace file. In addition, more validation will be done if recovery +was needed and force_recovery is not set. + +We also scan the biggest space id, and store it to fil_system. */ +void dict_check_tablespaces_and_store_max_id() +{ + uint32_t max_space_id = 0; + btr_pcur_t pcur; + mtr_t mtr; + + DBUG_ENTER("dict_check_tablespaces_and_store_max_id"); + + mtr.start(); + + dict_sys.lock(SRW_LOCK_CALL); + + for (const rec_t *rec = dict_startscan_system(&pcur, &mtr, + dict_sys.sys_tables); + rec; rec = dict_getnext_system_low(&pcur, &mtr)) { + ulint len; + table_id_t table_id; + uint32_t space_id; + uint32_t n_cols; + uint32_t flags; + uint32_t flags2; + + /* If a table record is not useable, ignore it and continue + on to the next record. Error messages were logged. */ + if (dict_sys_tables_rec_check(rec)) { + continue; + } + + const char *field = reinterpret_cast( + rec_get_nth_field_old(rec, DICT_FLD__SYS_TABLES__NAME, + &len)); + + DBUG_PRINT("dict_check_sys_tables", + ("name: %*.s", static_cast(len), field)); + + if (dict_sys_tables_rec_read(rec, false, + &mtr, &table_id, &space_id, + &n_cols, &flags, &flags2, nullptr) + != READ_OK + || space_id == TRX_SYS_SPACE) { + continue; + } + + if (flags2 & DICT_TF2_DISCARDED) { + sql_print_information("InnoDB: Ignoring tablespace" + " for %.*s because " + "the DISCARD flag is set", + static_cast(len), field); + continue; + } + + /* For tables or partitions using .ibd files, the flag + DICT_TF2_USE_FILE_PER_TABLE was not set in MIX_LEN + before MySQL 5.6.5. The flag should not have been + introduced in persistent storage. MariaDB will keep + setting the flag when writing SYS_TABLES entries for + newly created or rebuilt tables or partitions, but + will otherwise ignore the flag. */ + + if (fil_space_for_table_exists_in_mem(space_id, flags)) { + continue; + } + + const span name{field, len}; + + char* filepath = fil_make_filepath(nullptr, name, + IBD, false); + + const bool not_dropped{!rec_get_deleted_flag(rec, 0)}; + + /* Check that the .ibd file exists. */ + if (fil_ibd_open(not_dropped, FIL_TYPE_TABLESPACE, + space_id, dict_tf_to_fsp_flags(flags), + name, filepath)) { + } else if (!not_dropped) { + } else if (srv_operation == SRV_OPERATION_NORMAL + && srv_start_after_restore + && srv_force_recovery < SRV_FORCE_NO_BACKGROUND + && dict_table_t::is_temporary_name(filepath)) { + /* Mariabackup will not copy files whose + names start with #sql-. This table ought to + be dropped by drop_garbage_tables_after_restore() + a little later. */ + } else { + sql_print_warning("InnoDB: Ignoring tablespace for" + " %.*s because it" + " could not be opened.", + static_cast(len), field); + } + + max_space_id = ut_max(max_space_id, space_id); + + ut_free(filepath); + } + + mtr.commit(); + + fil_set_max_space_id_if_bigger(max_space_id); + + dict_sys.unlock(); + + DBUG_VOID_RETURN; +} + +/** Error message for a delete-marked record in dict_load_column_low() */ +static const char *dict_load_column_del= "delete-marked record in SYS_COLUMNS"; +/** Error message for a missing record in dict_load_column_low() */ +static const char *dict_load_column_none= "SYS_COLUMNS record not found"; +/** Message for incomplete instant ADD/DROP in dict_load_column_low() */ +static const char *dict_load_column_instant= "incomplete instant ADD/DROP"; + +/** Load a table column definition from a SYS_COLUMNS record to dict_table_t. +@param table table, or nullptr if the output will be in column +@param use_uncommitted 0=READ COMMITTED, 1=detect, 2=READ UNCOMMITTED +@param heap memory heap for temporary storage +@param column pointer to output buffer, or nullptr if table!=nullptr +@param table_id table identifier +@param col_name column name +@param rec SYS_COLUMNS record +@param mtr mini-transaction +@param nth_v_col nullptr, or pointer to a counter of virtual columns +@return error message +@retval nullptr on success */ +static const char *dict_load_column_low(dict_table_t *table, + unsigned use_uncommitted, + mem_heap_t *heap, dict_col_t *column, + table_id_t *table_id, + const char **col_name, + const rec_t *rec, + mtr_t *mtr, + ulint *nth_v_col) +{ + char* name; + const byte* field; + ulint len; + ulint mtype; + ulint prtype; + ulint col_len; + ulint pos; + ulint num_base; + + ut_ad(!table == !!column); + + if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_COLUMNS) { + return("wrong number of columns in SYS_COLUMNS record"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__TABLE_ID, &len); + if (len != 8) { +err_len: + return("incorrect column length in SYS_COLUMNS"); + } + + if (table_id) { + *table_id = mach_read_from_8(field); + } else if (table->id != mach_read_from_8(field)) { + return dict_load_column_none; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__POS, &len); + if (len != 4) { + goto err_len; + } + + pos = mach_read_from_4(field); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__DB_TRX_ID, &len); + if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + const trx_id_t trx_id = trx_read_trx_id(field); + + if (trx_id && mtr && use_uncommitted < 2 + && trx_sys.find(nullptr, trx_id, false)) { + if (use_uncommitted) { + return dict_load_column_instant; + } + const auto savepoint = mtr->get_savepoint(); + dict_index_t* index = UT_LIST_GET_FIRST( + dict_sys.sys_columns->indexes); + rec_offs* offsets = rec_get_offsets( + rec, index, nullptr, true, ULINT_UNDEFINED, &heap); + const rec_t* old_vers; + row_vers_build_for_semi_consistent_read( + nullptr, rec, mtr, index, &offsets, &heap, + heap, &old_vers, nullptr); + mtr->rollback_to_savepoint(savepoint); + rec = old_vers; + if (!old_vers) { + return dict_load_column_none; + } + ut_ad(!rec_get_deleted_flag(rec, 0)); + } + + if (rec_get_deleted_flag(rec, 0)) { + ut_ad(trx_id); + return dict_load_column_del; + } + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_COLUMNS__DB_ROLL_PTR, &len); + if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__NAME, &len); + if (len == 0 || len == UNIV_SQL_NULL) { + goto err_len; + } + + *col_name = name = mem_heap_strdupl(heap, (const char*) field, len); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__MTYPE, &len); + if (len != 4) { + goto err_len; + } + + mtype = mach_read_from_4(field); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__PRTYPE, &len); + if (len != 4) { + goto err_len; + } + prtype = mach_read_from_4(field); + + if (dtype_get_charset_coll(prtype) == 0 + && dtype_is_string_type(mtype)) { + /* The table was created with < 4.1.2. */ + + if (dtype_is_binary_string_type(mtype, prtype)) { + /* Use the binary collation for + string columns of binary type. */ + + prtype = dtype_form_prtype( + prtype, + DATA_MYSQL_BINARY_CHARSET_COLL); + } else { + /* Use the default charset for + other than binary columns. */ + + prtype = dtype_form_prtype( + prtype, + data_mysql_default_charset_coll); + } + } + + if (table && table->n_def != pos && !(prtype & DATA_VIRTUAL)) { + return("SYS_COLUMNS.POS mismatch"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__LEN, &len); + if (len != 4) { + goto err_len; + } + col_len = mach_read_from_4(field); + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_COLUMNS__PREC, &len); + if (len != 4) { + goto err_len; + } + num_base = mach_read_from_4(field); + + if (table) { + if (prtype & DATA_VIRTUAL) { +#ifdef UNIV_DEBUG + dict_v_col_t* vcol = +#endif + dict_mem_table_add_v_col( + table, heap, name, mtype, + prtype, col_len, + dict_get_v_col_mysql_pos(pos), num_base); + ut_ad(vcol->v_pos == dict_get_v_col_pos(pos)); + } else { + ut_ad(num_base == 0); + dict_mem_table_add_col(table, heap, name, mtype, + prtype, col_len); + } + + if (trx_id > table->def_trx_id) { + table->def_trx_id = trx_id; + } + } else { + dict_mem_fill_column_struct(column, pos, mtype, + prtype, col_len); + } + + /* Report the virtual column number */ + if ((prtype & DATA_VIRTUAL) && nth_v_col != NULL) { + *nth_v_col = dict_get_v_col_pos(pos); + } + + return(NULL); +} + +/** Error message for a delete-marked record in dict_load_virtual_low() */ +static const char *dict_load_virtual_del= "delete-marked record in SYS_VIRTUAL"; +static const char *dict_load_virtual_none= "SYS_VIRTUAL record not found"; + +/** Load a virtual column "mapping" (to base columns) information +from a SYS_VIRTUAL record +@param[in,out] table table +@param[in] uncommitted false=READ COMMITTED, true=READ UNCOMMITTED +@param[in,out] column mapped base column's dict_column_t +@param[in,out] table_id table id +@param[in,out] pos virtual column position +@param[in,out] base_pos base column position +@param[in] rec SYS_VIRTUAL record +@return error message +@retval NULL on success */ +static +const char* +dict_load_virtual_low( + dict_table_t* table, + bool uncommitted, + dict_col_t** column, + table_id_t* table_id, + ulint* pos, + ulint* base_pos, + const rec_t* rec) +{ + const byte* field; + ulint len; + ulint base; + + if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_VIRTUAL) { + return("wrong number of columns in SYS_VIRTUAL record"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_VIRTUAL__TABLE_ID, &len); + if (len != 8) { +err_len: + return("incorrect column length in SYS_VIRTUAL"); + } + + if (table_id != NULL) { + *table_id = mach_read_from_8(field); + } else if (table->id != mach_read_from_8(field)) { + return dict_load_virtual_none; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_VIRTUAL__POS, &len); + if (len != 4) { + goto err_len; + } + + if (pos != NULL) { + *pos = mach_read_from_4(field); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_VIRTUAL__BASE_POS, &len); + if (len != 4) { + goto err_len; + } + + base = mach_read_from_4(field); + + if (base_pos != NULL) { + *base_pos = base; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_VIRTUAL__DB_TRX_ID, &len); + if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_VIRTUAL__DB_ROLL_PTR, &len); + if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + const trx_id_t trx_id = trx_read_trx_id(field); + + if (trx_id && column && !uncommitted + && trx_sys.find(nullptr, trx_id, false)) { + if (!rec_get_deleted_flag(rec, 0)) { + return dict_load_virtual_none; + } + } else if (rec_get_deleted_flag(rec, 0)) { + ut_ad(trx_id != 0); + return dict_load_virtual_del; + } + + if (column != NULL) { + *column = dict_table_get_nth_col(table, base); + } + + return(NULL); +} + +/** Load the definitions for table columns. +@param table table +@param use_uncommitted 0=READ COMMITTED, 1=detect, 2=READ UNCOMMITTED +@param heap memory heap for temporary storage +@return error code +@retval DB_SUCCESS on success +@retval DB_SUCCESS_LOCKED_REC on success if use_uncommitted=1 +and instant ADD/DROP/reorder was detected */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +static dberr_t dict_load_columns(dict_table_t *table, unsigned use_uncommitted, + mem_heap_t *heap) +{ + btr_pcur_t pcur; + mtr_t mtr; + ulint n_skipped = 0; + + ut_ad(dict_sys.locked()); + + mtr.start(); + + dict_index_t* sys_index = dict_sys.sys_columns->indexes.start; + ut_ad(!dict_sys.sys_columns->not_redundant()); + + ut_ad(name_of_col_is(dict_sys.sys_columns, sys_index, + DICT_FLD__SYS_COLUMNS__NAME, "NAME")); + ut_ad(name_of_col_is(dict_sys.sys_columns, sys_index, + DICT_FLD__SYS_COLUMNS__PREC, "PREC")); + + dfield_t dfield; + dtuple_t tuple{ + 0,1,1,&dfield,0,nullptr +#ifdef UNIV_DEBUG + , DATA_TUPLE_MAGIC_N +#endif + }; + byte table_id[8]; + mach_write_to_8(table_id, table->id); + dfield_set_data(&dfield, table_id, 8); + dict_index_copy_types(&tuple, sys_index, 1); + pcur.btr_cur.page_cur.index = sys_index; + + dberr_t err = btr_pcur_open_on_user_rec(&tuple, + BTR_SEARCH_LEAF, &pcur, &mtr); + if (err != DB_SUCCESS) { + goto func_exit; + } + + ut_ad(table->n_t_cols == static_cast( + table->n_cols) + static_cast(table->n_v_cols)); + + for (ulint i = 0; + i + DATA_N_SYS_COLS < table->n_t_cols + n_skipped; + i++) { + const char* err_msg; + const char* name = NULL; + ulint nth_v_col = ULINT_UNDEFINED; + const rec_t* rec = btr_pcur_get_rec(&pcur); + + err_msg = btr_pcur_is_on_user_rec(&pcur) + ? dict_load_column_low(table, use_uncommitted, + heap, NULL, NULL, + &name, rec, &mtr, &nth_v_col) + : dict_load_column_none; + + if (!err_msg) { + } else if (err_msg == dict_load_column_del) { + n_skipped++; + goto next_rec; + } else if (err_msg == dict_load_column_instant) { + err = DB_SUCCESS_LOCKED_REC; + goto func_exit; + } else if (err_msg == dict_load_column_none + && strstr(table->name.m_name, + "/" TEMP_FILE_PREFIX_INNODB)) { + break; + } else { + ib::error() << err_msg << " for table " << table->name; + err = DB_CORRUPTION; + goto func_exit; + } + + /* Note: Currently we have one DOC_ID column that is + shared by all FTS indexes on a table. And only non-virtual + column can be used for FULLTEXT index */ + if (innobase_strcasecmp(name, + FTS_DOC_ID_COL_NAME) == 0 + && nth_v_col == ULINT_UNDEFINED) { + dict_col_t* col; + /* As part of normal loading of tables the + table->flag is not set for tables with FTS + till after the FTS indexes are loaded. So we + create the fts_t instance here if there isn't + one already created. + + This case does not arise for table create as + the flag is set before the table is created. */ + if (table->fts == NULL) { + table->fts = fts_create(table); + table->fts->cache = fts_cache_create(table); + DICT_TF2_FLAG_SET(table, DICT_TF2_FTS_AUX_HEX_NAME); + } + + ut_a(table->fts->doc_col == ULINT_UNDEFINED); + + col = dict_table_get_nth_col(table, i - n_skipped); + + ut_ad(col->len == sizeof(doc_id_t)); + + if (col->prtype & DATA_FTS_DOC_ID) { + DICT_TF2_FLAG_SET( + table, DICT_TF2_FTS_HAS_DOC_ID); + DICT_TF2_FLAG_UNSET( + table, DICT_TF2_FTS_ADD_DOC_ID); + } + + table->fts->doc_col = i - n_skipped; + } +next_rec: + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } + +func_exit: + mtr.commit(); + return err; +} + +/** Loads SYS_VIRTUAL info for one virtual column +@param table table definition +@param uncommitted false=READ COMMITTED, true=READ UNCOMMITTED +@param nth_v_col virtual column position */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +static +dberr_t +dict_load_virtual_col(dict_table_t *table, bool uncommitted, ulint nth_v_col) +{ + const dict_v_col_t* v_col = dict_table_get_nth_v_col(table, nth_v_col); + + if (v_col->num_base == 0) { + return DB_SUCCESS; + } + + dict_index_t* sys_virtual_index; + btr_pcur_t pcur; + mtr_t mtr; + + ut_ad(dict_sys.locked()); + + mtr.start(); + + sys_virtual_index = dict_sys.sys_virtual->indexes.start; + ut_ad(!dict_sys.sys_virtual->not_redundant()); + + ut_ad(name_of_col_is(dict_sys.sys_virtual, sys_virtual_index, + DICT_FLD__SYS_VIRTUAL__POS, "POS")); + + dfield_t dfield[2]; + dtuple_t tuple{ + 0,2,2,dfield,0,nullptr +#ifdef UNIV_DEBUG + , DATA_TUPLE_MAGIC_N +#endif + }; + byte table_id[8], vcol_pos[4]; + mach_write_to_8(table_id, table->id); + dfield_set_data(&dfield[0], table_id, 8); + mach_write_to_4(vcol_pos, + dict_create_v_col_pos(nth_v_col, v_col->m_col.ind)); + dfield_set_data(&dfield[1], vcol_pos, 4); + + dict_index_copy_types(&tuple, sys_virtual_index, 2); + pcur.btr_cur.page_cur.index = sys_virtual_index; + + dberr_t err = btr_pcur_open_on_user_rec(&tuple, + BTR_SEARCH_LEAF, &pcur, &mtr); + if (err != DB_SUCCESS) { + goto func_exit; + } + + for (ulint i = 0, skipped = 0; + i < unsigned{v_col->num_base} + skipped; i++) { + ulint pos; + const char* err_msg + = btr_pcur_is_on_user_rec(&pcur) + ? dict_load_virtual_low(table, uncommitted, + &v_col->base_col[i - skipped], + NULL, + &pos, NULL, + btr_pcur_get_rec(&pcur)) + : dict_load_virtual_none; + + if (!err_msg) { + ut_ad(pos == mach_read_from_4(vcol_pos)); + } else if (err_msg == dict_load_virtual_del) { + skipped++; + } else if (err_msg == dict_load_virtual_none + && strstr(table->name.m_name, + "/" TEMP_FILE_PREFIX_INNODB)) { + break; + } else { + ib::error() << err_msg << " for table " << table->name; + err = DB_CORRUPTION; + break; + } + + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } + +func_exit: + mtr.commit(); + return err; +} + +/** Loads info from SYS_VIRTUAL for virtual columns. +@param table table definition +@param uncommitted false=READ COMMITTED, true=READ UNCOMMITTED */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +static dberr_t dict_load_virtual(dict_table_t *table, bool uncommitted) +{ + for (ulint i= 0; i < table->n_v_cols; i++) + if (dberr_t err= dict_load_virtual_col(table, uncommitted, i)) + return err; + return DB_SUCCESS; +} + +/** Error message for a delete-marked record in dict_load_field_low() */ +static const char *dict_load_field_del= "delete-marked record in SYS_FIELDS"; + +static const char *dict_load_field_none= "SYS_FIELDS record not found"; + +/** Load an index field definition from a SYS_FIELDS record to dict_index_t. +@return error message +@retval NULL on success */ +static +const char* +dict_load_field_low( + byte* index_id, /*!< in/out: index id (8 bytes) + an "in" value if index != NULL + and "out" if index == NULL */ + bool uncommitted, /*!< in: false=READ COMMITTED, + true=READ UNCOMMITTED */ + dict_index_t* index, /*!< in/out: index, could be NULL + if we just populate a dict_field_t + struct with information from + a SYS_FIELDS record */ + dict_field_t* sys_field, /*!< out: dict_field_t to be + filled */ + ulint* pos, /*!< out: Field position */ + byte* last_index_id, /*!< in: last index id */ + mem_heap_t* heap, /*!< in/out: memory heap + for temporary storage */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + const rec_t* rec) /*!< in: SYS_FIELDS record */ +{ + const byte* field; + ulint len; + unsigned pos_and_prefix_len; + unsigned prefix_len; + bool descending; + bool first_field; + ulint position; + + /* Either index or sys_field is supplied, not both */ + ut_ad((!index) != (!sys_field)); + ut_ad((!index) == !mtr); + + if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_FIELDS) { + return("wrong number of columns in SYS_FIELDS record"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FIELDS__INDEX_ID, &len); + if (len != 8) { +err_len: + return("incorrect column length in SYS_FIELDS"); + } + + if (!index) { + ut_a(last_index_id); + memcpy(index_id, (const char*) field, 8); + first_field = memcmp(index_id, last_index_id, 8); + } else { + first_field = (index->n_def == 0); + if (memcmp(field, index_id, 8)) { + return dict_load_field_none; + } + } + + /* The next field stores the field position in the index and a + possible column prefix length if the index field does not + contain the whole column. The storage format is like this: if + there is at least one prefix field in the index, then the HIGH + 2 bytes contain the field number (index->n_def) and the low 2 + bytes the prefix length for the field. Otherwise the field + number (index->n_def) is contained in the 2 LOW bytes. */ + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FIELDS__POS, &len); + if (len != 4) { + goto err_len; + } + + pos_and_prefix_len = mach_read_from_4(field); + + if (index && UNIV_UNLIKELY + ((pos_and_prefix_len & 0xFFFFUL) != index->n_def + && (pos_and_prefix_len >> 16 & 0xFFFF) != index->n_def)) { + return("SYS_FIELDS.POS mismatch"); + } + + if (first_field || pos_and_prefix_len > 0xFFFFUL) { + prefix_len = pos_and_prefix_len & 0x7FFFUL; + descending = (pos_and_prefix_len & 0x8000UL); + position = (pos_and_prefix_len & 0xFFFF0000UL) >> 16; + } else { + prefix_len = 0; + descending = false; + position = pos_and_prefix_len & 0xFFFFUL; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FIELDS__DB_TRX_ID, &len); + if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_FIELDS__DB_ROLL_PTR, &len); + if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + const trx_id_t trx_id = trx_read_trx_id(field); + + if (!trx_id) { + ut_ad(!rec_get_deleted_flag(rec, 0)); + } else if (!mtr || uncommitted) { + } else if (trx_sys.find(nullptr, trx_id, false)) { + const auto savepoint = mtr->get_savepoint(); + dict_index_t* sys_field = UT_LIST_GET_FIRST( + dict_sys.sys_fields->indexes); + rec_offs* offsets = rec_get_offsets( + rec, sys_field, nullptr, true, ULINT_UNDEFINED, &heap); + const rec_t* old_vers; + row_vers_build_for_semi_consistent_read( + nullptr, rec, mtr, sys_field, &offsets, &heap, + heap, &old_vers, nullptr); + mtr->rollback_to_savepoint(savepoint); + rec = old_vers; + if (!old_vers || rec_get_deleted_flag(rec, 0)) { + return dict_load_field_none; + } + } + + if (rec_get_deleted_flag(rec, 0)) { + return(dict_load_field_del); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FIELDS__COL_NAME, &len); + if (len == 0 || len == UNIV_SQL_NULL) { + goto err_len; + } + + if (index) { + dict_mem_index_add_field( + index, mem_heap_strdupl(heap, (const char*) field, len), + prefix_len, descending); + } else { + sys_field->name = mem_heap_strdupl( + heap, (const char*) field, len); + sys_field->prefix_len = prefix_len & ((1U << 12) - 1); + sys_field->descending = descending; + *pos = position; + } + + return(NULL); +} + +/** +Load definitions for index fields. +@param index index whose fields are to be loaded +@param uncommitted false=READ COMMITTED, true=READ UNCOMMITTED +@param heap memory heap for temporary storage +@return error code +@return DB_SUCCESS if the fields were loaded successfully */ +static dberr_t dict_load_fields(dict_index_t *index, bool uncommitted, + mem_heap_t *heap) +{ + btr_pcur_t pcur; + mtr_t mtr; + + ut_ad(dict_sys.locked()); + + mtr.start(); + + dict_index_t* sys_index = dict_sys.sys_fields->indexes.start; + ut_ad(!dict_sys.sys_fields->not_redundant()); + ut_ad(name_of_col_is(dict_sys.sys_fields, sys_index, + DICT_FLD__SYS_FIELDS__COL_NAME, "COL_NAME")); + + dfield_t dfield; + dtuple_t tuple{ + 0,1,1,&dfield,0,nullptr +#ifdef UNIV_DEBUG + , DATA_TUPLE_MAGIC_N +#endif + }; + byte index_id[8]; + mach_write_to_8(index_id, index->id); + dfield_set_data(&dfield, index_id, 8); + dict_index_copy_types(&tuple, sys_index, 1); + pcur.btr_cur.page_cur.index = sys_index; + + dberr_t error = btr_pcur_open_on_user_rec(&tuple, BTR_SEARCH_LEAF, + &pcur, &mtr); + if (error != DB_SUCCESS) { + goto func_exit; + } + + for (ulint i = 0; i < index->n_fields; i++) { + const char *err_msg = btr_pcur_is_on_user_rec(&pcur) + ? dict_load_field_low(index_id, uncommitted, index, + nullptr, nullptr, nullptr, + heap, &mtr, + btr_pcur_get_rec(&pcur)) + : dict_load_field_none; + + if (!err_msg) { + } else if (err_msg == dict_load_field_del) { + /* There could be delete marked records in + SYS_FIELDS because SYS_FIELDS.INDEX_ID can be + updated by ALTER TABLE ADD INDEX. */ + } else { + if (err_msg != dict_load_field_none + || strstr(index->table->name.m_name, + "/" TEMP_FILE_PREFIX_INNODB)) { + ib::error() << err_msg << " for index " + << index->name + << " of table " + << index->table->name; + } + error = DB_CORRUPTION; + break; + } + + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } + +func_exit: + mtr.commit(); + return error; +} + +/** Error message for a delete-marked record in dict_load_index_low() */ +static const char *dict_load_index_del= "delete-marked record in SYS_INDEXES"; +/** Error message for table->id mismatch in dict_load_index_low() */ +static const char *dict_load_index_none= "SYS_INDEXES record not found"; +/** Error message for SYS_TABLES flags mismatch in dict_load_table_low() */ +static const char *dict_load_table_flags= "incorrect flags in SYS_TABLES"; + +/** Load an index definition from a SYS_INDEXES record to dict_index_t. +@return error message +@retval NULL on success */ +static +const char* +dict_load_index_low( + byte* table_id, /*!< in/out: table id (8 bytes), + an "in" value if mtr + and "out" when !mtr */ + bool uncommitted, /*!< in: false=READ COMMITTED, + true=READ UNCOMMITTED */ + mem_heap_t* heap, /*!< in/out: temporary memory heap */ + const rec_t* rec, /*!< in: SYS_INDEXES record */ + mtr_t* mtr, /*!< in/out: mini-transaction, + or nullptr if a pre-allocated + *index is to be filled in */ + dict_table_t* table, /*!< in/out: table, or NULL */ + dict_index_t** index) /*!< out,own: index, or NULL */ +{ + const byte* field; + ulint len; + index_id_t id; + ulint n_fields; + ulint type; + unsigned merge_threshold; + + if (mtr) { + *index = NULL; + } + + if (rec_get_n_fields_old(rec) == DICT_NUM_FIELDS__SYS_INDEXES) { + /* MERGE_THRESHOLD exists */ + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD, &len); + switch (len) { + case 4: + merge_threshold = mach_read_from_4(field); + break; + case UNIV_SQL_NULL: + merge_threshold = DICT_INDEX_MERGE_THRESHOLD_DEFAULT; + break; + default: + return("incorrect MERGE_THRESHOLD length" + " in SYS_INDEXES"); + } + } else if (rec_get_n_fields_old(rec) + == DICT_NUM_FIELDS__SYS_INDEXES - 1) { + /* MERGE_THRESHOLD doesn't exist */ + + merge_threshold = DICT_INDEX_MERGE_THRESHOLD_DEFAULT; + } else { + return("wrong number of columns in SYS_INDEXES record"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__TABLE_ID, &len); + if (len != 8) { +err_len: + return("incorrect column length in SYS_INDEXES"); + } + + if (!mtr) { + /* We are reading a SYS_INDEXES record. Copy the table_id */ + memcpy(table_id, (const char*) field, 8); + } else if (memcmp(field, table_id, 8)) { + /* Caller supplied table_id, verify it is the same + id as on the index record */ + return dict_load_index_none; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__ID, &len); + if (len != 8) { + goto err_len; + } + + id = mach_read_from_8(field); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__DB_TRX_ID, &len); + if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + rec_get_nth_field_offs_old( + rec, DICT_FLD__SYS_INDEXES__DB_ROLL_PTR, &len); + if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) { + goto err_len; + } + + const trx_id_t trx_id = trx_read_trx_id(field); + if (!trx_id) { + ut_ad(!rec_get_deleted_flag(rec, 0)); + } else if (!mtr || uncommitted) { + } else if (trx_sys.find(nullptr, trx_id, false)) { + const auto savepoint = mtr->get_savepoint(); + dict_index_t* sys_index = UT_LIST_GET_FIRST( + dict_sys.sys_indexes->indexes); + rec_offs* offsets = rec_get_offsets( + rec, sys_index, nullptr, true, ULINT_UNDEFINED, &heap); + const rec_t* old_vers; + row_vers_build_for_semi_consistent_read( + nullptr, rec, mtr, sys_index, &offsets, &heap, + heap, &old_vers, nullptr); + mtr->rollback_to_savepoint(savepoint); + rec = old_vers; + if (!old_vers || rec_get_deleted_flag(rec, 0)) { + return dict_load_index_none; + } + } else if (rec_get_deleted_flag(rec, 0) + && rec[8 + 8 + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN] + != static_cast(*TEMP_INDEX_PREFIX_STR) + && table->def_trx_id < trx_id) { + table->def_trx_id = trx_id; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__N_FIELDS, &len); + if (len != 4) { + goto err_len; + } + n_fields = mach_read_from_4(field); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__TYPE, &len); + if (len != 4) { + goto err_len; + } + type = mach_read_from_4(field); + if (type & (~0U << DICT_IT_BITS)) { + return("unknown SYS_INDEXES.TYPE bits"); + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len); + if (len != 4) { + goto err_len; + } + + ut_d(const auto name_offs =) + rec_get_nth_field_offs_old(rec, DICT_FLD__SYS_INDEXES__NAME, &len); + ut_ad(name_offs == 8 + 8 + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN); + + if (len == 0 || len == UNIV_SQL_NULL) { + goto err_len; + } + + if (rec_get_deleted_flag(rec, 0)) { + return dict_load_index_del; + } + + char* name = mem_heap_strdupl(heap, reinterpret_cast(rec) + + (8 + 8 + DATA_TRX_ID_LEN + + DATA_ROLL_PTR_LEN), + len); + + if (mtr) { + *index = dict_mem_index_create(table, name, type, n_fields); + } else { + dict_mem_fill_index_struct(*index, nullptr, name, + type, n_fields); + } + + (*index)->id = id; + (*index)->page = mach_read_from_4(field); + ut_ad((*index)->page); + (*index)->merge_threshold = merge_threshold & ((1U << 6) - 1); + + return(NULL); +} + +/** Load definitions for table indexes. Adds them to the data dictionary cache. +@param table table definition +@param uncommitted false=READ COMMITTED, true=READ UNCOMMITTED +@param heap memory heap for temporary storage +@param ignore_err errors to be ignored when loading the index definition +@return error code +@retval DB_SUCCESS if all indexes were successfully loaded +@retval DB_CORRUPTION if corruption of dictionary table +@retval DB_UNSUPPORTED if table has unknown index type */ +static MY_ATTRIBUTE((nonnull)) +dberr_t dict_load_indexes(dict_table_t *table, bool uncommitted, + mem_heap_t *heap, dict_err_ignore_t ignore_err) +{ + dict_index_t* sys_index; + btr_pcur_t pcur; + byte table_id[8]; + mtr_t mtr; + + ut_ad(dict_sys.locked()); + + mtr.start(); + + sys_index = dict_sys.sys_indexes->indexes.start; + ut_ad(!dict_sys.sys_indexes->not_redundant()); + ut_ad(name_of_col_is(dict_sys.sys_indexes, sys_index, + DICT_FLD__SYS_INDEXES__NAME, "NAME")); + ut_ad(name_of_col_is(dict_sys.sys_indexes, sys_index, + DICT_FLD__SYS_INDEXES__PAGE_NO, "PAGE_NO")); + + dfield_t dfield; + dtuple_t tuple{ + 0,1,1,&dfield,0,nullptr +#ifdef UNIV_DEBUG + , DATA_TUPLE_MAGIC_N +#endif + }; + mach_write_to_8(table_id, table->id); + dfield_set_data(&dfield, table_id, 8); + dict_index_copy_types(&tuple, sys_index, 1); + pcur.btr_cur.page_cur.index = sys_index; + + dberr_t error = btr_pcur_open_on_user_rec(&tuple, BTR_SEARCH_LEAF, + &pcur, &mtr); + if (error != DB_SUCCESS) { + goto func_exit; + } + + while (btr_pcur_is_on_user_rec(&pcur)) { + dict_index_t* index = NULL; + const char* err_msg; + const rec_t* rec = btr_pcur_get_rec(&pcur); + if ((ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK) + && (rec_get_n_fields_old(rec) + == DICT_NUM_FIELDS__SYS_INDEXES + /* a record for older SYS_INDEXES table + (missing merge_threshold column) is acceptable. */ + || rec_get_n_fields_old(rec) + == DICT_NUM_FIELDS__SYS_INDEXES - 1)) { + const byte* field; + ulint len; + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__NAME, &len); + + if (len != UNIV_SQL_NULL + && static_cast(*field) + == static_cast(*TEMP_INDEX_PREFIX_STR)) { + /* Skip indexes whose name starts with + TEMP_INDEX_PREFIX_STR, because they will + be dropped by row_merge_drop_temp_indexes() + during crash recovery. */ + goto next_rec; + } + } + + err_msg = dict_load_index_low(table_id, uncommitted, heap, rec, + &mtr, table, &index); + ut_ad(!index == !!err_msg); + + if (err_msg == dict_load_index_none) { + /* We have ran out of index definitions for + the table. */ + break; + } + + if (err_msg == dict_load_index_del) { + goto next_rec; + } else if (err_msg) { + ib::error() << err_msg; + if (ignore_err & DICT_ERR_IGNORE_INDEX) { + goto next_rec; + } + error = DB_CORRUPTION; + goto func_exit; + } else if (rec[8 + 8 + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN] + == static_cast(*TEMP_INDEX_PREFIX_STR)) { + dict_mem_index_free(index); + goto next_rec; + } else { + const trx_id_t id = trx_read_trx_id(rec + 8 + 8); + if (id > table->def_trx_id) { + table->def_trx_id = id; + } + } + + ut_ad(index); + ut_ad(!dict_index_is_online_ddl(index)); + + /* Check whether the index is corrupted */ + if (ignore_err != DICT_ERR_IGNORE_DROP + && index->is_corrupted() && index->is_clust()) { + dict_mem_index_free(index); + error = DB_TABLE_CORRUPT; + goto func_exit; + } + + if (index->type & DICT_FTS + && !dict_table_has_fts_index(table)) { + /* This should have been created by now. */ + ut_a(table->fts != NULL); + DICT_TF2_FLAG_SET(table, DICT_TF2_FTS); + } + + /* We check for unsupported types first, so that the + subsequent checks are relevant for the supported types. */ + if (index->type & ~(DICT_CLUSTERED | DICT_UNIQUE + | DICT_CORRUPT | DICT_FTS + | DICT_SPATIAL | DICT_VIRTUAL)) { + + ib::error() << "Unknown type " << index->type + << " of index " << index->name + << " of table " << table->name; + + error = DB_UNSUPPORTED; + dict_mem_index_free(index); + goto func_exit; + } else if (index->page == FIL_NULL + && table->is_readable() + && (!(index->type & DICT_FTS))) { + if (!uncommitted + && ignore_err != DICT_ERR_IGNORE_DROP) { + ib::error_or_warn(!(ignore_err + & DICT_ERR_IGNORE_INDEX)) + << "Index " << index->name + << " for table " << table->name + << " has been freed!"; + } + + if (!(ignore_err & DICT_ERR_IGNORE_INDEX)) { +corrupted: + dict_mem_index_free(index); + error = DB_CORRUPTION; + goto func_exit; + } + /* If caller can tolerate this error, + we will continue to load the index and + let caller deal with this error. However + mark the index and table corrupted. We + only need to mark such in the index + dictionary cache for such metadata corruption, + since we would always be able to set it + when loading the dictionary cache */ + if (index->is_clust()) { + index->table->corrupted = true; + index->table->file_unreadable = true; + } + index->type |= DICT_CORRUPT; + } else if (!dict_index_is_clust(index) + && NULL == dict_table_get_first_index(table)) { + + ib::error() << "Trying to load index " << index->name + << " for table " << table->name + << ", but the first index is not clustered!"; + + goto corrupted; + } else if (dict_is_sys_table(table->id) + && (dict_index_is_clust(index) + || ((table == dict_sys.sys_tables) + && !strcmp("ID_IND", index->name)))) { + + /* The index was created in memory already at booting + of the database server */ + dict_mem_index_free(index); + } else { + error = dict_load_fields(index, uncommitted, heap); + if (error != DB_SUCCESS) { + goto func_exit; + } + + /* The data dictionary tables should never contain + invalid index definitions. If we ignored this error + and simply did not load this index definition, the + .frm file would disagree with the index definitions + inside InnoDB. */ + if ((error = dict_index_add_to_cache(index, + index->page)) + != DB_SUCCESS) { + goto func_exit; + } + +#ifdef UNIV_DEBUG + // The following assertion doesn't hold for FTS indexes + // as it may have prefix_len=1 with any charset + if (index->type != DICT_FTS) { + for (uint i = 0; i < index->n_fields; i++) { + dict_field_t &f = index->fields[i]; + ut_ad(f.col->mbmaxlen == 0 + || f.prefix_len + % f.col->mbmaxlen == 0); + } + } +#endif /* UNIV_DEBUG */ + } +next_rec: + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } + + if (!dict_table_get_first_index(table) + && !(ignore_err & DICT_ERR_IGNORE_INDEX)) { + ib::warn() << "No indexes found for table " << table->name; + error = DB_CORRUPTION; + goto func_exit; + } + + ut_ad(table->fts_doc_id_index == NULL); + + if (table->fts != NULL) { + dict_index_t *idx = dict_table_get_index_on_name( + table, FTS_DOC_ID_INDEX_NAME); + if (idx && dict_index_is_unique(idx)) { + table->fts_doc_id_index = idx; + } + } + + /* If the table contains FTS indexes, populate table->fts->indexes */ + if (dict_table_has_fts_index(table)) { + ut_ad(table->fts_doc_id_index != NULL); + /* table->fts->indexes should have been created. */ + ut_a(table->fts->indexes != NULL); + dict_table_get_all_fts_indexes(table, table->fts->indexes); + } + +func_exit: + mtr.commit(); + return error; +} + +/** Load a table definition from a SYS_TABLES record to dict_table_t. +Do not load any columns or indexes. +@param[in,out] mtr mini-transaction +@param[in] uncommitted whether to use READ UNCOMMITTED isolation level +@param[in] rec SYS_TABLES record +@param[out,own] table table, or nullptr +@return error message +@retval nullptr on success */ +const char *dict_load_table_low(mtr_t *mtr, bool uncommitted, + const rec_t *rec, dict_table_t **table) +{ + table_id_t table_id; + uint32_t space_id, t_num, flags, flags2; + ulint n_cols, n_v_col; + trx_id_t trx_id; + + if (const char* error_text = dict_sys_tables_rec_check(rec)) { + *table = NULL; + return(error_text); + } + + if (auto r = dict_sys_tables_rec_read(rec, uncommitted, mtr, + &table_id, &space_id, + &t_num, &flags, &flags2, + &trx_id)) { + *table = NULL; + return r == READ_ERROR ? dict_load_table_flags : nullptr; + } + + dict_table_decode_n_col(t_num, &n_cols, &n_v_col); + + *table = dict_table_t::create( + span(reinterpret_cast(rec), + rec_get_field_start_offs(rec, 1)), + nullptr, n_cols + n_v_col, n_v_col, flags, flags2); + (*table)->space_id = space_id; + (*table)->id = table_id; + (*table)->file_unreadable = !!(flags2 & DICT_TF2_DISCARDED); + (*table)->def_trx_id = trx_id; + return(NULL); +} + +/** Make sure the data_file_name is saved in dict_table_t if needed. +@param[in,out] table Table object */ +void dict_get_and_save_data_dir_path(dict_table_t *table) +{ + ut_ad(!table->is_temporary()); + ut_ad(!table->space || table->space->id == table->space_id); + + if (!table->data_dir_path && table->space_id && table->space) + { + const char *filepath= table->space->chain.start->name; + if (strncmp(fil_path_to_mysql_datadir, filepath, + strlen(fil_path_to_mysql_datadir))) + { + table->lock_mutex_lock(); + table->flags|= 1 << DICT_TF_POS_DATA_DIR & ((1U << DICT_TF_BITS) - 1); + table->data_dir_path= mem_heap_strdup(table->heap, filepath); + os_file_make_data_dir_path(table->data_dir_path); + table->lock_mutex_unlock(); + } + } +} + +/** Opens a tablespace for dict_load_table_one() +@param[in,out] table A table that refers to the tablespace to open +@param[in] ignore_err Whether to ignore an error. */ +UNIV_INLINE +void +dict_load_tablespace( + dict_table_t* table, + dict_err_ignore_t ignore_err) +{ + ut_ad(!table->is_temporary()); + ut_ad(!table->space); + ut_ad(table->space_id < SRV_SPACE_ID_UPPER_BOUND); + ut_ad(fil_system.sys_space); + + if (table->space_id == TRX_SYS_SPACE) { + table->space = fil_system.sys_space; + return; + } + + if (table->flags2 & DICT_TF2_DISCARDED) { + ib::warn() << "Tablespace for table " << table->name + << " is set as discarded."; + table->file_unreadable = true; + return; + } + + /* The tablespace may already be open. */ + table->space = fil_space_for_table_exists_in_mem(table->space_id, + table->flags); + if (table->space) { + return; + } + + if (ignore_err >= DICT_ERR_IGNORE_TABLESPACE) { + table->file_unreadable = true; + return; + } + + if (!(ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK)) { + ib::error() << "Failed to find tablespace for table " + << table->name << " in the cache. Attempting" + " to load the tablespace with space id " + << table->space_id; + } + + /* Use the remote filepath if needed. This parameter is optional + in the call to fil_ibd_open(). If not supplied, it will be built + from the table->name. */ + char* filepath = NULL; + if (DICT_TF_HAS_DATA_DIR(table->flags)) { + /* This will set table->data_dir_path from fil_system */ + dict_get_and_save_data_dir_path(table); + + if (table->data_dir_path) { + filepath = fil_make_filepath( + table->data_dir_path, table->name, IBD, true); + } + } + + table->space = fil_ibd_open( + 2, FIL_TYPE_TABLESPACE, table->space_id, + dict_tf_to_fsp_flags(table->flags), + {table->name.m_name, strlen(table->name.m_name)}, filepath); + + if (!table->space) { + /* We failed to find a sensible tablespace file */ + table->file_unreadable = true; + } + + ut_free(filepath); +} + +/** Loads a table definition and also all its index definitions. + +Loads those foreign key constraints whose referenced table is already in +dictionary cache. If a foreign key constraint is not loaded, then the +referenced table is pushed into the output stack (fk_tables), if it is not +NULL. These tables must be subsequently loaded so that all the foreign +key constraints are loaded into memory. + +@param[in] name Table name in the db/tablename format +@param[in] ignore_err Error to be ignored when loading table + and its index definition +@param[out] fk_tables Related table names that must also be + loaded to ensure that all foreign key + constraints are loaded. +@return table, possibly with file_unreadable flag set +@retval nullptr if the table does not exist */ +static dict_table_t *dict_load_table_one(const span &name, + dict_err_ignore_t ignore_err, + dict_names_t &fk_tables) +{ + btr_pcur_t pcur; + mtr_t mtr; + + DBUG_ENTER("dict_load_table_one"); + DBUG_PRINT("dict_load_table_one", + ("table: %.*s", int(name.size()), name.data())); + + ut_ad(dict_sys.locked()); + + dict_index_t *sys_index = dict_sys.sys_tables->indexes.start; + ut_ad(!dict_sys.sys_tables->not_redundant()); + ut_ad(name_of_col_is(dict_sys.sys_tables, sys_index, + DICT_FLD__SYS_TABLES__ID, "ID")); + ut_ad(name_of_col_is(dict_sys.sys_tables, sys_index, + DICT_FLD__SYS_TABLES__N_COLS, "N_COLS")); + ut_ad(name_of_col_is(dict_sys.sys_tables, sys_index, + DICT_FLD__SYS_TABLES__TYPE, "TYPE")); + ut_ad(name_of_col_is(dict_sys.sys_tables, sys_index, + DICT_FLD__SYS_TABLES__MIX_LEN, "MIX_LEN")); + ut_ad(name_of_col_is(dict_sys.sys_tables, sys_index, + DICT_FLD__SYS_TABLES__SPACE, "SPACE")); + + dfield_t dfield; + dtuple_t tuple{ + 0,1,1,&dfield,0,nullptr +#ifdef UNIV_DEBUG + , DATA_TUPLE_MAGIC_N +#endif + }; + dfield_set_data(&dfield, name.data(), name.size()); + dict_index_copy_types(&tuple, sys_index, 1); + pcur.btr_cur.page_cur.index = sys_index; + + bool uncommitted = false; +reload: + mtr.start(); + dberr_t err = btr_pcur_open_on_user_rec(&tuple, + BTR_SEARCH_LEAF, &pcur, &mtr); + + if (err != DB_SUCCESS || !btr_pcur_is_on_user_rec(&pcur)) { + /* Not found */ +err_exit: + mtr.commit(); + DBUG_RETURN(nullptr); + } + + const rec_t* rec = btr_pcur_get_rec(&pcur); + + /* Check if the table name in record is the searched one */ + if (rec_get_field_start_offs(rec, 1) != name.size() + || memcmp(name.data(), rec, name.size())) { + goto err_exit; + } + + dict_table_t* table; + if (const char* err_msg = + dict_load_table_low(&mtr, uncommitted, rec, &table)) { + if (err_msg != dict_load_table_flags) { + ib::error() << err_msg; + } + goto err_exit; + } + if (!table) { + goto err_exit; + } + + const unsigned use_uncommitted = uncommitted + ? 2 + : table->id == mach_read_from_8( + rec + rec_get_field_start_offs( + rec, DICT_FLD__SYS_TABLES__ID)); + + mtr.commit(); + + mem_heap_t* heap = mem_heap_create(32000); + + dict_load_tablespace(table, ignore_err); + + switch (dict_load_columns(table, use_uncommitted, heap)) { + case DB_SUCCESS_LOCKED_REC: + ut_ad(!uncommitted); + uncommitted = true; + dict_mem_table_free(table); + mem_heap_free(heap); + goto reload; + case DB_SUCCESS: + if (!dict_load_virtual(table, uncommitted)) { + break; + } + /* fall through */ + default: + dict_mem_table_free(table); + mem_heap_free(heap); + DBUG_RETURN(nullptr); + } + + dict_table_add_system_columns(table, heap); + + table->can_be_evicted = true; + table->add_to_cache(); + + mem_heap_empty(heap); + + ut_ad(dict_tf2_is_valid(table->flags, table->flags2)); + + /* If there is no tablespace for the table then we only need to + load the index definitions. So that we can IMPORT the tablespace + later. When recovering table locks for resurrected incomplete + transactions, the tablespace should exist, because DDL operations + were not allowed while the table is being locked by a transaction. */ + dict_err_ignore_t index_load_err = + !(ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK) + && !table->is_readable() + ? DICT_ERR_IGNORE_ALL + : ignore_err; + + err = dict_load_indexes(table, uncommitted, heap, index_load_err); + + if (err == DB_TABLE_CORRUPT) { + /* Refuse to load the table if the table has a corrupted + cluster index */ + ut_ad(index_load_err != DICT_ERR_IGNORE_DROP); + ib::error() << "Refusing to load corrupted table " + << table->name; +evict: + dict_sys.remove(table); + mem_heap_free(heap); + DBUG_RETURN(nullptr); + } + + if (err != DB_SUCCESS || !table->is_readable()) { + } else if (dict_index_t* pk = dict_table_get_first_index(table)) { + ut_ad(pk->is_primary()); + if (pk->is_corrupted() + || pk->page >= table->space->get_size()) { +corrupted: + table->corrupted = true; + table->file_unreadable = true; + err = DB_TABLE_CORRUPT; + } else if (table->space->id + && ignore_err == DICT_ERR_IGNORE_DROP) { + /* Do not bother to load data from .ibd files + only to delete the .ibd files. */ + goto corrupted; + } else { + const page_id_t page_id{table->space->id, pk->page}; + mtr.start(); + buf_block_t* block = buf_page_get( + page_id, table->space->zip_size(), + RW_S_LATCH, &mtr); + const bool corrupted = !block + || page_get_space_id(block->page.frame) + != page_id.space() + || page_get_page_no(block->page.frame) + != page_id.page_no() + || (mach_read_from_2(FIL_PAGE_TYPE + + block->page.frame) + != FIL_PAGE_INDEX + && mach_read_from_2(FIL_PAGE_TYPE + + block->page.frame) + != FIL_PAGE_TYPE_INSTANT); + mtr.commit(); + if (corrupted) { + goto corrupted; + } + + if (table->supports_instant()) { + err = btr_cur_instant_init(table); + } + } + } else { + ut_ad(ignore_err & DICT_ERR_IGNORE_INDEX); + if (ignore_err != DICT_ERR_IGNORE_DROP) { + err = DB_CORRUPTION; + goto evict; + } + } + + /* Initialize table foreign_child value. Its value could be + changed when dict_load_foreigns() is called below */ + table->fk_max_recusive_level = 0; + + /* We will load the foreign key information only if + all indexes were loaded. */ + if (!table->is_readable()) { + /* Don't attempt to load the indexes from disk. */ + } else if (err == DB_SUCCESS) { + err = dict_load_foreigns(table->name.m_name, nullptr, + 0, true, ignore_err, fk_tables); + + if (err != DB_SUCCESS) { + ib::warn() << "Load table " << table->name + << " failed, the table has missing" + " foreign key indexes. Turn off" + " 'foreign_key_checks' and try again."; + goto evict; + } else { + dict_mem_table_fill_foreign_vcol_set(table); + table->fk_max_recusive_level = 0; + } + } + + mem_heap_free(heap); + + ut_ad(!table + || (ignore_err & ~DICT_ERR_IGNORE_FK_NOKEY) + || !table->is_readable() + || !table->corrupted); + + if (table && table->fts) { + if (!(dict_table_has_fts_index(table) + || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID) + || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID))) { + /* the table->fts could be created in dict_load_column + when a user defined FTS_DOC_ID is present, but no + FTS */ + table->fts->~fts_t(); + table->fts = nullptr; + } else if (fts_optimize_wq) { + fts_optimize_add_table(table); + } else if (table->can_be_evicted) { + /* fts_optimize_thread is not started yet. + So make the table as non-evictable from cache. */ + dict_sys.prevent_eviction(table); + } + } + + ut_ad(err != DB_SUCCESS || dict_foreign_set_validate(*table)); + + DBUG_RETURN(table); +} + +dict_table_t *dict_sys_t::load_table(const span &name, + dict_err_ignore_t ignore) +{ + if (dict_table_t *table= find_table(name)) + return table; + dict_names_t fk_list; + dict_table_t *table= dict_load_table_one(name, ignore, fk_list); + while (!fk_list.empty()) + { + const char *f= fk_list.front(); + const span name{f, strlen(f)}; + if (!find_table(name)) + dict_load_table_one(name, ignore, fk_list); + fk_list.pop_front(); + } + + return table; +} + +/***********************************************************************//** +Loads a table object based on the table id. +@return table; NULL if table does not exist */ +dict_table_t* +dict_load_table_on_id( +/*==================*/ + table_id_t table_id, /*!< in: table id */ + dict_err_ignore_t ignore_err) /*!< in: errors to ignore + when loading the table */ +{ + byte id_buf[8]; + btr_pcur_t pcur; + const byte* field; + ulint len; + mtr_t mtr; + + ut_ad(dict_sys.locked()); + + /* NOTE that the operation of this function is protected by + dict_sys.latch, and therefore no deadlocks can occur + with other dictionary operations. */ + + mtr.start(); + /*---------------------------------------------------*/ + /* Get the secondary index based on ID for table SYS_TABLES */ + dict_index_t *sys_table_ids = + dict_sys.sys_tables->indexes.start->indexes.next; + + dfield_t dfield; + dtuple_t tuple{ + 0,1,1,&dfield,0,nullptr +#ifdef UNIV_DEBUG + , DATA_TUPLE_MAGIC_N +#endif + }; + + /* Write the table id in byte format to id_buf */ + mach_write_to_8(id_buf, table_id); + dfield_set_data(&dfield, id_buf, 8); + dict_index_copy_types(&tuple, sys_table_ids, 1); + pcur.btr_cur.page_cur.index = sys_table_ids; + + dict_table_t* table = nullptr; + + if (btr_pcur_open_on_user_rec(&tuple, BTR_SEARCH_LEAF, &pcur, &mtr) + == DB_SUCCESS + && btr_pcur_is_on_user_rec(&pcur)) { + /*---------------------------------------------------*/ + /* Now we have the record in the secondary index + containing the table ID and NAME */ + const rec_t* rec = btr_pcur_get_rec(&pcur); +check_rec: + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_TABLE_IDS__ID, &len); + ut_ad(len == 8); + + /* Check if the table id in record is the one searched for */ + if (table_id == mach_read_from_8(field)) { + field = rec_get_nth_field_old(rec, + DICT_FLD__SYS_TABLE_IDS__NAME, &len); + table = dict_sys.load_table( + {reinterpret_cast(field), + len}, ignore_err); + if (table && table->id != table_id) { + ut_ad(rec_get_deleted_flag(rec, 0)); + table = nullptr; + } + if (!table) { + while (btr_pcur_move_to_next(&pcur, &mtr)) { + rec = btr_pcur_get_rec(&pcur); + + if (page_rec_is_user_rec(rec)) { + goto check_rec; + } + } + } + } + } + + mtr.commit(); + return table; +} + +/********************************************************************//** +This function is called when the database is booted. Loads system table +index definitions except for the clustered index which is added to the +dictionary cache at booting before calling this function. */ +void +dict_load_sys_table( +/*================*/ + dict_table_t* table) /*!< in: system table */ +{ + mem_heap_t* heap; + + ut_ad(dict_sys.locked()); + + heap = mem_heap_create(1000); + + dict_load_indexes(table, false, heap, DICT_ERR_IGNORE_NONE); + + mem_heap_free(heap); +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/********************************************************************//** +Loads foreign key constraint col names (also for the referenced table). +Members that must be set (and valid) in foreign: +foreign->heap +foreign->n_fields +foreign->id ('\0'-terminated) +Members that will be created and set by this function: +foreign->foreign_col_names[i] +foreign->referenced_col_names[i] +(for i=0..foreign->n_fields-1) */ +static dberr_t dict_load_foreign_cols(dict_foreign_t *foreign, trx_id_t trx_id) +{ + btr_pcur_t pcur; + mtr_t mtr; + size_t id_len; + + ut_ad(dict_sys.locked()); + + id_len = strlen(foreign->id); + + foreign->foreign_col_names = static_cast( + mem_heap_alloc(foreign->heap, + foreign->n_fields * sizeof(void*))); + + foreign->referenced_col_names = static_cast( + mem_heap_alloc(foreign->heap, + foreign->n_fields * sizeof(void*))); + + mtr.start(); + + dict_index_t* sys_index = dict_sys.sys_foreign_cols->indexes.start; + ut_ad(!dict_sys.sys_foreign_cols->not_redundant()); + + dfield_t dfield; + dtuple_t tuple{ + 0,1,1,&dfield,0,nullptr +#ifdef UNIV_DEBUG + , DATA_TUPLE_MAGIC_N +#endif + }; + + dfield_set_data(&dfield, foreign->id, id_len); + dict_index_copy_types(&tuple, sys_index, 1); + pcur.btr_cur.page_cur.index = sys_index; + + mem_heap_t* heap = nullptr; + dberr_t err = btr_pcur_open_on_user_rec(&tuple, + BTR_SEARCH_LEAF, &pcur, &mtr); + if (err != DB_SUCCESS) { + goto func_exit; + } + for (ulint i = 0; i < foreign->n_fields; i++) { + ut_a(btr_pcur_is_on_user_rec(&pcur)); + + const rec_t* rec = btr_pcur_get_rec(&pcur); + ulint len; + const byte* field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__DB_TRX_ID, &len); + ut_a(len == DATA_TRX_ID_LEN); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_empty(heap); + } + + const trx_id_t id = trx_read_trx_id(field); + if (!id) { + } else if (id != trx_id && trx_sys.find(nullptr, id, false)) { + const auto savepoint = mtr.get_savepoint(); + rec_offs* offsets = rec_get_offsets( + rec, sys_index, nullptr, true, ULINT_UNDEFINED, + &heap); + const rec_t* old_vers; + row_vers_build_for_semi_consistent_read( + nullptr, rec, &mtr, sys_index, &offsets, &heap, + heap, &old_vers, nullptr); + mtr.rollback_to_savepoint(savepoint); + rec = old_vers; + if (!rec || rec_get_deleted_flag(rec, 0)) { + goto next; + } + } + + if (rec_get_deleted_flag(rec, 0)) { + ut_ad(id); + goto next; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__ID, &len); + + if (len != id_len || memcmp(foreign->id, field, len)) { + const rec_t* pos; + ulint pos_len; + const rec_t* for_col_name; + ulint for_col_name_len; + const rec_t* ref_col_name; + ulint ref_col_name_len; + + pos = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__POS, + &pos_len); + + for_col_name = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME, + &for_col_name_len); + + ref_col_name = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME, + &ref_col_name_len); + + ib::error sout; + + sout << "Unable to load column names for foreign" + " key '" << foreign->id + << "' because it was not found in" + " InnoDB internal table SYS_FOREIGN_COLS. The" + " closest entry we found is:" + " (ID='"; + sout.write(field, len); + sout << "', POS=" << mach_read_from_4(pos) + << ", FOR_COL_NAME='"; + sout.write(for_col_name, for_col_name_len); + sout << "', REF_COL_NAME='"; + sout.write(ref_col_name, ref_col_name_len); + sout << "')"; + + err = DB_CORRUPTION; + break; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__POS, &len); + ut_a(len == 4); + ut_a(i == mach_read_from_4(field)); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME, &len); + foreign->foreign_col_names[i] = mem_heap_strdupl( + foreign->heap, (char*) field, len); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME, &len); + foreign->referenced_col_names[i] = mem_heap_strdupl( + foreign->heap, (char*) field, len); + +next: + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + } +func_exit: + mtr.commit(); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + return err; +} + +/***********************************************************************//** +Loads a foreign key constraint to the dictionary cache. If the referenced +table is not yet loaded, it is added in the output parameter (fk_tables). +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((warn_unused_result)) +dberr_t +dict_load_foreign( +/*==============*/ + const char* table_name, /*!< in: table name */ + bool uncommitted, /*!< in: use READ UNCOMMITTED + transaction isolation level */ + const char** col_names, + /*!< in: column names, or NULL + to use foreign->foreign_table->col_names */ + trx_id_t trx_id, + /*!< in: current transaction id, or 0 */ + bool check_recursive, + /*!< in: whether to record the foreign table + parent count to avoid unlimited recursive + load of chained foreign tables */ + bool check_charsets, + /*!< in: whether to check charset + compatibility */ + span id, + /*!< in: foreign constraint id */ + dict_err_ignore_t ignore_err, + /*!< in: error to be ignored */ + dict_names_t& fk_tables) + /*!< out: the foreign key constraint is added + to the dictionary cache only if the referenced + table is already in cache. Otherwise, the + foreign key constraint is not added to cache, + and the referenced table is added to this + stack. */ +{ + dict_foreign_t* foreign; + btr_pcur_t pcur; + const byte* field; + ulint len; + mtr_t mtr; + dict_table_t* for_table; + dict_table_t* ref_table; + + DBUG_ENTER("dict_load_foreign"); + DBUG_PRINT("dict_load_foreign", + ("id: '%.*s', check_recursive: %d", + int(id.size()), id.data(), check_recursive)); + + ut_ad(dict_sys.locked()); + + dict_index_t* sys_index = dict_sys.sys_foreign->indexes.start; + ut_ad(!dict_sys.sys_foreign->not_redundant()); + + dfield_t dfield; + dtuple_t tuple{ + 0,1,1,&dfield,0,nullptr +#ifdef UNIV_DEBUG + , DATA_TUPLE_MAGIC_N +#endif + }; + dfield_set_data(&dfield, id.data(), id.size()); + dict_index_copy_types(&tuple, sys_index, 1); + pcur.btr_cur.page_cur.index = sys_index; + + mtr.start(); + + mem_heap_t* heap = nullptr; + dberr_t err = btr_pcur_open_on_user_rec(&tuple, + BTR_SEARCH_LEAF, &pcur, &mtr); + if (err != DB_SUCCESS) { + goto err_exit; + } + + if (!btr_pcur_is_on_user_rec(&pcur)) { +not_found: + err = DB_NOT_FOUND; +err_exit: + mtr.commit(); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + DBUG_RETURN(err); + } + + const rec_t* rec = btr_pcur_get_rec(&pcur); + static_assert(DICT_FLD__SYS_FOREIGN__ID == 0, "compatibility"); + field = rec_get_nth_field_old(rec, DICT_FLD__SYS_FOREIGN__ID, &len); + + /* Check if the id in record is the searched one */ + if (len != id.size() || memcmp(id.data(), field, id.size())) { + goto not_found; + } + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN__DB_TRX_ID, &len); + ut_a(len == DATA_TRX_ID_LEN); + + const trx_id_t tid = trx_read_trx_id(field); + + if (tid && tid != trx_id && !uncommitted + && trx_sys.find(nullptr, tid, false)) { + const auto savepoint = mtr.get_savepoint(); + rec_offs* offsets = rec_get_offsets( + rec, sys_index, nullptr, true, ULINT_UNDEFINED, &heap); + const rec_t* old_vers; + row_vers_build_for_semi_consistent_read( + nullptr, rec, &mtr, sys_index, &offsets, &heap, + heap, &old_vers, nullptr); + mtr.rollback_to_savepoint(savepoint); + rec = old_vers; + if (!rec) { + goto not_found; + } + } + + if (rec_get_deleted_flag(rec, 0)) { + ut_ad(tid); + goto not_found; + } + + /* Read the table names and the number of columns associated + with the constraint */ + + foreign = dict_mem_foreign_create(); + + uint32_t n_fields_and_type = mach_read_from_4( + rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN__N_COLS, &len)); + + ut_a(len == 4); + + /* We store the type in the bits 24..29 of n_fields_and_type. */ + + foreign->type = (n_fields_and_type >> 24) & ((1U << 6) - 1); + foreign->n_fields = n_fields_and_type & dict_index_t::MAX_N_FIELDS; + + foreign->id = mem_heap_strdupl(foreign->heap, id.data(), id.size()); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN__FOR_NAME, &len); + + foreign->foreign_table_name = mem_heap_strdupl( + foreign->heap, (char*) field, len); + dict_mem_foreign_table_name_lookup_set(foreign, TRUE); + + const size_t foreign_table_name_len = len; + const size_t table_name_len = strlen(table_name); + + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN__REF_NAME, &len); + + if (!my_charset_latin1.strnncoll(table_name, table_name_len, + foreign->foreign_table_name, + foreign_table_name_len)) { + } else if (!check_recursive + && !my_charset_latin1.strnncoll(table_name, table_name_len, + (const char*) field, len)) { + } else { + dict_foreign_free(foreign); + goto not_found; + } + + foreign->referenced_table_name = mem_heap_strdupl( + foreign->heap, (const char*) field, len); + dict_mem_referenced_table_name_lookup_set(foreign, TRUE); + + mtr.commit(); + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + err = dict_load_foreign_cols(foreign, trx_id); + if (err != DB_SUCCESS) { + goto load_error; + } + + ref_table = dict_sys.find_table( + {foreign->referenced_table_name_lookup, + strlen(foreign->referenced_table_name_lookup)}); + for_table = dict_sys.find_table( + {foreign->foreign_table_name_lookup, + strlen(foreign->foreign_table_name_lookup)}); + + if (!for_table) { + /* To avoid recursively loading the tables related through + the foreign key constraints, the child table name is saved + here. The child table will be loaded later, along with its + foreign key constraint. */ + + ut_a(ref_table != NULL); + fk_tables.push_back( + mem_heap_strdupl(ref_table->heap, + foreign->foreign_table_name_lookup, + foreign_table_name_len)); +load_error: + dict_foreign_remove_from_cache(foreign); + DBUG_RETURN(err); + } + + ut_a(for_table || ref_table); + + /* Note that there may already be a foreign constraint object in + the dictionary cache for this constraint: then the following + call only sets the pointers in it to point to the appropriate table + and index objects and frees the newly created object foreign. + Adding to the cache should always succeed since we are not creating + a new foreign key constraint but loading one from the data + dictionary. */ + + DBUG_RETURN(dict_foreign_add_to_cache(foreign, col_names, + check_charsets, + ignore_err)); +} + +/***********************************************************************//** +Loads foreign key constraints where the table is either the foreign key +holder or where the table is referenced by a foreign key. Adds these +constraints to the data dictionary. + +The foreign key constraint is loaded only if the referenced table is also +in the dictionary cache. If the referenced table is not in dictionary +cache, then it is added to the output parameter (fk_tables). + +@return DB_SUCCESS or error code */ +dberr_t +dict_load_foreigns( + const char* table_name, /*!< in: table name */ + const char** col_names, /*!< in: column names, or NULL + to use table->col_names */ + trx_id_t trx_id, /*!< in: DDL transaction id, + or 0 to check + recursive load of tables + chained by FK */ + bool check_charsets, /*!< in: whether to check + charset compatibility */ + dict_err_ignore_t ignore_err, /*!< in: error to be ignored */ + dict_names_t& fk_tables) + /*!< out: stack of table + names which must be loaded + subsequently to load all the + foreign key constraints. */ +{ + btr_pcur_t pcur; + mtr_t mtr; + + DBUG_ENTER("dict_load_foreigns"); + + ut_ad(dict_sys.locked()); + + if (!dict_sys.sys_foreign || !dict_sys.sys_foreign_cols) { + if (ignore_err & DICT_ERR_IGNORE_FK_NOKEY) { + DBUG_RETURN(DB_SUCCESS); + } + sql_print_information("InnoDB: No foreign key system tables" + " in the database"); + DBUG_RETURN(DB_ERROR); + } + + ut_ad(!dict_sys.sys_foreign->not_redundant()); + + dict_index_t *sec_index = dict_table_get_next_index( + dict_table_get_first_index(dict_sys.sys_foreign)); + ut_ad(!strcmp(sec_index->fields[0].name, "FOR_NAME")); + bool check_recursive = !trx_id; + dfield_t dfield; + dtuple_t tuple{ + 0,1,1,&dfield,0,nullptr +#ifdef UNIV_DEBUG + , DATA_TUPLE_MAGIC_N +#endif + }; + +start_load: + mtr.start(); + dfield_set_data(&dfield, table_name, strlen(table_name)); + dict_index_copy_types(&tuple, sec_index, 1); + pcur.btr_cur.page_cur.index = sec_index; + + dberr_t err = btr_pcur_open_on_user_rec(&tuple, + BTR_SEARCH_LEAF, &pcur, &mtr); + if (err != DB_SUCCESS) { + DBUG_RETURN(err); + } +loop: + const rec_t* rec = btr_pcur_get_rec(&pcur); + const byte* field; + const auto maybe_deleted = rec_get_deleted_flag(rec, 0); + + if (!btr_pcur_is_on_user_rec(&pcur)) { + /* End of index */ + + goto load_next_index; + } + + /* Now we have the record in the secondary index containing a table + name and a foreign constraint ID */ + + ulint len; + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_FOR_NAME__NAME, &len); + + /* Check if the table name in the record is the one searched for; the + following call does the comparison in the latin1_swedish_ci + charset-collation, in a case-insensitive way. */ + + if (cmp_data(dfield_get_type(&dfield)->mtype, + dfield_get_type(&dfield)->prtype, + false, + reinterpret_cast(table_name), + dfield_get_len(&dfield), + field, len)) { + goto load_next_index; + } + + /* Since table names in SYS_FOREIGN are stored in a case-insensitive + order, we have to check that the table name matches also in a binary + string comparison. On Unix, MySQL allows table names that only differ + in character case. If lower_case_table_names=2 then what is stored + may not be the same case, but the previous comparison showed that they + match with no-case. */ + + if (lower_case_table_names != 2 && memcmp(field, table_name, len)) { + goto next_rec; + } + + /* Now we get a foreign key constraint id */ + field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_FOREIGN_FOR_NAME__ID, &len); + + /* Copy the string because the page may be modified or evicted + after mtr.commit() below. */ + char fk_id[MAX_TABLE_NAME_LEN + NAME_LEN]; + err = DB_SUCCESS; + if (UNIV_LIKELY(len < sizeof fk_id)) { + memcpy(fk_id, field, len); + } + + btr_pcur_store_position(&pcur, &mtr); + + mtr.commit(); + + /* Load the foreign constraint definition to the dictionary cache */ + + err = len < sizeof fk_id + ? dict_load_foreign(table_name, false, col_names, trx_id, + check_recursive, check_charsets, + {fk_id, len}, ignore_err, fk_tables) + : DB_CORRUPTION; + + switch (err) { + case DB_SUCCESS: + break; + case DB_NOT_FOUND: + if (maybe_deleted) { + break; + } + sql_print_error("InnoDB: Cannot load foreign constraint %.*s:" + " could not find the relevant record in " + "SYS_FOREIGN", int(len), fk_id); + /* fall through */ + default: +corrupted: + ut_free(pcur.old_rec_buf); + DBUG_RETURN(err); + } + + mtr.start(); + if (pcur.restore_position(BTR_SEARCH_LEAF, &mtr) + == btr_pcur_t::CORRUPTED) { + mtr.commit(); + goto corrupted; + } +next_rec: + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + + goto loop; + +load_next_index: + mtr.commit(); + + if ((sec_index = dict_table_get_next_index(sec_index))) { + /* Switch to scan index on REF_NAME, fk_max_recusive_level + already been updated when scanning FOR_NAME index, no need to + update again */ + check_recursive = false; + goto start_load; + } + + ut_free(pcur.old_rec_buf); + DBUG_RETURN(DB_SUCCESS); +} diff --git a/storage/innobase/dict/dict0mem.cc b/storage/innobase/dict/dict0mem.cc new file mode 100644 index 00000000..b8b2d583 --- /dev/null +++ b/storage/innobase/dict/dict0mem.cc @@ -0,0 +1,1379 @@ +/***************************************************************************** + +Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file dict/dict0mem.cc +Data dictionary memory object creation + +Created 1/8/1996 Heikki Tuuri +***********************************************************************/ + +#include "ha_prototypes.h" +#include + +#include "dict0mem.h" +#include "rem0rec.h" +#include "data0type.h" +#include "mach0data.h" +#include "dict0dict.h" +#include "fts0priv.h" +#include "lock0lock.h" +#include "row0row.h" +#include "sql_string.h" +#include + +#define DICT_HEAP_SIZE 100 /*!< initial memory heap size when + creating a table or index object */ + +/** System databases */ +static const char* innobase_system_databases[] = { + "mysql/", + "information_schema/", + "performance_schema/", + NullS +}; + +/** Determine if a table belongs to innobase_system_databases[] +@param[in] name database_name/table_name +@return whether the database_name is in innobase_system_databases[] */ +static bool dict_mem_table_is_system(const char *name) +{ + /* table has the following format: database/table + and some system table are of the form SYS_* */ + if (!strchr(name, '/')) { + return true; + } + size_t table_len = strlen(name); + const char *system_db; + int i = 0; + while ((system_db = innobase_system_databases[i++]) + && (system_db != NullS)) { + size_t len = strlen(system_db); + if (table_len > len && !strncmp(name, system_db, len)) { + return true; + } + } + return false; +} + +/** The start of the table basename suffix for partitioned tables */ +const char table_name_t::part_suffix[4] +#ifdef _WIN32 += "#p#"; +#else += "#P#"; +#endif + +/** Display an identifier. +@param[in,out] s output stream +@param[in] id_name SQL identifier (other than table name) +@return the output stream */ +std::ostream& +operator<<( + std::ostream& s, + const id_name_t& id_name) +{ + const char q = '`'; + const char* c = id_name; + s << q; + for (; *c != 0; c++) { + if (*c == q) { + s << *c; + } + s << *c; + } + s << q; + return(s); +} + +/** Display a table name. +@param[in,out] s output stream +@param[in] table_name table name +@return the output stream */ +std::ostream& +operator<<( + std::ostream& s, + const table_name_t& table_name) +{ + return(s << ut_get_name(NULL, table_name.m_name)); +} + +bool dict_col_t::same_encoding(uint16_t a, uint16_t b) +{ + if (const CHARSET_INFO *acs= get_charset(a, MYF(MY_WME))) + if (const CHARSET_INFO *bcs= get_charset(b, MYF(MY_WME))) + return Charset(bcs).encoding_allows_reinterpret_as(acs); + return false; +} + +/** Create metadata. +@param name table name +@param space tablespace +@param n_cols total number of columns (both virtual and non-virtual) +@param n_v_cols number of virtual columns +@param flags table flags +@param flags2 table flags2 +@return newly allocated table object */ +dict_table_t *dict_table_t::create(const span &name, + fil_space_t *space, + ulint n_cols, ulint n_v_cols, ulint flags, + ulint flags2) +{ + ut_ad(!space || space->purpose == FIL_TYPE_TABLESPACE || + space->purpose == FIL_TYPE_TEMPORARY || + space->purpose == FIL_TYPE_IMPORT); + ut_a(dict_tf2_is_valid(flags, flags2)); + ut_a(!(flags2 & DICT_TF2_UNUSED_BIT_MASK)); + + mem_heap_t *heap= mem_heap_create(DICT_HEAP_SIZE); + + dict_table_t *table= static_cast + (mem_heap_zalloc(heap, sizeof(*table))); + + lock_table_lock_list_init(&table->locks); + UT_LIST_INIT(table->indexes, &dict_index_t::indexes); +#ifdef BTR_CUR_HASH_ADAPT + UT_LIST_INIT(table->freed_indexes, &dict_index_t::indexes); +#endif /* BTR_CUR_HASH_ADAPT */ + table->heap= heap; + + ut_d(table->magic_n= DICT_TABLE_MAGIC_N); + + table->flags= static_cast(flags) & ((1U << DICT_TF_BITS) - 1); + table->flags2= static_cast(flags2) & ((1U << DICT_TF2_BITS) - 1); + table->name.m_name= mem_strdupl(name.data(), name.size()); + table->mdl_name.m_name= table->name.m_name; + table->is_system_db= dict_mem_table_is_system(table->name.m_name); + table->space= space; + table->space_id= space ? space->id : UINT32_MAX; + table->n_t_cols= static_cast(n_cols + DATA_N_SYS_COLS) & + dict_index_t::MAX_N_FIELDS; + table->n_v_cols= static_cast(n_v_cols) & + dict_index_t::MAX_N_FIELDS; + table->n_cols= static_cast(table->n_t_cols - table->n_v_cols) & + dict_index_t::MAX_N_FIELDS; + table->cols= static_cast + (mem_heap_alloc(heap, table->n_cols * sizeof *table->cols)); + table->v_cols= static_cast + (mem_heap_alloc(heap, n_v_cols * sizeof *table->v_cols)); + for (ulint i = n_v_cols; i--; ) + new (&table->v_cols[i]) dict_v_col_t(); + table->autoinc_lock= static_cast + (mem_heap_alloc(heap, sizeof *table->autoinc_lock)); + /* If the table has an FTS index or we are in the process + of building one, create the table->fts */ + if (dict_table_has_fts_index(table) || + DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID | + DICT_TF2_FTS_ADD_DOC_ID)) + { + table->fts= fts_create(table); + table->fts->cache= fts_cache_create(table); + } + + new (&table->foreign_set) dict_foreign_set(); + new (&table->referenced_set) dict_foreign_set(); + + return table; +} + +/****************************************************************//** +Free a table memory object. */ +void +dict_mem_table_free( +/*================*/ + dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(UT_LIST_GET_LEN(table->indexes) == 0); +#ifdef BTR_CUR_HASH_ADAPT + ut_ad(UT_LIST_GET_LEN(table->freed_indexes) == 0); +#endif /* BTR_CUR_HASH_ADAPT */ + ut_d(table->cached = FALSE); + + if (dict_table_has_fts_index(table) + || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID) + || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) { + if (table->fts) { + table->fts->~fts_t(); + } + } + + dict_mem_table_free_foreign_vcol_set(table); + + table->foreign_set.~dict_foreign_set(); + table->referenced_set.~dict_foreign_set(); + + ut_free(table->name.m_name); + + /* Clean up virtual index info structures that are registered + with virtual columns */ + for (ulint i = 0; i < table->n_v_def; i++) { + dict_table_get_nth_v_col(table, i)->~dict_v_col_t(); + } + + UT_DELETE(table->s_cols); + + mem_heap_free(table->heap); +} + +/****************************************************************//** +Append 'name' to 'col_names'. @see dict_table_t::col_names +@return new column names array */ +static +const char* +dict_add_col_name( +/*==============*/ + const char* col_names, /*!< in: existing column names, or + NULL */ + ulint cols, /*!< in: number of existing columns */ + const char* name, /*!< in: new column name */ + mem_heap_t* heap) /*!< in: heap */ +{ + ulint old_len; + ulint new_len; + ulint total_len; + char* res; + + ut_ad(!cols == !col_names); + + /* Find out length of existing array. */ + if (col_names) { + const char* s = col_names; + ulint i; + + for (i = 0; i < cols; i++) { + s += strlen(s) + 1; + } + + old_len = unsigned(s - col_names); + } else { + old_len = 0; + } + + new_len = strlen(name) + 1; + total_len = old_len + new_len; + + res = static_cast(mem_heap_alloc(heap, total_len)); + + if (old_len > 0) { + memcpy(res, col_names, old_len); + } + + memcpy(res + old_len, name, new_len); + + return(res); +} + +/**********************************************************************//** +Adds a column definition to a table. */ +void +dict_mem_table_add_col( +/*===================*/ + dict_table_t* table, /*!< in: table */ + mem_heap_t* heap, /*!< in: temporary memory heap, or NULL */ + const char* name, /*!< in: column name, or NULL */ + ulint mtype, /*!< in: main datatype */ + ulint prtype, /*!< in: precise type */ + ulint len) /*!< in: precision */ +{ + dict_col_t* col; + unsigned i; + + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(!heap == !name); + + ut_ad(!(prtype & DATA_VIRTUAL)); + + i = table->n_def++; + + table->n_t_def++; + + if (name) { + if (table->n_def == table->n_cols) { + heap = table->heap; + } + if (i && !table->col_names) { + /* All preceding column names are empty. */ + char* s = static_cast( + mem_heap_zalloc(heap, table->n_def)); + + table->col_names = s; + } + + table->col_names = dict_add_col_name(table->col_names, + i, name, heap); + } + + col = dict_table_get_nth_col(table, i); + + dict_mem_fill_column_struct(col, i, mtype, prtype, len); + + switch (prtype & DATA_VERSIONED) { + case DATA_VERS_START: + ut_ad(!table->vers_start); + table->vers_start = i & dict_index_t::MAX_N_FIELDS; + break; + case DATA_VERS_END: + ut_ad(!table->vers_end); + table->vers_end = i & dict_index_t::MAX_N_FIELDS; + } +} + +/** Adds a virtual column definition to a table. +@param[in,out] table table +@param[in,out] heap temporary memory heap, or NULL. It is + used to store name when we have not finished + adding all columns. When all columns are + added, the whole name will copy to memory from + table->heap +@param[in] name column name +@param[in] mtype main datatype +@param[in] prtype precise type +@param[in] len length +@param[in] pos position in a table +@param[in] num_base number of base columns +@return the virtual column definition */ +dict_v_col_t* +dict_mem_table_add_v_col( + dict_table_t* table, + mem_heap_t* heap, + const char* name, + ulint mtype, + ulint prtype, + ulint len, + ulint pos, + ulint num_base) +{ + dict_v_col_t* v_col; + + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(!heap == !name); + + ut_ad(prtype & DATA_VIRTUAL); + + unsigned i = table->n_v_def++; + + table->n_t_def++; + + if (name != NULL) { + if (table->n_v_def == table->n_v_cols) { + heap = table->heap; + } + + if (i && !table->v_col_names) { + /* All preceding column names are empty. */ + char* s = static_cast( + mem_heap_zalloc(heap, table->n_v_def)); + + table->v_col_names = s; + } + + table->v_col_names = dict_add_col_name(table->v_col_names, + i, name, heap); + } + + v_col = &table->v_cols[i]; + + dict_mem_fill_column_struct(&v_col->m_col, pos, mtype, prtype, len); + v_col->v_pos = i & dict_index_t::MAX_N_FIELDS; + + if (num_base != 0) { + v_col->base_col = static_cast(mem_heap_zalloc( + table->heap, num_base * sizeof( + *v_col->base_col))); + } else { + v_col->base_col = NULL; + } + + v_col->num_base = static_cast(num_base) + & dict_index_t::MAX_N_FIELDS; + + /* Initialize the index list for virtual columns */ + ut_ad(v_col->v_indexes.empty()); + + return(v_col); +} + +/** Adds a stored column definition to a table. +@param[in] table table +@param[in] num_base number of base columns. */ +void +dict_mem_table_add_s_col( + dict_table_t* table, + ulint num_base) +{ + unsigned i = unsigned(table->n_def) - 1; + dict_col_t* col = dict_table_get_nth_col(table, i); + dict_s_col_t s_col; + + ut_ad(col != NULL); + + if (table->s_cols == NULL) { + table->s_cols = UT_NEW_NOKEY(dict_s_col_list()); + } + + s_col.m_col = col; + s_col.s_pos = i + table->n_v_def; + + if (num_base != 0) { + s_col.base_col = static_cast(mem_heap_zalloc( + table->heap, num_base * sizeof(dict_col_t*))); + } else { + s_col.base_col = NULL; + } + + s_col.num_base = num_base; + table->s_cols->push_front(s_col); +} + +/**********************************************************************//** +Renames a column of a table in the data dictionary cache. */ +static MY_ATTRIBUTE((nonnull)) +void +dict_mem_table_col_rename_low( +/*==========================*/ + dict_table_t* table, /*!< in/out: table */ + unsigned i, /*!< in: column offset corresponding to s */ + const char* to, /*!< in: new column name */ + const char* s, /*!< in: pointer to table->col_names */ + bool is_virtual) + /*!< in: if this is a virtual column */ +{ + char* t_col_names = const_cast( + is_virtual ? table->v_col_names : table->col_names); + ulint n_col = is_virtual ? table->n_v_def : table->n_def; + + size_t from_len = strlen(s), to_len = strlen(to); + + ut_ad(i < table->n_def || is_virtual); + ut_ad(i < table->n_v_def || !is_virtual); + + ut_ad(from_len <= NAME_LEN); + ut_ad(to_len <= NAME_LEN); + + char from[NAME_LEN + 1]; + strncpy(from, s, sizeof from - 1); + from[sizeof from - 1] = '\0'; + + if (from_len == to_len) { + /* The easy case: simply replace the column name in + table->col_names. */ + strcpy(const_cast(s), to); + } else { + /* We need to adjust all affected index->field + pointers, as in dict_index_add_col(). First, copy + table->col_names. */ + ulint prefix_len = ulint(s - t_col_names); + + for (; i < n_col; i++) { + s += strlen(s) + 1; + } + + ulint full_len = ulint(s - t_col_names); + char* col_names; + + if (to_len > from_len) { + col_names = static_cast( + mem_heap_alloc( + table->heap, + full_len + to_len - from_len)); + + memcpy(col_names, t_col_names, prefix_len); + } else { + col_names = const_cast(t_col_names); + } + + memcpy(col_names + prefix_len, to, to_len); + memmove(col_names + prefix_len + to_len, + t_col_names + (prefix_len + from_len), + full_len - (prefix_len + from_len)); + + /* Replace the field names in every index. */ + for (dict_index_t* index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + ulint n_fields = dict_index_get_n_fields(index); + + for (ulint i = 0; i < n_fields; i++) { + dict_field_t* field + = dict_index_get_nth_field( + index, i); + + ut_ad(!field->name + == field->col->is_dropped()); + if (!field->name) { + /* dropped columns lack a name */ + ut_ad(index->is_instant()); + continue; + } + + /* if is_virtual and that in field->col does + not match, continue */ + if ((!is_virtual) != + (!field->col->is_virtual())) { + continue; + } + + ulint name_ofs + = ulint(field->name - t_col_names); + if (name_ofs <= prefix_len) { + field->name = col_names + name_ofs; + } else { + ut_a(name_ofs < full_len); + field->name = col_names + + name_ofs + to_len - from_len; + } + } + } + + if (is_virtual) { + table->v_col_names = col_names; + } else { + table->col_names = col_names; + } + } + + /* Virtual columns are not allowed for foreign key */ + if (is_virtual) { + return; + } + + dict_foreign_t* foreign; + + /* Replace the field names in every foreign key constraint. */ + for (dict_foreign_set::iterator it = table->foreign_set.begin(); + it != table->foreign_set.end(); + ++it) { + + foreign = *it; + + if (foreign->foreign_index == NULL) { + /* We may go here when we set foreign_key_checks to 0, + and then try to rename a column and modify the + corresponding foreign key constraint. The index + would have been dropped, we have to find an equivalent + one */ + for (unsigned f = 0; f < foreign->n_fields; f++) { + if (strcmp(foreign->foreign_col_names[f], from) + == 0) { + + char** rc = const_cast( + foreign->foreign_col_names + + f); + + if (to_len <= strlen(*rc)) { + memcpy(*rc, to, to_len + 1); + } else { + *rc = static_cast( + mem_heap_dup( + foreign->heap, + to, + to_len + 1)); + } + } + } + + /* New index can be null if InnoDB already dropped + the foreign index when FOREIGN_KEY_CHECKS is + disabled */ + foreign->foreign_index = dict_foreign_find_index( + foreign->foreign_table, NULL, + foreign->foreign_col_names, + foreign->n_fields, NULL, true, false, + NULL, NULL, NULL); + + } else { + + for (unsigned f = 0; f < foreign->n_fields; f++) { + /* These can point straight to + table->col_names, because the foreign key + constraints will be freed at the same time + when the table object is freed. */ + foreign->foreign_col_names[f] + = dict_index_get_nth_field( + foreign->foreign_index, + f)->name; + } + } + } + + for (dict_foreign_set::iterator it = table->referenced_set.begin(); + it != table->referenced_set.end(); + ++it) { + + foreign = *it; + + if (!foreign->referenced_index) { + /* Referenced index could have been dropped + when foreign_key_checks is disabled. In that case, + rename the corresponding referenced_col_names and + find the equivalent referenced index also */ + for (unsigned f = 0; f < foreign->n_fields; f++) { + + const char*& rc = + foreign->referenced_col_names[f]; + if (strcmp(rc, from)) { + continue; + } + + if (to_len <= strlen(rc)) { + memcpy(const_cast(rc), to, + to_len + 1); + } else { + rc = static_cast( + mem_heap_dup( + foreign->heap, + to, to_len + 1)); + } + } + + /* New index can be null if InnoDB already dropped + the referenced index when FOREIGN_KEY_CHECKS is + disabled */ + foreign->referenced_index = dict_foreign_find_index( + foreign->referenced_table, NULL, + foreign->referenced_col_names, + foreign->n_fields, NULL, true, false, + NULL, NULL, NULL); + return; + } + + + for (unsigned f = 0; f < foreign->n_fields; f++) { + /* foreign->referenced_col_names[] need to be + copies, because the constraint may become + orphan when foreign_key_checks=0 and the + parent table is dropped. */ + + const char* col_name = dict_index_get_nth_field( + foreign->referenced_index, f)->name; + + if (strcmp(foreign->referenced_col_names[f], + col_name)) { + char** rc = const_cast( + foreign->referenced_col_names + f); + size_t col_name_len_1 = strlen(col_name) + 1; + + if (col_name_len_1 <= strlen(*rc) + 1) { + memcpy(*rc, col_name, col_name_len_1); + } else { + *rc = static_cast( + mem_heap_dup( + foreign->heap, + col_name, + col_name_len_1)); + } + } + } + } +} + +/**********************************************************************//** +Renames a column of a table in the data dictionary cache. */ +void +dict_mem_table_col_rename( +/*======================*/ + dict_table_t* table, /*!< in/out: table */ + ulint nth_col,/*!< in: column index */ + const char* from, /*!< in: old column name */ + const char* to, /*!< in: new column name */ + bool is_virtual) + /*!< in: if this is a virtual column */ +{ + const char* s = is_virtual ? table->v_col_names : table->col_names; + + ut_ad((!is_virtual && nth_col < table->n_def) + || (is_virtual && nth_col < table->n_v_def)); + + for (ulint i = 0; i < nth_col; i++) { + size_t len = strlen(s); + ut_ad(len > 0); + s += len + 1; + } + + ut_ad(!my_strcasecmp(system_charset_info, from, s)); + + dict_mem_table_col_rename_low(table, static_cast(nth_col), + to, s, is_virtual); +} + +/**********************************************************************//** +This function populates a dict_col_t memory structure with +supplied information. */ +void +dict_mem_fill_column_struct( +/*========================*/ + dict_col_t* column, /*!< out: column struct to be + filled */ + ulint col_pos, /*!< in: column position */ + ulint mtype, /*!< in: main data type */ + ulint prtype, /*!< in: precise type */ + ulint col_len) /*!< in: column length */ +{ + unsigned mbminlen, mbmaxlen; + + column->ind = static_cast(col_pos) + & dict_index_t::MAX_N_FIELDS; + column->ord_part = 0; + column->max_prefix = 0; + column->mtype = static_cast(mtype); + column->prtype = static_cast(prtype); + column->len = static_cast(col_len); + dtype_get_mblen(mtype, prtype, &mbminlen, &mbmaxlen); + column->mbminlen = mbminlen & 7; + column->mbmaxlen = mbmaxlen & 7; + column->def_val.data = NULL; + column->def_val.len = UNIV_SQL_DEFAULT; + ut_ad(!column->is_dropped()); +} + +/**********************************************************************//** +Creates an index memory object. +@return own: index object */ +dict_index_t* +dict_mem_index_create( +/*==================*/ + dict_table_t* table, /*!< in: table */ + const char* index_name, /*!< in: index name */ + ulint type, /*!< in: DICT_UNIQUE, + DICT_CLUSTERED, ... ORed */ + ulint n_fields) /*!< in: number of fields */ +{ + dict_index_t* index; + mem_heap_t* heap; + + ut_ad(!table || table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(index_name); + + heap = mem_heap_create(DICT_HEAP_SIZE); + + index = static_cast( + mem_heap_zalloc(heap, sizeof(*index))); + index->table = table; + + dict_mem_fill_index_struct(index, heap, index_name, type, n_fields); + + new (&index->zip_pad.mutex) std::mutex(); + + if (type & DICT_SPATIAL) { + index->rtr_track = new + (mem_heap_alloc(heap, sizeof *index->rtr_track)) + rtr_info_track_t(); + mysql_mutex_init(rtr_active_mutex_key, + &index->rtr_track->rtr_active_mutex, nullptr); + } + + return(index); +} + +/**********************************************************************//** +Creates and initializes a foreign constraint memory object. +@return own: foreign constraint struct */ +dict_foreign_t* +dict_mem_foreign_create(void) +/*=========================*/ +{ + dict_foreign_t* foreign; + mem_heap_t* heap; + DBUG_ENTER("dict_mem_foreign_create"); + + heap = mem_heap_create(100); + + foreign = static_cast( + mem_heap_zalloc(heap, sizeof(dict_foreign_t))); + + foreign->heap = heap; + + foreign->v_cols = NULL; + + DBUG_PRINT("dict_mem_foreign_create", ("heap: %p", heap)); + + DBUG_RETURN(foreign); +} + +/**********************************************************************//** +Sets the foreign_table_name_lookup pointer based on the value of +lower_case_table_names. If that is 0 or 1, foreign_table_name_lookup +will point to foreign_table_name. If 2, then another string is +allocated from foreign->heap and set to lower case. */ +void +dict_mem_foreign_table_name_lookup_set( +/*===================================*/ + dict_foreign_t* foreign, /*!< in/out: foreign struct */ + ibool do_alloc) /*!< in: is an alloc needed */ +{ + if (lower_case_table_names == 2) { + if (do_alloc) { + ulint len; + + len = strlen(foreign->foreign_table_name) + 1; + + foreign->foreign_table_name_lookup = + static_cast( + mem_heap_alloc(foreign->heap, len)); + } + strcpy(foreign->foreign_table_name_lookup, + foreign->foreign_table_name); + innobase_casedn_str(foreign->foreign_table_name_lookup); + } else { + foreign->foreign_table_name_lookup + = foreign->foreign_table_name; + } +} + +/**********************************************************************//** +Sets the referenced_table_name_lookup pointer based on the value of +lower_case_table_names. If that is 0 or 1, referenced_table_name_lookup +will point to referenced_table_name. If 2, then another string is +allocated from foreign->heap and set to lower case. */ +void +dict_mem_referenced_table_name_lookup_set( +/*======================================*/ + dict_foreign_t* foreign, /*!< in/out: foreign struct */ + ibool do_alloc) /*!< in: is an alloc needed */ +{ + if (lower_case_table_names == 2) { + if (do_alloc) { + ulint len; + + len = strlen(foreign->referenced_table_name) + 1; + + foreign->referenced_table_name_lookup = + static_cast( + mem_heap_alloc(foreign->heap, len)); + } + strcpy(foreign->referenced_table_name_lookup, + foreign->referenced_table_name); + innobase_casedn_str(foreign->referenced_table_name_lookup); + } else { + foreign->referenced_table_name_lookup + = foreign->referenced_table_name; + } +} + +/** Fill the virtual column set with virtual column information +present in the given virtual index. +@param[in] index virtual index +@param[out] v_cols virtual column set. */ +static +void +dict_mem_fill_vcol_has_index( + const dict_index_t* index, + dict_vcol_set** v_cols) +{ + for (ulint i = 0; i < index->table->n_v_cols; i++) { + dict_v_col_t* v_col = dict_table_get_nth_v_col( + index->table, i); + if (!v_col->m_col.ord_part) { + continue; + } + + for (const auto& v_idx : v_col->v_indexes) { + if (v_idx.index != index) { + continue; + } + + if (*v_cols == NULL) { + *v_cols = UT_NEW_NOKEY(dict_vcol_set()); + } + + (*v_cols)->insert(v_col); + } + } +} + +/** Fill the virtual column set with the virtual column of the index +if the index contains given column name. +@param[in] col_name column name +@param[in] table innodb table object +@param[out] v_cols set of virtual column information. */ +static +void +dict_mem_fill_vcol_from_v_indexes( + const char* col_name, + const dict_table_t* table, + dict_vcol_set** v_cols) +{ + /* virtual column can't be Primary Key, so start with + secondary index */ + for (dict_index_t* index = dict_table_get_next_index( + dict_table_get_first_index(table)); + index; + index = dict_table_get_next_index(index)) { + + /* Skip if the index have newly added + virtual column because field name is NULL. + Later virtual column set will be + refreshed during loading of table. */ + if (!dict_index_has_virtual(index) + || index->has_new_v_col()) { + continue; + } + + for (ulint i = 0; i < index->n_fields; i++) { + dict_field_t* field = + dict_index_get_nth_field(index, i); + + if (strcmp(field->name, col_name) == 0) { + dict_mem_fill_vcol_has_index( + index, v_cols); + } + } + } +} + +/** Fill the virtual column set with virtual columns which have base columns +as the given col_name +@param[in] col_name column name +@param[in] table table object +@param[out] v_cols set of virtual columns. */ +static +void +dict_mem_fill_vcol_set_for_base_col( + const char* col_name, + const dict_table_t* table, + dict_vcol_set** v_cols) +{ + for (ulint i = 0; i < table->n_v_cols; i++) { + dict_v_col_t* v_col = dict_table_get_nth_v_col(table, i); + + if (!v_col->m_col.ord_part) { + continue; + } + + for (ulint j = 0; j < unsigned{v_col->num_base}; j++) { + if (strcmp(col_name, dict_table_get_col_name( + table, + v_col->base_col[j]->ind)) == 0) { + + if (*v_cols == NULL) { + *v_cols = UT_NEW_NOKEY(dict_vcol_set()); + } + + (*v_cols)->insert(v_col); + } + } + } +} + +/** Fills the dependent virtual columns in a set. +Reason for being dependent are +1) FK can be present on base column of virtual columns +2) FK can be present on column which is a part of virtual index +@param[in,out] foreign foreign key information. */ +void +dict_mem_foreign_fill_vcol_set( + dict_foreign_t* foreign) +{ + ulint type = foreign->type; + + if (type == 0) { + return; + } + + for (ulint i = 0; i < foreign->n_fields; i++) { + /** FK can be present on base columns + of virtual columns. */ + dict_mem_fill_vcol_set_for_base_col( + foreign->foreign_col_names[i], + foreign->foreign_table, + &foreign->v_cols); + + /** FK can be present on the columns + which can be a part of virtual index. */ + dict_mem_fill_vcol_from_v_indexes( + foreign->foreign_col_names[i], + foreign->foreign_table, + &foreign->v_cols); + } +} + +/** Fill virtual columns set in each fk constraint present in the table. +@param[in,out] table innodb table object. */ +void +dict_mem_table_fill_foreign_vcol_set( + dict_table_t* table) +{ + dict_foreign_set fk_set = table->foreign_set; + dict_foreign_t* foreign; + + dict_foreign_set::iterator it; + for (it = fk_set.begin(); it != fk_set.end(); ++it) { + foreign = *it; + + dict_mem_foreign_fill_vcol_set(foreign); + } +} + +/** Free the vcol_set from all foreign key constraint on the table. +@param[in,out] table innodb table object. */ +void +dict_mem_table_free_foreign_vcol_set( + dict_table_t* table) +{ + dict_foreign_set fk_set = table->foreign_set; + dict_foreign_t* foreign; + + dict_foreign_set::iterator it; + for (it = fk_set.begin(); it != fk_set.end(); ++it) { + + foreign = *it; + + if (foreign->v_cols != NULL) { + UT_DELETE(foreign->v_cols); + foreign->v_cols = NULL; + } + } +} + +/**********************************************************************//** +Frees an index memory object. */ +void +dict_mem_index_free( +/*================*/ + dict_index_t* index) /*!< in: index */ +{ + ut_ad(index); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + index->zip_pad.mutex.~mutex(); + + if (dict_index_is_spatial(index)) { + for (auto& rtr_info : index->rtr_track->rtr_active) { + rtr_info->index = NULL; + } + + mysql_mutex_destroy(&index->rtr_track->rtr_active_mutex); + index->rtr_track->~rtr_info_track_t(); + } + + index->detach_columns(); + mem_heap_free(index->heap); +} + +/** Create a temporary tablename like "#sql-ibNNN". +@param[in] heap A memory heap +@param[in] dbtab Table name in the form database/table name +@param[in] id Table id +@return A unique temporary tablename suitable for InnoDB use */ +char* +dict_mem_create_temporary_tablename( + mem_heap_t* heap, + const char* dbtab, + table_id_t id) +{ + size_t size; + char* name; + const char* dbend = strchr(dbtab, '/'); + ut_ad(dbend); + size_t dblen = size_t(dbend - dbtab) + 1; + + size = dblen + (sizeof(TEMP_FILE_PREFIX_INNODB) + 20); + name = static_cast(mem_heap_alloc(heap, size)); + memcpy(name, dbtab, dblen); + snprintf(name + dblen, size - dblen, + TEMP_FILE_PREFIX_INNODB UINT64PF, id); + + return(name); +} + +/** Validate the search order in the foreign key set. +@param[in] fk_set the foreign key set to be validated +@return true if search order is fine in the set, false otherwise. */ +bool +dict_foreign_set_validate( + const dict_foreign_set& fk_set) +{ + dict_foreign_not_exists not_exists(fk_set); + + dict_foreign_set::const_iterator it = std::find_if( + fk_set.begin(), fk_set.end(), not_exists); + + if (it == fk_set.end()) { + return(true); + } + + dict_foreign_t* foreign = *it; + std::cerr << "Foreign key lookup failed: " << *foreign; + std::cerr << fk_set; + ut_ad(0); + return(false); +} + +/** Validate the search order in the foreign key sets of the table +(foreign_set and referenced_set). +@param[in] table table whose foreign key sets are to be validated +@return true if foreign key sets are fine, false otherwise. */ +bool +dict_foreign_set_validate( + const dict_table_t& table) +{ + return(dict_foreign_set_validate(table.foreign_set) + && dict_foreign_set_validate(table.referenced_set)); +} + +std::ostream& +operator<< (std::ostream& out, const dict_foreign_t& foreign) +{ + out << "[dict_foreign_t: id='" << foreign.id << "'"; + + if (foreign.foreign_table_name != NULL) { + out << ",for: '" << foreign.foreign_table_name << "'"; + } + + out << "]"; + return(out); +} + +std::ostream& +operator<< (std::ostream& out, const dict_foreign_set& fk_set) +{ + out << "[dict_foreign_set:"; + std::for_each(fk_set.begin(), fk_set.end(), dict_foreign_print(out)); + out << "]" << std::endl; + return(out); +} + +/** Check whether fulltext index gets affected by foreign +key constraint. */ +bool dict_foreign_t::affects_fulltext() const +{ + if (foreign_table == referenced_table || !foreign_table->fts) + return false; + + for (ulint i= 0; i < n_fields; i++) + { + const dict_col_t *col= dict_index_get_nth_col(foreign_index, i); + if (dict_table_is_fts_column(foreign_table->fts->indexes, col->ind, + col->is_virtual()) != ULINT_UNDEFINED) + return true; + } + + return false; +} + +/** Reconstruct the clustered index fields. +@return whether metadata is incorrect */ +inline bool dict_index_t::reconstruct_fields() +{ + DBUG_ASSERT(is_primary()); + + const auto old_n_fields = n_fields; + + n_fields = (n_fields + table->instant->n_dropped) + & dict_index_t::MAX_N_FIELDS; + n_def = (n_def + table->instant->n_dropped) + & dict_index_t::MAX_N_FIELDS; + + const unsigned n_first = first_user_field(); + + dict_field_t* tfields = static_cast( + mem_heap_zalloc(heap, n_fields * sizeof *fields)); + + memcpy(tfields, fields, n_first * sizeof *fields); + + n_nullable = 0; + ulint n_core_null = 0; + const bool comp = dict_table_is_comp(table); + const auto* field_map_it = table->instant->field_map; + for (unsigned i = n_first, j = 0; i < n_fields; ) { + dict_field_t& f = tfields[i++]; + auto c = *field_map_it++; + if (c.is_dropped()) { + f.col = &table->instant->dropped[j++]; + DBUG_ASSERT(f.col->is_dropped()); + f.fixed_len = dict_col_get_fixed_size(f.col, comp) + & ((1U << 10) - 1); + } else { + DBUG_ASSERT(!c.is_not_null()); + const auto old = std::find_if( + fields + n_first, fields + old_n_fields, + [c](const dict_field_t& o) + { return o.col->ind == c.ind(); }); + + if (old >= fields + old_n_fields + || old->prefix_len + || old->col != &table->cols[c.ind()]) { + return true; + } + + ut_ad(old >= &fields[n_first]); + f = *old; + } + + f.col->clear_instant(); + if (f.col->is_nullable()) { + n_nullable++; + n_core_null += i <= n_core_fields; + } + } + + fields = tfields; + n_core_null_bytes = static_cast(UT_BITS_IN_BYTES(n_core_null)); + + return false; +} + +/** Reconstruct dropped or reordered columns. +@param[in] metadata data from serialise_columns() +@param[in] len length of the metadata, in bytes +@return whether parsing the metadata failed */ +bool dict_table_t::deserialise_columns(const byte* metadata, ulint len) +{ + DBUG_ASSERT(!instant); + + unsigned num_non_pk_fields = mach_read_from_4(metadata); + metadata += 4; + + if (num_non_pk_fields >= REC_MAX_N_FIELDS - 3) { + return true; + } + + dict_index_t* index = UT_LIST_GET_FIRST(indexes); + + if (num_non_pk_fields < unsigned(index->n_fields) + - index->first_user_field()) { + return true; + } + + field_map_element_t* field_map = static_cast( + mem_heap_alloc(heap, + num_non_pk_fields * sizeof *field_map)); + + unsigned n_dropped_cols = 0; + + for (unsigned i = 0; i < num_non_pk_fields; i++) { + auto c = field_map[i] = mach_read_from_2(metadata); + metadata += 2; + + if (field_map[i].is_dropped()) { + if (c.ind() > DICT_MAX_FIXED_COL_LEN + 1) { + return true; + } + n_dropped_cols++; + } else if (c >= n_cols) { + return true; + } + } + + dict_col_t* dropped_cols = static_cast(mem_heap_zalloc( + heap, n_dropped_cols * sizeof(dict_col_t))); + instant = new (mem_heap_alloc(heap, sizeof *instant)) dict_instant_t(); + instant->n_dropped = n_dropped_cols; + instant->dropped = dropped_cols; + instant->field_map = field_map; + + dict_col_t* col = dropped_cols; + for (unsigned i = 0; i < num_non_pk_fields; i++) { + if (field_map[i].is_dropped()) { + auto fixed_len = field_map[i].ind(); + DBUG_ASSERT(fixed_len <= DICT_MAX_FIXED_COL_LEN + 1); + (col++)->set_dropped(field_map[i].is_not_null(), + fixed_len == 1, + fixed_len > 1 ? fixed_len - 1 + : 0); + } + } + DBUG_ASSERT(col == &dropped_cols[n_dropped_cols]); + + return UT_LIST_GET_FIRST(indexes)->reconstruct_fields(); +} + +/** Check if record in clustered index is historical row. +@param[in] rec clustered row +@param[in] offsets offsets +@return true if row is historical */ +bool +dict_index_t::vers_history_row( + const rec_t* rec, + const rec_offs* offsets) +{ + ut_ad(is_primary()); + + ulint len; + dict_col_t& col= table->cols[table->vers_end]; + ut_ad(col.vers_sys_end()); + ulint nfield = dict_col_get_clust_pos(&col, this); + const byte *data = rec_get_nth_field(rec, offsets, nfield, &len); + if (col.vers_native()) { + ut_ad(len == sizeof trx_id_max_bytes); + return 0 != memcmp(data, trx_id_max_bytes, len); + } + ut_ad(len == sizeof timestamp_max_bytes); + return 0 != memcmp(data, timestamp_max_bytes, len); +} + +/** Check if record in secondary index is historical row. +@param[in] rec record in a secondary index +@param[out] history_row true if row is historical +@return true on error */ +bool +dict_index_t::vers_history_row( + const rec_t* rec, + bool &history_row) +{ + ut_ad(!is_primary()); + + /* + Get row_end from clustered index + + TODO (optimization): row_end can be taken from unique secondary index + as well. For that dict_index_t::vers_end member should be added and + updated at index init (dict_index_build_internal_non_clust()). + + Test case: + + create or replace table t1 (x int unique, y int unique, + foreign key r (y) references t1 (x)) + with system versioning engine innodb; + insert into t1 values (1, 1); + */ + bool error = false; + mem_heap_t* heap = NULL; + dict_index_t* clust_index = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + mtr_t mtr; + mtr.start(); + + rec_t* clust_rec = + row_get_clust_rec(BTR_SEARCH_LEAF, rec, this, &clust_index, &mtr); + if (clust_rec) { + offsets = rec_get_offsets(clust_rec, clust_index, offsets, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + + history_row = clust_index->vers_history_row(clust_rec, offsets); + } else { + ib::error() << "foreign constraints: secondary index is out of " + "sync"; + ut_ad("secondary index is out of sync" == 0); + error = true; + } + mtr.commit(); + if (heap) { + mem_heap_free(heap); + } + return(error); +} diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc new file mode 100644 index 00000000..40969335 --- /dev/null +++ b/storage/innobase/dict/dict0stats.cc @@ -0,0 +1,4724 @@ +/***************************************************************************** + +Copyright (c) 2009, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file dict/dict0stats.cc +Code used for calculating and manipulating table statistics. + +Created Jan 06, 2010 Vasil Dimov +*******************************************************/ + +#include "dict0stats.h" +#include "dyn0buf.h" +#include "row0sel.h" +#include "trx0trx.h" +#include "lock0lock.h" +#include "pars0pars.h" +#include +#include "log.h" +#include "btr0btr.h" +#include "que0que.h" +#include "scope.h" +#include "debug_sync.h" + +#include +#include +#include +#include + +/* Sampling algorithm description @{ + +The algorithm is controlled by one number - N_SAMPLE_PAGES(index), +let it be A, which is the number of leaf pages to analyze for a given index +for each n-prefix (if the index is on 3 columns, then 3*A leaf pages will be +analyzed). + +Let the total number of leaf pages in the table be T. +Level 0 - leaf pages, level H - root. + +Definition: N-prefix-boring record is a record on a non-leaf page that equals +the next (to the right, cross page boundaries, skipping the supremum and +infimum) record on the same level when looking at the fist n-prefix columns. +The last (user) record on a level is not boring (it does not match the +non-existent user record to the right). We call the records boring because all +the records on the page below a boring record are equal to that boring record. + +We avoid diving below boring records when searching for a leaf page to +estimate the number of distinct records because we know that such a leaf +page will have number of distinct records == 1. + +For each n-prefix: start from the root level and full scan subsequent lower +levels until a level that contains at least A*10 distinct records is found. +Lets call this level LA. +As an optimization the search is canceled if it has reached level 1 (never +descend to the level 0 (leaf)) and also if the next level to be scanned +would contain more than A pages. The latter is because the user has asked +to analyze A leaf pages and it does not make sense to scan much more than +A non-leaf pages with the sole purpose of finding a good sample of A leaf +pages. + +After finding the appropriate level LA with >A*10 distinct records (or less in +the exceptions described above), divide it into groups of equal records and +pick A such groups. Then pick the last record from each group. For example, +let the level be: + +index: 0,1,2,3,4,5,6,7,8,9,10 +record: 1,1,1,2,2,7,7,7,7,7,9 + +There are 4 groups of distinct records and if A=2 random ones are selected, +e.g. 1,1,1 and 7,7,7,7,7, then records with indexes 2 and 9 will be selected. + +After selecting A records as described above, dive below them to find A leaf +pages and analyze them, finding the total number of distinct records. The +dive to the leaf level is performed by selecting a non-boring record from +each page and diving below it. + +This way, a total of A leaf pages are analyzed for the given n-prefix. + +Let the number of different key values found in each leaf page i be Pi (i=1..A). +Let N_DIFF_AVG_LEAF be (P1 + P2 + ... + PA) / A. +Let the number of different key values on level LA be N_DIFF_LA. +Let the total number of records on level LA be TOTAL_LA. +Let R be N_DIFF_LA / TOTAL_LA, we assume this ratio is the same on the +leaf level. +Let the number of leaf pages be N. +Then the total number of different key values on the leaf level is: +N * R * N_DIFF_AVG_LEAF. +See REF01 for the implementation. + +The above describes how to calculate the cardinality of an index. +This algorithm is executed for each n-prefix of a multi-column index +where n=1..n_uniq. +@} */ + +/* names of the tables from the persistent statistics storage */ +#define TABLE_STATS_NAME_PRINT "mysql.innodb_table_stats" +#define INDEX_STATS_NAME_PRINT "mysql.innodb_index_stats" + +#ifdef UNIV_STATS_DEBUG +#define DEBUG_PRINTF(fmt, ...) printf(fmt, ## __VA_ARGS__) +#else /* UNIV_STATS_DEBUG */ +#define DEBUG_PRINTF(fmt, ...) /* noop */ +#endif /* UNIV_STATS_DEBUG */ + +/* Gets the number of leaf pages to sample in persistent stats estimation */ +#define N_SAMPLE_PAGES(index) \ + static_cast( \ + (index)->table->stats_sample_pages != 0 \ + ? (index)->table->stats_sample_pages \ + : srv_stats_persistent_sample_pages) + +/* number of distinct records on a given level that are required to stop +descending to lower levels and fetch N_SAMPLE_PAGES(index) records +from that level */ +#define N_DIFF_REQUIRED(index) (N_SAMPLE_PAGES(index) * 10) + +/* A dynamic array where we store the boundaries of each distinct group +of keys. For example if a btree level is: +index: 0,1,2,3,4,5,6,7,8,9,10,11,12 +data: b,b,b,b,b,b,g,g,j,j,j, x, y +then we would store 5,7,10,11,12 in the array. */ +typedef std::vector > boundaries_t; + +/** Allocator type used for index_map_t. */ +typedef ut_allocator > + index_map_t_allocator; + +/** Auxiliary map used for sorting indexes by name in dict_stats_save(). */ +typedef std::map index_map_t; + +bool dict_table_t::is_stats_table() const +{ + return !strcmp(name.m_name, TABLE_STATS_NAME) || + !strcmp(name.m_name, INDEX_STATS_NAME); +} + +bool trx_t::has_stats_table_lock() const +{ + for (const lock_t *l : lock.table_locks) + if (l && l->un_member.tab_lock.table->is_stats_table()) + return true; + return false; +} + +/*********************************************************************//** +Checks whether an index should be ignored in stats manipulations: +* stats fetch +* stats recalc +* stats save +@return true if exists and all tables are ok */ +UNIV_INLINE +bool +dict_stats_should_ignore_index( +/*===========================*/ + const dict_index_t* index) /*!< in: index */ +{ + return !index->is_btree() || index->to_be_dropped || !index->is_committed(); +} + + +/** expected column definition */ +struct dict_col_meta_t +{ + /** column name */ + const char *name; + /** main type */ + unsigned mtype; + /** prtype mask; all these bits have to be set in prtype */ + unsigned prtype_mask; + /** column length in bytes */ + unsigned len; +}; + +/** For checking whether a table exists and has a predefined schema */ +struct dict_table_schema_t +{ + /** table name */ + span table_name; + /** table name in SQL */ + const char *table_name_sql; + /** number of columns */ + unsigned n_cols; + /** columns */ + const dict_col_meta_t columns[8]; +}; + +static const dict_table_schema_t table_stats_schema = +{ + {C_STRING_WITH_LEN(TABLE_STATS_NAME)}, TABLE_STATS_NAME_PRINT, 6, + { + {"database_name", DATA_VARMYSQL, DATA_NOT_NULL, 192}, + {"table_name", DATA_VARMYSQL, DATA_NOT_NULL, 597}, + /* + Don't check the DATA_UNSIGNED flag in last_update. + It presents if the server is running in a pure MariaDB installation, + because MariaDB's Field_timestampf::flags has UNSIGNED_FLAG. + But DATA_UNSIGNED misses when the server starts on a MySQL-5.7 directory + (during a migration), because MySQL's Field_timestampf::flags does not + have UNSIGNED_FLAG. + This is fine not to check DATA_UNSIGNED, because Field_timestampf + in both MariaDB and MySQL support only non-negative time_t values. + */ + {"last_update", DATA_INT, DATA_NOT_NULL, 4}, + {"n_rows", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8}, + {"clustered_index_size", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8}, + {"sum_of_other_index_sizes", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8}, + } +}; + +static const dict_table_schema_t index_stats_schema = +{ + {C_STRING_WITH_LEN(INDEX_STATS_NAME)}, INDEX_STATS_NAME_PRINT, 8, + { + {"database_name", DATA_VARMYSQL, DATA_NOT_NULL, 192}, + {"table_name", DATA_VARMYSQL, DATA_NOT_NULL, 597}, + {"index_name", DATA_VARMYSQL, DATA_NOT_NULL, 192}, + /* + Don't check the DATA_UNSIGNED flag in last_update. + See comments about last_update in table_stats_schema above. + */ + {"last_update", DATA_INT, DATA_NOT_NULL, 4}, + {"stat_name", DATA_VARMYSQL, DATA_NOT_NULL, 64*3}, + {"stat_value", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8}, + {"sample_size", DATA_INT, DATA_UNSIGNED, 8}, + {"stat_description", DATA_VARMYSQL, DATA_NOT_NULL, 1024*3} + } +}; + +/** Construct the type's SQL name (e.g. BIGINT UNSIGNED) +@param mtype InnoDB main type +@param prtype InnoDB precise type +@param len length of the column +@param name the SQL name +@param name_sz size of the name buffer +@return number of bytes written (excluding the terminating NUL byte) */ +static int dtype_sql_name(unsigned mtype, unsigned prtype, unsigned len, + char *name, size_t name_sz) +{ + const char *Unsigned= ""; + const char *Main= "UNKNOWN"; + + switch (mtype) { + case DATA_INT: + switch (len) { + case 1: + Main= "TINYINT"; + break; + case 2: + Main= "SMALLINT"; + break; + case 3: + Main= "MEDIUMINT"; + break; + case 4: + Main= "INT"; + break; + case 8: + Main= "BIGINT"; + break; + } + + append_unsigned: + if (prtype & DATA_UNSIGNED) + Unsigned= " UNSIGNED"; + len= 0; + break; + case DATA_FLOAT: + Main= "FLOAT"; + goto append_unsigned; + case DATA_DOUBLE: + Main= "DOUBLE"; + goto append_unsigned; + case DATA_FIXBINARY: + Main= "BINARY"; + break; + case DATA_CHAR: + case DATA_MYSQL: + Main= "CHAR"; + break; + case DATA_VARCHAR: + case DATA_VARMYSQL: + Main= "VARCHAR"; + break; + case DATA_BINARY: + Main= "VARBINARY"; + break; + case DATA_GEOMETRY: + Main= "GEOMETRY"; + len= 0; + break; + case DATA_BLOB: + switch (len) { + case 9: + Main= "TINYBLOB"; + break; + case 10: + Main= "BLOB"; + break; + case 11: + Main= "MEDIUMBLOB"; + break; + case 12: + Main= "LONGBLOB"; + break; + } + len= 0; + } + + const char* Not_null= (prtype & DATA_NOT_NULL) ? " NOT NULL" : ""; + if (len) + return snprintf(name, name_sz, "%s(%u)%s%s", Main, len, Unsigned, + Not_null); + else + return snprintf(name, name_sz, "%s%s%s", Main, Unsigned, Not_null); +} + +static bool innodb_table_stats_not_found; +static bool innodb_index_stats_not_found; +static bool innodb_table_stats_not_found_reported; +static bool innodb_index_stats_not_found_reported; + +/*********************************************************************//** +Checks whether a table exists and whether it has the given structure. +The table must have the same number of columns with the same names and +types. The order of the columns does not matter. +dict_table_schema_check() @{ +@return DB_SUCCESS if the table exists and contains the necessary columns */ +static +dberr_t +dict_table_schema_check( +/*====================*/ + const dict_table_schema_t* req_schema, /*!< in: required table + schema */ + char* errstr, /*!< out: human readable error + message if != DB_SUCCESS is + returned */ + size_t errstr_sz) /*!< in: errstr size */ +{ + const dict_table_t* table= dict_sys.load_table(req_schema->table_name); + + if (!table) { + if (opt_bootstrap) + return DB_TABLE_NOT_FOUND; + if (req_schema == &table_stats_schema) { + if (innodb_table_stats_not_found_reported) { + return DB_STATS_DO_NOT_EXIST; + } + innodb_table_stats_not_found = true; + innodb_table_stats_not_found_reported = true; + } else { + ut_ad(req_schema == &index_stats_schema); + if (innodb_index_stats_not_found_reported) { + return DB_STATS_DO_NOT_EXIST; + } + innodb_index_stats_not_found = true; + innodb_index_stats_not_found_reported = true; + } + + snprintf(errstr, errstr_sz, "Table %s not found.", + req_schema->table_name_sql); + return DB_TABLE_NOT_FOUND; + } + + if (!table->is_readable() && !table->space) { + /* missing tablespace */ + snprintf(errstr, errstr_sz, + "Tablespace for table %s is missing.", + req_schema->table_name_sql); + return DB_TABLE_NOT_FOUND; + } + + if (unsigned(table->n_def - DATA_N_SYS_COLS) != req_schema->n_cols) { + /* the table has a different number of columns than required */ + snprintf(errstr, errstr_sz, + "%s has %d columns but should have %u.", + req_schema->table_name_sql, + table->n_def - DATA_N_SYS_COLS, + req_schema->n_cols); + return DB_ERROR; + } + + /* For each column from req_schema->columns[] search + whether it is present in table->cols[]. + The following algorithm is O(n_cols^2), but is optimized to + be O(n_cols) if the columns are in the same order in both arrays. */ + + for (unsigned i = 0; i < req_schema->n_cols; i++) { + ulint j = dict_table_has_column( + table, req_schema->columns[i].name, i); + + if (j == table->n_def) { + snprintf(errstr, errstr_sz, + "required column %s" + " not found in table %s.", + req_schema->columns[i].name, + req_schema->table_name_sql); + + return(DB_ERROR); + } + + /* we found a column with the same name on j'th position, + compare column types and flags */ + + /* check length for exact match */ + if (req_schema->columns[i].len != table->cols[j].len) { + sql_print_warning("InnoDB: Table %s has" + " length mismatch in the" + " column name %s." + " Please run mariadb-upgrade", + req_schema->table_name_sql, + req_schema->columns[i].name); + } + + /* + check mtype for exact match. + This check is relaxed to allow use to use TIMESTAMP + (ie INT) for last_update instead of DATA_BINARY. + We have to test for both values as the innodb_table_stats + table may come from MySQL and have the old type. + */ + if (req_schema->columns[i].mtype != table->cols[j].mtype && + !(req_schema->columns[i].mtype == DATA_INT && + table->cols[j].mtype == DATA_FIXBINARY)) { + } else if ((~table->cols[j].prtype + & req_schema->columns[i].prtype_mask)) { + } else { + continue; + } + + int s = snprintf(errstr, errstr_sz, + "Column %s in table %s is ", + req_schema->columns[i].name, + req_schema->table_name_sql); + if (s < 0 || static_cast(s) >= errstr_sz) { + return DB_ERROR; + } + errstr += s; + errstr_sz -= s; + s = dtype_sql_name(table->cols[j].mtype, table->cols[j].prtype, + table->cols[j].len, errstr, errstr_sz); + if (s < 0 || static_cast(s) + sizeof " but should be " + >= errstr_sz) { + return DB_ERROR; + } + errstr += s; + memcpy(errstr, " but should be ", sizeof " but should be "); + errstr += (sizeof " but should be ") - 1; + errstr_sz -= s + (sizeof " but should be ") - 1; + s = dtype_sql_name(req_schema->columns[i].mtype, + req_schema->columns[i].prtype_mask, + req_schema->columns[i].len, + errstr, errstr_sz); + return DB_ERROR; + } + + if (size_t n_foreign = table->foreign_set.size()) { + snprintf(errstr, errstr_sz, + "Table %s has %zu foreign key(s) pointing" + " to other tables, but it must have 0.", + req_schema->table_name_sql, n_foreign); + return DB_ERROR; + } + + if (size_t n_referenced = table->referenced_set.size()) { + snprintf(errstr, errstr_sz, + "There are %zu foreign key(s) pointing to %s, " + "but there must be 0.", n_referenced, + req_schema->table_name_sql); + return DB_ERROR; + } + + return DB_SUCCESS; +} + +/*********************************************************************//** +Checks whether the persistent statistics storage exists and that all +tables have the proper structure. +@return true if exists and all tables are ok */ +static bool dict_stats_persistent_storage_check(bool dict_already_locked) +{ + char errstr[512]; + dberr_t ret; + + if (!dict_already_locked) { + dict_sys.lock(SRW_LOCK_CALL); + } + + ut_ad(dict_sys.locked()); + + /* first check table_stats */ + ret = dict_table_schema_check(&table_stats_schema, errstr, + sizeof(errstr)); + if (ret == DB_SUCCESS) { + /* if it is ok, then check index_stats */ + ret = dict_table_schema_check(&index_stats_schema, errstr, + sizeof(errstr)); + } + + if (!dict_already_locked) { + dict_sys.unlock(); + } + + switch (ret) { + case DB_SUCCESS: + return true; + default: + if (!opt_bootstrap) { + ib::error() << errstr; + } + /* fall through */ + case DB_STATS_DO_NOT_EXIST: + return false; + } +} + +/** Executes a given SQL statement using the InnoDB internal SQL parser. +This function will free the pinfo object. +@param[in,out] pinfo pinfo to pass to que_eval_sql() must already +have any literals bound to it +@param[in] sql SQL string to execute +@param[in,out] trx transaction +@return DB_SUCCESS or error code */ +static +dberr_t dict_stats_exec_sql(pars_info_t *pinfo, const char* sql, trx_t *trx) +{ + ut_ad(dict_sys.locked()); + + if (!dict_stats_persistent_storage_check(true)) + { + pars_info_free(pinfo); + return DB_STATS_DO_NOT_EXIST; + } + + return que_eval_sql(pinfo, sql, trx); +} + +/*********************************************************************//** +Duplicate a table object and its indexes. +This function creates a dummy dict_table_t object and initializes the +following table and index members: +dict_table_t::id (copied) +dict_table_t::heap (newly created) +dict_table_t::name (copied) +dict_table_t::corrupted (copied) +dict_table_t::indexes<> (newly created) +dict_table_t::magic_n +for each entry in dict_table_t::indexes, the following are initialized: +(indexes that have DICT_FTS set in index->type are skipped) +dict_index_t::id (copied) +dict_index_t::name (copied) +dict_index_t::table_name (points to the copied table name) +dict_index_t::table (points to the above semi-initialized object) +dict_index_t::type (copied) +dict_index_t::to_be_dropped (copied) +dict_index_t::online_status (copied) +dict_index_t::n_uniq (copied) +dict_index_t::fields[] (newly created, only first n_uniq, only fields[i].name) +dict_index_t::indexes<> (newly created) +dict_index_t::stat_n_diff_key_vals[] (only allocated, left uninitialized) +dict_index_t::stat_n_sample_sizes[] (only allocated, left uninitialized) +dict_index_t::stat_n_non_null_key_vals[] (only allocated, left uninitialized) +dict_index_t::magic_n +The returned object should be freed with dict_stats_table_clone_free() +when no longer needed. +@return incomplete table object */ +static +dict_table_t* +dict_stats_table_clone_create( +/*==========================*/ + const dict_table_t* table) /*!< in: table whose stats to copy */ +{ + size_t heap_size; + dict_index_t* index; + + /* Estimate the size needed for the table and all of its indexes */ + + heap_size = 0; + heap_size += sizeof(dict_table_t); + heap_size += strlen(table->name.m_name) + 1; + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (dict_stats_should_ignore_index(index)) { + continue; + } + + ut_ad(!dict_index_is_ibuf(index)); + + ulint n_uniq = dict_index_get_n_unique(index); + + heap_size += sizeof(dict_index_t); + heap_size += strlen(index->name) + 1; + heap_size += n_uniq * sizeof(index->fields[0]); + for (ulint i = 0; i < n_uniq; i++) { + heap_size += strlen(index->fields[i].name) + 1; + } + heap_size += n_uniq * sizeof(index->stat_n_diff_key_vals[0]); + heap_size += n_uniq * sizeof(index->stat_n_sample_sizes[0]); + heap_size += n_uniq * sizeof(index->stat_n_non_null_key_vals[0]); + } + + /* Allocate the memory and copy the members */ + + mem_heap_t* heap; + + heap = mem_heap_create(heap_size); + + dict_table_t* t; + + t = (dict_table_t*) mem_heap_zalloc(heap, sizeof(*t)); + + t->stats_mutex_init(); + + MEM_CHECK_DEFINED(&table->id, sizeof(table->id)); + t->id = table->id; + + t->heap = heap; + + t->name.m_name = mem_heap_strdup(heap, table->name.m_name); + t->mdl_name.m_name = t->name.m_name; + + t->corrupted = table->corrupted; + + UT_LIST_INIT(t->indexes, &dict_index_t::indexes); +#ifdef BTR_CUR_HASH_ADAPT + UT_LIST_INIT(t->freed_indexes, &dict_index_t::indexes); +#endif /* BTR_CUR_HASH_ADAPT */ + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (dict_stats_should_ignore_index(index)) { + continue; + } + + ut_ad(!dict_index_is_ibuf(index)); + + dict_index_t* idx; + + idx = (dict_index_t*) mem_heap_zalloc(heap, sizeof(*idx)); + + MEM_CHECK_DEFINED(&index->id, sizeof(index->id)); + idx->id = index->id; + + idx->name = mem_heap_strdup(heap, index->name); + + idx->table = t; + + idx->type = index->type; + + idx->to_be_dropped = 0; + + idx->online_status = ONLINE_INDEX_COMPLETE; + idx->set_committed(true); + + idx->n_uniq = index->n_uniq; + + idx->fields = (dict_field_t*) mem_heap_zalloc( + heap, idx->n_uniq * sizeof(idx->fields[0])); + + for (ulint i = 0; i < idx->n_uniq; i++) { + idx->fields[i].name = mem_heap_strdup( + heap, index->fields[i].name); + } + + /* hook idx into t->indexes */ + UT_LIST_ADD_LAST(t->indexes, idx); + + idx->stat_n_diff_key_vals = (ib_uint64_t*) mem_heap_zalloc( + heap, + idx->n_uniq * sizeof(idx->stat_n_diff_key_vals[0])); + + idx->stat_n_sample_sizes = (ib_uint64_t*) mem_heap_zalloc( + heap, + idx->n_uniq * sizeof(idx->stat_n_sample_sizes[0])); + + idx->stat_n_non_null_key_vals = (ib_uint64_t*) mem_heap_zalloc( + heap, + idx->n_uniq * sizeof(idx->stat_n_non_null_key_vals[0])); + ut_d(idx->magic_n = DICT_INDEX_MAGIC_N); + + idx->stat_defrag_n_page_split = 0; + idx->stat_defrag_n_pages_freed = 0; + } + + ut_d(t->magic_n = DICT_TABLE_MAGIC_N); + + return(t); +} + +/*********************************************************************//** +Free the resources occupied by an object returned by +dict_stats_table_clone_create(). */ +static +void +dict_stats_table_clone_free( +/*========================*/ + dict_table_t* t) /*!< in: dummy table object to free */ +{ + t->stats_mutex_destroy(); + mem_heap_free(t->heap); +} + +/*********************************************************************//** +Write all zeros (or 1 where it makes sense) into an index +statistics members. The resulting stats correspond to an empty index. */ +static +void +dict_stats_empty_index( +/*===================*/ + dict_index_t* index, /*!< in/out: index */ + bool empty_defrag_stats) + /*!< in: whether to empty defrag stats */ +{ + ut_ad(!(index->type & DICT_FTS)); + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(index->table->stats_mutex_is_owner()); + + ulint n_uniq = index->n_uniq; + + for (ulint i = 0; i < n_uniq; i++) { + index->stat_n_diff_key_vals[i] = 0; + index->stat_n_sample_sizes[i] = 1; + index->stat_n_non_null_key_vals[i] = 0; + } + + index->stat_index_size = 1; + index->stat_n_leaf_pages = 1; + + if (empty_defrag_stats) { + dict_stats_empty_defrag_stats(index); + dict_stats_empty_defrag_summary(index); + } +} + +/*********************************************************************//** +Write all zeros (or 1 where it makes sense) into a table and its indexes' +statistics members. The resulting stats correspond to an empty table. */ +static +void +dict_stats_empty_table( +/*===================*/ + dict_table_t* table, /*!< in/out: table */ + bool empty_defrag_stats) + /*!< in: whether to empty defrag stats */ +{ + /* Initialize table/index level stats is now protected by + table level lock_mutex.*/ + table->stats_mutex_lock(); + + /* Zero the stats members */ + table->stat_n_rows = 0; + table->stat_clustered_index_size = 1; + /* 1 page for each index, not counting the clustered */ + table->stat_sum_of_other_index_sizes + = UT_LIST_GET_LEN(table->indexes) - 1; + table->stat_modified_counter = 0; + + dict_index_t* index; + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (index->type & DICT_FTS) { + continue; + } + + ut_ad(!dict_index_is_ibuf(index)); + + dict_stats_empty_index(index, empty_defrag_stats); + } + + table->stat_initialized = TRUE; + table->stats_mutex_unlock(); +} + +/*********************************************************************//** +Check whether index's stats are initialized (assert if they are not). */ +static +void +dict_stats_assert_initialized_index( +/*================================*/ + const dict_index_t* index) /*!< in: index */ +{ + MEM_CHECK_DEFINED( + index->stat_n_diff_key_vals, + index->n_uniq * sizeof(index->stat_n_diff_key_vals[0])); + + MEM_CHECK_DEFINED( + index->stat_n_sample_sizes, + index->n_uniq * sizeof(index->stat_n_sample_sizes[0])); + + MEM_CHECK_DEFINED( + index->stat_n_non_null_key_vals, + index->n_uniq * sizeof(index->stat_n_non_null_key_vals[0])); + + MEM_CHECK_DEFINED( + &index->stat_index_size, + sizeof(index->stat_index_size)); + + MEM_CHECK_DEFINED( + &index->stat_n_leaf_pages, + sizeof(index->stat_n_leaf_pages)); +} + +/*********************************************************************//** +Check whether table's stats are initialized (assert if they are not). */ +static +void +dict_stats_assert_initialized( +/*==========================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_a(table->stat_initialized); + + MEM_CHECK_DEFINED(&table->stats_last_recalc, + sizeof table->stats_last_recalc); + + MEM_CHECK_DEFINED(&table->stat_persistent, + sizeof table->stat_persistent); + + MEM_CHECK_DEFINED(&table->stats_auto_recalc, + sizeof table->stats_auto_recalc); + + MEM_CHECK_DEFINED(&table->stats_sample_pages, + sizeof table->stats_sample_pages); + + MEM_CHECK_DEFINED(&table->stat_n_rows, + sizeof table->stat_n_rows); + + MEM_CHECK_DEFINED(&table->stat_clustered_index_size, + sizeof table->stat_clustered_index_size); + + MEM_CHECK_DEFINED(&table->stat_sum_of_other_index_sizes, + sizeof table->stat_sum_of_other_index_sizes); + + MEM_CHECK_DEFINED(&table->stat_modified_counter, + sizeof table->stat_modified_counter); + + for (dict_index_t* index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (!dict_stats_should_ignore_index(index)) { + dict_stats_assert_initialized_index(index); + } + } +} + +#define INDEX_EQ(i1, i2) \ + ((i1) != NULL \ + && (i2) != NULL \ + && (i1)->id == (i2)->id \ + && strcmp((i1)->name, (i2)->name) == 0) + +/*********************************************************************//** +Copy table and index statistics from one table to another, including index +stats. Extra indexes in src are ignored and extra indexes in dst are +initialized to correspond to an empty index. */ +static +void +dict_stats_copy( +/*============*/ + dict_table_t* dst, /*!< in/out: destination table */ + const dict_table_t* src, /*!< in: source table */ + bool reset_ignored_indexes) /*!< in: if true, set ignored indexes + to have the same statistics as if + the table was empty */ +{ + ut_ad(src->stats_mutex_is_owner()); + ut_ad(dst->stats_mutex_is_owner()); + + dst->stats_last_recalc = src->stats_last_recalc; + dst->stat_n_rows = src->stat_n_rows; + dst->stat_clustered_index_size = src->stat_clustered_index_size; + dst->stat_sum_of_other_index_sizes = src->stat_sum_of_other_index_sizes; + dst->stat_modified_counter = src->stat_modified_counter; + + dict_index_t* dst_idx; + dict_index_t* src_idx; + + for (dst_idx = dict_table_get_first_index(dst), + src_idx = dict_table_get_first_index(src); + dst_idx != NULL; + dst_idx = dict_table_get_next_index(dst_idx), + (src_idx != NULL + && (src_idx = dict_table_get_next_index(src_idx)))) { + + if (dict_stats_should_ignore_index(dst_idx)) { + if (reset_ignored_indexes) { + /* Reset index statistics for all ignored indexes, + unless they are FT indexes (these have no statistics)*/ + if (dst_idx->type & DICT_FTS) { + continue; + } + dict_stats_empty_index(dst_idx, true); + } else { + continue; + } + } + + ut_ad(!dict_index_is_ibuf(dst_idx)); + + if (!INDEX_EQ(src_idx, dst_idx)) { + for (src_idx = dict_table_get_first_index(src); + src_idx != NULL; + src_idx = dict_table_get_next_index(src_idx)) { + + if (INDEX_EQ(src_idx, dst_idx)) { + break; + } + } + } + + if (!INDEX_EQ(src_idx, dst_idx)) { + dict_stats_empty_index(dst_idx, true); + continue; + } + + ulint n_copy_el; + + if (dst_idx->n_uniq > src_idx->n_uniq) { + n_copy_el = src_idx->n_uniq; + /* Since src is smaller some elements in dst + will remain untouched by the following memmove(), + thus we init all of them here. */ + dict_stats_empty_index(dst_idx, true); + } else { + n_copy_el = dst_idx->n_uniq; + } + + memmove(dst_idx->stat_n_diff_key_vals, + src_idx->stat_n_diff_key_vals, + n_copy_el * sizeof(dst_idx->stat_n_diff_key_vals[0])); + + memmove(dst_idx->stat_n_sample_sizes, + src_idx->stat_n_sample_sizes, + n_copy_el * sizeof(dst_idx->stat_n_sample_sizes[0])); + + memmove(dst_idx->stat_n_non_null_key_vals, + src_idx->stat_n_non_null_key_vals, + n_copy_el * sizeof(dst_idx->stat_n_non_null_key_vals[0])); + + dst_idx->stat_index_size = src_idx->stat_index_size; + + dst_idx->stat_n_leaf_pages = src_idx->stat_n_leaf_pages; + + dst_idx->stat_defrag_modified_counter = + src_idx->stat_defrag_modified_counter; + dst_idx->stat_defrag_n_pages_freed = + src_idx->stat_defrag_n_pages_freed; + dst_idx->stat_defrag_n_page_split = + src_idx->stat_defrag_n_page_split; + } + + dst->stat_initialized = TRUE; +} + +/** Duplicate the stats of a table and its indexes. +This function creates a dummy dict_table_t object and copies the input +table's stats into it. The returned table object is not in the dictionary +cache and cannot be accessed by any other threads. In addition to the +members copied in dict_stats_table_clone_create() this function initializes +the following: +dict_table_t::stat_initialized +dict_table_t::stat_persistent +dict_table_t::stat_n_rows +dict_table_t::stat_clustered_index_size +dict_table_t::stat_sum_of_other_index_sizes +dict_table_t::stat_modified_counter +dict_index_t::stat_n_diff_key_vals[] +dict_index_t::stat_n_sample_sizes[] +dict_index_t::stat_n_non_null_key_vals[] +dict_index_t::stat_index_size +dict_index_t::stat_n_leaf_pages +dict_index_t::stat_defrag_modified_counter +dict_index_t::stat_defrag_n_pages_freed +dict_index_t::stat_defrag_n_page_split +The returned object should be freed with dict_stats_snapshot_free() +when no longer needed. +@param[in] table table whose stats to copy +@return incomplete table object */ +static +dict_table_t* +dict_stats_snapshot_create( + dict_table_t* table) +{ + dict_sys.lock(SRW_LOCK_CALL); + + dict_stats_assert_initialized(table); + + dict_table_t* t; + + t = dict_stats_table_clone_create(table); + + table->stats_mutex_lock(); + ut_d(t->stats_mutex_lock()); + + dict_stats_copy(t, table, false); + + ut_d(t->stats_mutex_unlock()); + table->stats_mutex_unlock(); + + t->stat_persistent = table->stat_persistent; + t->stats_auto_recalc = table->stats_auto_recalc; + t->stats_sample_pages = table->stats_sample_pages; + + dict_sys.unlock(); + + return(t); +} + +/*********************************************************************//** +Free the resources occupied by an object returned by +dict_stats_snapshot_create(). */ +static +void +dict_stats_snapshot_free( +/*=====================*/ + dict_table_t* t) /*!< in: dummy table object to free */ +{ + dict_stats_table_clone_free(t); +} + +/** Statistics for one field of an index. */ +struct index_field_stats_t +{ + ib_uint64_t n_diff_key_vals; + ib_uint64_t n_sample_sizes; + ib_uint64_t n_non_null_key_vals; + + index_field_stats_t(ib_uint64_t n_diff_key_vals= 0, + ib_uint64_t n_sample_sizes= 0, + ib_uint64_t n_non_null_key_vals= 0) + : n_diff_key_vals(n_diff_key_vals), n_sample_sizes(n_sample_sizes), + n_non_null_key_vals(n_non_null_key_vals) + { + } + + bool is_bulk_operation() const + { + return n_diff_key_vals == UINT64_MAX && + n_sample_sizes == UINT64_MAX && n_non_null_key_vals == UINT64_MAX; + } +}; + +/*******************************************************************//** +Record the number of non_null key values in a given index for +each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index). +The estimates are eventually stored in the array: +index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */ +static +void +btr_record_not_null_field_in_rec( +/*=============================*/ + ulint n_unique, /*!< in: dict_index_get_n_unique(index), + number of columns uniquely determine + an index entry */ + const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index), + its size could be for all fields or + that of "n_unique" */ + ib_uint64_t* n_not_null) /*!< in/out: array to record number of + not null rows for n-column prefix */ +{ + ulint i; + + ut_ad(rec_offs_n_fields(offsets) >= n_unique); + + if (n_not_null == NULL) { + return; + } + + for (i = 0; i < n_unique; i++) { + if (rec_offs_nth_sql_null(offsets, i)) { + break; + } + + n_not_null[i]++; + } +} + +inline dberr_t +btr_cur_t::open_random_leaf(rec_offs *&offsets, mem_heap_t *&heap, mtr_t &mtr) +{ + ut_ad(!index()->is_spatial()); + ut_ad(!mtr.get_savepoint()); + + mtr_s_lock_index(index(), &mtr); + + if (index()->page == FIL_NULL) + return DB_CORRUPTION; + + dberr_t err; + auto offset= index()->page; + bool merge= false; + ulint height= ULINT_UNDEFINED; + + while (buf_block_t *block= + btr_block_get(*index(), offset, RW_S_LATCH, merge, &mtr, &err)) + { + page_cur.block= block; + + if (height == ULINT_UNDEFINED) + { + height= btr_page_get_level(block->page.frame); + if (height > BTR_MAX_LEVELS) + return DB_CORRUPTION; + + if (height == 0) + goto got_leaf; + } + + if (height == 0) + { + mtr.rollback_to_savepoint(0, mtr.get_savepoint() - 1); + got_leaf: + page_cur.rec= page_get_infimum_rec(block->page.frame); + return DB_SUCCESS; + } + + if (!--height) + merge= !index()->is_clust(); + + page_cur_open_on_rnd_user_rec(&page_cur); + + offsets= rec_get_offsets(page_cur.rec, page_cur.index, offsets, 0, + ULINT_UNDEFINED, &heap); + + /* Go to the child node */ + offset= btr_node_ptr_get_child_page_no(page_cur.rec, offsets); + } + + return err; +} + +/** Estimated table level stats from sampled value. +@param value sampled stats +@param index index being sampled +@param sample number of sampled rows +@param ext_size external stored data size +@param not_empty table not empty +@return estimated table wide stats from sampled value */ +#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty) \ + (((value) * static_cast(index->stat_n_leaf_pages) \ + + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size))) + +/** Estimates the number of different key values in a given index, for +each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index). +The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed +0..n_uniq-1) and the number of pages that were sampled is saved in +result.n_sample_sizes[]. +If innodb_stats_method is nulls_ignored, we also record the number of +non-null values for each prefix and stored the estimates in +array result.n_non_null_key_vals. +@param index B-tree index +@param bulk_trx_id the value of index->table->bulk_trx_id at the start +@return vector with statistics information +empty vector if the index is unavailable. */ +static +std::vector +btr_estimate_number_of_different_key_vals(dict_index_t* index, + trx_id_t bulk_trx_id) +{ + page_t* page; + rec_t* rec; + ulint n_cols; + ib_uint64_t* n_diff; + ib_uint64_t* n_not_null; + ibool stats_null_not_equal; + uintmax_t n_sample_pages=1; /* number of pages to sample */ + ulint not_empty_flag = 0; + ulint total_external_size = 0; + uintmax_t add_on; + mtr_t mtr; + mem_heap_t* heap = NULL; + rec_offs* offsets_rec = NULL; + rec_offs* offsets_next_rec = NULL; + + std::vector result; + + ut_ad(index->is_btree()); + + n_cols = dict_index_get_n_unique(index); + + heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null) + * n_cols + + dict_index_get_n_fields(index) + * (sizeof *offsets_rec + + sizeof *offsets_next_rec)); + + n_diff = (ib_uint64_t*) mem_heap_zalloc( + heap, n_cols * sizeof(n_diff[0])); + + n_not_null = NULL; + + /* Check srv_innodb_stats_method setting, and decide whether we + need to record non-null value and also decide if NULL is + considered equal (by setting stats_null_not_equal value) */ + switch (srv_innodb_stats_method) { + case SRV_STATS_NULLS_IGNORED: + n_not_null = (ib_uint64_t*) mem_heap_zalloc( + heap, n_cols * sizeof *n_not_null); + /* fall through */ + + case SRV_STATS_NULLS_UNEQUAL: + /* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL + case, we will treat NULLs as unequal value */ + stats_null_not_equal = TRUE; + break; + + case SRV_STATS_NULLS_EQUAL: + stats_null_not_equal = FALSE; + break; + + default: + ut_error; + } + + if (srv_stats_sample_traditional) { + /* It makes no sense to test more pages than are contained + in the index, thus we lower the number if it is too high */ + if (srv_stats_transient_sample_pages > index->stat_index_size) { + if (index->stat_index_size > 0) { + n_sample_pages = index->stat_index_size; + } + } else { + n_sample_pages = srv_stats_transient_sample_pages; + } + } else { + /* New logaritmic number of pages that are estimated. + Number of pages estimated should be between 1 and + index->stat_index_size. + + If we have only 0 or 1 index pages then we can only take 1 + sample. We have already initialized n_sample_pages to 1. + + So taking index size as I and sample as S and log(I)*S as L + + requirement 1) we want the out limit of the expression to not exceed I; + requirement 2) we want the ideal pages to be at least S; + so the current expression is min(I, max( min(S,I), L) + + looking for simplifications: + + case 1: assume S < I + min(I, max( min(S,I), L) -> min(I , max( S, L)) + + but since L=LOG2(I)*S and log2(I) >=1 L>S always so max(S,L) = L. + + so we have: min(I , L) + + case 2: assume I < S + min(I, max( min(S,I), L) -> min(I, max( I, L)) + + case 2a: L > I + min(I, max( I, L)) -> min(I, L) -> I + + case 2b: when L < I + min(I, max( I, L)) -> min(I, I ) -> I + + so taking all case2 paths is I, our expression is: + n_pages = S < I? min(I,L) : I + */ + if (index->stat_index_size > 1) { + n_sample_pages = (srv_stats_transient_sample_pages < index->stat_index_size) + ? ut_min(index->stat_index_size, + static_cast( + log2(double(index->stat_index_size)) + * double(srv_stats_transient_sample_pages))) + : index->stat_index_size; + } + } + + /* Sanity check */ + ut_ad(n_sample_pages > 0 && n_sample_pages <= (index->stat_index_size <= 1 ? 1 : index->stat_index_size)); + + /* We sample some pages in the index to get an estimate */ + btr_cur_t cursor; + cursor.page_cur.index = index; + + for (ulint i = 0; i < n_sample_pages; i++) { + mtr.start(); + + if (cursor.open_random_leaf(offsets_rec, heap, mtr) != + DB_SUCCESS + || index->table->bulk_trx_id != bulk_trx_id) { + mtr.commit(); + goto exit_loop; + } + + /* Count the number of different key values for each prefix of + the key on this index page. If the prefix does not determine + the index record uniquely in the B-tree, then we subtract one + because otherwise our algorithm would give a wrong estimate + for an index where there is just one key value. */ + + page = btr_cur_get_page(&cursor); + + rec = page_rec_get_next(cursor.page_cur.rec); + const ulint n_core = index->n_core_fields; + + if (rec && !page_rec_is_supremum(rec)) { + not_empty_flag = 1; + offsets_rec = rec_get_offsets(rec, index, offsets_rec, + n_core, + ULINT_UNDEFINED, &heap); + + if (n_not_null != NULL) { + btr_record_not_null_field_in_rec( + n_cols, offsets_rec, n_not_null); + } + } + + while (!page_rec_is_supremum(rec)) { + ulint matched_fields; + rec_t* next_rec = page_rec_get_next(rec); + if (!next_rec || page_rec_is_supremum(next_rec)) { + total_external_size += + btr_rec_get_externally_stored_len( + rec, offsets_rec); + break; + } + + offsets_next_rec = rec_get_offsets(next_rec, index, + offsets_next_rec, + n_core, + ULINT_UNDEFINED, + &heap); + + cmp_rec_rec(rec, next_rec, + offsets_rec, offsets_next_rec, + index, stats_null_not_equal, + &matched_fields); + + for (ulint j = matched_fields; j < n_cols; j++) { + /* We add one if this index record has + a different prefix from the previous */ + + n_diff[j]++; + } + + if (n_not_null != NULL) { + btr_record_not_null_field_in_rec( + n_cols, offsets_next_rec, n_not_null); + } + + total_external_size + += btr_rec_get_externally_stored_len( + rec, offsets_rec); + + rec = next_rec; + /* Initialize offsets_rec for the next round + and assign the old offsets_rec buffer to + offsets_next_rec. */ + { + rec_offs* offsets_tmp = offsets_rec; + offsets_rec = offsets_next_rec; + offsets_next_rec = offsets_tmp; + } + } + + if (n_cols == dict_index_get_n_unique_in_tree(index) + && page_has_siblings(page)) { + + /* If there is more than one leaf page in the tree, + we add one because we know that the first record + on the page certainly had a different prefix than the + last record on the previous index page in the + alphabetical order. Before this fix, if there was + just one big record on each clustered index page, the + algorithm grossly underestimated the number of rows + in the table. */ + + n_diff[n_cols - 1]++; + } + + mtr.commit(); + } + +exit_loop: + /* If we saw k borders between different key values on + n_sample_pages leaf pages, we can estimate how many + there will be in index->stat_n_leaf_pages */ + + /* We must take into account that our sample actually represents + also the pages used for external storage of fields (those pages are + included in index->stat_n_leaf_pages) */ + + result.reserve(n_cols); + + for (ulint j = 0; j < n_cols; j++) { + index_field_stats_t stat; + + stat.n_diff_key_vals + = BTR_TABLE_STATS_FROM_SAMPLE( + n_diff[j], index, n_sample_pages, + total_external_size, not_empty_flag); + + /* If the tree is small, smaller than + 10 * n_sample_pages + total_external_size, then + the above estimate is ok. For bigger trees it is common that we + do not see any borders between key values in the few pages + we pick. But still there may be n_sample_pages + different key values, or even more. Let us try to approximate + that: */ + + add_on = index->stat_n_leaf_pages + / (10 * (n_sample_pages + + total_external_size)); + + if (add_on > n_sample_pages) { + add_on = n_sample_pages; + } + + stat.n_diff_key_vals += add_on; + + stat.n_sample_sizes = n_sample_pages; + + if (n_not_null != NULL) { + stat.n_non_null_key_vals = + BTR_TABLE_STATS_FROM_SAMPLE( + n_not_null[j], index, n_sample_pages, + total_external_size, not_empty_flag); + } + + result.push_back(stat); + } + + mem_heap_free(heap); + return result; +} + +/*********************************************************************//** +Calculates new estimates for index statistics. This function is +relatively quick and is used to calculate transient statistics that +are not saved on disk. This was the only way to calculate statistics +before the Persistent Statistics feature was introduced. +This function doesn't update the defragmentation related stats. +Only persistent statistics supports defragmentation stats. +@return error code +@retval DB_SUCCESS_LOCKED_REC if the table under bulk insert operation */ +static +dberr_t +dict_stats_update_transient_for_index( +/*==================================*/ + dict_index_t* index) /*!< in/out: index */ +{ + dberr_t err = DB_SUCCESS; + if (srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO + && (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO + || !dict_index_is_clust(index))) { + /* If we have set a high innodb_force_recovery + level, do not calculate statistics, as a badly + corrupted index can cause a crash in it. + Initialize some bogus index cardinality + statistics, so that the data can be queried in + various means, also via secondary indexes. */ +dummy_empty: + index->table->stats_mutex_lock(); + dict_stats_empty_index(index, false); + index->table->stats_mutex_unlock(); + return err; +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG + } else if (ibuf_debug && !dict_index_is_clust(index)) { + goto dummy_empty; +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + } else if (dict_index_is_online_ddl(index) || !index->is_committed() + || !index->table->space) { + goto dummy_empty; + } else { + mtr_t mtr; + + mtr.start(); + mtr_sx_lock_index(index, &mtr); + + dberr_t err; + buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, + &mtr, &err); + if (!root) { +invalid: + mtr.commit(); + goto dummy_empty; + } + + const auto bulk_trx_id = index->table->bulk_trx_id; + if (bulk_trx_id && trx_sys.find(nullptr, bulk_trx_id, false)) { + err= DB_SUCCESS_LOCKED_REC; + goto invalid; + } + + mtr.x_lock_space(index->table->space); + + ulint dummy, size; + index->stat_index_size + = fseg_n_reserved_pages(*root, PAGE_HEADER + + PAGE_BTR_SEG_LEAF + + root->page.frame, &size, + &mtr) + + fseg_n_reserved_pages(*root, PAGE_HEADER + + PAGE_BTR_SEG_TOP + + root->page.frame, &dummy, + &mtr); + + mtr.commit(); + + index->stat_n_leaf_pages = size ? size : 1; + + /* Do not continue if table decryption has failed or + table is already marked as corrupted. */ + if (index->is_readable()) { + std::vector stats + = btr_estimate_number_of_different_key_vals( + index, bulk_trx_id); + + if (!stats.empty()) { + index->table->stats_mutex_lock(); + for (size_t i = 0; i < stats.size(); ++i) { + index->stat_n_diff_key_vals[i] + = stats[i].n_diff_key_vals; + index->stat_n_sample_sizes[i] + = stats[i].n_sample_sizes; + index->stat_n_non_null_key_vals[i] + = stats[i].n_non_null_key_vals; + } + index->table->stats_mutex_unlock(); + } + } + } + + return err; +} + +/*********************************************************************//** +Calculates new estimates for table and index statistics. This function +is relatively quick and is used to calculate transient statistics that +are not saved on disk. +This was the only way to calculate statistics before the +Persistent Statistics feature was introduced. +@return error code +@retval DB_SUCCESS_LOCKED REC if the table under bulk insert operation */ +static +dberr_t +dict_stats_update_transient( +/*========================*/ + dict_table_t* table) /*!< in/out: table */ +{ + ut_ad(!table->stats_mutex_is_owner()); + + dict_index_t* index; + ulint sum_of_index_sizes = 0; + dberr_t err = DB_SUCCESS; + + /* Find out the sizes of the indexes and how many different values + for the key they approximately have */ + + index = dict_table_get_first_index(table); + + if (!table->space) { + /* Nothing to do. */ +empty_table: + dict_stats_empty_table(table, true); + return err; + } else if (index == NULL) { + /* Table definition is corrupt */ + + ib::warn() << "Table " << table->name + << " has no indexes. Cannot calculate statistics."; + goto empty_table; + } + + for (; index != NULL; index = dict_table_get_next_index(index)) { + + ut_ad(!dict_index_is_ibuf(index)); + + if (!index->is_btree()) { + continue; + } + + if (dict_stats_should_ignore_index(index) + || !index->is_readable() + || err == DB_SUCCESS_LOCKED_REC) { + index->table->stats_mutex_lock(); + dict_stats_empty_index(index, false); + index->table->stats_mutex_unlock(); + continue; + } + + err = dict_stats_update_transient_for_index(index); + + sum_of_index_sizes += index->stat_index_size; + } + + table->stats_mutex_lock(); + + index = dict_table_get_first_index(table); + + table->stat_n_rows = index->stat_n_diff_key_vals[ + dict_index_get_n_unique(index) - 1]; + + table->stat_clustered_index_size = index->stat_index_size; + + table->stat_sum_of_other_index_sizes = sum_of_index_sizes + - index->stat_index_size; + + table->stats_last_recalc = time(NULL); + + table->stat_modified_counter = 0; + + table->stat_initialized = TRUE; + + table->stats_mutex_unlock(); + + return err; +} + +/** Open a cursor at the first page in a tree level. +@param page_cur cursor +@param level level to search for (0=leaf) +@param mtr mini-transaction */ +static dberr_t page_cur_open_level(page_cur_t *page_cur, ulint level, + mtr_t *mtr) +{ + mem_heap_t *heap= nullptr; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs *offsets= offsets_; + dberr_t err; + + dict_index_t *const index= page_cur->index; + + rec_offs_init(offsets_); + ut_ad(level != ULINT_UNDEFINED); + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_SX_LOCK)); + ut_ad(mtr->get_savepoint() == 1); + + uint32_t page= index->page; + + for (ulint height = ULINT_UNDEFINED;; height--) + { + buf_block_t* block= + btr_block_get(*index, page, RW_S_LATCH, + !height && !index->is_clust(), mtr, &err); + if (!block) + break; + + const uint32_t l= btr_page_get_level(block->page.frame); + + if (height == ULINT_UNDEFINED) + { + ut_ad(!heap); + /* We are in the root node */ + height= l; + if (UNIV_UNLIKELY(height < level)) + return DB_CORRUPTION; + } + else if (UNIV_UNLIKELY(height != l) || page_has_prev(block->page.frame)) + { + err= DB_CORRUPTION; + break; + } + + page_cur_set_before_first(block, page_cur); + + if (height == level) + break; + + ut_ad(height); + + if (!page_cur_move_to_next(page_cur)) + { + err= DB_CORRUPTION; + break; + } + + offsets= rec_get_offsets(page_cur->rec, index, offsets, 0, ULINT_UNDEFINED, + &heap); + page= btr_node_ptr_get_child_page_no(page_cur->rec, offsets); + } + + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + + /* Release all page latches except the one on the desired page. */ + const auto end= mtr->get_savepoint(); + if (end > 1) + mtr->rollback_to_savepoint(1, end - 1); + + return err; +} + +/** Open a cursor at the first page in a tree level. +@param page_cur cursor +@param level level to search for (0=leaf) +@param mtr mini-transaction +@param index index tree */ +static dberr_t btr_pcur_open_level(btr_pcur_t *pcur, ulint level, mtr_t *mtr, + dict_index_t *index) +{ + pcur->latch_mode= BTR_SEARCH_LEAF; + pcur->search_mode= PAGE_CUR_G; + pcur->pos_state= BTR_PCUR_IS_POSITIONED; + pcur->btr_cur.page_cur.index= index; + return page_cur_open_level(&pcur->btr_cur.page_cur, level, mtr); +} + + +/* @{ Pseudo code about the relation between the following functions + +let N = N_SAMPLE_PAGES(index) + +dict_stats_analyze_index() + for each n_prefix + search for good enough level: + dict_stats_analyze_index_level() // only called if level has <= N pages + // full scan of the level in one mtr + collect statistics about the given level + if we are not satisfied with the level, search next lower level + we have found a good enough level here + dict_stats_analyze_index_for_n_prefix(that level, stats collected above) + // full scan of the level in one mtr + dive below some records and analyze the leaf page there: + dict_stats_analyze_index_below_cur() +@} */ + +/*********************************************************************//** +Find the total number and the number of distinct keys on a given level in +an index. Each of the 1..n_uniq prefixes are looked up and the results are +saved in the array n_diff[0] .. n_diff[n_uniq - 1]. The total number of +records on the level is saved in total_recs. +Also, the index of the last record in each group of equal records is saved +in n_diff_boundaries[0..n_uniq - 1], records indexing starts from the leftmost +record on the level and continues cross pages boundaries, counting from 0. */ +static +void +dict_stats_analyze_index_level( +/*===========================*/ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: level */ + ib_uint64_t* n_diff, /*!< out: array for number of + distinct keys for all prefixes */ + ib_uint64_t* total_recs, /*!< out: total number of records */ + ib_uint64_t* total_pages, /*!< out: total number of pages */ + boundaries_t* n_diff_boundaries,/*!< out: boundaries of the groups + of distinct keys */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint n_uniq; + mem_heap_t* heap; + btr_pcur_t pcur; + const page_t* page; + const rec_t* rec; + const rec_t* prev_rec; + bool prev_rec_is_copied; + byte* prev_rec_buf = NULL; + ulint prev_rec_buf_size = 0; + rec_offs* rec_offsets; + rec_offs* prev_rec_offsets; + ulint i; + + DEBUG_PRINTF(" %s(table=%s, index=%s, level=" ULINTPF ")\n", + __func__, index->table->name, index->name, level); + + *total_recs = 0; + *total_pages = 0; + + n_uniq = dict_index_get_n_unique(index); + + /* elements in the n_diff array are 0..n_uniq-1 (inclusive) */ + memset(n_diff, 0x0, n_uniq * sizeof(n_diff[0])); + + /* Allocate space for the offsets header (the allocation size at + offsets[0] and the REC_OFFS_HEADER_SIZE bytes), and n_uniq + 1, + so that this will never be less than the size calculated in + rec_get_offsets_func(). */ + i = (REC_OFFS_HEADER_SIZE + 1 + 1) + n_uniq; + + heap = mem_heap_create((2 * sizeof *rec_offsets) * i); + rec_offsets = static_cast( + mem_heap_alloc(heap, i * sizeof *rec_offsets)); + prev_rec_offsets = static_cast( + mem_heap_alloc(heap, i * sizeof *prev_rec_offsets)); + rec_offs_set_n_alloc(rec_offsets, i); + rec_offs_set_n_alloc(prev_rec_offsets, i); + + /* reset the dynamic arrays n_diff_boundaries[0..n_uniq-1] */ + if (n_diff_boundaries != NULL) { + for (i = 0; i < n_uniq; i++) { + n_diff_boundaries[i].erase( + n_diff_boundaries[i].begin(), + n_diff_boundaries[i].end()); + } + } + + /* Position pcur on the leftmost record on the leftmost page + on the desired level. */ + + if (btr_pcur_open_level(&pcur, level, mtr, index) != DB_SUCCESS + || !btr_pcur_move_to_next_on_page(&pcur)) { + goto func_exit; + } + + page = btr_pcur_get_page(&pcur); + + /* The page must not be empty, except when + it is the root page (and the whole index is empty). */ + ut_ad(btr_pcur_is_on_user_rec(&pcur) || page_is_leaf(page)); + + prev_rec = NULL; + prev_rec_is_copied = false; + + if (REC_INFO_MIN_REC_FLAG & rec_get_info_bits( + btr_pcur_get_rec(&pcur), page_is_comp(page))) { + ut_ad(btr_pcur_is_on_user_rec(&pcur)); + if (level == 0) { + /* Skip the metadata pseudo-record */ + ut_ad(index->is_instant()); + btr_pcur_move_to_next_user_rec(&pcur, mtr); + } + } else if (UNIV_UNLIKELY(level != 0)) { + /* The first record on the leftmost page must be + marked as such on each level except the leaf level. */ + goto func_exit; + } + + /* iterate over all user records on this level + and compare each two adjacent ones, even the last on page + X and the fist on page X+1 */ + for (; + btr_pcur_is_on_user_rec(&pcur); + btr_pcur_move_to_next_user_rec(&pcur, mtr)) { + + bool rec_is_last_on_page; + + rec = btr_pcur_get_rec(&pcur); + + /* If rec and prev_rec are on different pages, then prev_rec + must have been copied, because we hold latch only on the page + where rec resides. */ + if (prev_rec != NULL + && page_align(rec) != page_align(prev_rec)) { + + ut_a(prev_rec_is_copied); + } + + rec_is_last_on_page = + page_rec_is_supremum(page_rec_get_next_const(rec)); + + /* increment the pages counter at the end of each page */ + if (rec_is_last_on_page) { + + (*total_pages)++; + } + + /* Skip delete-marked records on the leaf level. If we + do not skip them, then ANALYZE quickly after DELETE + could count them or not (purge may have already wiped + them away) which brings non-determinism. We skip only + leaf-level delete marks because delete marks on + non-leaf level do not make sense. */ + + if (level == 0 + && !srv_stats_include_delete_marked + && rec_get_deleted_flag(rec, page_rec_is_comp(rec))) { + if (rec_is_last_on_page + && !prev_rec_is_copied + && prev_rec != NULL) { + /* copy prev_rec */ + + prev_rec_offsets = rec_get_offsets( + prev_rec, index, prev_rec_offsets, + index->n_core_fields, + n_uniq, &heap); + + prev_rec = rec_copy_prefix_to_buf( + prev_rec, index, n_uniq, + &prev_rec_buf, &prev_rec_buf_size); + + prev_rec_is_copied = true; + } + + continue; + } + rec_offsets = rec_get_offsets(rec, index, rec_offsets, + level ? 0 : index->n_core_fields, + n_uniq, &heap); + + (*total_recs)++; + + if (prev_rec != NULL) { + ulint matched_fields; + + prev_rec_offsets = rec_get_offsets( + prev_rec, index, prev_rec_offsets, + level ? 0 : index->n_core_fields, + n_uniq, &heap); + + cmp_rec_rec(prev_rec, rec, + prev_rec_offsets, rec_offsets, index, + false, &matched_fields); + + for (i = matched_fields; i < n_uniq; i++) { + + if (n_diff_boundaries != NULL) { + /* push the index of the previous + record, that is - the last one from + a group of equal keys */ + + ib_uint64_t idx; + + /* the index of the current record + is total_recs - 1, the index of the + previous record is total_recs - 2; + we know that idx is not going to + become negative here because if we + are in this branch then there is a + previous record and thus + total_recs >= 2 */ + idx = *total_recs - 2; + + n_diff_boundaries[i].push_back(idx); + } + + /* increment the number of different keys + for n_prefix=i+1 (e.g. if i=0 then we increment + for n_prefix=1 which is stored in n_diff[0]) */ + n_diff[i]++; + } + } else { + /* this is the first non-delete marked record */ + for (i = 0; i < n_uniq; i++) { + n_diff[i] = 1; + } + } + + if (rec_is_last_on_page) { + /* end of a page has been reached */ + + /* we need to copy the record instead of assigning + like prev_rec = rec; because when we traverse the + records on this level at some point we will jump from + one page to the next and then rec and prev_rec will + be on different pages and + btr_cur_move_to_next_user_rec() will release the + latch on the page that prev_rec is on */ + prev_rec = rec_copy_prefix_to_buf( + rec, index, n_uniq, + &prev_rec_buf, &prev_rec_buf_size); + prev_rec_is_copied = true; + + } else { + /* still on the same page, the next call to + btr_cur_move_to_next_user_rec() will not jump + on the next page, we can simply assign pointers + instead of copying the records like above */ + + prev_rec = rec; + prev_rec_is_copied = false; + } + } + + /* if *total_pages is left untouched then the above loop was not + entered at all and there is one page in the whole tree which is + empty or the loop was entered but this is level 0, contains one page + and all records are delete-marked */ + if (*total_pages == 0) { + + ut_ad(level == 0); + ut_ad(*total_recs == 0); + + *total_pages = 1; + } + + /* if there are records on this level and boundaries + should be saved */ + if (*total_recs > 0 && n_diff_boundaries != NULL) { + + /* remember the index of the last record on the level as the + last one from the last group of equal keys; this holds for + all possible prefixes */ + for (i = 0; i < n_uniq; i++) { + ib_uint64_t idx; + + idx = *total_recs - 1; + + n_diff_boundaries[i].push_back(idx); + } + } + + /* now in n_diff_boundaries[i] there are exactly n_diff[i] integers, + for i=0..n_uniq-1 */ + +#ifdef UNIV_STATS_DEBUG + for (i = 0; i < n_uniq; i++) { + + DEBUG_PRINTF(" %s(): total recs: " UINT64PF + ", total pages: " UINT64PF + ", n_diff[" ULINTPF "]: " UINT64PF "\n", + __func__, *total_recs, + *total_pages, + i, n_diff[i]); + +#if 0 + if (n_diff_boundaries != NULL) { + ib_uint64_t j; + + DEBUG_PRINTF(" %s(): boundaries[%lu]: ", + __func__, i); + + for (j = 0; j < n_diff[i]; j++) { + ib_uint64_t idx; + + idx = n_diff_boundaries[i][j]; + + DEBUG_PRINTF(UINT64PF "=" UINT64PF ", ", + j, idx); + } + DEBUG_PRINTF("\n"); + } +#endif + } +#endif /* UNIV_STATS_DEBUG */ + +func_exit: + ut_free(prev_rec_buf); + mem_heap_free(heap); +} + + +/************************************************************//** +Gets the pointer to the next non delete-marked record on the page. +If all subsequent records are delete-marked, then this function +will return the supremum record. +@return pointer to next non delete-marked record or pointer to supremum */ +static +const rec_t* +page_rec_get_next_non_del_marked( +/*=============================*/ + const rec_t* rec) /*!< in: pointer to record */ +{ + const page_t *const page= page_align(rec); + + if (page_is_comp(page)) + { + for (rec= page_rec_get_next_low(rec, TRUE); + rec && rec_get_deleted_flag(rec, TRUE); + rec= page_rec_get_next_low(rec, TRUE)); + return rec ? rec : page + PAGE_NEW_SUPREMUM; + } + else + { + for (rec= page_rec_get_next_low(rec, FALSE); + rec && rec_get_deleted_flag(rec, FALSE); + rec= page_rec_get_next_low(rec, FALSE)); + return rec ? rec : page + PAGE_OLD_SUPREMUM; + } +} + +/** Scan a page, reading records from left to right and counting the number +of distinct records (looking only at the first n_prefix +columns) and the number of external pages pointed by records from this page. +If scan_method is QUIT_ON_FIRST_NON_BORING then the function +will return as soon as it finds a record that does not match its neighbor +to the right, which means that in the case of QUIT_ON_FIRST_NON_BORING the +returned n_diff can either be 0 (empty page), 1 (the whole page has all keys +equal) or 2 (the function found a non-boring record and returned). +@param[out] out_rec record, or NULL +@param[out] offsets1 rec_get_offsets() working space (must +be big enough) +@param[out] offsets2 rec_get_offsets() working space (must +be big enough) +@param[in] index index of the page +@param[in] page the page to scan +@param[in] n_prefix look at the first n_prefix columns +@param[in] n_core 0, or index->n_core_fields for leaf +@param[out] n_diff number of distinct records encountered +@param[out] n_external_pages if this is non-NULL then it will be set +to the number of externally stored pages which were encountered +@return offsets1 or offsets2 (the offsets of *out_rec), +or NULL if the page is empty and does not contain user records. */ +UNIV_INLINE +rec_offs* +dict_stats_scan_page( + const rec_t** out_rec, + rec_offs* offsets1, + rec_offs* offsets2, + const dict_index_t* index, + const page_t* page, + ulint n_prefix, + ulint n_core, + ib_uint64_t* n_diff, + ib_uint64_t* n_external_pages) +{ + rec_offs* offsets_rec = offsets1; + rec_offs* offsets_next_rec = offsets2; + const rec_t* rec; + const rec_t* next_rec; + /* A dummy heap, to be passed to rec_get_offsets(). + Because offsets1,offsets2 should be big enough, + this memory heap should never be used. */ + mem_heap_t* heap = NULL; + ut_ad(!!n_core == page_is_leaf(page)); + const rec_t* (*get_next)(const rec_t*) + = !n_core || srv_stats_include_delete_marked + ? page_rec_get_next_const + : page_rec_get_next_non_del_marked; + + const bool should_count_external_pages = n_external_pages != NULL; + + if (should_count_external_pages) { + *n_external_pages = 0; + } + + rec = get_next(page_get_infimum_rec(page)); + + if (!rec || page_rec_is_supremum(rec)) { + /* the page is empty or contains only delete-marked records */ + *n_diff = 0; + *out_rec = NULL; + return(NULL); + } + + offsets_rec = rec_get_offsets(rec, index, offsets_rec, n_core, + ULINT_UNDEFINED, &heap); + + if (should_count_external_pages) { + *n_external_pages += btr_rec_get_externally_stored_len( + rec, offsets_rec); + } + + next_rec = get_next(rec); + + *n_diff = 1; + + while (next_rec && !page_rec_is_supremum(next_rec)) { + + ulint matched_fields; + + offsets_next_rec = rec_get_offsets(next_rec, index, + offsets_next_rec, n_core, + ULINT_UNDEFINED, + &heap); + + /* check whether rec != next_rec when looking at + the first n_prefix fields */ + cmp_rec_rec(rec, next_rec, offsets_rec, offsets_next_rec, + index, false, &matched_fields); + + if (matched_fields < n_prefix) { + /* rec != next_rec, => rec is non-boring */ + + (*n_diff)++; + + if (!n_core) { + break; + } + } + + rec = next_rec; + /* Assign offsets_rec = offsets_next_rec so that + offsets_rec matches with rec which was just assigned + rec = next_rec above. Also need to point + offsets_next_rec to the place where offsets_rec was + pointing before because we have just 2 placeholders + where data is actually stored: offsets1 and offsets2 + and we are using them in circular fashion + (offsets[_next]_rec are just pointers to those + placeholders). */ + std::swap(offsets_rec, offsets_next_rec); + + if (should_count_external_pages) { + *n_external_pages += btr_rec_get_externally_stored_len( + rec, offsets_rec); + } + + next_rec = get_next(next_rec); + } + + /* offsets1,offsets2 should have been big enough */ + ut_a(heap == NULL); + *out_rec = rec; + return(offsets_rec); +} + +/** Dive below the current position of a cursor and calculate the number of +distinct records on the leaf page, when looking at the fist n_prefix +columns. Also calculate the number of external pages pointed by records +on the leaf page. +@param[in] cur cursor +@param[in] n_prefix look at the first n_prefix columns +when comparing records +@param[out] n_diff number of distinct records +@param[out] n_external_pages number of external pages +@return number of distinct records on the leaf page */ +static +void +dict_stats_analyze_index_below_cur( + const btr_cur_t* cur, + ulint n_prefix, + ib_uint64_t* n_diff, + ib_uint64_t* n_external_pages) +{ + dict_index_t* index; + buf_block_t* block; + const page_t* page; + mem_heap_t* heap; + const rec_t* rec; + rec_offs* offsets1; + rec_offs* offsets2; + rec_offs* offsets_rec; + ulint size; + mtr_t mtr; + + index = btr_cur_get_index(cur); + + /* Allocate offsets for the record and the node pointer, for + node pointer records. In a secondary index, the node pointer + record will consist of all index fields followed by a child + page number. + Allocate space for the offsets header (the allocation size at + offsets[0] and the REC_OFFS_HEADER_SIZE bytes), and n_fields + 1, + so that this will never be less than the size calculated in + rec_get_offsets_func(). */ + size = (1 + REC_OFFS_HEADER_SIZE) + 1 + dict_index_get_n_fields(index); + + heap = mem_heap_create(size * (sizeof *offsets1 + sizeof *offsets2)); + + offsets1 = static_cast(mem_heap_alloc( + heap, size * sizeof *offsets1)); + + offsets2 = static_cast(mem_heap_alloc( + heap, size * sizeof *offsets2)); + + rec_offs_set_n_alloc(offsets1, size); + rec_offs_set_n_alloc(offsets2, size); + + rec = btr_cur_get_rec(cur); + page = page_align(rec); + ut_ad(!page_rec_is_leaf(rec)); + + offsets_rec = rec_get_offsets(rec, index, offsets1, 0, + ULINT_UNDEFINED, &heap); + + page_id_t page_id(index->table->space_id, + btr_node_ptr_get_child_page_no( + rec, offsets_rec)); + const ulint zip_size = index->table->space->zip_size(); + + /* assume no external pages by default - in case we quit from this + function without analyzing any leaf pages */ + *n_external_pages = 0; + + mtr_start(&mtr); + + /* descend to the leaf level on the B-tree */ + for (;;) { + dberr_t err; + + block = buf_page_get_gen(page_id, zip_size, + RW_S_LATCH, NULL, BUF_GET, + &mtr, &err, + !index->is_clust() + && 1 == btr_page_get_level(page)); + if (!block) { + goto func_exit; + } + + page = block->page.frame; + + if (page_is_leaf(page)) { + /* leaf level */ + break; + } + /* else */ + + /* search for the first non-boring record on the page */ + offsets_rec = dict_stats_scan_page( + &rec, offsets1, offsets2, index, page, n_prefix, + 0, n_diff, NULL); + + /* pages on level > 0 are not allowed to be empty */ + ut_a(offsets_rec != NULL); + /* if page is not empty (offsets_rec != NULL) then n_diff must + be > 0, otherwise there is a bug in dict_stats_scan_page() */ + ut_a(*n_diff > 0); + + if (*n_diff == 1) { + mtr_commit(&mtr); + + /* page has all keys equal and the end of the page + was reached by dict_stats_scan_page(), no need to + descend to the leaf level */ + mem_heap_free(heap); + /* can't get an estimate for n_external_pages here + because we do not dive to the leaf level, assume no + external pages (*n_external_pages was assigned to 0 + above). */ + return; + } + /* else */ + + /* when we instruct dict_stats_scan_page() to quit on the + first non-boring record it finds, then the returned n_diff + can either be 0 (empty page), 1 (page has all keys equal) or + 2 (non-boring record was found) */ + ut_a(*n_diff == 2); + + /* we have a non-boring record in rec, descend below it */ + + page_id.set_page_no( + btr_node_ptr_get_child_page_no(rec, offsets_rec)); + } + + /* make sure we got a leaf page as a result from the above loop */ + ut_ad(page_is_leaf(page)); + + /* scan the leaf page and find the number of distinct keys, + when looking only at the first n_prefix columns; also estimate + the number of externally stored pages pointed by records on this + page */ + + offsets_rec = dict_stats_scan_page( + &rec, offsets1, offsets2, index, page, n_prefix, + index->n_core_fields, n_diff, + n_external_pages); + +#if 0 + DEBUG_PRINTF(" %s(): n_diff below page_no=%lu: " UINT64PF "\n", + __func__, page_no, n_diff); +#endif + +func_exit: + mtr_commit(&mtr); + mem_heap_free(heap); +} + +/** Input data that is used to calculate dict_index_t::stat_n_diff_key_vals[] +for each n-columns prefix (n from 1 to n_uniq). */ +struct n_diff_data_t { + /** Index of the level on which the descent through the btree + stopped. level 0 is the leaf level. This is >= 1 because we + avoid scanning the leaf level because it may contain too many + pages and doing so is useless when combined with the random dives - + if we are to scan the leaf level, this means a full scan and we can + simply do that instead of fiddling with picking random records higher + in the tree and to dive below them. At the start of the analyzing + we may decide to do full scan of the leaf level, but then this + structure is not used in that code path. */ + ulint level; + + /** Number of records on the level where the descend through the btree + stopped. When we scan the btree from the root, we stop at some mid + level, choose some records from it and dive below them towards a leaf + page to analyze. */ + ib_uint64_t n_recs_on_level; + + /** Number of different key values that were found on the mid level. */ + ib_uint64_t n_diff_on_level; + + /** Number of leaf pages that are analyzed. This is also the same as + the number of records that we pick from the mid level and dive below + them. */ + ib_uint64_t n_leaf_pages_to_analyze; + + /** Cumulative sum of the number of different key values that were + found on all analyzed pages. */ + ib_uint64_t n_diff_all_analyzed_pages; + + /** Cumulative sum of the number of external pages (stored outside of + the btree but in the same file segment). */ + ib_uint64_t n_external_pages_sum; +}; + +/** Estimate the number of different key values in an index when looking at +the first n_prefix columns. For a given level in an index select +n_diff_data->n_leaf_pages_to_analyze records from that level and dive below +them to the corresponding leaf pages, then scan those leaf pages and save the +sampling results in n_diff_data->n_diff_all_analyzed_pages. +@param[in] index index +@param[in] n_prefix look at first 'n_prefix' columns when +comparing records +@param[in] boundaries a vector that contains +n_diff_data->n_diff_on_level integers each of which represents the index (on +level 'level', counting from left/smallest to right/biggest from 0) of the +last record from each group of distinct keys +@param[in,out] n_diff_data n_diff_all_analyzed_pages and +n_external_pages_sum in this structure will be set by this function. The +members level, n_diff_on_level and n_leaf_pages_to_analyze must be set by the +caller in advance - they are used by some calculations inside this function +@param[in,out] mtr mini-transaction */ +static +void +dict_stats_analyze_index_for_n_prefix( + dict_index_t* index, + ulint n_prefix, + const boundaries_t* boundaries, + n_diff_data_t* n_diff_data, + mtr_t* mtr) +{ + btr_pcur_t pcur; + const page_t* page; + ib_uint64_t rec_idx; + ib_uint64_t i; + +#if 0 + DEBUG_PRINTF(" %s(table=%s, index=%s, level=%lu, n_prefix=%lu," + " n_diff_on_level=" UINT64PF ")\n", + __func__, index->table->name, index->name, level, + n_prefix, n_diff_data->n_diff_on_level); +#endif + + ut_ad(n_diff_data->level); + + /* Position pcur on the leftmost record on the leftmost page + on the desired level. */ + + n_diff_data->n_diff_all_analyzed_pages = 0; + n_diff_data->n_external_pages_sum = 0; + + if (btr_pcur_open_level(&pcur, n_diff_data->level, mtr, index) + != DB_SUCCESS + || !btr_pcur_move_to_next_on_page(&pcur)) { + return; + } + + page = btr_pcur_get_page(&pcur); + + const rec_t* first_rec = btr_pcur_get_rec(&pcur); + + /* The page must not be empty, except when + it is the root page (and the whole index is empty). */ + if (page_has_prev(page) + || !btr_pcur_is_on_user_rec(&pcur) + || btr_page_get_level(page) != n_diff_data->level + || first_rec != page_rec_get_next_const(page_get_infimum_rec(page)) + || !(rec_get_info_bits(first_rec, page_is_comp(page)) + & REC_INFO_MIN_REC_FLAG)) { + return; + } + + const ib_uint64_t last_idx_on_level = boundaries->at( + static_cast(n_diff_data->n_diff_on_level - 1)); + + rec_idx = 0; + + for (i = 0; i < n_diff_data->n_leaf_pages_to_analyze; i++) { + /* there are n_diff_on_level elements + in 'boundaries' and we divide those elements + into n_leaf_pages_to_analyze segments, for example: + + let n_diff_on_level=100, n_leaf_pages_to_analyze=4, then: + segment i=0: [0, 24] + segment i=1: [25, 49] + segment i=2: [50, 74] + segment i=3: [75, 99] or + + let n_diff_on_level=1, n_leaf_pages_to_analyze=1, then: + segment i=0: [0, 0] or + + let n_diff_on_level=2, n_leaf_pages_to_analyze=2, then: + segment i=0: [0, 0] + segment i=1: [1, 1] or + + let n_diff_on_level=13, n_leaf_pages_to_analyze=7, then: + segment i=0: [0, 0] + segment i=1: [1, 2] + segment i=2: [3, 4] + segment i=3: [5, 6] + segment i=4: [7, 8] + segment i=5: [9, 10] + segment i=6: [11, 12] + + then we select a random record from each segment and dive + below it */ + const ib_uint64_t n_diff = n_diff_data->n_diff_on_level; + const ib_uint64_t n_pick + = n_diff_data->n_leaf_pages_to_analyze; + + const ib_uint64_t left = n_diff * i / n_pick; + const ib_uint64_t right = n_diff * (i + 1) / n_pick - 1; + + ut_a(left <= right); + ut_a(right <= last_idx_on_level); + + const ulint rnd = ut_rnd_interval( + static_cast(right - left)); + + const ib_uint64_t dive_below_idx + = boundaries->at(static_cast(left + rnd)); + +#if 0 + DEBUG_PRINTF(" %s(): dive below record with index=" + UINT64PF "\n", __func__, dive_below_idx); +#endif + + /* seek to the record with index dive_below_idx */ + while (rec_idx < dive_below_idx + && btr_pcur_is_on_user_rec(&pcur)) { + + btr_pcur_move_to_next_user_rec(&pcur, mtr); + rec_idx++; + } + + /* if the level has finished before the record we are + searching for, this means that the B-tree has changed in + the meantime, quit our sampling and use whatever stats + we have collected so far */ + if (rec_idx < dive_below_idx) { + + ut_ad(!btr_pcur_is_on_user_rec(&pcur)); + break; + } + + /* it could be that the tree has changed in such a way that + the record under dive_below_idx is the supremum record, in + this case rec_idx == dive_below_idx and pcur is positioned + on the supremum, we do not want to dive below it */ + if (!btr_pcur_is_on_user_rec(&pcur)) { + break; + } + + ut_a(rec_idx == dive_below_idx); + + ib_uint64_t n_diff_on_leaf_page; + ib_uint64_t n_external_pages; + + dict_stats_analyze_index_below_cur(btr_pcur_get_btr_cur(&pcur), + n_prefix, + &n_diff_on_leaf_page, + &n_external_pages); + + /* We adjust n_diff_on_leaf_page here to avoid counting + one value twice - once as the last on some page and once + as the first on another page. Consider the following example: + Leaf level: + page: (2,2,2,2,3,3) + ... many pages like (3,3,3,3,3,3) ... + page: (3,3,3,3,5,5) + ... many pages like (5,5,5,5,5,5) ... + page: (5,5,5,5,8,8) + page: (8,8,8,8,9,9) + our algo would (correctly) get an estimate that there are + 2 distinct records per page (average). Having 4 pages below + non-boring records, it would (wrongly) estimate the number + of distinct records to 8. */ + if (n_diff_on_leaf_page > 0) { + n_diff_on_leaf_page--; + } + + n_diff_data->n_diff_all_analyzed_pages += n_diff_on_leaf_page; + + n_diff_data->n_external_pages_sum += n_external_pages; + } +} + +/** statistics for an index */ +struct index_stats_t +{ + std::vector stats; + ulint index_size; + ulint n_leaf_pages; + + index_stats_t(ulint n_uniq) : index_size(1), n_leaf_pages(1) + { + stats.reserve(n_uniq); + for (ulint i= 0; i < n_uniq; ++i) + stats.push_back(index_field_stats_t{0, 1, 0}); + } + + void set_bulk_operation() + { + memset((void*) &stats[0], 0xff, stats.size() * sizeof stats[0]); + } + + bool is_bulk_operation() const + { + for (auto &s : stats) + if (!s.is_bulk_operation()) + return false; + return true; + } +}; + +/** Set dict_index_t::stat_n_diff_key_vals[] and stat_n_sample_sizes[]. +@param[in] n_diff_data input data to use to derive the results +@param[in,out] index_stats index stats to set */ +UNIV_INLINE +void +dict_stats_index_set_n_diff( + const n_diff_data_t* n_diff_data, + index_stats_t& index_stats) +{ + for (ulint n_prefix = index_stats.stats.size(); + n_prefix >= 1; + n_prefix--) { + /* n_diff_all_analyzed_pages can be 0 here if + all the leaf pages sampled contained only + delete-marked records. In this case we should assign + 0 to index->stat_n_diff_key_vals[n_prefix - 1], which + the formula below does. */ + + const n_diff_data_t* data = &n_diff_data[n_prefix - 1]; + + ut_ad(data->n_leaf_pages_to_analyze > 0); + ut_ad(data->n_recs_on_level > 0); + + ib_uint64_t n_ordinary_leaf_pages; + + if (data->level == 1) { + /* If we know the number of records on level 1, then + this number is the same as the number of pages on + level 0 (leaf). */ + n_ordinary_leaf_pages = data->n_recs_on_level; + } else { + /* If we analyzed D ordinary leaf pages and found E + external pages in total linked from those D ordinary + leaf pages, then this means that the ratio + ordinary/external is D/E. Then the ratio ordinary/total + is D / (D + E). Knowing that the total number of pages + is T (including ordinary and external) then we estimate + that the total number of ordinary leaf pages is + T * D / (D + E). */ + n_ordinary_leaf_pages + = index_stats.n_leaf_pages + * data->n_leaf_pages_to_analyze + / (data->n_leaf_pages_to_analyze + + data->n_external_pages_sum); + } + + /* See REF01 for an explanation of the algorithm */ + index_stats.stats[n_prefix - 1].n_diff_key_vals + = n_ordinary_leaf_pages + + * data->n_diff_on_level + / data->n_recs_on_level + + * data->n_diff_all_analyzed_pages + / data->n_leaf_pages_to_analyze; + + index_stats.stats[n_prefix - 1].n_sample_sizes + = data->n_leaf_pages_to_analyze; + + DEBUG_PRINTF(" %s(): n_diff=" UINT64PF + " for n_prefix=" ULINTPF + " (" ULINTPF + " * " UINT64PF " / " UINT64PF + " * " UINT64PF " / " UINT64PF ")\n", + __func__, + index_stats.stats[n_prefix - 1].n_diff_key_vals, + n_prefix, + index_stats.n_leaf_pages, + data->n_diff_on_level, + data->n_recs_on_level, + data->n_diff_all_analyzed_pages, + data->n_leaf_pages_to_analyze); + } +} + +/** Calculates new statistics for a given index and saves them to the index +members stat_n_diff_key_vals[], stat_n_sample_sizes[], stat_index_size and +stat_n_leaf_pages. This function can be slow. +@param[in] index index to analyze +@return index stats */ +static index_stats_t dict_stats_analyze_index(dict_index_t* index) +{ + bool level_is_analyzed; + ulint n_uniq; + ulint n_prefix; + ib_uint64_t total_recs; + ib_uint64_t total_pages; + mtr_t mtr; + index_stats_t result(index->n_uniq); + DBUG_ENTER("dict_stats_analyze_index"); + + DBUG_PRINT("info", ("index: %s, online status: %d", index->name(), + dict_index_get_online_status(index))); + + ut_ad(!index->table->stats_mutex_is_owner()); + ut_ad(index->table->get_ref_count()); + + if (!index->is_btree()) { + DBUG_RETURN(result); + } + + DEBUG_PRINTF(" %s(index=%s)\n", __func__, index->name()); + + mtr.start(); + mtr_sx_lock_index(index, &mtr); + dberr_t err; + buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, &mtr, &err); + if (!root) { +empty_index: + mtr.commit(); + dict_stats_assert_initialized_index(index); + DBUG_RETURN(result); + } + + uint16_t root_level = btr_page_get_level(root->page.frame); + mtr.x_lock_space(index->table->space); + ulint dummy, size; + result.index_size + = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF + + root->page.frame, &size, &mtr) + + fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_TOP + + root->page.frame, &dummy, &mtr); + result.n_leaf_pages = size ? size : 1; + + const auto bulk_trx_id = index->table->bulk_trx_id; + if (bulk_trx_id && trx_sys.find(nullptr, bulk_trx_id, false)) { + result.set_bulk_operation(); + goto empty_index; + } + + mtr.commit(); + + mtr.start(); + mtr_sx_lock_index(index, &mtr); + + n_uniq = dict_index_get_n_unique(index); + + /* If the tree has just one level (and one page) or if the user + has requested to sample too many pages then do full scan. + + For each n-column prefix (for n=1..n_uniq) N_SAMPLE_PAGES(index) + will be sampled, so in total N_SAMPLE_PAGES(index) * n_uniq leaf + pages will be sampled. If that number is bigger than the total + number of leaf pages then do full scan of the leaf level instead + since it will be faster and will give better results. */ + + if (root_level == 0 + || N_SAMPLE_PAGES(index) * n_uniq > result.n_leaf_pages) { + + if (root_level == 0) { + DEBUG_PRINTF(" %s(): just one page," + " doing full scan\n", __func__); + } else { + DEBUG_PRINTF(" %s(): too many pages requested for" + " sampling, doing full scan\n", __func__); + } + + /* do full scan of level 0; save results directly + into the index */ + + dict_stats_analyze_index_level(index, + 0 /* leaf level */, + index->stat_n_diff_key_vals, + &total_recs, + &total_pages, + NULL /* boundaries not needed */, + &mtr); + + mtr.commit(); + + index->table->stats_mutex_lock(); + for (ulint i = 0; i < n_uniq; i++) { + result.stats[i].n_diff_key_vals = index->stat_n_diff_key_vals[i]; + result.stats[i].n_sample_sizes = total_pages; + result.stats[i].n_non_null_key_vals = index->stat_n_non_null_key_vals[i]; + } + result.n_leaf_pages = index->stat_n_leaf_pages; + index->table->stats_mutex_unlock(); + + DBUG_RETURN(result); + } + + /* For each level that is being scanned in the btree, this contains the + number of different key values for all possible n-column prefixes. */ + ib_uint64_t* n_diff_on_level = UT_NEW_ARRAY( + ib_uint64_t, n_uniq, mem_key_dict_stats_n_diff_on_level); + + /* For each level that is being scanned in the btree, this contains the + index of the last record from each group of equal records (when + comparing only the first n columns, n=1..n_uniq). */ + boundaries_t* n_diff_boundaries = UT_NEW_ARRAY_NOKEY(boundaries_t, + n_uniq); + + /* For each n-column prefix this array contains the input data that is + used to calculate dict_index_t::stat_n_diff_key_vals[]. */ + n_diff_data_t* n_diff_data = UT_NEW_ARRAY_NOKEY(n_diff_data_t, n_uniq); + + /* total_recs is also used to estimate the number of pages on one + level below, so at the start we have 1 page (the root) */ + total_recs = 1; + + /* Here we use the following optimization: + If we find that level L is the first one (searching from the + root) that contains at least D distinct keys when looking at + the first n_prefix columns, then: + if we look at the first n_prefix-1 columns then the first + level that contains D distinct keys will be either L or a + lower one. + So if we find that the first level containing D distinct + keys (on n_prefix columns) is L, we continue from L when + searching for D distinct keys on n_prefix-1 columns. */ + auto level = root_level; + level_is_analyzed = false; + + for (n_prefix = n_uniq; n_prefix >= 1; n_prefix--) { + + DEBUG_PRINTF(" %s(): searching level with >=%llu " + "distinct records, n_prefix=" ULINTPF "\n", + __func__, N_DIFF_REQUIRED(index), n_prefix); + + /* Commit the mtr to release the tree S lock to allow + other threads to do some work too. */ + mtr.commit(); + mtr.start(); + mtr_sx_lock_index(index, &mtr); + ut_ad(mtr.get_savepoint() == 1); + buf_block_t *root = btr_root_block_get(index, RW_S_LATCH, + &mtr, &err); + if (!root || root_level != btr_page_get_level(root->page.frame) + || index->table->bulk_trx_id != bulk_trx_id) { + /* Just quit if the tree has changed beyond + recognition here. The old stats from previous + runs will remain in the values that we have + not calculated yet. Initially when the index + object is created the stats members are given + some sensible values so leaving them untouched + here even the first time will not cause us to + read uninitialized memory later. */ + break; + } + + mtr.rollback_to_savepoint(1); + + /* check whether we should pick the current level; + we pick level 1 even if it does not have enough + distinct records because we do not want to scan the + leaf level because it may contain too many records */ + if (level_is_analyzed + && (n_diff_on_level[n_prefix - 1] >= N_DIFF_REQUIRED(index) + || level == 1)) { + + goto found_level; + } + + /* search for a level that contains enough distinct records */ + + if (level_is_analyzed && level > 1) { + + /* if this does not hold we should be on + "found_level" instead of here */ + ut_ad(n_diff_on_level[n_prefix - 1] + < N_DIFF_REQUIRED(index)); + + level--; + level_is_analyzed = false; + } + + /* descend into the tree, searching for "good enough" level */ + for (;;) { + + /* make sure we do not scan the leaf level + accidentally, it may contain too many pages */ + ut_ad(level > 0); + + /* scanning the same level twice is an optimization + bug */ + ut_ad(!level_is_analyzed); + + /* Do not scan if this would read too many pages. + Here we use the following fact: + the number of pages on level L equals the number + of records on level L+1, thus we deduce that the + following call would scan total_recs pages, because + total_recs is left from the previous iteration when + we scanned one level upper or we have not scanned any + levels yet in which case total_recs is 1. */ + if (total_recs > N_SAMPLE_PAGES(index)) { + + /* if the above cond is true then we are + not at the root level since on the root + level total_recs == 1 (set before we + enter the n-prefix loop) and cannot + be > N_SAMPLE_PAGES(index) */ + ut_a(level != root_level); + + /* step one level back and be satisfied with + whatever it contains */ + level++; + level_is_analyzed = true; + + break; + } + + mtr.rollback_to_savepoint(1); + dict_stats_analyze_index_level(index, + level, + n_diff_on_level, + &total_recs, + &total_pages, + n_diff_boundaries, + &mtr); + mtr.rollback_to_savepoint(1); + level_is_analyzed = true; + + if (level == 1 + || n_diff_on_level[n_prefix - 1] + >= N_DIFF_REQUIRED(index)) { + /* we have reached the last level we could scan + or we found a good level with many distinct + records */ + break; + } + + level--; + level_is_analyzed = false; + } +found_level: + + DEBUG_PRINTF(" %s(): found level " ULINTPF + " that has " UINT64PF + " distinct records for n_prefix=" ULINTPF "\n", + __func__, level, n_diff_on_level[n_prefix - 1], + n_prefix); + /* here we are either on level 1 or the level that we are on + contains >= N_DIFF_REQUIRED distinct keys or we did not scan + deeper levels because they would contain too many pages */ + + ut_ad(level > 0); + + ut_ad(level_is_analyzed); + + /* if any of these is 0 then there is exactly one page in the + B-tree and it is empty and we should have done full scan and + should not be here */ + ut_ad(total_recs > 0); + ut_ad(n_diff_on_level[n_prefix - 1] > 0); + + ut_ad(N_SAMPLE_PAGES(index) > 0); + + n_diff_data_t* data = &n_diff_data[n_prefix - 1]; + + data->level = level; + + data->n_recs_on_level = total_recs; + + data->n_diff_on_level = n_diff_on_level[n_prefix - 1]; + + data->n_leaf_pages_to_analyze = std::min( + N_SAMPLE_PAGES(index), + n_diff_on_level[n_prefix - 1]); + + /* pick some records from this level and dive below them for + the given n_prefix */ + + dict_stats_analyze_index_for_n_prefix( + index, n_prefix, &n_diff_boundaries[n_prefix - 1], + data, &mtr); + } + + mtr.commit(); + + UT_DELETE_ARRAY(n_diff_boundaries); + + UT_DELETE_ARRAY(n_diff_on_level); + + /* n_prefix == 0 means that the above loop did not end up prematurely + due to tree being changed and so n_diff_data[] is set up. */ + if (n_prefix == 0) { + dict_stats_index_set_n_diff(n_diff_data, result); + } + + UT_DELETE_ARRAY(n_diff_data); + + DBUG_RETURN(result); +} + +/*********************************************************************//** +Calculates new estimates for table and index statistics. This function +is relatively slow and is used to calculate persistent statistics that +will be saved on disk. +@return DB_SUCCESS or error code +@retval DB_SUCCESS_LOCKED_REC if the table under bulk insert operation */ +static +dberr_t +dict_stats_update_persistent( +/*=========================*/ + dict_table_t* table) /*!< in/out: table */ +{ + dict_index_t* index; + + DEBUG_PRINTF("%s(table=%s)\n", __func__, table->name); + + DEBUG_SYNC_C("dict_stats_update_persistent"); + + /* analyze the clustered index first */ + + index = dict_table_get_first_index(table); + + if (index == NULL + || index->is_corrupted() + || (index->type | DICT_UNIQUE) != (DICT_CLUSTERED | DICT_UNIQUE)) { + + /* Table definition is corrupt */ + dict_stats_empty_table(table, true); + + return(DB_CORRUPTION); + } + + ut_ad(!dict_index_is_ibuf(index)); + table->stats_mutex_lock(); + dict_stats_empty_index(index, false); + table->stats_mutex_unlock(); + + index_stats_t stats = dict_stats_analyze_index(index); + + if (stats.is_bulk_operation()) { + dict_stats_empty_table(table, false); + return DB_SUCCESS_LOCKED_REC; + } + + table->stats_mutex_lock(); + index->stat_index_size = stats.index_size; + index->stat_n_leaf_pages = stats.n_leaf_pages; + for (size_t i = 0; i < stats.stats.size(); ++i) { + index->stat_n_diff_key_vals[i] = stats.stats[i].n_diff_key_vals; + index->stat_n_sample_sizes[i] = stats.stats[i].n_sample_sizes; + index->stat_n_non_null_key_vals[i] = stats.stats[i].n_non_null_key_vals; + } + + ulint n_unique = dict_index_get_n_unique(index); + + table->stat_n_rows = index->stat_n_diff_key_vals[n_unique - 1]; + + table->stat_clustered_index_size = index->stat_index_size; + + /* analyze other indexes from the table, if any */ + + table->stat_sum_of_other_index_sizes = 0; + + for (index = dict_table_get_next_index(index); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (!index->is_btree()) { + continue; + } + + dict_stats_empty_index(index, false); + + if (dict_stats_should_ignore_index(index)) { + continue; + } + + table->stats_mutex_unlock(); + stats = dict_stats_analyze_index(index); + table->stats_mutex_lock(); + + if (stats.is_bulk_operation()) { + table->stats_mutex_unlock(); + dict_stats_empty_table(table, false); + return DB_SUCCESS_LOCKED_REC; + } + + index->stat_index_size = stats.index_size; + index->stat_n_leaf_pages = stats.n_leaf_pages; + + for (size_t i = 0; i < stats.stats.size(); ++i) { + index->stat_n_diff_key_vals[i] + = stats.stats[i].n_diff_key_vals; + index->stat_n_sample_sizes[i] + = stats.stats[i].n_sample_sizes; + index->stat_n_non_null_key_vals[i] + = stats.stats[i].n_non_null_key_vals; + } + + table->stat_sum_of_other_index_sizes + += index->stat_index_size; + } + + table->stats_last_recalc = time(NULL); + + table->stat_modified_counter = 0; + + table->stat_initialized = TRUE; + + dict_stats_assert_initialized(table); + + table->stats_mutex_unlock(); + + return(DB_SUCCESS); +} + +#include "mysql_com.h" +/** Save an individual index's statistic into the persistent statistics +storage. +@param[in] index index to be updated +@param[in] last_update timestamp of the stat +@param[in] stat_name name of the stat +@param[in] stat_value value of the stat +@param[in] sample_size n pages sampled or NULL +@param[in] stat_description description of the stat +@param[in,out] trx transaction +@return DB_SUCCESS or error code */ +dberr_t +dict_stats_save_index_stat( + dict_index_t* index, + time_t last_update, + const char* stat_name, + ib_uint64_t stat_value, + ib_uint64_t* sample_size, + const char* stat_description, + trx_t* trx) +{ + dberr_t ret; + pars_info_t* pinfo; + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; + + ut_ad(dict_sys.locked()); + + dict_fs2utf8(index->table->name.m_name, db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); + + pinfo = pars_info_create(); + pars_info_add_str_literal(pinfo, "database_name", db_utf8); + pars_info_add_str_literal(pinfo, "table_name", table_utf8); + pars_info_add_str_literal(pinfo, "index_name", index->name); + MEM_CHECK_DEFINED(&last_update, 4); + pars_info_add_int4_literal(pinfo, "last_update", uint32(last_update)); + MEM_CHECK_DEFINED(stat_name, strlen(stat_name)); + pars_info_add_str_literal(pinfo, "stat_name", stat_name); + MEM_CHECK_DEFINED(&stat_value, 8); + pars_info_add_ull_literal(pinfo, "stat_value", stat_value); + if (sample_size != NULL) { + MEM_CHECK_DEFINED(sample_size, 8); + pars_info_add_ull_literal(pinfo, "sample_size", *sample_size); + } else { + pars_info_add_literal(pinfo, "sample_size", NULL, + UNIV_SQL_NULL, DATA_FIXBINARY, 0); + } + pars_info_add_str_literal(pinfo, "stat_description", + stat_description); + + ret = dict_stats_exec_sql( + pinfo, + "PROCEDURE INDEX_STATS_SAVE () IS\n" + "BEGIN\n" + + "DELETE FROM \"" INDEX_STATS_NAME "\"\n" + "WHERE\n" + "database_name = :database_name AND\n" + "table_name = :table_name AND\n" + "index_name = :index_name AND\n" + "stat_name = :stat_name;\n" + + "INSERT INTO \"" INDEX_STATS_NAME "\"\n" + "VALUES\n" + "(\n" + ":database_name,\n" + ":table_name,\n" + ":index_name,\n" + ":last_update,\n" + ":stat_name,\n" + ":stat_value,\n" + ":sample_size,\n" + ":stat_description\n" + ");\n" + "END;", trx); + + if (UNIV_UNLIKELY(ret != DB_SUCCESS)) { + if (innodb_index_stats_not_found == false && + index->stats_error_printed == false) { + ib::error() << "Cannot save index statistics for table " + << index->table->name + << ", index " << index->name + << ", stat name \"" << stat_name << "\": " + << ret; + index->stats_error_printed = true; + } + } + + return(ret); +} + +/** Report an error if updating table statistics failed because +.ibd file is missing, table decryption failed or table is corrupted. +@param[in,out] table Table +@param[in] defragment true if statistics is for defragment +@retval DB_DECRYPTION_FAILED if decryption of the table failed +@retval DB_TABLESPACE_DELETED if .ibd file is missing +@retval DB_CORRUPTION if table is marked as corrupted */ +dberr_t +dict_stats_report_error(dict_table_t* table, bool defragment) +{ + dberr_t err; + + const char* df = defragment ? " defragment" : ""; + + if (!table->space) { + ib::warn() << "Cannot save" << df << " statistics for table " + << table->name + << " because the .ibd file is missing. " + << TROUBLESHOOTING_MSG; + err = DB_TABLESPACE_DELETED; + } else { + ib::warn() << "Cannot save" << df << " statistics for table " + << table->name + << " because file " + << table->space->chain.start->name + << (table->corrupted + ? " is corrupted." + : " cannot be decrypted."); + err = table->corrupted ? DB_CORRUPTION : DB_DECRYPTION_FAILED; + } + + dict_stats_empty_table(table, defragment); + return err; +} + +/** Save the table's statistics into the persistent statistics storage. +@param[in] table_orig table whose stats to save +@param[in] only_for_index if this is non-NULL, then stats for indexes +that are not equal to it will not be saved, if NULL, then all indexes' stats +are saved +@return DB_SUCCESS or error code */ +static +dberr_t +dict_stats_save( + dict_table_t* table_orig, + const index_id_t* only_for_index) +{ + pars_info_t* pinfo; + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; + +#ifdef ENABLED_DEBUG_SYNC + DBUG_EXECUTE_IF("dict_stats_save_exit_notify", + SCOPE_EXIT([] { + debug_sync_set_action(current_thd, + STRING_WITH_LEN("now SIGNAL dict_stats_save_finished")); + }); + ); +#endif /* ENABLED_DEBUG_SYNC */ + + if (high_level_read_only) { + return DB_READ_ONLY; + } + + if (!table_orig->is_readable()) { + return (dict_stats_report_error(table_orig)); + } + + THD* thd = current_thd; + MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr; + dict_table_t* table_stats = dict_table_open_on_name( + TABLE_STATS_NAME, false, DICT_ERR_IGNORE_NONE); + if (table_stats) { + dict_sys.freeze(SRW_LOCK_CALL); + table_stats = dict_acquire_mdl_shared(table_stats, thd, + &mdl_table); + dict_sys.unfreeze(); + } + if (!table_stats + || strcmp(table_stats->name.m_name, TABLE_STATS_NAME)) { +release_and_exit: + if (table_stats) { + dict_table_close(table_stats, false, thd, mdl_table); + } + return DB_STATS_DO_NOT_EXIST; + } + + dict_table_t* index_stats = dict_table_open_on_name( + INDEX_STATS_NAME, false, DICT_ERR_IGNORE_NONE); + if (index_stats) { + dict_sys.freeze(SRW_LOCK_CALL); + index_stats = dict_acquire_mdl_shared(index_stats, thd, + &mdl_index); + dict_sys.unfreeze(); + } + if (!index_stats) { + goto release_and_exit; + } + if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME)) { + dict_table_close(index_stats, false, thd, mdl_index); + goto release_and_exit; + } + + dict_table_t* table = dict_stats_snapshot_create(table_orig); + + dict_fs2utf8(table->name.m_name, db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); + const time_t now = time(NULL); + trx_t* trx = trx_create(); + trx->mysql_thd = thd; + trx_start_internal(trx); + dberr_t ret = trx->read_only + ? DB_READ_ONLY + : lock_table_for_trx(table_stats, trx, LOCK_X); + if (ret == DB_SUCCESS) { + ret = lock_table_for_trx(index_stats, trx, LOCK_X); + } + if (ret != DB_SUCCESS) { + if (trx->state != TRX_STATE_NOT_STARTED) { + trx->commit(); + } + goto unlocked_free_and_exit; + } + + pinfo = pars_info_create(); + + pars_info_add_str_literal(pinfo, "database_name", db_utf8); + pars_info_add_str_literal(pinfo, "table_name", table_utf8); + pars_info_add_int4_literal(pinfo, "last_update", uint32(now)); + pars_info_add_ull_literal(pinfo, "n_rows", table->stat_n_rows); + pars_info_add_ull_literal(pinfo, "clustered_index_size", + table->stat_clustered_index_size); + pars_info_add_ull_literal(pinfo, "sum_of_other_index_sizes", + table->stat_sum_of_other_index_sizes); + + dict_sys.lock(SRW_LOCK_CALL); + trx->dict_operation_lock_mode = true; + + ret = dict_stats_exec_sql( + pinfo, + "PROCEDURE TABLE_STATS_SAVE () IS\n" + "BEGIN\n" + + "DELETE FROM \"" TABLE_STATS_NAME "\"\n" + "WHERE\n" + "database_name = :database_name AND\n" + "table_name = :table_name;\n" + + "INSERT INTO \"" TABLE_STATS_NAME "\"\n" + "VALUES\n" + "(\n" + ":database_name,\n" + ":table_name,\n" + ":last_update,\n" + ":n_rows,\n" + ":clustered_index_size,\n" + ":sum_of_other_index_sizes\n" + ");\n" + "END;", trx); + + if (UNIV_UNLIKELY(ret != DB_SUCCESS)) { + ib::error() << "Cannot save table statistics for table " + << table->name << ": " << ret; +rollback_and_exit: + trx->rollback(); +free_and_exit: + trx->dict_operation_lock_mode = false; + dict_sys.unlock(); +unlocked_free_and_exit: + trx->free(); + dict_stats_snapshot_free(table); + dict_table_close(table_stats, false, thd, mdl_table); + dict_table_close(index_stats, false, thd, mdl_index); + return ret; + } + + dict_index_t* index; + index_map_t indexes( + (ut_strcmp_functor()), + index_map_t_allocator(mem_key_dict_stats_index_map_t)); + + /* Below we do all the modifications in innodb_index_stats in a single + transaction for performance reasons. Modifying more than one row in a + single transaction may deadlock with other transactions if they + lock the rows in different order. Other transaction could be for + example when we DROP a table and do + DELETE FROM innodb_index_stats WHERE database_name = '...' + AND table_name = '...'; which will affect more than one row. To + prevent deadlocks we always lock the rows in the same order - the + order of the PK, which is (database_name, table_name, index_name, + stat_name). This is why below we sort the indexes by name and then + for each index, do the mods ordered by stat_name. */ + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + indexes[index->name] = index; + } + + index_map_t::const_iterator it; + + for (it = indexes.begin(); it != indexes.end(); ++it) { + + index = it->second; + + if (only_for_index != NULL && index->id != *only_for_index) { + continue; + } + + if (dict_stats_should_ignore_index(index)) { + continue; + } + + ut_ad(!dict_index_is_ibuf(index)); + + for (unsigned i = 0; i < index->n_uniq; i++) { + + char stat_name[16]; + char stat_description[1024]; + + snprintf(stat_name, sizeof(stat_name), + "n_diff_pfx%02u", i + 1); + + /* craft a string that contains the column names */ + snprintf(stat_description, sizeof(stat_description), + "%s", index->fields[0].name()); + for (unsigned j = 1; j <= i; j++) { + size_t len; + + len = strlen(stat_description); + + snprintf(stat_description + len, + sizeof(stat_description) - len, + ",%s", index->fields[j].name()); + } + + ret = dict_stats_save_index_stat( + index, now, stat_name, + index->stat_n_diff_key_vals[i], + &index->stat_n_sample_sizes[i], + stat_description, trx); + + if (ret != DB_SUCCESS) { + goto rollback_and_exit; + } + } + + ret = dict_stats_save_index_stat(index, now, "n_leaf_pages", + index->stat_n_leaf_pages, + NULL, + "Number of leaf pages " + "in the index", trx); + if (ret != DB_SUCCESS) { + goto rollback_and_exit; + } + + ret = dict_stats_save_index_stat(index, now, "size", + index->stat_index_size, + NULL, + "Number of pages " + "in the index", trx); + if (ret != DB_SUCCESS) { + goto rollback_and_exit; + } + } + + ret= trx->bulk_insert_apply(); + if (ret != DB_SUCCESS) { + goto rollback_and_exit; + } + + trx->commit(); + goto free_and_exit; +} + +/*********************************************************************//** +Called for the row that is selected by +SELECT ... FROM mysql.innodb_table_stats WHERE table='...' +The second argument is a pointer to the table and the fetched stats are +written to it. +@return non-NULL dummy */ +static +ibool +dict_stats_fetch_table_stats_step( +/*==============================*/ + void* node_void, /*!< in: select node */ + void* table_void) /*!< out: table */ +{ + sel_node_t* node = (sel_node_t*) node_void; + dict_table_t* table = (dict_table_t*) table_void; + que_common_t* cnode; + int i; + + /* this should loop exactly 3 times - for + n_rows,clustered_index_size,sum_of_other_index_sizes */ + for (cnode = static_cast(node->select_list), i = 0; + cnode != NULL; + cnode = static_cast(que_node_get_next(cnode)), + i++) { + + const byte* data; + dfield_t* dfield = que_node_get_val(cnode); + dtype_t* type = dfield_get_type(dfield); + ulint len = dfield_get_len(dfield); + + data = static_cast(dfield_get_data(dfield)); + + switch (i) { + case 0: /* mysql.innodb_table_stats.n_rows */ + + ut_a(dtype_get_mtype(type) == DATA_INT); + ut_a(len == 8); + + table->stat_n_rows = mach_read_from_8(data); + + break; + + case 1: /* mysql.innodb_table_stats.clustered_index_size */ + + ut_a(dtype_get_mtype(type) == DATA_INT); + ut_a(len == 8); + + table->stat_clustered_index_size + = (ulint) mach_read_from_8(data); + + break; + + case 2: /* mysql.innodb_table_stats.sum_of_other_index_sizes */ + + ut_a(dtype_get_mtype(type) == DATA_INT); + ut_a(len == 8); + + table->stat_sum_of_other_index_sizes + = (ulint) mach_read_from_8(data); + + break; + + default: + + /* someone changed SELECT + n_rows,clustered_index_size,sum_of_other_index_sizes + to select more columns from innodb_table_stats without + adjusting here */ + ut_error; + } + } + + /* if i < 3 this means someone changed the + SELECT n_rows,clustered_index_size,sum_of_other_index_sizes + to select less columns from innodb_table_stats without adjusting here; + if i > 3 we would have ut_error'ed earlier */ + ut_a(i == 3 /*n_rows,clustered_index_size,sum_of_other_index_sizes*/); + + /* XXX this is not used but returning non-NULL is necessary */ + return(TRUE); +} + +/** Aux struct used to pass a table and a boolean to +dict_stats_fetch_index_stats_step(). */ +struct index_fetch_t { + dict_table_t* table; /*!< table whose indexes are to be modified */ + bool stats_were_modified; /*!< will be set to true if at + least one index stats were modified */ +}; + +/*********************************************************************//** +Called for the rows that are selected by +SELECT ... FROM mysql.innodb_index_stats WHERE table='...' +The second argument is a pointer to the table and the fetched stats are +written to its indexes. +Let a table has N indexes and each index has Ui unique columns for i=1..N, +then mysql.innodb_index_stats will have SUM(Ui) i=1..N rows for that table. +So this function will be called SUM(Ui) times where SUM(Ui) is of magnitude +N*AVG(Ui). In each call it searches for the currently fetched index into +table->indexes linearly, assuming this list is not sorted. Thus, overall, +fetching all indexes' stats from mysql.innodb_index_stats is O(N^2) where N +is the number of indexes. +This can be improved if we sort table->indexes in a temporary area just once +and then search in that sorted list. Then the complexity will be O(N*log(N)). +We assume a table will not have more than 100 indexes, so we go with the +simpler N^2 algorithm. +@return non-NULL dummy */ +static +ibool +dict_stats_fetch_index_stats_step( +/*==============================*/ + void* node_void, /*!< in: select node */ + void* arg_void) /*!< out: table + a flag that tells if we + modified anything */ +{ + sel_node_t* node = (sel_node_t*) node_void; + index_fetch_t* arg = (index_fetch_t*) arg_void; + dict_table_t* table = arg->table; + dict_index_t* index = NULL; + que_common_t* cnode; + const char* stat_name = NULL; + ulint stat_name_len = ULINT_UNDEFINED; + ib_uint64_t stat_value = UINT64_UNDEFINED; + ib_uint64_t sample_size = UINT64_UNDEFINED; + int i; + + /* this should loop exactly 4 times - for the columns that + were selected: index_name,stat_name,stat_value,sample_size */ + for (cnode = static_cast(node->select_list), i = 0; + cnode != NULL; + cnode = static_cast(que_node_get_next(cnode)), + i++) { + + const byte* data; + dfield_t* dfield = que_node_get_val(cnode); + dtype_t* type = dfield_get_type(dfield); + ulint len = dfield_get_len(dfield); + + data = static_cast(dfield_get_data(dfield)); + + switch (i) { + case 0: /* mysql.innodb_index_stats.index_name */ + + ut_a(dtype_get_mtype(type) == DATA_VARMYSQL); + + /* search for index in table's indexes whose name + matches data; the fetched index name is in data, + has no terminating '\0' and has length len */ + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (index->is_committed() + && strlen(index->name) == len + && memcmp(index->name, data, len) == 0) { + /* the corresponding index was found */ + break; + } + } + + /* if index is NULL here this means that + mysql.innodb_index_stats contains more rows than the + number of indexes in the table; this is ok, we just + return ignoring those extra rows; in other words + dict_stats_fetch_index_stats_step() has been called + for a row from index_stats with unknown index_name + column */ + if (index == NULL) { + + return(TRUE); + } + + break; + + case 1: /* mysql.innodb_index_stats.stat_name */ + + ut_a(dtype_get_mtype(type) == DATA_VARMYSQL); + + ut_a(index != NULL); + + stat_name = (const char*) data; + stat_name_len = len; + + break; + + case 2: /* mysql.innodb_index_stats.stat_value */ + + ut_a(dtype_get_mtype(type) == DATA_INT); + ut_a(len == 8); + + ut_a(index != NULL); + ut_a(stat_name != NULL); + ut_a(stat_name_len != ULINT_UNDEFINED); + + stat_value = mach_read_from_8(data); + + break; + + case 3: /* mysql.innodb_index_stats.sample_size */ + + ut_a(dtype_get_mtype(type) == DATA_INT); + ut_a(len == 8 || len == UNIV_SQL_NULL); + + ut_a(index != NULL); + ut_a(stat_name != NULL); + ut_a(stat_name_len != ULINT_UNDEFINED); + ut_a(stat_value != UINT64_UNDEFINED); + + if (len == UNIV_SQL_NULL) { + break; + } + /* else */ + + sample_size = mach_read_from_8(data); + + break; + + default: + + /* someone changed + SELECT index_name,stat_name,stat_value,sample_size + to select more columns from innodb_index_stats without + adjusting here */ + ut_error; + } + } + + /* if i < 4 this means someone changed the + SELECT index_name,stat_name,stat_value,sample_size + to select less columns from innodb_index_stats without adjusting here; + if i > 4 we would have ut_error'ed earlier */ + ut_a(i == 4 /* index_name,stat_name,stat_value,sample_size */); + + ut_a(index != NULL); + ut_a(stat_name != NULL); + ut_a(stat_name_len != ULINT_UNDEFINED); + ut_a(stat_value != UINT64_UNDEFINED); + /* sample_size could be UINT64_UNDEFINED here, if it is NULL */ + +#define PFX "n_diff_pfx" +#define PFX_LEN 10 + + if (stat_name_len == 4 /* strlen("size") */ + && strncasecmp("size", stat_name, stat_name_len) == 0) { + index->stat_index_size = (ulint) stat_value; + arg->stats_were_modified = true; + } else if (stat_name_len == 12 /* strlen("n_leaf_pages") */ + && strncasecmp("n_leaf_pages", stat_name, stat_name_len) + == 0) { + index->stat_n_leaf_pages = (ulint) stat_value; + arg->stats_were_modified = true; + } else if (stat_name_len == 12 /* strlen("n_page_split") */ + && strncasecmp("n_page_split", stat_name, stat_name_len) + == 0) { + index->stat_defrag_n_page_split = (ulint) stat_value; + arg->stats_were_modified = true; + } else if (stat_name_len == 13 /* strlen("n_pages_freed") */ + && strncasecmp("n_pages_freed", stat_name, stat_name_len) + == 0) { + index->stat_defrag_n_pages_freed = (ulint) stat_value; + arg->stats_were_modified = true; + } else if (stat_name_len > PFX_LEN /* e.g. stat_name=="n_diff_pfx01" */ + && strncasecmp(PFX, stat_name, PFX_LEN) == 0) { + + const char* num_ptr; + unsigned long n_pfx; + + /* point num_ptr into "1" from "n_diff_pfx12..." */ + num_ptr = stat_name + PFX_LEN; + + /* stat_name should have exactly 2 chars appended to PFX + and they should be digits */ + if (stat_name_len != PFX_LEN + 2 + || num_ptr[0] < '0' || num_ptr[0] > '9' + || num_ptr[1] < '0' || num_ptr[1] > '9') { + + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; + + dict_fs2utf8(table->name.m_name, + db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); + + ib::info out; + out << "Ignoring strange row from " + << INDEX_STATS_NAME_PRINT << " WHERE" + " database_name = '" << db_utf8 + << "' AND table_name = '" << table_utf8 + << "' AND index_name = '" << index->name() + << "' AND stat_name = '"; + out.write(stat_name, stat_name_len); + out << "'; because stat_name is malformed"; + return(TRUE); + } + /* else */ + + /* extract 12 from "n_diff_pfx12..." into n_pfx + note that stat_name does not have a terminating '\0' */ + n_pfx = ulong(num_ptr[0] - '0') * 10 + ulong(num_ptr[1] - '0'); + + ulint n_uniq = index->n_uniq; + + if (n_pfx == 0 || n_pfx > n_uniq) { + + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; + + dict_fs2utf8(table->name.m_name, + db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); + + ib::info out; + out << "Ignoring strange row from " + << INDEX_STATS_NAME_PRINT << " WHERE" + " database_name = '" << db_utf8 + << "' AND table_name = '" << table_utf8 + << "' AND index_name = '" << index->name() + << "' AND stat_name = '"; + out.write(stat_name, stat_name_len); + out << "'; because stat_name is out of range, the index" + " has " << n_uniq << " unique columns"; + + return(TRUE); + } + /* else */ + + index->stat_n_diff_key_vals[n_pfx - 1] = stat_value; + + if (sample_size != UINT64_UNDEFINED) { + index->stat_n_sample_sizes[n_pfx - 1] = sample_size; + } else { + /* hmm, strange... the user must have UPDATEd the + table manually and SET sample_size = NULL */ + index->stat_n_sample_sizes[n_pfx - 1] = 0; + } + + index->stat_n_non_null_key_vals[n_pfx - 1] = 0; + + arg->stats_were_modified = true; + } else { + /* silently ignore rows with unknown stat_name, the + user may have developed her own stats */ + } + + /* XXX this is not used but returning non-NULL is necessary */ + return(TRUE); +} + +/*********************************************************************//** +Read table's statistics from the persistent statistics storage. +@return DB_SUCCESS or error code */ +static +dberr_t +dict_stats_fetch_from_ps( +/*=====================*/ + dict_table_t* table) /*!< in/out: table */ +{ + index_fetch_t index_fetch_arg; + trx_t* trx; + pars_info_t* pinfo; + dberr_t ret; + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; + + /* Initialize all stats to dummy values before fetching because if + the persistent storage contains incomplete stats (e.g. missing stats + for some index) then we would end up with (partially) uninitialized + stats. */ + dict_stats_empty_table(table, true); + + THD* thd = current_thd; + MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr; + dict_table_t* table_stats = dict_table_open_on_name( + TABLE_STATS_NAME, false, DICT_ERR_IGNORE_NONE); + if (table_stats) { + dict_sys.freeze(SRW_LOCK_CALL); + table_stats = dict_acquire_mdl_shared(table_stats, thd, + &mdl_table); + dict_sys.unfreeze(); + } + if (!table_stats + || strcmp(table_stats->name.m_name, TABLE_STATS_NAME)) { +release_and_exit: + if (table_stats) { + dict_table_close(table_stats, false, thd, mdl_table); + } + return DB_STATS_DO_NOT_EXIST; + } + + dict_table_t* index_stats = dict_table_open_on_name( + INDEX_STATS_NAME, false, DICT_ERR_IGNORE_NONE); + if (index_stats) { + dict_sys.freeze(SRW_LOCK_CALL); + index_stats = dict_acquire_mdl_shared(index_stats, thd, + &mdl_index); + dict_sys.unfreeze(); + } + if (!index_stats) { + goto release_and_exit; + } + if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME)) { + dict_table_close(index_stats, false, thd, mdl_index); + goto release_and_exit; + } + + trx = trx_create(); + + trx_start_internal_read_only(trx); + + dict_fs2utf8(table->name.m_name, db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); + + pinfo = pars_info_create(); + + pars_info_add_str_literal(pinfo, "database_name", db_utf8); + + pars_info_add_str_literal(pinfo, "table_name", table_utf8); + + pars_info_bind_function(pinfo, + "fetch_table_stats_step", + dict_stats_fetch_table_stats_step, + table); + + index_fetch_arg.table = table; + index_fetch_arg.stats_were_modified = false; + pars_info_bind_function(pinfo, + "fetch_index_stats_step", + dict_stats_fetch_index_stats_step, + &index_fetch_arg); + dict_sys.lock(SRW_LOCK_CALL); /* FIXME: remove this */ + ret = que_eval_sql(pinfo, + "PROCEDURE FETCH_STATS () IS\n" + "found INT;\n" + "DECLARE FUNCTION fetch_table_stats_step;\n" + "DECLARE FUNCTION fetch_index_stats_step;\n" + "DECLARE CURSOR table_stats_cur IS\n" + " SELECT\n" + /* if you change the selected fields, be + sure to adjust + dict_stats_fetch_table_stats_step() */ + " n_rows,\n" + " clustered_index_size,\n" + " sum_of_other_index_sizes\n" + " FROM \"" TABLE_STATS_NAME "\"\n" + " WHERE\n" + " database_name = :database_name AND\n" + " table_name = :table_name;\n" + "DECLARE CURSOR index_stats_cur IS\n" + " SELECT\n" + /* if you change the selected fields, be + sure to adjust + dict_stats_fetch_index_stats_step() */ + " index_name,\n" + " stat_name,\n" + " stat_value,\n" + " sample_size\n" + " FROM \"" INDEX_STATS_NAME "\"\n" + " WHERE\n" + " database_name = :database_name AND\n" + " table_name = :table_name;\n" + + "BEGIN\n" + + "OPEN table_stats_cur;\n" + "FETCH table_stats_cur INTO\n" + " fetch_table_stats_step();\n" + "IF (SQL % NOTFOUND) THEN\n" + " CLOSE table_stats_cur;\n" + " RETURN;\n" + "END IF;\n" + "CLOSE table_stats_cur;\n" + + "OPEN index_stats_cur;\n" + "found := 1;\n" + "WHILE found = 1 LOOP\n" + " FETCH index_stats_cur INTO\n" + " fetch_index_stats_step();\n" + " IF (SQL % NOTFOUND) THEN\n" + " found := 0;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE index_stats_cur;\n" + + "END;", trx); + /* pinfo is freed by que_eval_sql() */ + dict_sys.unlock(); + + dict_table_close(table_stats, false, thd, mdl_table); + dict_table_close(index_stats, false, thd, mdl_index); + + trx_commit_for_mysql(trx); + + trx->free(); + + if (!index_fetch_arg.stats_were_modified) { + return(DB_STATS_DO_NOT_EXIST); + } + + return(ret); +} + +/*********************************************************************//** +Clear defragmentation stats modified counter for all indices in table. */ +static +void +dict_stats_empty_defrag_modified_counter( + dict_table_t* table) /*!< in: table */ +{ + dict_index_t* index; + ut_a(table); + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + index->stat_defrag_modified_counter = 0; + } +} + +/*********************************************************************//** +Fetches or calculates new estimates for index statistics. */ +void +dict_stats_update_for_index( +/*========================*/ + dict_index_t* index) /*!< in/out: index */ +{ + DBUG_ENTER("dict_stats_update_for_index"); + + if (dict_stats_is_persistent_enabled(index->table)) { + + if (dict_stats_persistent_storage_check(false)) { + index_stats_t stats = dict_stats_analyze_index(index); + index->table->stats_mutex_lock(); + index->stat_index_size = stats.index_size; + index->stat_n_leaf_pages = stats.n_leaf_pages; + for (size_t i = 0; i < stats.stats.size(); ++i) { + index->stat_n_diff_key_vals[i] + = stats.stats[i].n_diff_key_vals; + index->stat_n_sample_sizes[i] + = stats.stats[i].n_sample_sizes; + index->stat_n_non_null_key_vals[i] + = stats.stats[i].n_non_null_key_vals; + } + index->table->stat_sum_of_other_index_sizes + += index->stat_index_size; + index->table->stats_mutex_unlock(); + + dict_stats_save(index->table, &index->id); + DBUG_VOID_RETURN; + } + /* else */ + + if (innodb_index_stats_not_found == false && + index->stats_error_printed == false) { + /* Fall back to transient stats since the persistent + storage is not present or is corrupted */ + + ib::info() << "Recalculation of persistent statistics" + " requested for table " << index->table->name + << " index " << index->name + << " but the required" + " persistent statistics storage is not present or is" + " corrupted. Using transient stats instead."; + index->stats_error_printed = false; + } + } + + dict_stats_update_transient_for_index(index); + + DBUG_VOID_RETURN; +} + +/*********************************************************************//** +Calculates new estimates for table and index statistics. The statistics +are used in query optimization. +@return DB_SUCCESS or error code +@retval DB_SUCCESS_LOCKED_REC if the table under bulk insert operation */ +dberr_t +dict_stats_update( +/*==============*/ + dict_table_t* table, /*!< in/out: table */ + dict_stats_upd_option_t stats_upd_option) + /*!< in: whether to (re) calc + the stats or to fetch them from + the persistent statistics + storage */ +{ + ut_ad(!table->stats_mutex_is_owner()); + + if (!table->is_readable()) { + return (dict_stats_report_error(table)); + } else if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) { + /* If we have set a high innodb_force_recovery level, do + not calculate statistics, as a badly corrupted index can + cause a crash in it. */ + dict_stats_empty_table(table, false); + return(DB_SUCCESS); + } + + if (trx_id_t bulk_trx_id = table->bulk_trx_id) { + if (trx_sys.find(nullptr, bulk_trx_id, false)) { + dict_stats_empty_table(table, false); + return DB_SUCCESS_LOCKED_REC; + } + } + + switch (stats_upd_option) { + case DICT_STATS_RECALC_PERSISTENT: + + if (srv_read_only_mode) { + goto transient; + } + + /* Persistent recalculation requested, called from + 1) ANALYZE TABLE, or + 2) the auto recalculation background thread, or + 3) open table if stats do not exist on disk and auto recalc + is enabled */ + + /* InnoDB internal tables (e.g. SYS_TABLES) cannot have + persistent stats enabled */ + ut_a(strchr(table->name.m_name, '/') != NULL); + + /* check if the persistent statistics storage exists + before calling the potentially slow function + dict_stats_update_persistent(); that is a + prerequisite for dict_stats_save() succeeding */ + if (dict_stats_persistent_storage_check(false)) { + + dberr_t err; + + err = dict_stats_update_persistent(table); + + if (err != DB_SUCCESS) { + return(err); + } + + err = dict_stats_save(table, NULL); + + return(err); + } + + /* Fall back to transient stats since the persistent + storage is not present or is corrupted */ + + if (innodb_table_stats_not_found == false && + table->stats_error_printed == false) { + ib::warn() << "Recalculation of persistent statistics" + " requested for table " + << table->name + << " but the required persistent" + " statistics storage is not present or is corrupted." + " Using transient stats instead."; + table->stats_error_printed = true; + } + + goto transient; + + case DICT_STATS_RECALC_TRANSIENT: + + goto transient; + + case DICT_STATS_EMPTY_TABLE: + + dict_stats_empty_table(table, true); + + /* If table is using persistent stats, + then save the stats on disk */ + + if (dict_stats_is_persistent_enabled(table)) { + + if (dict_stats_persistent_storage_check(false)) { + + return(dict_stats_save(table, NULL)); + } + + return(DB_STATS_DO_NOT_EXIST); + } + + return(DB_SUCCESS); + + case DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY: + + /* fetch requested, either fetch from persistent statistics + storage or use the old method */ + + if (table->stat_initialized) { + return(DB_SUCCESS); + } + + /* InnoDB internal tables (e.g. SYS_TABLES) cannot have + persistent stats enabled */ + ut_a(strchr(table->name.m_name, '/') != NULL); + + if (!dict_stats_persistent_storage_check(false)) { + /* persistent statistics storage does not exist + or is corrupted, calculate the transient stats */ + + if (innodb_table_stats_not_found == false && + table->stats_error_printed == false && + !opt_bootstrap) { + ib::error() << "Fetch of persistent statistics" + " requested for table " + << table->name + << " but the required system tables " + << TABLE_STATS_NAME_PRINT + << " and " << INDEX_STATS_NAME_PRINT + << " are not present or have unexpected" + " structure. Using transient stats instead."; + table->stats_error_printed = true; + } + + goto transient; + } + + dict_table_t* t; + + /* Create a dummy table object with the same name and + indexes, suitable for fetching the stats into it. */ + t = dict_stats_table_clone_create(table); + + dberr_t err = dict_stats_fetch_from_ps(t); + + t->stats_last_recalc = table->stats_last_recalc; + t->stat_modified_counter = 0; + dict_stats_empty_defrag_modified_counter(t); + + switch (err) { + case DB_SUCCESS: + + table->stats_mutex_lock(); + /* t is localized to this thread so no need to + take stats mutex lock (limiting it to debug only) */ + ut_d(t->stats_mutex_lock()); + + /* Pass reset_ignored_indexes=true as parameter + to dict_stats_copy. This will cause statictics + for corrupted indexes to be set to empty values */ + dict_stats_copy(table, t, true); + + dict_stats_assert_initialized(table); + + ut_d(t->stats_mutex_unlock()); + table->stats_mutex_unlock(); + + dict_stats_table_clone_free(t); + + return(DB_SUCCESS); + case DB_STATS_DO_NOT_EXIST: + + dict_stats_table_clone_free(t); + + if (srv_read_only_mode) { + goto transient; + } + + if (dict_stats_auto_recalc_is_enabled(table)) { + return(dict_stats_update( + table, + DICT_STATS_RECALC_PERSISTENT)); + } + + ib::info() << "Trying to use table " << table->name + << " which has persistent statistics enabled," + " but auto recalculation turned off and the" + " statistics do not exist in " + TABLE_STATS_NAME_PRINT + " and " INDEX_STATS_NAME_PRINT + ". Please either run \"ANALYZE TABLE " + << table->name << ";\" manually or enable the" + " auto recalculation with \"ALTER TABLE " + << table->name << " STATS_AUTO_RECALC=1;\"." + " InnoDB will now use transient statistics for " + << table->name << "."; + + goto transient; + default: + + dict_stats_table_clone_free(t); + + if (innodb_table_stats_not_found == false && + table->stats_error_printed == false) { + ib::error() << "Error fetching persistent statistics" + " for table " + << table->name + << " from " TABLE_STATS_NAME_PRINT " and " + INDEX_STATS_NAME_PRINT ": " << err + << ". Using transient stats method instead."; + } + + goto transient; + } + /* no "default:" in order to produce a compilation warning + about unhandled enumeration value */ + } + +transient: + return dict_stats_update_transient(table); +} + +/** Execute DELETE FROM mysql.innodb_table_stats +@param database_name database name +@param table_name table name +@param trx transaction (nullptr=start and commit a new one) +@return DB_SUCCESS or error code */ +dberr_t dict_stats_delete_from_table_stats(const char *database_name, + const char *table_name, trx_t *trx) +{ + pars_info_t* pinfo; + + ut_ad(dict_sys.locked()); + + pinfo = pars_info_create(); + + pars_info_add_str_literal(pinfo, "database_name", database_name); + pars_info_add_str_literal(pinfo, "table_name", table_name); + + return dict_stats_exec_sql( + pinfo, + "PROCEDURE DELETE_FROM_TABLE_STATS () IS\n" + "BEGIN\n" + "DELETE FROM \"" TABLE_STATS_NAME "\" WHERE\n" + "database_name = :database_name AND\n" + "table_name = :table_name;\n" + "END;\n", trx); +} + +/** Execute DELETE FROM mysql.innodb_index_stats +@param database_name database name +@param table_name table name +@param trx transaction +@return DB_SUCCESS or error code */ +dberr_t dict_stats_delete_from_index_stats(const char *database_name, + const char *table_name, trx_t *trx) +{ + pars_info_t* pinfo; + + ut_ad(dict_sys.locked()); + + pinfo = pars_info_create(); + + pars_info_add_str_literal(pinfo, "database_name", database_name); + pars_info_add_str_literal(pinfo, "table_name", table_name); + + return dict_stats_exec_sql( + pinfo, + "PROCEDURE DELETE_FROM_INDEX_STATS () IS\n" + "BEGIN\n" + "DELETE FROM \"" INDEX_STATS_NAME "\" WHERE\n" + "database_name = :database_name AND\n" + "table_name = :table_name;\n" + "END;\n", trx); +} + +/** Execute DELETE FROM mysql.innodb_index_stats +@param database_name database name +@param table_name table name +@param index_name name of the index +@param trx transaction +@return DB_SUCCESS or error code */ +dberr_t dict_stats_delete_from_index_stats(const char *database_name, + const char *table_name, + const char *index_name, trx_t *trx) +{ + pars_info_t* pinfo; + + ut_ad(dict_sys.locked()); + + pinfo = pars_info_create(); + + pars_info_add_str_literal(pinfo, "database_name", database_name); + pars_info_add_str_literal(pinfo, "table_name", table_name); + pars_info_add_str_literal(pinfo, "index_name", index_name); + + return dict_stats_exec_sql( + pinfo, + "PROCEDURE DELETE_FROM_INDEX_STATS () IS\n" + "BEGIN\n" + "DELETE FROM \"" INDEX_STATS_NAME "\" WHERE\n" + "database_name = :database_name AND\n" + "table_name = :table_name AND\n" + "index_name = :index_name;\n" + "END;\n", trx); +} + +/** Rename a table in InnoDB persistent stats storage. +@param old_name old table name +@param new_name new table name +@param trx transaction +@return DB_SUCCESS or error code */ +dberr_t dict_stats_rename_table(const char *old_name, const char *new_name, + trx_t *trx) +{ + /* skip the statistics tables themselves */ + if (!strcmp(old_name, TABLE_STATS_NAME) || + !strcmp(old_name, INDEX_STATS_NAME) || + !strcmp(new_name, TABLE_STATS_NAME) || + !strcmp(new_name, INDEX_STATS_NAME)) + return DB_SUCCESS; + + char old_db[MAX_DB_UTF8_LEN]; + char new_db[MAX_DB_UTF8_LEN]; + char old_table[MAX_TABLE_UTF8_LEN]; + char new_table[MAX_TABLE_UTF8_LEN]; + + dict_fs2utf8(old_name, old_db, sizeof old_db, old_table, sizeof old_table); + dict_fs2utf8(new_name, new_db, sizeof new_db, new_table, sizeof new_table); + + if (dict_table_t::is_temporary_name(old_name) || + dict_table_t::is_temporary_name(new_name)) + { + if (dberr_t e= dict_stats_delete_from_table_stats(old_db, old_table, trx)) + return e; + return dict_stats_delete_from_index_stats(old_db, old_table, trx); + } + + pars_info_t *pinfo= pars_info_create(); + pars_info_add_str_literal(pinfo, "old_db", old_db); + pars_info_add_str_literal(pinfo, "old_table", old_table); + pars_info_add_str_literal(pinfo, "new_db", new_db); + pars_info_add_str_literal(pinfo, "new_table", new_table); + + static const char sql[]= + "PROCEDURE RENAME_TABLE_IN_STATS() IS\n" + "BEGIN\n" + "UPDATE \"" TABLE_STATS_NAME "\" SET\n" + "database_name=:new_db, table_name=:new_table\n" + "WHERE database_name=:old_db AND table_name=:old_table;\n" + "UPDATE \"" INDEX_STATS_NAME "\" SET\n" + "database_name=:new_db, table_name=:new_table\n" + "WHERE database_name=:old_db AND table_name=:old_table;\n" + "END;\n"; + + return dict_stats_exec_sql(pinfo, sql, trx); +} + +/** Rename an index in InnoDB persistent statistics. +@param db database name +@param table table name +@param old_name old table name +@param new_name new table name +@param trx transaction +@return DB_SUCCESS or error code */ +dberr_t dict_stats_rename_index(const char *db, const char *table, + const char *old_name, const char *new_name, + trx_t *trx) +{ + if (!dict_stats_persistent_storage_check(true)) + return DB_STATS_DO_NOT_EXIST; + pars_info_t *pinfo= pars_info_create(); + + pars_info_add_str_literal(pinfo, "db", db); + pars_info_add_str_literal(pinfo, "table", table); + pars_info_add_str_literal(pinfo, "old", old_name); + pars_info_add_str_literal(pinfo, "new", new_name); + + static const char sql[]= + "PROCEDURE RENAME_INDEX_IN_STATS() IS\n" + "BEGIN\n" + "UPDATE \"" INDEX_STATS_NAME "\" SET index_name=:new\n" + "WHERE database_name=:db AND table_name=:table AND index_name=:old;\n" + "END;\n"; + + return dict_stats_exec_sql(pinfo, sql, trx); +} + +/** Delete all persistent statistics for a database. +@param db database name +@param trx transaction +@return DB_SUCCESS or error code */ +dberr_t dict_stats_delete(const char *db, trx_t *trx) +{ + static const char sql[] = + "PROCEDURE DROP_DATABASE_STATS () IS\n" + "BEGIN\n" + "DELETE FROM \"" TABLE_STATS_NAME "\" WHERE database_name=:db;\n" + "DELETE FROM \"" INDEX_STATS_NAME "\" WHERE database_name=:db;\n" + "END;\n"; + + pars_info_t *pinfo= pars_info_create(); + pars_info_add_str_literal(pinfo, "db", db); + return dict_stats_exec_sql(pinfo, sql, trx); +} + +/* tests @{ */ +#ifdef UNIV_ENABLE_UNIT_TEST_DICT_STATS +/* save/fetch aux macros @{ */ +#define TEST_DATABASE_NAME "foobardb" +#define TEST_TABLE_NAME "test_dict_stats" + +#define TEST_N_ROWS 111 +#define TEST_CLUSTERED_INDEX_SIZE 222 +#define TEST_SUM_OF_OTHER_INDEX_SIZES 333 + +#define TEST_IDX1_NAME "tidx1" +#define TEST_IDX1_COL1_NAME "tidx1_col1" +#define TEST_IDX1_INDEX_SIZE 123 +#define TEST_IDX1_N_LEAF_PAGES 234 +#define TEST_IDX1_N_DIFF1 50 +#define TEST_IDX1_N_DIFF1_SAMPLE_SIZE 500 + +#define TEST_IDX2_NAME "tidx2" +#define TEST_IDX2_COL1_NAME "tidx2_col1" +#define TEST_IDX2_COL2_NAME "tidx2_col2" +#define TEST_IDX2_COL3_NAME "tidx2_col3" +#define TEST_IDX2_COL4_NAME "tidx2_col4" +#define TEST_IDX2_INDEX_SIZE 321 +#define TEST_IDX2_N_LEAF_PAGES 432 +#define TEST_IDX2_N_DIFF1 60 +#define TEST_IDX2_N_DIFF1_SAMPLE_SIZE 600 +#define TEST_IDX2_N_DIFF2 61 +#define TEST_IDX2_N_DIFF2_SAMPLE_SIZE 610 +#define TEST_IDX2_N_DIFF3 62 +#define TEST_IDX2_N_DIFF3_SAMPLE_SIZE 620 +#define TEST_IDX2_N_DIFF4 63 +#define TEST_IDX2_N_DIFF4_SAMPLE_SIZE 630 +/* @} */ + +/* test_dict_stats_save() @{ */ +void +test_dict_stats_save() +{ + dict_table_t table; + dict_index_t index1; + dict_field_t index1_fields[1]; + ib_uint64_t index1_stat_n_diff_key_vals[1]; + ib_uint64_t index1_stat_n_sample_sizes[1]; + dict_index_t index2; + dict_field_t index2_fields[4]; + ib_uint64_t index2_stat_n_diff_key_vals[4]; + ib_uint64_t index2_stat_n_sample_sizes[4]; + dberr_t ret; + + /* craft a dummy dict_table_t */ + table.name.m_name = (char*) (TEST_DATABASE_NAME "/" TEST_TABLE_NAME); + table.stat_n_rows = TEST_N_ROWS; + table.stat_clustered_index_size = TEST_CLUSTERED_INDEX_SIZE; + table.stat_sum_of_other_index_sizes = TEST_SUM_OF_OTHER_INDEX_SIZES; + UT_LIST_INIT(table.indexes, &dict_index_t::indexes); +#ifdef BTR_CUR_HASH_ADAPT + UT_LIST_INIT(table.freed_indexes, &dict_index_t::indexes); +#endif /* BTR_CUR_HASH_ADAPT */ + UT_LIST_ADD_LAST(table.indexes, &index1); + UT_LIST_ADD_LAST(table.indexes, &index2); + ut_d(table.magic_n = DICT_TABLE_MAGIC_N); + ut_d(index1.magic_n = DICT_INDEX_MAGIC_N); + + index1.name = TEST_IDX1_NAME; + index1.table = &table; + index1.cached = 1; + index1.n_uniq = 1; + index1.fields = index1_fields; + index1.stat_n_diff_key_vals = index1_stat_n_diff_key_vals; + index1.stat_n_sample_sizes = index1_stat_n_sample_sizes; + index1.stat_index_size = TEST_IDX1_INDEX_SIZE; + index1.stat_n_leaf_pages = TEST_IDX1_N_LEAF_PAGES; + index1_fields[0].name = TEST_IDX1_COL1_NAME; + index1_stat_n_diff_key_vals[0] = TEST_IDX1_N_DIFF1; + index1_stat_n_sample_sizes[0] = TEST_IDX1_N_DIFF1_SAMPLE_SIZE; + + ut_d(index2.magic_n = DICT_INDEX_MAGIC_N); + index2.name = TEST_IDX2_NAME; + index2.table = &table; + index2.cached = 1; + index2.n_uniq = 4; + index2.fields = index2_fields; + index2.stat_n_diff_key_vals = index2_stat_n_diff_key_vals; + index2.stat_n_sample_sizes = index2_stat_n_sample_sizes; + index2.stat_index_size = TEST_IDX2_INDEX_SIZE; + index2.stat_n_leaf_pages = TEST_IDX2_N_LEAF_PAGES; + index2_fields[0].name = TEST_IDX2_COL1_NAME; + index2_fields[1].name = TEST_IDX2_COL2_NAME; + index2_fields[2].name = TEST_IDX2_COL3_NAME; + index2_fields[3].name = TEST_IDX2_COL4_NAME; + index2_stat_n_diff_key_vals[0] = TEST_IDX2_N_DIFF1; + index2_stat_n_diff_key_vals[1] = TEST_IDX2_N_DIFF2; + index2_stat_n_diff_key_vals[2] = TEST_IDX2_N_DIFF3; + index2_stat_n_diff_key_vals[3] = TEST_IDX2_N_DIFF4; + index2_stat_n_sample_sizes[0] = TEST_IDX2_N_DIFF1_SAMPLE_SIZE; + index2_stat_n_sample_sizes[1] = TEST_IDX2_N_DIFF2_SAMPLE_SIZE; + index2_stat_n_sample_sizes[2] = TEST_IDX2_N_DIFF3_SAMPLE_SIZE; + index2_stat_n_sample_sizes[3] = TEST_IDX2_N_DIFF4_SAMPLE_SIZE; + + ret = dict_stats_save(&table, NULL); + + ut_a(ret == DB_SUCCESS); + + printf("\nOK: stats saved successfully, now go ahead and read" + " what's inside %s and %s:\n\n", + TABLE_STATS_NAME_PRINT, + INDEX_STATS_NAME_PRINT); + + printf("SELECT COUNT(*) = 1 AS table_stats_saved_successfully\n" + "FROM %s\n" + "WHERE\n" + "database_name = '%s' AND\n" + "table_name = '%s' AND\n" + "n_rows = %d AND\n" + "clustered_index_size = %d AND\n" + "sum_of_other_index_sizes = %d;\n" + "\n", + TABLE_STATS_NAME_PRINT, + TEST_DATABASE_NAME, + TEST_TABLE_NAME, + TEST_N_ROWS, + TEST_CLUSTERED_INDEX_SIZE, + TEST_SUM_OF_OTHER_INDEX_SIZES); + + printf("SELECT COUNT(*) = 3 AS tidx1_stats_saved_successfully\n" + "FROM %s\n" + "WHERE\n" + "database_name = '%s' AND\n" + "table_name = '%s' AND\n" + "index_name = '%s' AND\n" + "(\n" + " (stat_name = 'size' AND stat_value = %d AND" + " sample_size IS NULL) OR\n" + " (stat_name = 'n_leaf_pages' AND stat_value = %d AND" + " sample_size IS NULL) OR\n" + " (stat_name = 'n_diff_pfx01' AND stat_value = %d AND" + " sample_size = '%d' AND stat_description = '%s')\n" + ");\n" + "\n", + INDEX_STATS_NAME_PRINT, + TEST_DATABASE_NAME, + TEST_TABLE_NAME, + TEST_IDX1_NAME, + TEST_IDX1_INDEX_SIZE, + TEST_IDX1_N_LEAF_PAGES, + TEST_IDX1_N_DIFF1, + TEST_IDX1_N_DIFF1_SAMPLE_SIZE, + TEST_IDX1_COL1_NAME); + + printf("SELECT COUNT(*) = 6 AS tidx2_stats_saved_successfully\n" + "FROM %s\n" + "WHERE\n" + "database_name = '%s' AND\n" + "table_name = '%s' AND\n" + "index_name = '%s' AND\n" + "(\n" + " (stat_name = 'size' AND stat_value = %d AND" + " sample_size IS NULL) OR\n" + " (stat_name = 'n_leaf_pages' AND stat_value = %d AND" + " sample_size IS NULL) OR\n" + " (stat_name = 'n_diff_pfx01' AND stat_value = %d AND" + " sample_size = '%d' AND stat_description = '%s') OR\n" + " (stat_name = 'n_diff_pfx02' AND stat_value = %d AND" + " sample_size = '%d' AND stat_description = '%s,%s') OR\n" + " (stat_name = 'n_diff_pfx03' AND stat_value = %d AND" + " sample_size = '%d' AND stat_description = '%s,%s,%s') OR\n" + " (stat_name = 'n_diff_pfx04' AND stat_value = %d AND" + " sample_size = '%d' AND stat_description = '%s,%s,%s,%s')\n" + ");\n" + "\n", + INDEX_STATS_NAME_PRINT, + TEST_DATABASE_NAME, + TEST_TABLE_NAME, + TEST_IDX2_NAME, + TEST_IDX2_INDEX_SIZE, + TEST_IDX2_N_LEAF_PAGES, + TEST_IDX2_N_DIFF1, + TEST_IDX2_N_DIFF1_SAMPLE_SIZE, TEST_IDX2_COL1_NAME, + TEST_IDX2_N_DIFF2, + TEST_IDX2_N_DIFF2_SAMPLE_SIZE, + TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME, + TEST_IDX2_N_DIFF3, + TEST_IDX2_N_DIFF3_SAMPLE_SIZE, + TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME, TEST_IDX2_COL3_NAME, + TEST_IDX2_N_DIFF4, + TEST_IDX2_N_DIFF4_SAMPLE_SIZE, + TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME, TEST_IDX2_COL3_NAME, + TEST_IDX2_COL4_NAME); +} +/* @} */ + +/* test_dict_stats_fetch_from_ps() @{ */ +void +test_dict_stats_fetch_from_ps() +{ + dict_table_t table; + dict_index_t index1; + ib_uint64_t index1_stat_n_diff_key_vals[1]; + ib_uint64_t index1_stat_n_sample_sizes[1]; + dict_index_t index2; + ib_uint64_t index2_stat_n_diff_key_vals[4]; + ib_uint64_t index2_stat_n_sample_sizes[4]; + dberr_t ret; + + /* craft a dummy dict_table_t */ + table.name.m_name = (char*) (TEST_DATABASE_NAME "/" TEST_TABLE_NAME); + UT_LIST_INIT(table.indexes, &dict_index_t::indexes); +#ifdef BTR_CUR_HASH_ADAPT + UT_LIST_INIT(table.freed_indexes, &dict_index_t::indexes); +#endif /* BTR_CUR_HASH_ADAPT */ + UT_LIST_ADD_LAST(table.indexes, &index1); + UT_LIST_ADD_LAST(table.indexes, &index2); + ut_d(table.magic_n = DICT_TABLE_MAGIC_N); + + index1.name = TEST_IDX1_NAME; + ut_d(index1.magic_n = DICT_INDEX_MAGIC_N); + index1.cached = 1; + index1.n_uniq = 1; + index1.stat_n_diff_key_vals = index1_stat_n_diff_key_vals; + index1.stat_n_sample_sizes = index1_stat_n_sample_sizes; + + index2.name = TEST_IDX2_NAME; + ut_d(index2.magic_n = DICT_INDEX_MAGIC_N); + index2.cached = 1; + index2.n_uniq = 4; + index2.stat_n_diff_key_vals = index2_stat_n_diff_key_vals; + index2.stat_n_sample_sizes = index2_stat_n_sample_sizes; + + ret = dict_stats_fetch_from_ps(&table); + + ut_a(ret == DB_SUCCESS); + + ut_a(table.stat_n_rows == TEST_N_ROWS); + ut_a(table.stat_clustered_index_size == TEST_CLUSTERED_INDEX_SIZE); + ut_a(table.stat_sum_of_other_index_sizes + == TEST_SUM_OF_OTHER_INDEX_SIZES); + + ut_a(index1.stat_index_size == TEST_IDX1_INDEX_SIZE); + ut_a(index1.stat_n_leaf_pages == TEST_IDX1_N_LEAF_PAGES); + ut_a(index1_stat_n_diff_key_vals[0] == TEST_IDX1_N_DIFF1); + ut_a(index1_stat_n_sample_sizes[0] == TEST_IDX1_N_DIFF1_SAMPLE_SIZE); + + ut_a(index2.stat_index_size == TEST_IDX2_INDEX_SIZE); + ut_a(index2.stat_n_leaf_pages == TEST_IDX2_N_LEAF_PAGES); + ut_a(index2_stat_n_diff_key_vals[0] == TEST_IDX2_N_DIFF1); + ut_a(index2_stat_n_sample_sizes[0] == TEST_IDX2_N_DIFF1_SAMPLE_SIZE); + ut_a(index2_stat_n_diff_key_vals[1] == TEST_IDX2_N_DIFF2); + ut_a(index2_stat_n_sample_sizes[1] == TEST_IDX2_N_DIFF2_SAMPLE_SIZE); + ut_a(index2_stat_n_diff_key_vals[2] == TEST_IDX2_N_DIFF3); + ut_a(index2_stat_n_sample_sizes[2] == TEST_IDX2_N_DIFF3_SAMPLE_SIZE); + ut_a(index2_stat_n_diff_key_vals[3] == TEST_IDX2_N_DIFF4); + ut_a(index2_stat_n_sample_sizes[3] == TEST_IDX2_N_DIFF4_SAMPLE_SIZE); + + printf("OK: fetch successful\n"); +} +/* @} */ + +/* test_dict_stats_all() @{ */ +void +test_dict_stats_all() +{ + test_dict_table_schema_check(); + + test_dict_stats_save(); + + test_dict_stats_fetch_from_ps(); +} +/* @} */ + +#endif /* UNIV_ENABLE_UNIT_TEST_DICT_STATS */ +/* @} */ diff --git a/storage/innobase/dict/dict0stats_bg.cc b/storage/innobase/dict/dict0stats_bg.cc new file mode 100644 index 00000000..a66aac22 --- /dev/null +++ b/storage/innobase/dict/dict0stats_bg.cc @@ -0,0 +1,424 @@ +/***************************************************************************** + +Copyright (c) 2012, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file dict/dict0stats_bg.cc +Code used for background table and index stats gathering. + +Created Apr 25, 2012 Vasil Dimov +*******************************************************/ + +#include "dict0dict.h" +#include "dict0stats.h" +#include "dict0stats_bg.h" +#include "dict0defrag_bg.h" +#include "row0mysql.h" +#include "srv0start.h" +#include "fil0fil.h" +#include "mysqld.h" +#ifdef WITH_WSREP +# include "trx0trx.h" +# include "mysql/service_wsrep.h" +# include "wsrep.h" +# include "log.h" +#endif + +#include + +/** Minimum time interval between stats recalc for a given table */ +#define MIN_RECALC_INTERVAL 10 /* seconds */ +static void dict_stats_schedule(int ms); + +/** Protects recalc_pool */ +static mysql_mutex_t recalc_pool_mutex; + +/** for signaling recalc::state */ +static pthread_cond_t recalc_pool_cond; + +/** Work item of the recalc_pool; protected by recalc_pool_mutex */ +struct recalc +{ + /** identifies a table with persistent statistics */ + table_id_t id; + /** state of the entry */ + enum { IDLE, IN_PROGRESS, IN_PROGRESS_DELETING, DELETING} state; +}; + +/** The multitude of tables whose stats are to be automatically recalculated */ +typedef std::vector> recalc_pool_t; + +/** Pool where we store information on which tables are to be processed +by background statistics gathering. */ +static recalc_pool_t recalc_pool; +/** Whether the global data structures have been initialized */ +static bool stats_initialised; + +/*****************************************************************//** +Free the resources occupied by the recalc pool, called once during +thread de-initialization. */ +static void dict_stats_recalc_pool_deinit() +{ + ut_ad(!srv_read_only_mode); + + recalc_pool.clear(); + defrag_pool.clear(); + /* + recalc_pool may still have its buffer allocated. It will free it when + its destructor is called. + The problem is, memory leak detector is run before the recalc_pool's + destructor is invoked, and will report recalc_pool's buffer as leaked + memory. To avoid that, we force recalc_pool to surrender its buffer + to empty_pool object, which will free it when leaving this function: + */ + recalc_pool_t recalc_empty_pool; + defrag_pool_t defrag_empty_pool; + recalc_pool.swap(recalc_empty_pool); + defrag_pool.swap(defrag_empty_pool); +} + +/*****************************************************************//** +Add a table to the recalc pool, which is processed by the +background stats gathering thread. Only the table id is added to the +list, so the table can be closed after being enqueued and it will be +opened when needed. If the table does not exist later (has been DROPped), +then it will be removed from the pool and skipped. */ +static void dict_stats_recalc_pool_add(table_id_t id) +{ + ut_ad(!srv_read_only_mode); + ut_ad(id); + bool schedule = false; + mysql_mutex_lock(&recalc_pool_mutex); + + const auto begin= recalc_pool.begin(), end= recalc_pool.end(); + if (end == std::find_if(begin, end, [&](const recalc &r){return r.id == id;})) + { + recalc_pool.emplace_back(recalc{id, recalc::IDLE}); + schedule = true; + } + + mysql_mutex_unlock(&recalc_pool_mutex); + if (schedule) + dict_stats_schedule_now(); +} + +#ifdef WITH_WSREP +/** Update the table modification counter and if necessary, +schedule new estimates for table and index statistics to be calculated. +@param[in,out] table persistent or temporary table +@param[in] thd current session */ +void dict_stats_update_if_needed(dict_table_t *table, const trx_t &trx) +#else +/** Update the table modification counter and if necessary, +schedule new estimates for table and index statistics to be calculated. +@param[in,out] table persistent or temporary table */ +void dict_stats_update_if_needed_func(dict_table_t *table) +#endif +{ + if (UNIV_UNLIKELY(!table->stat_initialized)) { + /* The table may have been evicted from dict_sys + and reloaded internally by InnoDB for FOREIGN KEY + processing, but not reloaded by the SQL layer. + + We can (re)compute the transient statistics when the + table is actually loaded by the SQL layer. + + Note: If InnoDB persistent statistics are enabled, + we will skip the updates. We must do this, because + dict_table_get_n_rows() below assumes that the + statistics have been initialized. The DBA may have + to execute ANALYZE TABLE. */ + return; + } + + ulonglong counter = table->stat_modified_counter++; + ulonglong n_rows = dict_table_get_n_rows(table); + + if (dict_stats_is_persistent_enabled(table)) { + if (table->name.is_temporary()) { + return; + } + if (counter > n_rows / 10 /* 10% */ + && dict_stats_auto_recalc_is_enabled(table)) { + +#ifdef WITH_WSREP + /* Do not add table to background + statistic calculation if this thread is not a + applier (as all DDL, which is replicated (i.e + is binlogged in master node), will be executed + with high priority (a.k.a BF) in slave nodes) + and is BF. This could again lead BF lock + waits in applier node but it is better than + no persistent index/table statistics at + applier nodes. TODO: allow BF threads + wait for these InnoDB internal SQL-parser + generated row locks and allow BF thread + lock waits to be enqueued at head of waiting + queue. */ + if (trx.is_wsrep() + && !wsrep_thd_is_applying(trx.mysql_thd) + && wsrep_thd_is_BF(trx.mysql_thd, 0)) { + WSREP_DEBUG("Avoiding background statistics" + " calculation for table %s.", + table->name.m_name); + return; + } +#endif /* WITH_WSREP */ + + dict_stats_recalc_pool_add(table->id); + table->stat_modified_counter = 0; + } + return; + } + + /* Calculate new statistics if 1 / 16 of table has been modified + since the last time a statistics batch was run. + We calculate statistics at most every 16th round, since we may have + a counter table which is very small and updated very often. */ + ulonglong threshold = 16 + n_rows / 16; /* 6.25% */ + + if (srv_stats_modified_counter) { + threshold = std::min(srv_stats_modified_counter, threshold); + } + + if (counter > threshold) { + /* this will reset table->stat_modified_counter to 0 */ + dict_stats_update(table, DICT_STATS_RECALC_TRANSIENT); + } +} + +/** Delete a table from the auto recalc pool, and ensure that +no statistics are being updated on it. */ +void dict_stats_recalc_pool_del(table_id_t id, bool have_mdl_exclusive) +{ + ut_ad(!srv_read_only_mode); + ut_ad(id); + + mysql_mutex_lock(&recalc_pool_mutex); + + auto end= recalc_pool.end(); + auto i= std::find_if(recalc_pool.begin(), end, + [&](const recalc &r){return r.id == id;}); + if (i != end) + { + switch (i->state) { + case recalc::IN_PROGRESS: + if (!have_mdl_exclusive) + { + i->state= recalc::IN_PROGRESS_DELETING; + do + { + my_cond_wait(&recalc_pool_cond, &recalc_pool_mutex.m_mutex); + end= recalc_pool.end(); + i= std::find_if(recalc_pool.begin(), end, + [&](const recalc &r){return r.id == id;}); + if (i == end) + goto done; + } + while (i->state == recalc::IN_PROGRESS_DELETING); + } + /* fall through */ + case recalc::IDLE: + recalc_pool.erase(i); + break; + case recalc::IN_PROGRESS_DELETING: + case recalc::DELETING: + /* another thread will delete the entry in dict_stats_recalc_pool_del() */ + break; + } + } + +done: + mysql_mutex_unlock(&recalc_pool_mutex); +} + +/*****************************************************************//** +Initialize global variables needed for the operation of dict_stats_thread() +Must be called before dict_stats_thread() is started. */ +void dict_stats_init() +{ + ut_ad(!srv_read_only_mode); + mysql_mutex_init(recalc_pool_mutex_key, &recalc_pool_mutex, nullptr); + pthread_cond_init(&recalc_pool_cond, nullptr); + dict_defrag_pool_init(); + stats_initialised= true; +} + +/*****************************************************************//** +Free resources allocated by dict_stats_init(), must be called +after dict_stats task has exited. */ +void dict_stats_deinit() +{ + if (!stats_initialised) { + return; + } + + ut_ad(!srv_read_only_mode); + stats_initialised = false; + + dict_stats_recalc_pool_deinit(); + dict_defrag_pool_deinit(); + + mysql_mutex_destroy(&recalc_pool_mutex); + pthread_cond_destroy(&recalc_pool_cond); +} + +/** +Get the first table that has been added for auto recalc and eventually +update its stats. +@return whether the first entry can be processed immediately */ +static bool dict_stats_process_entry_from_recalc_pool(THD *thd) +{ + ut_ad(!srv_read_only_mode); + table_id_t table_id; + mysql_mutex_lock(&recalc_pool_mutex); +next_table_id_with_mutex: + for (auto &r : recalc_pool) + { + if ((table_id= r.id) && r.state == recalc::IDLE) + { + r.state= recalc::IN_PROGRESS; + mysql_mutex_unlock(&recalc_pool_mutex); + goto process; + } + } + mysql_mutex_unlock(&recalc_pool_mutex); + return false; + +process: + MDL_ticket *mdl= nullptr; + dict_table_t *table= dict_table_open_on_id(table_id, false, + DICT_TABLE_OP_NORMAL, thd, &mdl); + if (!table) + { +invalid_table_id: + mysql_mutex_lock(&recalc_pool_mutex); + auto i= std::find_if(recalc_pool.begin(), recalc_pool.end(), + [&](const recalc &r){return r.id == table_id;}); + if (i == recalc_pool.end()); + else if (UNIV_LIKELY(i->state == recalc::IN_PROGRESS)) + recalc_pool.erase(i); + else + { + ut_ad(i->state == recalc::IN_PROGRESS_DELETING); + i->state= recalc::DELETING; + pthread_cond_broadcast(&recalc_pool_cond); + } + goto next_table_id_with_mutex; + } + + ut_ad(!table->is_temporary()); + + if (!mdl || !table->is_accessible()) + { + dict_table_close(table, false, thd, mdl); + goto invalid_table_id; + } + + /* time() could be expensive, the current function + is called once every time a table has been changed more than 10% and + on a system with lots of small tables, this could become hot. If we + find out that this is a problem, then the check below could eventually + be replaced with something else, though a time interval is the natural + approach. */ + const bool update_now= + difftime(time(nullptr), table->stats_last_recalc) >= MIN_RECALC_INTERVAL; + + const dberr_t err= update_now + ? dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT) + : DB_SUCCESS_LOCKED_REC; + + dict_table_close(table, false, thd, mdl); + + mysql_mutex_lock(&recalc_pool_mutex); + auto i= std::find_if(recalc_pool.begin(), recalc_pool.end(), + [&](const recalc &r){return r.id == table_id;}); + if (i == recalc_pool.end()) + goto done; + else if (i->state == recalc::IN_PROGRESS_DELETING) + { + i->state= recalc::DELETING; + pthread_cond_broadcast(&recalc_pool_cond); +done: + mysql_mutex_unlock(&recalc_pool_mutex); + } + else + { + ut_ad(i->state == recalc::IN_PROGRESS); + recalc_pool.erase(i); + const bool reschedule= !update_now && recalc_pool.empty(); + if (err == DB_SUCCESS_LOCKED_REC) + recalc_pool.emplace_back(recalc{table_id, recalc::IDLE}); + mysql_mutex_unlock(&recalc_pool_mutex); + if (reschedule) + dict_stats_schedule(MIN_RECALC_INTERVAL * 1000); + } + + return update_now; +} + +static tpool::timer* dict_stats_timer; +static std::mutex dict_stats_mutex; + +static void dict_stats_func(void*) +{ + THD *thd= innobase_create_background_thd("InnoDB statistics"); + set_current_thd(thd); + while (dict_stats_process_entry_from_recalc_pool(thd)) {} + dict_defrag_process_entries_from_defrag_pool(thd); + set_current_thd(nullptr); + destroy_background_thd(thd); +} + + +void dict_stats_start() +{ + std::lock_guard lk(dict_stats_mutex); + if (!dict_stats_timer) + dict_stats_timer= srv_thread_pool->create_timer(dict_stats_func); +} + + +static void dict_stats_schedule(int ms) +{ + std::unique_lock lk(dict_stats_mutex, std::defer_lock); + /* + Use try_lock() to avoid deadlock in dict_stats_shutdown(), which + uses dict_stats_mutex too. If there is simultaneous timer reschedule, + the first one will win, which is fine. + */ + if (!lk.try_lock()) + { + return; + } + if (dict_stats_timer) + dict_stats_timer->set_time(ms,0); +} + +void dict_stats_schedule_now() +{ + dict_stats_schedule(0); +} + +/** Shut down the dict_stats_thread. */ +void dict_stats_shutdown() +{ + std::lock_guard lk(dict_stats_mutex); + delete dict_stats_timer; + dict_stats_timer= 0; +} diff --git a/storage/innobase/dict/drop.cc b/storage/innobase/dict/drop.cc new file mode 100644 index 00000000..dce71974 --- /dev/null +++ b/storage/innobase/dict/drop.cc @@ -0,0 +1,297 @@ +/***************************************************************************** + +Copyright (c) 2021, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/** +@file dict/drop.cc +Data Dictionary Language operations that delete .ibd files */ + +/* We implement atomic data dictionary operations as follows. + +1. A data dictionary transaction is started. +2. We acquire exclusive lock on all the tables that are to be dropped +during the execution of the transaction. +3. We lock the data dictionary cache. +4. All metadata tables will be updated within the single DDL transaction, +including deleting or renaming InnoDB persistent statistics. +4b. If any lock wait would occur while we are holding the dict_sys latches, +we will instantly report a timeout error and roll back the transaction. +5. The transaction metadata is marked as committed. +6. If any files were deleted, we will durably write FILE_DELETE +to the redo log and start deleting the files. +6b. Also purge after a commit may perform file deletion. This is also the +recovery mechanism if the server was killed between step 5 and 6. +7. We unlock the data dictionary cache. +8. The file handles of the unlinked files will be closed. This will actually +reclaim the space in the file system (delete-on-close semantics). + +Notes: + +(a) Purge will be locked out by MDL. For internal tables related to +FULLTEXT INDEX, purge will not acquire MDL on the user table name, +and therefore, when we are dropping any FTS_ tables, we must suspend +and resume purge to prevent a race condition. + +(b) If a transaction needs to both drop and create a table by some +name, it must rename the table in between. This is used by +ha_innobase::truncate() and fts_drop_common_tables(). + +(c) No data is ever destroyed before the transaction is committed, +so we can trivially roll back the transaction at any time. +Lock waits during a DDL operation are no longer a fatal error +that would cause the InnoDB to hang or to intentionally crash. +(Only ALTER TABLE...DISCARD TABLESPACE may discard data before commit.) + +(d) The only changes to the data dictionary cache that are performed +before transaction commit and must be rolled back explicitly are as follows: +(d1) fts_optimize_add_table() to undo fts_optimize_remove_table() +*/ + +#include "trx0purge.h" +#include "dict0dict.h" +#include "dict0stats.h" +#include "dict0stats_bg.h" + +#include "dict0defrag_bg.h" +#include "btr0defragment.h" +#include "ibuf0ibuf.h" +#include "lock0lock.h" + +#include "que0que.h" +#include "pars0pars.h" + +/** Try to drop the foreign key constraints for a persistent table. +@param name name of persistent table +@return error code */ +dberr_t trx_t::drop_table_foreign(const table_name_t &name) +{ + ut_ad(dict_sys.locked()); + ut_ad(state == TRX_STATE_ACTIVE); + ut_ad(dict_operation); + ut_ad(dict_operation_lock_mode); + + if (!dict_sys.sys_foreign || dict_sys.sys_foreign->corrupted) + return DB_SUCCESS; + + if (!dict_sys.sys_foreign_cols || dict_sys.sys_foreign_cols->corrupted) + return DB_SUCCESS; + + pars_info_t *info= pars_info_create(); + pars_info_add_str_literal(info, "name", name.m_name); + return que_eval_sql(info, + "PROCEDURE DROP_FOREIGN() IS\n" + "fid CHAR;\n" + + "DECLARE CURSOR fk IS\n" + "SELECT ID FROM SYS_FOREIGN\n" + "WHERE FOR_NAME=:name\n" + "AND TO_BINARY(FOR_NAME)=TO_BINARY(:name)\n" + "FOR UPDATE;\n" + + "BEGIN\n" + "OPEN fk;\n" + "WHILE 1=1 LOOP\n" + " FETCH fk INTO fid;\n" + " IF (SQL % NOTFOUND)THEN RETURN;END IF;\n" + " DELETE FROM SYS_FOREIGN_COLS" + " WHERE ID=fid;\n" + " DELETE FROM SYS_FOREIGN WHERE ID=fid;\n" + "END LOOP;\n" + "CLOSE fk;\n" + "END;\n", this); +} + +/** Try to drop the statistics for a persistent table. +@param name name of persistent table +@return error code */ +dberr_t trx_t::drop_table_statistics(const table_name_t &name) +{ + ut_ad(dict_sys.locked()); + ut_ad(dict_operation_lock_mode); + + if (strstr(name.m_name, "/" TEMP_FILE_PREFIX_INNODB) || + !strcmp(name.m_name, TABLE_STATS_NAME) || + !strcmp(name.m_name, INDEX_STATS_NAME)) + return DB_SUCCESS; + + char db[MAX_DB_UTF8_LEN], table[MAX_TABLE_UTF8_LEN]; + dict_fs2utf8(name.m_name, db, sizeof db, table, sizeof table); + + dberr_t err= dict_stats_delete_from_table_stats(db, table, this); + if (err == DB_SUCCESS || err == DB_STATS_DO_NOT_EXIST) + { + err= dict_stats_delete_from_index_stats(db, table, this); + if (err == DB_STATS_DO_NOT_EXIST) + err= DB_SUCCESS; + } + return err; +} + +/** Try to drop a persistent table. +@param table persistent table +@param fk whether to drop FOREIGN KEY metadata +@return error code */ +dberr_t trx_t::drop_table(const dict_table_t &table) +{ + ut_ad(dict_sys.locked()); + ut_ad(state == TRX_STATE_ACTIVE); + ut_ad(dict_operation); + ut_ad(dict_operation_lock_mode); + ut_ad(!table.is_temporary()); + /* The table must be exclusively locked by this transaction. */ + ut_ad(table.get_ref_count() <= 1); + ut_ad(table.n_lock_x_or_s == 1); + ut_ad(UT_LIST_GET_LEN(table.locks) >= 1); +#ifdef UNIV_DEBUG + bool found_x= false; + for (lock_t *lock= UT_LIST_GET_FIRST(table.locks); lock; + lock= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) + { + ut_ad(lock->trx == this); + switch (lock->type_mode) { + case LOCK_TABLE | LOCK_X: + found_x= true; + break; + case LOCK_TABLE | LOCK_IX: + case LOCK_TABLE | LOCK_AUTO_INC: + break; + default: + ut_ad("unexpected lock type" == 0); + } + } + ut_ad(found_x); +#endif + + if (dict_sys.sys_virtual && !dict_sys.sys_virtual->corrupted) + { + pars_info_t *info= pars_info_create(); + pars_info_add_ull_literal(info, "id", table.id); + if (dberr_t err= que_eval_sql(info, + "PROCEDURE DROP_VIRTUAL() IS\n" + "BEGIN\n" + "DELETE FROM SYS_VIRTUAL" + " WHERE TABLE_ID=:id;\n" + "END;\n", this)) + return err; + } + + /* Once DELETE FROM SYS_INDEXES is committed, purge may invoke + dict_drop_index_tree(). */ + + if (!(table.flags2 & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS))); + else if (dberr_t err= fts_drop_tables(this, table)) + { + ib::error() << "Unable to remove FTS tables for " + << table.name << ": " << err; + return err; + } + + mod_tables.emplace(const_cast(&table), undo_no). + first->second.set_dropped(); + + pars_info_t *info= pars_info_create(); + pars_info_add_ull_literal(info, "id", table.id); + return que_eval_sql(info, + "PROCEDURE DROP_TABLE() IS\n" + "iid CHAR;\n" + + "DECLARE CURSOR idx IS\n" + "SELECT ID FROM SYS_INDEXES\n" + "WHERE TABLE_ID=:id FOR UPDATE;\n" + + "BEGIN\n" + + "DELETE FROM SYS_TABLES WHERE ID=:id;\n" + "DELETE FROM SYS_COLUMNS WHERE TABLE_ID=:id;\n" + + "OPEN idx;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH idx INTO iid;\n" + " IF (SQL % NOTFOUND) THEN EXIT; END IF;\n" + " DELETE FROM SYS_INDEXES WHERE CURRENT OF idx;\n" + " DELETE FROM SYS_FIELDS WHERE INDEX_ID=iid;\n" + "END LOOP;\n" + "CLOSE idx;\n" + + "END;\n", this); +} + +/** Commit the transaction, possibly after drop_table(). +@param deleted handles of data files that were deleted */ +void trx_t::commit(std::vector &deleted) +{ + ut_ad(dict_operation); + flush_log_later= true; + commit_persist(); + flush_log_later= false; + if (dict_operation) + { + std::vector space_ids; + space_ids.reserve(mod_tables.size()); + ut_ad(dict_sys.locked()); + lock_sys.wr_lock(SRW_LOCK_CALL); + mutex_lock(); + lock_release_on_drop(this); + ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0); + ut_ad(ib_vector_is_empty(autoinc_locks)); + mem_heap_empty(lock.lock_heap); + lock.table_locks.clear(); + /* commit_persist() already reset this. */ + ut_ad(!lock.was_chosen_as_deadlock_victim); + lock.n_rec_locks= 0; + while (dict_table_t *table= UT_LIST_GET_FIRST(lock.evicted_tables)) + { + UT_LIST_REMOVE(lock.evicted_tables, table); + dict_mem_table_free(table); + } + dict_operation= false; + id= 0; + mutex_unlock(); + + for (const auto &p : mod_tables) + { + if (p.second.is_dropped()) + { + dict_table_t *table= p.first; + dict_stats_recalc_pool_del(table->id, true); + dict_stats_defrag_pool_del(table, nullptr); + if (btr_defragment_active) + btr_defragment_remove_table(table); + const fil_space_t *space= table->space; + ut_ad(!p.second.is_aux_table() || purge_sys.must_wait_FTS()); + dict_sys.remove(table); + if (const auto id= space ? space->id : 0) + { + space_ids.emplace_back(id); + pfs_os_file_t d= fil_delete_tablespace(id); + if (d != OS_FILE_CLOSED) + deleted.emplace_back(d); + } + } + } + + lock_sys.wr_unlock(); + + mysql_mutex_lock(&lock_sys.wait_mutex); + lock_sys.deadlock_check(); + mysql_mutex_unlock(&lock_sys.wait_mutex); + + for (const auto id : space_ids) + ibuf_delete_for_discarded_space(id); + } + commit_cleanup(); +} diff --git a/storage/innobase/eval/eval0eval.cc b/storage/innobase/eval/eval0eval.cc new file mode 100644 index 00000000..bafb0b55 --- /dev/null +++ b/storage/innobase/eval/eval0eval.cc @@ -0,0 +1,643 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file eval/eval0eval.cc +SQL evaluator: evaluates simple data structures, like expressions, in +a query graph + +Created 12/29/1997 Heikki Tuuri +*******************************************************/ + +#include "eval0eval.h" +#include "data0data.h" +#include "row0sel.h" +#include "rem0cmp.h" + +/** Dummy adress used when we should allocate a buffer of size 0 in +eval_node_alloc_val_buf */ + +static byte eval_dummy; + +/************************************************************************* +Gets the like node from the node */ +UNIV_INLINE +que_node_t* +que_node_get_like_node( +/*===================*/ + /* out: next node in a list of nodes */ + que_node_t* node) /* in: node in a list */ +{ + return(((sym_node_t*) node)->like_node); +} + +/*****************************************************************//** +Allocate a buffer from global dynamic memory for a value of a que_node. +NOTE that this memory must be explicitly freed when the query graph is +freed. If the node already has an allocated buffer, that buffer is freed +here. NOTE that this is the only function where dynamic memory should be +allocated for a query node val field. +@return pointer to allocated buffer */ +byte* +eval_node_alloc_val_buf( +/*====================*/ + que_node_t* node, /*!< in: query graph node; sets the val field + data field to point to the new buffer, and + len field equal to size */ + ulint size) /*!< in: buffer size */ +{ + dfield_t* dfield; + byte* data; + + ut_ad(que_node_get_type(node) == QUE_NODE_SYMBOL + || que_node_get_type(node) == QUE_NODE_FUNC); + + dfield = que_node_get_val(node); + + data = static_cast(dfield_get_data(dfield)); + + if (data != &eval_dummy) { + ut_free(data); + } + + if (size == 0) { + data = &eval_dummy; + } else { + data = static_cast(ut_malloc_nokey(size)); + } + + que_node_set_val_buf_size(node, size); + + dfield_set_data(dfield, data, size); + + return(data); +} + +/*****************************************************************//** +Free the buffer from global dynamic memory for a value of a que_node, +if it has been allocated in the above function. The freeing for pushed +column values is done in sel_col_prefetch_buf_free. */ +void +eval_node_free_val_buf( +/*===================*/ + que_node_t* node) /*!< in: query graph node */ +{ + dfield_t* dfield; + byte* data; + + ut_ad(que_node_get_type(node) == QUE_NODE_SYMBOL + || que_node_get_type(node) == QUE_NODE_FUNC); + + dfield = que_node_get_val(node); + + data = static_cast(dfield_get_data(dfield)); + + if (que_node_get_val_buf_size(node) > 0) { + ut_a(data); + + ut_free(data); + } +} + +/********************************************************************* +Evaluates a LIKE comparison node. +@return the result of the comparison */ +UNIV_INLINE +ibool +eval_cmp_like( +/*==========*/ + que_node_t* arg1, /* !< in: left operand */ + que_node_t* arg2) /* !< in: right operand */ +{ + ib_like_t op; + que_node_t* arg3; + que_node_t* arg4; + const dfield_t* dfield; + + arg3 = que_node_get_like_node(arg2); + + /* Get the comparison type operator */ + ut_a(arg3); + + dfield = que_node_get_val(arg3); + ut_ad(dtype_get_mtype(dfield_get_type(dfield)) == DATA_INT); + op = static_cast( + mach_read_from_4(static_cast( + dfield_get_data(dfield)))); + + switch (op) { + case IB_LIKE_PREFIX: + arg4 = que_node_get_next(arg3); + return(cmp_dfield_dfield_eq_prefix(que_node_get_val(arg1), + que_node_get_val(arg4))); + case IB_LIKE_EXACT: + return(!cmp_dfield_dfield(que_node_get_val(arg1), + que_node_get_val(arg2))); + } + + ut_error; + return(FALSE); +} + +/********************************************************************* +Evaluates a comparison node. +@return the result of the comparison */ +ibool +eval_cmp( +/*=====*/ + func_node_t* cmp_node) /*!< in: comparison node */ +{ + que_node_t* arg1; + que_node_t* arg2; + int res; + ibool val = FALSE; /* remove warning */ + + ut_ad(que_node_get_type(cmp_node) == QUE_NODE_FUNC); + + arg1 = cmp_node->args; + arg2 = que_node_get_next(arg1); + + switch (cmp_node->func) { + case '<': + case '=': + case '>': + case PARS_LE_TOKEN: + case PARS_NE_TOKEN: + case PARS_GE_TOKEN: + res = cmp_dfield_dfield( + que_node_get_val(arg1), que_node_get_val(arg2)); + + switch (cmp_node->func) { + case '<': + val = (res < 0); + break; + case '=': + val = (res == 0); + break; + case '>': + val = (res > 0); + break; + case PARS_LE_TOKEN: + val = (res <= 0); + break; + case PARS_NE_TOKEN: + val = (res != 0); + break; + case PARS_GE_TOKEN: + val = (res >= 0); + break; + } + break; + default: + val = eval_cmp_like(arg1, arg2); + break; + } + + eval_node_set_ibool_val(cmp_node, val); + + return(val); +} + +/*****************************************************************//** +Evaluates a logical operation node. */ +UNIV_INLINE +void +eval_logical( +/*=========*/ + func_node_t* logical_node) /*!< in: logical operation node */ +{ + que_node_t* arg1; + que_node_t* arg2; + ibool val1; + ibool val2 = 0; /* remove warning */ + ibool val = 0; /* remove warning */ + int func; + + ut_ad(que_node_get_type(logical_node) == QUE_NODE_FUNC); + + arg1 = logical_node->args; + arg2 = que_node_get_next(arg1); /* arg2 is NULL if func is 'NOT' */ + + val1 = eval_node_get_ibool_val(arg1); + + if (arg2) { + val2 = eval_node_get_ibool_val(arg2); + } + + func = logical_node->func; + + if (func == PARS_AND_TOKEN) { + val = val1 & val2; + } else if (func == PARS_OR_TOKEN) { + val = val1 | val2; + } else if (func == PARS_NOT_TOKEN) { + val = TRUE - val1; + } else { + ut_error; + } + + eval_node_set_ibool_val(logical_node, val); +} + +/*****************************************************************//** +Evaluates an arithmetic operation node. */ +UNIV_INLINE +void +eval_arith( +/*=======*/ + func_node_t* arith_node) /*!< in: arithmetic operation node */ +{ + que_node_t* arg1; + que_node_t* arg2; + lint val1; + lint val2 = 0; /* remove warning */ + lint val; + int func; + + ut_ad(que_node_get_type(arith_node) == QUE_NODE_FUNC); + + arg1 = arith_node->args; + arg2 = que_node_get_next(arg1); /* arg2 is NULL if func is unary '-' */ + + val1 = eval_node_get_int_val(arg1); + + if (arg2) { + val2 = eval_node_get_int_val(arg2); + } + + func = arith_node->func; + + if (func == '+') { + val = val1 + val2; + } else if ((func == '-') && arg2) { + val = val1 - val2; + } else if (func == '-') { + val = -val1; + } else if (func == '*') { + val = val1 * val2; + } else { + ut_ad(func == '/'); + val = val1 / val2; + } + + eval_node_set_int_val(arith_node, val); +} + +/*****************************************************************//** +Evaluates an aggregate operation node. */ +UNIV_INLINE +void +eval_aggregate( +/*===========*/ + func_node_t* node) /*!< in: aggregate operation node */ +{ + lint val; + + ut_ad(que_node_get_type(node) == QUE_NODE_FUNC); + + val = eval_node_get_int_val(node); + + ut_a(node->func == PARS_COUNT_TOKEN); + val = val + 1; + eval_node_set_int_val(node, val); +} + +/*****************************************************************//** +Evaluates a notfound-function node. */ +UNIV_INLINE +void +eval_notfound( +/*==========*/ + func_node_t* func_node) /*!< in: function node */ +{ + sym_node_t* cursor; + sel_node_t* sel_node; + ibool ibool_val; + + ut_ad(func_node->func == PARS_NOTFOUND_TOKEN); + + cursor = static_cast(func_node->args); + + ut_ad(que_node_get_type(cursor) == QUE_NODE_SYMBOL); + + if (cursor->token_type == SYM_LIT) { + ut_ad(!memcmp(dfield_get_data(que_node_get_val(cursor)), + "SQL", 3)); + sel_node = cursor->sym_table->query_graph->last_sel_node; + } else { + sel_node = cursor->alias->cursor_def; + } + + if (sel_node->state == SEL_NODE_NO_MORE_ROWS) { + ibool_val = TRUE; + } else { + ibool_val = FALSE; + } + + eval_node_set_ibool_val(func_node, ibool_val); +} + +/*****************************************************************//** +Evaluates a substr-function node. */ +UNIV_INLINE +void +eval_substr( +/*========*/ + func_node_t* func_node) /*!< in: function node */ +{ + que_node_t* arg1; + que_node_t* arg2; + que_node_t* arg3; + dfield_t* dfield; + byte* str1; + ulint len1; + ulint len2; + + arg1 = func_node->args; + arg2 = que_node_get_next(arg1); + + ut_ad(func_node->func == PARS_SUBSTR_TOKEN); + + arg3 = que_node_get_next(arg2); + + str1 = static_cast(dfield_get_data(que_node_get_val(arg1))); + + const ulint str1_len = dfield_get_len(que_node_get_val(arg1)); + + len1 = (ulint) eval_node_get_int_val(arg2); + len2 = (ulint) eval_node_get_int_val(arg3); + + dfield = que_node_get_val(func_node); + + if (len1 > str1_len) { + len2 = 0; + } else { + str1 += len1; + if (len2 > str1_len - len1) { + len2 = str1_len - len1; + } + } + + dfield_set_data(dfield, str1, len2); +} + +/*****************************************************************//** +Evaluates an instr-function node. */ +static +void +eval_instr( +/*=======*/ + func_node_t* func_node) /*!< in: function node */ +{ + que_node_t* arg1; + que_node_t* arg2; + dfield_t* dfield1; + dfield_t* dfield2; + lint int_val; + byte* str1; + byte* str2; + byte match_char; + ulint len1; + ulint len2; + ulint i; + ulint j; + + arg1 = func_node->args; + arg2 = que_node_get_next(arg1); + + dfield1 = que_node_get_val(arg1); + dfield2 = que_node_get_val(arg2); + + str1 = static_cast(dfield_get_data(dfield1)); + str2 = static_cast(dfield_get_data(dfield2)); + + len1 = dfield_get_len(dfield1); + len2 = dfield_get_len(dfield2); + + if (len2 == 0) { + ut_error; + } + + match_char = str2[0]; + + for (i = 0; i < len1; i++) { + /* In this outer loop, the number of matched characters is 0 */ + + if (str1[i] == match_char) { + + if (i + len2 > len1) { + + break; + } + + for (j = 1;; j++) { + /* We have already matched j characters */ + + if (j == len2) { + int_val = lint(i) + 1; + + goto match_found; + } + + if (str1[i + j] != str2[j]) { + + break; + } + } + } + } + + int_val = 0; + +match_found: + eval_node_set_int_val(func_node, int_val); +} + +/*****************************************************************//** +Evaluates a predefined function node. */ +static +void +eval_concat( +/*========*/ + func_node_t* func_node) /*!< in: function node */ +{ + que_node_t* arg; + dfield_t* dfield; + byte* data; + ulint len; + ulint len1; + + arg = func_node->args; + len = 0; + + while (arg) { + len1 = dfield_get_len(que_node_get_val(arg)); + + len += len1; + + arg = que_node_get_next(arg); + } + + data = eval_node_ensure_val_buf(func_node, len); + + arg = func_node->args; + len = 0; + + while (arg) { + dfield = que_node_get_val(arg); + len1 = dfield_get_len(dfield); + + memcpy(data + len, dfield_get_data(dfield), len1); + + len += len1; + + arg = que_node_get_next(arg); + } +} + +/*****************************************************************//** +Evaluates a predefined function node. If the first argument is an integer, +this function looks at the second argument which is the integer length in +bytes, and converts the integer to a VARCHAR. +If the first argument is of some other type, this function converts it to +BINARY. */ +UNIV_INLINE +void +eval_to_binary( +/*===========*/ + func_node_t* func_node) /*!< in: function node */ +{ + que_node_t* arg1; + que_node_t* arg2; + dfield_t* dfield; + byte* str1; + ulint len; + ulint len1; + + arg1 = func_node->args; + + str1 = static_cast(dfield_get_data(que_node_get_val(arg1))); + + if (dtype_get_mtype(que_node_get_data_type(arg1)) != DATA_INT) { + + len = dfield_get_len(que_node_get_val(arg1)); + + dfield = que_node_get_val(func_node); + + dfield_set_data(dfield, str1, len); + + return; + } + + arg2 = que_node_get_next(arg1); + + len1 = (ulint) eval_node_get_int_val(arg2); + + if (len1 > 4) { + + ut_error; + } + + dfield = que_node_get_val(func_node); + + dfield_set_data(dfield, str1 + (4 - len1), len1); +} + +/*****************************************************************//** +Evaluate LENGTH(). */ +inline void eval_length(func_node_t* func_node) +{ + eval_node_set_int_val(func_node, + dfield_get_len(que_node_get_val + (func_node->args))); +} + +/*****************************************************************//** +Evaluates a function node. */ +void +eval_func( +/*======*/ + func_node_t* func_node) /*!< in: function node */ +{ + que_node_t* arg; + ulint fclass; + + ut_ad(que_node_get_type(func_node) == QUE_NODE_FUNC); + + fclass = func_node->fclass; + const int func = func_node->func; + + arg = func_node->args; + + /* Evaluate first the argument list */ + while (arg) { + eval_exp(arg); + + /* The functions are not defined for SQL null argument + values, except for eval_cmp and notfound */ + + if (dfield_is_null(que_node_get_val(arg)) + && (fclass != PARS_FUNC_CMP) + && (func != PARS_NOTFOUND_TOKEN)) { + ut_error; + } + + arg = que_node_get_next(arg); + } + + switch (fclass) { + case PARS_FUNC_CMP: + eval_cmp(func_node); + return; + case PARS_FUNC_ARITH: + eval_arith(func_node); + return; + case PARS_FUNC_AGGREGATE: + eval_aggregate(func_node); + return; + case PARS_FUNC_PREDEFINED: + switch (func) { + case PARS_NOTFOUND_TOKEN: + eval_notfound(func_node); + return; + case PARS_SUBSTR_TOKEN: + eval_substr(func_node); + return; + case PARS_INSTR_TOKEN: + eval_instr(func_node); + return; + case PARS_CONCAT_TOKEN: + eval_concat(func_node); + return; + case PARS_TO_BINARY_TOKEN: + eval_to_binary(func_node); + return; + case PARS_LENGTH_TOKEN: + eval_length(func_node); + return; + default: + ut_error; + } + case PARS_FUNC_LOGICAL: + eval_logical(func_node); + return; + } + + ut_error; +} diff --git a/storage/innobase/eval/eval0proc.cc b/storage/innobase/eval/eval0proc.cc new file mode 100644 index 00000000..7e39443f --- /dev/null +++ b/storage/innobase/eval/eval0proc.cc @@ -0,0 +1,286 @@ +/***************************************************************************** + +Copyright (c) 1998, 2016, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file eval/eval0proc.cc +Executes SQL stored procedures and their control structures + +Created 1/20/1998 Heikki Tuuri +*******************************************************/ + +#include "eval0proc.h" + +/**********************************************************************//** +Performs an execution step of an if-statement node. +@return query thread to run next or NULL */ +que_thr_t* +if_step( +/*====*/ + que_thr_t* thr) /*!< in: query thread */ +{ + if_node_t* node; + elsif_node_t* elsif_node; + + ut_ad(thr); + + node = static_cast(thr->run_node); + ut_ad(que_node_get_type(node) == QUE_NODE_IF); + + if (thr->prev_node == que_node_get_parent(node)) { + + /* Evaluate the condition */ + + eval_exp(node->cond); + + if (eval_node_get_ibool_val(node->cond)) { + + /* The condition evaluated to TRUE: start execution + from the first statement in the statement list */ + + thr->run_node = node->stat_list; + + } else if (node->else_part) { + thr->run_node = node->else_part; + + } else if (node->elsif_list) { + elsif_node = node->elsif_list; + + for (;;) { + eval_exp(elsif_node->cond); + + if (eval_node_get_ibool_val( + elsif_node->cond)) { + + /* The condition evaluated to TRUE: + start execution from the first + statement in the statement list */ + + thr->run_node = elsif_node->stat_list; + + break; + } + + elsif_node = static_cast( + que_node_get_next(elsif_node)); + + if (elsif_node == NULL) { + thr->run_node = NULL; + + break; + } + } + } else { + thr->run_node = NULL; + } + } else { + /* Move to the next statement */ + ut_ad(que_node_get_next(thr->prev_node) == NULL); + + thr->run_node = NULL; + } + + if (thr->run_node == NULL) { + thr->run_node = que_node_get_parent(node); + } + + return(thr); +} + +/**********************************************************************//** +Performs an execution step of a while-statement node. +@return query thread to run next or NULL */ +que_thr_t* +while_step( +/*=======*/ + que_thr_t* thr) /*!< in: query thread */ +{ + while_node_t* node; + + ut_ad(thr); + + node = static_cast(thr->run_node); + ut_ad(que_node_get_type(node) == QUE_NODE_WHILE); + + ut_ad((thr->prev_node == que_node_get_parent(node)) + || (que_node_get_next(thr->prev_node) == NULL)); + + /* Evaluate the condition */ + + eval_exp(node->cond); + + if (eval_node_get_ibool_val(node->cond)) { + + /* The condition evaluated to TRUE: start execution + from the first statement in the statement list */ + + thr->run_node = node->stat_list; + } else { + thr->run_node = que_node_get_parent(node); + } + + return(thr); +} + +/**********************************************************************//** +Performs an execution step of an assignment statement node. +@return query thread to run next or NULL */ +que_thr_t* +assign_step( +/*========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + assign_node_t* node; + + ut_ad(thr); + + node = static_cast(thr->run_node); + ut_ad(que_node_get_type(node) == QUE_NODE_ASSIGNMENT); + + /* Evaluate the value to assign */ + + eval_exp(node->val); + + eval_node_copy_val(node->var->alias, node->val); + + thr->run_node = que_node_get_parent(node); + + return(thr); +} + +/**********************************************************************//** +Performs an execution step of a for-loop node. +@return query thread to run next or NULL */ +que_thr_t* +for_step( +/*=====*/ + que_thr_t* thr) /*!< in: query thread */ +{ + for_node_t* node; + que_node_t* parent; + lint loop_var_value; + + ut_ad(thr); + + node = static_cast(thr->run_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_FOR); + + parent = que_node_get_parent(node); + + if (thr->prev_node != parent) { + + /* Move to the next statement */ + thr->run_node = que_node_get_next(thr->prev_node); + + if (thr->run_node != NULL) { + + return(thr); + } + + /* Increment the value of loop_var */ + + loop_var_value = 1 + eval_node_get_int_val(node->loop_var); + } else { + /* Initialize the loop */ + + eval_exp(node->loop_start_limit); + eval_exp(node->loop_end_limit); + + loop_var_value = eval_node_get_int_val(node->loop_start_limit); + + node->loop_end_value + = (int) eval_node_get_int_val(node->loop_end_limit); + } + + /* Check if we should do another loop */ + + if (loop_var_value > node->loop_end_value) { + + /* Enough loops done */ + + thr->run_node = parent; + } else { + eval_node_set_int_val(node->loop_var, loop_var_value); + + thr->run_node = node->stat_list; + } + + return(thr); +} + +/**********************************************************************//** +Performs an execution step of an exit statement node. +@return query thread to run next or NULL */ +que_thr_t* +exit_step( +/*======*/ + que_thr_t* thr) /*!< in: query thread */ +{ + exit_node_t* node; + que_node_t* loop_node; + + ut_ad(thr); + + node = static_cast(thr->run_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_EXIT); + + /* Loops exit by setting thr->run_node as the loop node's parent, so + find our containing loop node and get its parent. */ + + loop_node = que_node_get_containing_loop_node(node); + + /* If someone uses an EXIT statement outside of a loop, this will + trigger. */ + ut_a(loop_node); + + thr->run_node = que_node_get_parent(loop_node); + + return(thr); +} + +/**********************************************************************//** +Performs an execution step of a return-statement node. +@return query thread to run next or NULL */ +que_thr_t* +return_step( +/*========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + return_node_t* node; + que_node_t* parent; + + ut_ad(thr); + + node = static_cast(thr->run_node); + + ut_ad(que_node_get_type(node) == QUE_NODE_RETURN); + + parent = node; + + while (que_node_get_type(parent) != QUE_NODE_PROC) { + + parent = que_node_get_parent(parent); + } + + ut_a(parent); + + thr->run_node = que_node_get_parent(parent); + + return(thr); +} diff --git a/storage/innobase/fil/fil0crypt.cc b/storage/innobase/fil/fil0crypt.cc new file mode 100644 index 00000000..97cb3994 --- /dev/null +++ b/storage/innobase/fil/fil0crypt.cc @@ -0,0 +1,2425 @@ +/***************************************************************************** +Copyright (C) 2013, 2015, Google Inc. All Rights Reserved. +Copyright (c) 2014, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ +/**************************************************//** +@file fil0crypt.cc +Innodb file space encrypt/decrypt + +Created Jonas Oreland Google +Modified Jan Lindström jan.lindstrom@mariadb.com +*******************************************************/ + +#include "fil0crypt.h" +#include "mach0data.h" +#include "page0zip.h" +#include "buf0checksum.h" +#ifdef UNIV_INNOCHECKSUM +# include "buf0buf.h" +#else +#include "buf0flu.h" +#include "buf0dblwr.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "mtr0mtr.h" +#include "mtr0log.h" +#include "ut0ut.h" +#include "fsp0fsp.h" +#include "fil0pagecompress.h" +#include + +static bool fil_crypt_threads_inited = false; + +/** Is encryption enabled/disabled */ +ulong srv_encrypt_tables; + +/** No of key rotation threads requested */ +uint srv_n_fil_crypt_threads; + +/** No of key rotation threads started */ +uint srv_n_fil_crypt_threads_started; + +/** At this age or older a space/page will be rotated */ +uint srv_fil_crypt_rotate_key_age; + +/** Whether the encryption plugin does key rotation */ +Atomic_relaxed srv_encrypt_rotate; + +/** Condition variable for srv_n_fil_crypt_threads_started */ +static pthread_cond_t fil_crypt_cond; + +/** Condition variable to to signal the key rotation threads */ +static pthread_cond_t fil_crypt_threads_cond; + +/** Condition variable for interrupting sleeptime_ms sleep at the end +of fil_crypt_rotate_page() */ +static pthread_cond_t fil_crypt_throttle_sleep_cond; + +/** Mutex for key rotation threads. Acquired before fil_system.mutex! */ +static mysql_mutex_t fil_crypt_threads_mutex; + +/** Variable ensuring only 1 thread at time does initial conversion */ +static bool fil_crypt_start_converting; + +/** Variables for throttling */ +uint srv_n_fil_crypt_iops; // 10ms per iop +static constexpr uint srv_alloc_time = 3; // allocate iops for 3s at a time +static uint n_fil_crypt_iops_allocated; + +#define DEBUG_KEYROTATION_THROTTLING 0 + +/** Statistics variables */ +static fil_crypt_stat_t crypt_stat; +static mysql_mutex_t crypt_stat_mutex; + +/** Wake up the encryption threads */ +void fil_crypt_threads_signal(bool broadcast) +{ + mysql_mutex_lock(&fil_crypt_threads_mutex); + if (broadcast) + pthread_cond_broadcast(&fil_crypt_threads_cond); + else + pthread_cond_signal(&fil_crypt_threads_cond); + mysql_mutex_unlock(&fil_crypt_threads_mutex); +} + +/*********************************************************************** +Check if a key needs rotation given a key_state +@param[in] crypt_data Encryption information +@param[in] key_version Current key version +@param[in] latest_key_version Latest key version +@param[in] rotate_key_age when to rotate +@return true if key needs rotation, false if not */ +static bool +fil_crypt_needs_rotation( + const fil_space_crypt_t* crypt_data, + uint key_version, + uint latest_key_version, + uint rotate_key_age) + MY_ATTRIBUTE((warn_unused_result)); + +/********************************************************************* +Init space crypt */ +void fil_space_crypt_init() +{ + pthread_cond_init(&fil_crypt_throttle_sleep_cond, nullptr); + mysql_mutex_init(0, &crypt_stat_mutex, nullptr); +} + +/********************************************************************* +Cleanup space crypt */ +void fil_space_crypt_cleanup() +{ + pthread_cond_destroy(&fil_crypt_throttle_sleep_cond); + mysql_mutex_destroy(&crypt_stat_mutex); +} + +/** +Get latest key version from encryption plugin. +@return key version or ENCRYPTION_KEY_VERSION_INVALID */ +uint +fil_space_crypt_t::key_get_latest_version(void) +{ + uint key_version = key_found; + + if (is_key_found()) { + key_version = encryption_key_get_latest_version(key_id); + /* InnoDB does dirty read of srv_fil_crypt_rotate_key_age. + It doesn't matter because srv_encrypt_rotate + can be set to true only once */ + if (!srv_encrypt_rotate + && key_version > srv_fil_crypt_rotate_key_age) { + srv_encrypt_rotate = true; + } + + srv_stats.n_key_requests.inc(); + key_found = key_version; + } + + return key_version; +} + +/****************************************************************** +Get the latest(key-version), waking the encrypt thread, if needed +@param[in,out] crypt_data Crypt data */ +static inline +uint +fil_crypt_get_latest_key_version( + fil_space_crypt_t* crypt_data) +{ + ut_ad(crypt_data != NULL); + + uint key_version = crypt_data->key_get_latest_version(); + + if (crypt_data->is_key_found()) { + + if (fil_crypt_needs_rotation( + crypt_data, + crypt_data->min_key_version, + key_version, + srv_fil_crypt_rotate_key_age)) { + if (fil_crypt_threads_inited) { + fil_crypt_threads_signal(); + } + } + } + + return key_version; +} + +/****************************************************************** +Mutex helper for crypt_data->scheme */ +void +crypt_data_scheme_locker( +/*=====================*/ + st_encryption_scheme* scheme, + int exit) +{ + fil_space_crypt_t* crypt_data = + static_cast(scheme); + + if (exit) { + mysql_mutex_unlock(&crypt_data->mutex); + } else { + mysql_mutex_lock(&crypt_data->mutex); + } +} + +/****************************************************************** +Create a fil_space_crypt_t object +@param[in] type CRYPT_SCHEME_UNENCRYPTE or + CRYPT_SCHEME_1 +@param[in] encrypt_mode FIL_ENCRYPTION_DEFAULT or + FIL_ENCRYPTION_ON or + FIL_ENCRYPTION_OFF +@param[in] min_key_version key_version or 0 +@param[in] key_id Used key id +@return crypt object */ +static +fil_space_crypt_t* +fil_space_create_crypt_data( + uint type, + fil_encryption_t encrypt_mode, + uint min_key_version, + uint key_id) +{ + fil_space_crypt_t* crypt_data = NULL; + if (void* buf = ut_zalloc_nokey(sizeof(fil_space_crypt_t))) { + crypt_data = new(buf) + fil_space_crypt_t( + type, + min_key_version, + key_id, + encrypt_mode); + } + + return crypt_data; +} + +/****************************************************************** +Create a fil_space_crypt_t object +@param[in] encrypt_mode FIL_ENCRYPTION_DEFAULT or + FIL_ENCRYPTION_ON or + FIL_ENCRYPTION_OFF + +@param[in] key_id Encryption key id +@return crypt object */ +fil_space_crypt_t* +fil_space_create_crypt_data( + fil_encryption_t encrypt_mode, + uint key_id) +{ + return (fil_space_create_crypt_data(0, encrypt_mode, 0, key_id)); +} + +/****************************************************************** +Merge fil_space_crypt_t object +@param[in,out] dst Destination cryp data +@param[in] src Source crypt data */ +static +void +fil_space_merge_crypt_data( + fil_space_crypt_t* dst, + const fil_space_crypt_t* src) +{ + mysql_mutex_lock(&dst->mutex); + + /* validate that they are mergeable */ + ut_a(src->type == CRYPT_SCHEME_UNENCRYPTED || + src->type == CRYPT_SCHEME_1); + + ut_a(dst->type == CRYPT_SCHEME_UNENCRYPTED || + dst->type == CRYPT_SCHEME_1); + + dst->encryption = src->encryption; + dst->type = src->type; + dst->min_key_version = src->min_key_version; + dst->keyserver_requests += src->keyserver_requests; + + mysql_mutex_unlock(&dst->mutex); +} + +/** Initialize encryption parameters from a tablespace header page. +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] page first page of the tablespace +@return crypt data from page 0 +@retval NULL if not present or not valid */ +fil_space_crypt_t* fil_space_read_crypt_data(ulint zip_size, const byte* page) +{ + const ulint offset = FSP_HEADER_OFFSET + + fsp_header_get_encryption_offset(zip_size); + + if (memcmp(page + offset, CRYPT_MAGIC, MAGIC_SZ) != 0) { + /* Crypt data is not stored. */ + return NULL; + } + + uint8_t type = mach_read_from_1(page + offset + MAGIC_SZ + 0); + uint8_t iv_length = mach_read_from_1(page + offset + MAGIC_SZ + 1); + fil_space_crypt_t* crypt_data; + + if (!(type == CRYPT_SCHEME_UNENCRYPTED || + type == CRYPT_SCHEME_1) + || iv_length != sizeof crypt_data->iv) { + ib::error() << "Found non sensible crypt scheme: " + << type << "," << iv_length + << " for space: " + << page_get_space_id(page); + return NULL; + } + + uint min_key_version = mach_read_from_4 + (page + offset + MAGIC_SZ + 2 + iv_length); + + uint key_id = mach_read_from_4 + (page + offset + MAGIC_SZ + 2 + iv_length + 4); + + fil_encryption_t encryption = (fil_encryption_t)mach_read_from_1( + page + offset + MAGIC_SZ + 2 + iv_length + 8); + + crypt_data = fil_space_create_crypt_data(encryption, key_id); + /* We need to overwrite these as above function will initialize + members */ + crypt_data->type = type; + crypt_data->min_key_version = min_key_version; + memcpy(crypt_data->iv, page + offset + MAGIC_SZ + 2, iv_length); + + return crypt_data; +} + +/****************************************************************** +Free a crypt data object +@param[in,out] crypt_data crypt data to be freed */ +void fil_space_destroy_crypt_data(fil_space_crypt_t **crypt_data) +{ + if (crypt_data != NULL && (*crypt_data) != NULL) { + fil_space_crypt_t* c; + if (UNIV_LIKELY(fil_crypt_threads_inited)) { + mysql_mutex_lock(&fil_crypt_threads_mutex); + c = *crypt_data; + *crypt_data = NULL; + mysql_mutex_unlock(&fil_crypt_threads_mutex); + } else { + ut_ad(srv_read_only_mode || !srv_was_started); + c = *crypt_data; + *crypt_data = NULL; + } + if (c) { + c->~fil_space_crypt_t(); + ut_free(c); + } + } +} + +/** Amend encryption information from redo log. +@param[in] space tablespace +@param[in] data encryption metadata */ +void fil_crypt_parse(fil_space_t* space, const byte* data) +{ + ut_ad(data[1] == MY_AES_BLOCK_SIZE); + if (void* buf = ut_zalloc_nokey(sizeof(fil_space_crypt_t))) { + fil_space_crypt_t* crypt_data = new(buf) + fil_space_crypt_t( + data[0], + mach_read_from_4(&data[2 + MY_AES_BLOCK_SIZE]), + mach_read_from_4(&data[6 + MY_AES_BLOCK_SIZE]), + static_cast + (data[10 + MY_AES_BLOCK_SIZE])); + memcpy(crypt_data->iv, data + 2, MY_AES_BLOCK_SIZE); + mysql_mutex_lock(&fil_system.mutex); + if (space->crypt_data) { + fil_space_merge_crypt_data(space->crypt_data, + crypt_data); + fil_space_destroy_crypt_data(&crypt_data); + crypt_data = space->crypt_data; + } else { + space->crypt_data = crypt_data; + } + mysql_mutex_unlock(&fil_system.mutex); + } +} + +/** Write encryption metadata to the first page. +@param[in,out] block first page of the tablespace +@param[in,out] mtr mini-transaction */ +void fil_space_crypt_t::write_page0(buf_block_t* block, mtr_t* mtr) +{ + const ulint offset = FSP_HEADER_OFFSET + + fsp_header_get_encryption_offset(block->zip_size()); + byte* b = block->page.frame + offset; + + mtr->memcpy(*block, b, CRYPT_MAGIC, MAGIC_SZ); + + b += MAGIC_SZ; + byte* const start = b; + *b++ = static_cast(type); + compile_time_assert(sizeof iv == MY_AES_BLOCK_SIZE); + compile_time_assert(sizeof iv == CRYPT_SCHEME_1_IV_LEN); + *b++ = sizeof iv; + memcpy(b, iv, sizeof iv); + b += sizeof iv; + mach_write_to_4(b, min_key_version); + b += 4; + mach_write_to_4(b, key_id); + b += 4; + *b++ = byte(encryption); + ut_ad(b - start == 11 + MY_AES_BLOCK_SIZE); + /* We must log also any unchanged bytes, because recovery will + invoke fil_crypt_parse() based on this log record. */ + mtr->memcpy(*block, offset + MAGIC_SZ, b - start); +} + +/** Encrypt a buffer for non full checksum. +@param[in,out] crypt_data Crypt data +@param[in] space space_id +@param[in] offset Page offset +@param[in] lsn Log sequence number +@param[in] src_frame Page to encrypt +@param[in] zip_size ROW_FORMAT=COMPRESSED + page size, or 0 +@param[in,out] dst_frame Output buffer +@return encrypted buffer or NULL */ +static byte* fil_encrypt_buf_for_non_full_checksum( + fil_space_crypt_t* crypt_data, + ulint space, + ulint offset, + lsn_t lsn, + const byte* src_frame, + ulint zip_size, + byte* dst_frame) +{ + uint size = uint(zip_size ? zip_size : srv_page_size); + uint key_version = fil_crypt_get_latest_key_version(crypt_data); + ut_a(key_version != ENCRYPTION_KEY_VERSION_INVALID); + ut_ad(!ut_align_offset(src_frame, 8)); + ut_ad(!ut_align_offset(dst_frame, 8)); + + const bool page_compressed = fil_page_get_type(src_frame) + == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED; + uint header_len = FIL_PAGE_DATA; + + if (page_compressed) { + header_len += FIL_PAGE_ENCRYPT_COMP_METADATA_LEN; + } + + /* FIL page header is not encrypted */ + memcpy(dst_frame, src_frame, header_len); + mach_write_to_4(dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, + key_version); + + /* Calculate the start offset in a page */ + uint unencrypted_bytes = header_len + FIL_PAGE_DATA_END; + uint srclen = size - unencrypted_bytes; + const byte* src = src_frame + header_len; + byte* dst = dst_frame + header_len; + uint32 dstlen = 0; + + if (page_compressed) { + srclen = mach_read_from_2(src_frame + FIL_PAGE_DATA); + } + + int rc = encryption_scheme_encrypt(src, srclen, dst, &dstlen, + crypt_data, key_version, + (uint32)space, (uint32)offset, lsn); + ut_a(rc == MY_AES_OK); + ut_a(dstlen == srclen); + + /* For compressed tables we do not store the FIL header because + the whole page is not stored to the disk. In compressed tables only + the FIL header + compressed (and now encrypted) payload alligned + to sector boundary is written. */ + if (!page_compressed) { + /* FIL page trailer is also not encrypted */ + static_assert(FIL_PAGE_DATA_END == 8, "alignment"); + memcpy_aligned<8>(dst_frame + size - FIL_PAGE_DATA_END, + src_frame + size - FIL_PAGE_DATA_END, 8); + } else { + /* Clean up rest of buffer */ + memset(dst_frame+header_len+srclen, 0, + size - (header_len + srclen)); + } + + /* store the post-encryption checksum after the key-version */ + mach_write_to_4(dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + 4, + zip_size + ? page_zip_calc_checksum(dst_frame, zip_size, + SRV_CHECKSUM_ALGORITHM_CRC32) + : buf_calc_page_crc32(dst_frame)); + + ut_ad(fil_space_verify_crypt_checksum(dst_frame, zip_size)); + + srv_stats.pages_encrypted.inc(); + + return dst_frame; +} + +/** Encrypt a buffer for full checksum format. +@param[in,out] crypt_data Crypt data +@param[in] space space_id +@param[in] offset Page offset +@param[in] lsn Log sequence number +@param[in] src_frame Page to encrypt +@param[in,out] dst_frame Output buffer +@return encrypted buffer or NULL */ +static byte* fil_encrypt_buf_for_full_crc32( + fil_space_crypt_t* crypt_data, + ulint space, + ulint offset, + lsn_t lsn, + const byte* src_frame, + byte* dst_frame) +{ + uint key_version = fil_crypt_get_latest_key_version(crypt_data); + ut_d(bool corrupted = false); + const uint size = buf_page_full_crc32_size(src_frame, NULL, +#ifdef UNIV_DEBUG + &corrupted +#else + NULL +#endif + ); + ut_ad(!corrupted); + uint srclen = size - (FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + + FIL_PAGE_FCRC32_CHECKSUM); + const byte* src = src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION; + byte* dst = dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION; + uint dstlen = 0; + + ut_a(key_version != ENCRYPTION_KEY_VERSION_INVALID); + + /* Till FIL_PAGE_LSN, page is not encrypted */ + memcpy(dst_frame, src_frame, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); + + /* Write key version to the page. */ + mach_write_to_4(dst_frame + FIL_PAGE_FCRC32_KEY_VERSION, key_version); + + int rc = encryption_scheme_encrypt(src, srclen, dst, &dstlen, + crypt_data, key_version, + uint(space), uint(offset), lsn); + ut_a(rc == MY_AES_OK); + ut_a(dstlen == srclen); + + const ulint payload = size - FIL_PAGE_FCRC32_CHECKSUM; + mach_write_to_4(dst_frame + payload, my_crc32c(0, dst_frame, payload)); + /* Clean the rest of the buffer. FIXME: Punch holes when writing! */ + memset(dst_frame + (payload + 4), 0, srv_page_size - (payload + 4)); + + srv_stats.pages_encrypted.inc(); + + return dst_frame; +} + +/** Encrypt a buffer. +@param[in,out] crypt_data Crypt data +@param[in] space space_id +@param[in] offset Page offset +@param[in] src_frame Page to encrypt +@param[in] zip_size ROW_FORMAT=COMPRESSED + page size, or 0 +@param[in,out] dst_frame Output buffer +@param[in] use_full_checksum full crc32 algo is used +@return encrypted buffer or NULL */ +byte* fil_encrypt_buf( + fil_space_crypt_t* crypt_data, + ulint space, + ulint offset, + const byte* src_frame, + ulint zip_size, + byte* dst_frame, + bool use_full_checksum) +{ + const lsn_t lsn = mach_read_from_8(src_frame + FIL_PAGE_LSN); + + if (use_full_checksum) { + ut_ad(!zip_size); + return fil_encrypt_buf_for_full_crc32( + crypt_data, space, offset, + lsn, src_frame, dst_frame); + } + + return fil_encrypt_buf_for_non_full_checksum( + crypt_data, space, offset, lsn, + src_frame, zip_size, dst_frame); +} + +/** Check whether these page types are allowed to encrypt. +@param[in] space tablespace object +@param[in] src_frame source page +@return true if it is valid page type */ +static bool fil_space_encrypt_valid_page_type( + const fil_space_t* space, + const byte* src_frame) +{ + switch (fil_page_get_type(src_frame)) { + case FIL_PAGE_RTREE: + return space->full_crc32(); + case FIL_PAGE_TYPE_FSP_HDR: + case FIL_PAGE_TYPE_XDES: + return false; + } + + return true; +} + +/****************************************************************** +Encrypt a page + +@param[in] space Tablespace +@param[in] offset Page offset +@param[in] src_frame Page to encrypt +@param[in,out] dst_frame Output buffer +@return encrypted buffer or NULL */ +byte* fil_space_encrypt( + const fil_space_t* space, + ulint offset, + byte* src_frame, + byte* dst_frame) +{ + if (!fil_space_encrypt_valid_page_type(space, src_frame)) { + return src_frame; + } + + if (!space->crypt_data || !space->crypt_data->is_encrypted()) { + return (src_frame); + } + + ut_ad(space->referenced()); + + return fil_encrypt_buf(space->crypt_data, space->id, offset, + src_frame, space->zip_size(), + dst_frame, space->full_crc32()); +} + +/** Decrypt a page for full checksum format. +@param[in] space space id +@param[in] crypt_data crypt_data +@param[in] tmp_frame Temporary buffer +@param[in,out] src_frame Page to decrypt +@return DB_SUCCESS or error */ +static dberr_t fil_space_decrypt_full_crc32( + ulint space, + fil_space_crypt_t* crypt_data, + byte* tmp_frame, + byte* src_frame) +{ + uint key_version = mach_read_from_4( + src_frame + FIL_PAGE_FCRC32_KEY_VERSION); + lsn_t lsn = mach_read_from_8(src_frame + FIL_PAGE_LSN); + uint offset = mach_read_from_4(src_frame + FIL_PAGE_OFFSET); + + ut_ad(key_version != ENCRYPTION_KEY_NOT_ENCRYPTED); + + memcpy(tmp_frame, src_frame, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); + + /* Calculate the offset where decryption starts */ + const byte* src = src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION; + byte* dst = tmp_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION; + uint dstlen = 0; + bool corrupted = false; + uint size = buf_page_full_crc32_size(src_frame, NULL, &corrupted); + if (UNIV_UNLIKELY(corrupted)) { + return DB_DECRYPTION_FAILED; + } + + uint srclen = size - (FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + + FIL_PAGE_FCRC32_CHECKSUM); + + int rc = encryption_scheme_decrypt(src, srclen, dst, &dstlen, + crypt_data, key_version, + (uint) space, offset, lsn); + + if (rc != MY_AES_OK || dstlen != srclen) { + return DB_DECRYPTION_FAILED; + } + + /* Copy only checksum part in the trailer */ + memcpy(tmp_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM, + src_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM, + FIL_PAGE_FCRC32_CHECKSUM); + + srv_stats.pages_decrypted.inc(); + + return DB_SUCCESS; /* page was decrypted */ +} + +/** Decrypt a page for non full checksum format. +@param[in] crypt_data crypt_data +@param[in] tmp_frame Temporary buffer +@param[in] physical_size page size +@param[in,out] src_frame Page to decrypt +@return DB_SUCCESS or error */ +static dberr_t fil_space_decrypt_for_non_full_checksum( + fil_space_crypt_t* crypt_data, + byte* tmp_frame, + ulint physical_size, + byte* src_frame) +{ + uint key_version = mach_read_from_4( + src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); + bool page_compressed = (fil_page_get_type(src_frame) + == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED); + uint offset = mach_read_from_4(src_frame + FIL_PAGE_OFFSET); + uint space = mach_read_from_4( + src_frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + ib_uint64_t lsn = mach_read_from_8(src_frame + FIL_PAGE_LSN); + + ut_ad(key_version != ENCRYPTION_KEY_NOT_ENCRYPTED); + + /* read space & lsn */ + uint header_len = FIL_PAGE_DATA; + + if (page_compressed) { + header_len += FIL_PAGE_ENCRYPT_COMP_METADATA_LEN; + } + + /* Copy FIL page header, it is not encrypted */ + memcpy(tmp_frame, src_frame, header_len); + + /* Calculate the offset where decryption starts */ + const byte* src = src_frame + header_len; + byte* dst = tmp_frame + header_len; + uint32 dstlen = 0; + uint srclen = uint(physical_size) - header_len - FIL_PAGE_DATA_END; + + if (page_compressed) { + srclen = mach_read_from_2(src_frame + FIL_PAGE_DATA); + } + + int rc = encryption_scheme_decrypt(src, srclen, dst, &dstlen, + crypt_data, key_version, + space, offset, lsn); + + if (! ((rc == MY_AES_OK) && ((ulint) dstlen == srclen))) { + return DB_DECRYPTION_FAILED; + } + + /* For compressed tables we do not store the FIL header because + the whole page is not stored to the disk. In compressed tables only + the FIL header + compressed (and now encrypted) payload alligned + to sector boundary is written. */ + if (!page_compressed) { + /* Copy FIL trailer */ + memcpy(tmp_frame + physical_size - FIL_PAGE_DATA_END, + src_frame + physical_size - FIL_PAGE_DATA_END, + FIL_PAGE_DATA_END); + } + + srv_stats.pages_decrypted.inc(); + + return DB_SUCCESS; /* page was decrypted */ +} + +/** Decrypt a page. +@param[in] space_id tablespace id +@param[in] fsp_flags Tablespace flags +@param[in] crypt_data crypt_data +@param[in] tmp_frame Temporary buffer +@param[in] physical_size page size +@param[in,out] src_frame Page to decrypt +@retval DB_SUCCESS on success +@retval DB_DECRYPTION_FAILED on error */ +dberr_t +fil_space_decrypt( + uint32_t space_id, + uint32_t fsp_flags, + fil_space_crypt_t* crypt_data, + byte* tmp_frame, + ulint physical_size, + byte* src_frame) +{ + if (!crypt_data || !crypt_data->is_encrypted()) { + return DB_DECRYPTION_FAILED; + } + + if (fil_space_t::full_crc32(fsp_flags)) { + return fil_space_decrypt_full_crc32( + space_id, crypt_data, tmp_frame, src_frame); + } + + return fil_space_decrypt_for_non_full_checksum(crypt_data, tmp_frame, + physical_size, + src_frame); +} + +/** +Decrypt a page. +@param[in] space Tablespace +@param[in] tmp_frame Temporary buffer used for decrypting +@param[in,out] src_frame Page to decrypt +@return decrypted page, or original not encrypted page if decryption is +not needed. +@retval nullptr on failure */ +byte* +fil_space_decrypt( + const fil_space_t* space, + byte* tmp_frame, + byte* src_frame) +{ + const ulint physical_size = space->physical_size(); + + ut_ad(space->referenced()); + + if (DB_SUCCESS != fil_space_decrypt(space->id, space->flags, + space->crypt_data, + tmp_frame, physical_size, + src_frame)) { + return nullptr; + } + + /* Copy the decrypted page back to page buffer, not + really any other options. */ + return static_cast(memcpy(src_frame, tmp_frame, physical_size)); +} + +/***********************************************************************/ + +/** A copy of global key state */ +struct key_state_t { + key_state_t() : key_id(0), key_version(0), + rotate_key_age(srv_fil_crypt_rotate_key_age) {} + bool operator==(const key_state_t& other) const { + return key_version == other.key_version && + rotate_key_age == other.rotate_key_age; + } + uint key_id; + uint key_version; + uint rotate_key_age; +}; + +/*********************************************************************** +Copy global key state +@param[in,out] new_state key state +@param[in] crypt_data crypt data */ +static void +fil_crypt_get_key_state( + key_state_t* new_state, + fil_space_crypt_t* crypt_data) +{ + if (srv_encrypt_tables) { + new_state->key_version = crypt_data->key_get_latest_version(); + new_state->rotate_key_age = srv_fil_crypt_rotate_key_age; + + ut_a(new_state->key_version != ENCRYPTION_KEY_NOT_ENCRYPTED); + } else { + new_state->key_version = 0; + new_state->rotate_key_age = 0; + } +} + +/*********************************************************************** +Check if a key needs rotation given a key_state +@param[in] crypt_data Encryption information +@param[in] key_version Current key version +@param[in] latest_key_version Latest key version +@param[in] rotate_key_age when to rotate +@return true if key needs rotation, false if not */ +static bool +fil_crypt_needs_rotation( + const fil_space_crypt_t* crypt_data, + uint key_version, + uint latest_key_version, + uint rotate_key_age) +{ + if (key_version == ENCRYPTION_KEY_VERSION_INVALID) { + return false; + } + + if (key_version == 0 && latest_key_version != 0) { + /* this is rotation unencrypted => encrypted + * ignore rotate_key_age */ + return true; + } + + if (latest_key_version == 0 && key_version != 0) { + if (crypt_data->encryption == FIL_ENCRYPTION_DEFAULT) { + /* this is rotation encrypted => unencrypted */ + return true; + } + return false; + } + + if (crypt_data->encryption == FIL_ENCRYPTION_DEFAULT + && crypt_data->type == CRYPT_SCHEME_1 + && !srv_encrypt_tables) { + /* This is rotation encrypted => unencrypted */ + return true; + } + + if (rotate_key_age == 0) { + return false; + } + + /* this is rotation encrypted => encrypted, + * only reencrypt if key is sufficiently old */ + if (key_version + rotate_key_age < latest_key_version) { + return true; + } + + return false; +} + +/** Read page 0 and possible crypt data from there. +@param[in,out] space Tablespace */ +static inline void fil_crypt_read_crypt_data(fil_space_t *space) +{ + if (space->crypt_data || space->size || !space->get_size()) + /* The encryption metadata has already been read, or the + tablespace is not encrypted and the file has been opened already, + or the file cannot be accessed, likely due to a concurrent DROP + (possibly as part of TRUNCATE or ALTER TABLE). + + FIXME: The file can become unaccessible any time after this check! + We should really remove this function and instead make crypt_data + an integral part of fil_space_t. */ + return; + + const ulint zip_size= space->zip_size(); + mtr_t mtr; + mtr.start(); + if (buf_block_t* b= buf_page_get_gen(page_id_t{space->id, 0}, zip_size, + RW_S_LATCH, nullptr, + BUF_GET_POSSIBLY_FREED, &mtr)) + { + mysql_mutex_lock(&fil_system.mutex); + if (!space->crypt_data && !space->is_stopping()) + space->crypt_data= fil_space_read_crypt_data(zip_size, b->page.frame); + mysql_mutex_unlock(&fil_system.mutex); + } + mtr.commit(); +} + +/** Start encrypting a space +@param[in,out] space Tablespace +@return true if a recheck of tablespace is needed by encryption thread. */ +static bool fil_crypt_start_encrypting_space(fil_space_t* space) +{ + mysql_mutex_lock(&fil_crypt_threads_mutex); + + fil_space_crypt_t *crypt_data = space->crypt_data; + + /* If space is not encrypted and encryption is not enabled, then + do not continue encrypting the space. */ + if (!crypt_data && !srv_encrypt_tables) { +func_exit: + mysql_mutex_unlock(&fil_crypt_threads_mutex); + return false; + } + + const bool recheck = fil_crypt_start_converting; + + if (recheck || crypt_data || space->is_stopping()) { + mysql_mutex_unlock(&fil_crypt_threads_mutex); + return recheck; + } + + /* NOTE: we need to write and flush page 0 before publishing + * the crypt data. This so that after restart there is no + * risk of finding encrypted pages without having + * crypt data in page 0 */ + + /* 1 - create crypt data */ + crypt_data = fil_space_create_crypt_data( + FIL_ENCRYPTION_DEFAULT, FIL_DEFAULT_ENCRYPTION_KEY); + + if (!crypt_data) { + goto func_exit; + } + + fil_crypt_start_converting = true; + mysql_mutex_unlock(&fil_crypt_threads_mutex); + + mtr_t mtr; + mtr.start(); + + /* 2 - get page 0 */ + if (buf_block_t* block = buf_page_get_gen( + page_id_t(space->id, 0), space->zip_size(), + RW_X_LATCH, NULL, BUF_GET_POSSIBLY_FREED, &mtr)) { + crypt_data->type = CRYPT_SCHEME_1; + crypt_data->min_key_version = 0; // all pages are unencrypted + crypt_data->rotate_state.start_time = time(0); + crypt_data->rotate_state.starting = true; + crypt_data->rotate_state.active_threads = 1; + + mysql_mutex_lock(&fil_system.mutex); + const bool stopping = space->is_stopping(); + if (!stopping) { + space->crypt_data = crypt_data; + } + mysql_mutex_unlock(&fil_system.mutex); + + if (stopping) { + goto abort; + } + + /* 3 - write crypt data to page 0 */ + mtr.set_named_space(space); + crypt_data->write_page0(block, &mtr); + + mtr.commit(); + + /* 4 - sync tablespace before publishing crypt data */ + while (buf_flush_list_space(space)); + + /* 5 - publish crypt data */ + mysql_mutex_lock(&fil_crypt_threads_mutex); + mysql_mutex_lock(&crypt_data->mutex); + crypt_data->type = CRYPT_SCHEME_1; + ut_a(crypt_data->rotate_state.active_threads == 1); + crypt_data->rotate_state.active_threads = 0; + crypt_data->rotate_state.starting = false; + + fil_crypt_start_converting = false; + mysql_mutex_unlock(&fil_crypt_threads_mutex); + mysql_mutex_unlock(&crypt_data->mutex); + + return false; + } + +abort: + mtr.commit(); + mysql_mutex_lock(&fil_crypt_threads_mutex); + fil_crypt_start_converting = false; + mysql_mutex_unlock(&fil_crypt_threads_mutex); + + crypt_data->~fil_space_crypt_t(); + ut_free(crypt_data); + return false; +} + +/** State of a rotation thread */ +struct rotate_thread_t { + explicit rotate_thread_t(uint no) : thread_no(no) {} + + uint thread_no; + bool first = true; /*!< is position before first space */ + space_list_t::iterator space + = fil_system.space_list.end();/*!< current space or .end() */ + uint32_t offset = 0; /*!< current page number */ + ulint batch = 0; /*!< #pages to rotate */ + uint min_key_version_found = 0; /*!< min key version found but not rotated */ + lsn_t end_lsn = 0; /*!< max lsn when rotating this space */ + + uint estimated_max_iops = 20;/*!< estimation of max iops */ + uint allocated_iops = 0; /*!< allocated iops */ + ulint cnt_waited = 0; /*!< #times waited during this slot */ + uintmax_t sum_waited_us = 0; /*!< wait time during this slot */ + + fil_crypt_stat_t crypt_stat; // statistics + + /** @return whether this thread should terminate */ + bool should_shutdown() const { + mysql_mutex_assert_owner(&fil_crypt_threads_mutex); + switch (srv_shutdown_state) { + case SRV_SHUTDOWN_NONE: + return thread_no >= srv_n_fil_crypt_threads; + case SRV_SHUTDOWN_EXIT_THREADS: + /* srv_init_abort() must have been invoked */ + case SRV_SHUTDOWN_CLEANUP: + case SRV_SHUTDOWN_INITIATED: + return true; + case SRV_SHUTDOWN_LAST_PHASE: + break; + } + ut_ad(0); + return true; + } +}; + +/** Avoid the removal of the tablespace from +default_encrypt_list only when +1) Another active encryption thread working on tablespace +2) Eligible for tablespace key rotation +3) Tablespace is in flushing phase +@return true if tablespace should be removed from +default encrypt */ +static bool fil_crypt_must_remove(const fil_space_t &space) +{ + ut_ad(space.purpose == FIL_TYPE_TABLESPACE); + fil_space_crypt_t *crypt_data = space.crypt_data; + mysql_mutex_assert_owner(&fil_system.mutex); + const ulong encrypt_tables= srv_encrypt_tables; + if (!crypt_data) + return !encrypt_tables; + if (!crypt_data->is_key_found()) + return true; + + mysql_mutex_lock(&crypt_data->mutex); + const bool remove= (space.is_stopping() || crypt_data->not_encrypted()) && + (!crypt_data->rotate_state.flushing && + !encrypt_tables == !!crypt_data->min_key_version && + !crypt_data->rotate_state.active_threads); + mysql_mutex_unlock(&crypt_data->mutex); + return remove; +} + +/*********************************************************************** +Check if space needs rotation given a key_state +@param[in,out] state Key rotation state +@param[in,out] key_state Key state +@param[in,out] recheck needs recheck ? +@return true if space needs key rotation */ +static +bool +fil_crypt_space_needs_rotation( + rotate_thread_t* state, + key_state_t* key_state, + bool* recheck) +{ + mysql_mutex_assert_not_owner(&fil_crypt_threads_mutex); + + fil_space_t* space = &*state->space; + + ut_ad(space->referenced()); + ut_ad(space->purpose == FIL_TYPE_TABLESPACE); + + fil_space_crypt_t *crypt_data = space->crypt_data; + + if (crypt_data == NULL) { + /** + * space has no crypt data + * start encrypting it... + */ + *recheck = fil_crypt_start_encrypting_space(space); + crypt_data = space->crypt_data; + + if (crypt_data == NULL) { + return false; + } + + crypt_data->key_get_latest_version(); + } + + /* If used key_id is not found from encryption plugin we can't + continue to rotate the tablespace */ + if (!crypt_data->is_key_found()) { + return false; + } + + bool need_key_rotation = false; + + mysql_mutex_lock(&crypt_data->mutex); + + do { + /* prevent threads from starting to rotate space */ + if (crypt_data->rotate_state.starting) { + /* recheck this space later */ + *recheck = true; + break; + } + + /* prevent threads from starting to rotate space */ + if (space->is_stopping()) { + break; + } + + if (crypt_data->rotate_state.flushing) { + break; + } + + /* No need to rotate space if encryption is disabled */ + if (crypt_data->not_encrypted()) { + break; + } + + if (crypt_data->key_id != key_state->key_id) { + key_state->key_id= crypt_data->key_id; + fil_crypt_get_key_state(key_state, crypt_data); + } + + need_key_rotation = fil_crypt_needs_rotation( + crypt_data, + crypt_data->min_key_version, + key_state->key_version, + key_state->rotate_key_age); + } while (0); + + mysql_mutex_unlock(&crypt_data->mutex); + return need_key_rotation; +} + +/*********************************************************************** +Update global statistics with thread statistics +@param[in,out] state key rotation statistics */ +static void +fil_crypt_update_total_stat( + rotate_thread_t *state) +{ + mysql_mutex_lock(&crypt_stat_mutex); + crypt_stat.pages_read_from_cache += + state->crypt_stat.pages_read_from_cache; + crypt_stat.pages_read_from_disk += + state->crypt_stat.pages_read_from_disk; + crypt_stat.pages_modified += state->crypt_stat.pages_modified; + crypt_stat.pages_flushed += state->crypt_stat.pages_flushed; + // remote old estimate + crypt_stat.estimated_iops -= state->crypt_stat.estimated_iops; + // add new estimate + crypt_stat.estimated_iops += state->estimated_max_iops; + mysql_mutex_unlock(&crypt_stat_mutex); + + // make new estimate "current" estimate + state->crypt_stat.pages_read_from_cache = 0; + state->crypt_stat.pages_read_from_disk = 0; + state->crypt_stat.pages_modified = 0; + state->crypt_stat.pages_flushed = 0; + // record our old (current) estimate + state->crypt_stat.estimated_iops = state->estimated_max_iops; +} + +/*********************************************************************** +Allocate iops to thread from global setting, +used before starting to rotate a space. +@param[in,out] state Rotation state +@return true if allocation succeeded, false if failed */ +static bool fil_crypt_alloc_iops(rotate_thread_t *state) +{ + mysql_mutex_assert_owner(&fil_crypt_threads_mutex); + ut_ad(state->allocated_iops == 0); + + /* We have not yet selected the space to rotate, thus + state might not contain space and we can't check + its status yet. */ + + uint max_iops = state->estimated_max_iops; + + if (n_fil_crypt_iops_allocated >= srv_n_fil_crypt_iops) { +wait: + my_cond_wait(&fil_crypt_threads_cond, + &fil_crypt_threads_mutex.m_mutex); + return false; + } + + uint alloc = srv_n_fil_crypt_iops - n_fil_crypt_iops_allocated; + + if (alloc > max_iops) { + alloc = max_iops; + } + + if (!alloc) { + goto wait; + } + + n_fil_crypt_iops_allocated += alloc; + + state->allocated_iops = alloc; + return true; +} + +/** +Reallocate iops to thread when processing a tablespace +@param[in,out] state Rotation state +@return whether the thread should continue running */ +static bool fil_crypt_realloc_iops(rotate_thread_t *state) +{ + ut_a(state->allocated_iops > 0); + + if (10 * state->cnt_waited > state->batch) { + /* if we waited more than 10% re-estimate max_iops */ + ulint avg_wait_time_us = + ulint(state->sum_waited_us / state->cnt_waited); + + if (avg_wait_time_us == 0) { + avg_wait_time_us = 1; // prevent division by zero + } + + DBUG_PRINT("ib_crypt", + ("thr_no: %u - update estimated_max_iops from %u to " + ULINTPF ".", + state->thread_no, + state->estimated_max_iops, + 1000000 / avg_wait_time_us)); + + state->estimated_max_iops = std::max( + 1U, uint(1000000 / avg_wait_time_us)); + state->cnt_waited = 0; + state->sum_waited_us = 0; + } else { + DBUG_PRINT("ib_crypt", + ("thr_no: %u only waited " ULINTPF + "%% skip re-estimate.", + state->thread_no, + (100 * state->cnt_waited) + / (state->batch ? state->batch : 1))); + } + + ut_ad(state->estimated_max_iops); + + mysql_mutex_lock(&fil_crypt_threads_mutex); + + if (state->should_shutdown()) { + mysql_mutex_unlock(&fil_crypt_threads_mutex); + return false; + } + + if (state->allocated_iops > state->estimated_max_iops) { + /* release iops */ + uint extra = state->allocated_iops - state->estimated_max_iops; + state->allocated_iops = state->estimated_max_iops; + ut_ad(n_fil_crypt_iops_allocated >= extra); + n_fil_crypt_iops_allocated -= extra; + pthread_cond_broadcast(&fil_crypt_threads_cond); + } else if (srv_n_fil_crypt_iops > n_fil_crypt_iops_allocated) { + /* there are extra iops free */ + uint add = srv_n_fil_crypt_iops - n_fil_crypt_iops_allocated; + if (state->allocated_iops + add > state->estimated_max_iops) { + /* but don't alloc more than our max */ + add= state->estimated_max_iops - state->allocated_iops; + } + n_fil_crypt_iops_allocated += add; + state->allocated_iops += add; + + DBUG_PRINT("ib_crypt", + ("thr_no: %u increased iops from %u to %u.", + state->thread_no, + state->allocated_iops - add, + state->allocated_iops)); + } + + fil_crypt_update_total_stat(state); + mysql_mutex_unlock(&fil_crypt_threads_mutex); + return true; +} + +/** Release excess allocated iops +@param state rotation state +@param wake whether to wake up other threads */ +static void fil_crypt_return_iops(rotate_thread_t *state, bool wake= true) +{ + mysql_mutex_assert_owner(&fil_crypt_threads_mutex); + + if (uint iops= state->allocated_iops) + { + ut_ad(n_fil_crypt_iops_allocated >= iops); + n_fil_crypt_iops_allocated-= iops; + state->allocated_iops= 0; + if (wake) + pthread_cond_broadcast(&fil_crypt_threads_cond); + } + + fil_crypt_update_total_stat(state); +} + +/** Acquire a tablespace reference. +@return whether a tablespace reference was successfully acquired */ +inline bool fil_space_t::acquire_if_not_stopped() +{ + mysql_mutex_assert_owner(&fil_system.mutex); + const uint32_t n= acquire_low(); + if (UNIV_LIKELY(!(n & (STOPPING | CLOSING)))) + return true; + if (UNIV_UNLIKELY(n & STOPPING)) + return false; + return UNIV_LIKELY(!(n & CLOSING)) || prepare_acquired(); +} + +bool fil_crypt_must_default_encrypt() +{ + return !srv_fil_crypt_rotate_key_age || !srv_encrypt_rotate; +} + +/** Return the next tablespace from default_encrypt_tables list. +@param space previous tablespace (nullptr to start from the start) +@param recheck whether the removal condition needs to be rechecked after +the encryption parameters were changed +@param encrypt expected state of innodb_encrypt_tables +@return the next tablespace to process (n_pending_ops incremented) +@retval fil_system.temp_space if there is no work to do +@retval nullptr upon reaching the end of the iteration */ +inline fil_space_t *fil_system_t::default_encrypt_next(fil_space_t *space, + bool recheck, + bool encrypt) +{ + mysql_mutex_assert_owner(&mutex); + + auto it= space && space->is_in_default_encrypt + ? sized_ilist::iterator(space) + : default_encrypt_tables.begin(); + const auto end= default_encrypt_tables.end(); + + if (space) + { + const bool released= !space->release(); + + if (space->is_in_default_encrypt) + { + while (++it != end && + (!UT_LIST_GET_LEN(it->chain) || it->is_stopping())); + + /* If one of the encryption threads already started + the encryption of the table then don't remove the + unencrypted spaces from default encrypt list. + + If there is a change in innodb_encrypt_tables variables + value then don't remove the last processed tablespace + from the default encrypt list. */ + if (released && !recheck && fil_crypt_must_remove(*space)) + { + ut_a(!default_encrypt_tables.empty()); + default_encrypt_tables.remove(*space); + space->is_in_default_encrypt= false; + } + } + } + else while (it != end && + (!UT_LIST_GET_LEN(it->chain) || it->is_stopping())) + { + /* Find the next suitable default encrypt table if + beginning of default_encrypt_tables list has been scheduled + to be deleted */ + it++; + } + + if (it == end) + return temp_space; + + do + { + space= &*it; + if (space->acquire_if_not_stopped()) + return space; + if (++it == end) + return nullptr; + } + while (!UT_LIST_GET_LEN(it->chain) || it->is_stopping()); + + return nullptr; +} + +/** Determine the next tablespace for encryption key rotation. +@param space current tablespace (nullptr to start from the beginning) +@param recheck whether the removal condition needs to be rechecked after +encryption parameters were changed +@param encrypt expected state of innodb_encrypt_tables +@return the next tablespace +@retval fil_system.temp_space if there is no work to do +@retval end() upon reaching the end of the iteration */ +space_list_t::iterator fil_space_t::next(space_list_t::iterator space, + bool recheck, bool encrypt) +{ + mysql_mutex_lock(&fil_system.mutex); + + if (fil_crypt_must_default_encrypt()) + { + fil_space_t *next_space= + fil_system.default_encrypt_next(space == fil_system.space_list.end() + ? nullptr : &*space, recheck, encrypt); + space= next_space + ? space_list_t::iterator(next_space) + : fil_system.space_list.end(); + } + else + { + if (space == fil_system.space_list.end()) + space= fil_system.space_list.begin(); + else + { + /* Move on to the next fil_space_t */ + space->release(); + ++space; + } + + for (; space != fil_system.space_list.end(); ++space) + { + if (space->purpose != FIL_TYPE_TABLESPACE) + continue; + const uint32_t n= space->acquire_low(); + if (UNIV_LIKELY(!(n & (STOPPING | CLOSING)))) + break; + if (!(n & STOPPING) && space->prepare_acquired()) + break; + } + } + + mysql_mutex_unlock(&fil_system.mutex); + return space; +} + +/** Search for a space needing rotation +@param[in,out] key_state Key state +@param[in,out] state Rotation state +@param[in,out] recheck recheck of the tablespace is needed or + still encryption thread does write page 0 +@return whether the thread should keep running */ +static bool fil_crypt_find_space_to_rotate( + key_state_t* key_state, + rotate_thread_t* state, + bool* recheck) +{ + /* we need iops to start rotating */ + do { + if (state->should_shutdown()) { + if (state->space != fil_system.space_list.end()) { + state->space->release(); + state->space = fil_system.space_list.end(); + } + return false; + } + } while (!fil_crypt_alloc_iops(state)); + + if (state->first) { + state->first = false; + if (state->space != fil_system.space_list.end()) { + state->space->release(); + } + state->space = fil_system.space_list.end(); + } + + state->space = fil_space_t::next(state->space, *recheck, + key_state->key_version != 0); + + bool wake = true; + while (state->space != fil_system.space_list.end()) { + if (state->space + == space_list_t::iterator(fil_system.temp_space)) { + wake = false; + goto done; + } + + if (state->should_shutdown()) { + state->space->release(); +done: + state->space = fil_system.space_list.end(); + break; + } + + mysql_mutex_unlock(&fil_crypt_threads_mutex); + /* If there is no crypt data and we have not yet read + page 0 for this tablespace, we need to read it before + we can continue. */ + if (!state->space->crypt_data) { + fil_crypt_read_crypt_data(&*state->space); + } + + if (fil_crypt_space_needs_rotation(state, key_state, recheck)) { + ut_ad(key_state->key_id); + /* init state->min_key_version_found before + * starting on a space */ + state->min_key_version_found = key_state->key_version; + mysql_mutex_lock(&fil_crypt_threads_mutex); + return true; + } + + state->space = fil_space_t::next(state->space, *recheck, + key_state->key_version != 0); + mysql_mutex_lock(&fil_crypt_threads_mutex); + } + + /* no work to do; release our allocation of I/O capacity */ + fil_crypt_return_iops(state, wake); + return true; +} + +/*********************************************************************** +Start rotating a space +@param[in] key_state Key state +@param[in,out] state Rotation state */ +static +void +fil_crypt_start_rotate_space( + const key_state_t* key_state, + rotate_thread_t* state) +{ + fil_space_crypt_t *crypt_data = state->space->crypt_data; + + ut_ad(crypt_data); + mysql_mutex_lock(&crypt_data->mutex); + ut_ad(key_state->key_id == crypt_data->key_id); + + if (crypt_data->rotate_state.active_threads == 0) { + /* only first thread needs to init */ + crypt_data->rotate_state.next_offset = 1; // skip page 0 + /* no need to rotate beyond current max + * if space extends, it will be encrypted with newer version */ + /* FIXME: max_offset could be removed and instead + space->size consulted.*/ + crypt_data->rotate_state.max_offset = state->space->size; + crypt_data->rotate_state.end_lsn = 0; + crypt_data->rotate_state.min_key_version_found = + key_state->key_version; + + crypt_data->rotate_state.start_time = time(0); + + if (crypt_data->type == CRYPT_SCHEME_UNENCRYPTED && + crypt_data->is_encrypted() && + key_state->key_version != 0) { + /* this is rotation unencrypted => encrypted */ + crypt_data->type = CRYPT_SCHEME_1; + } + } + + /* count active threads in space */ + crypt_data->rotate_state.active_threads++; + + /* Initialize thread local state */ + state->end_lsn = crypt_data->rotate_state.end_lsn; + state->min_key_version_found = + crypt_data->rotate_state.min_key_version_found; + + mysql_mutex_unlock(&crypt_data->mutex); +} + +/*********************************************************************** +Search for batch of pages needing rotation +@param[in] key_state Key state +@param[in,out] state Rotation state +@return true if page needing key rotation found, false if not found */ +static +bool +fil_crypt_find_page_to_rotate( + const key_state_t* key_state, + rotate_thread_t* state) +{ + ulint batch = srv_alloc_time * state->allocated_iops; + + ut_ad(state->space == fil_system.space_list.end() + || state->space->referenced()); + + /* If space is marked to be dropped stop rotation. */ + if (state->space == fil_system.space_list.end() + || state->space->is_stopping()) { + return false; + } + + fil_space_crypt_t *crypt_data = state->space->crypt_data; + + mysql_mutex_lock(&crypt_data->mutex); + ut_ad(key_state->key_id == crypt_data->key_id); + + bool found = crypt_data->rotate_state.max_offset >= + crypt_data->rotate_state.next_offset; + + if (found) { + state->offset = crypt_data->rotate_state.next_offset; + ulint remaining = crypt_data->rotate_state.max_offset - + crypt_data->rotate_state.next_offset; + + if (batch <= remaining) { + state->batch = batch; + } else { + state->batch = remaining; + } + } + + crypt_data->rotate_state.next_offset += uint32_t(batch); + mysql_mutex_unlock(&crypt_data->mutex); + return found; +} + +/*********************************************************************** +Get a page and compute sleep time +@param[in,out] state Rotation state +@param[in] offset Page offset +@param[in,out] mtr Minitransaction +@param[out] sleeptime_ms Sleep time +@return page or NULL*/ +static +buf_block_t* +fil_crypt_get_page_throttle( + rotate_thread_t* state, + uint32_t offset, + mtr_t* mtr, + ulint* sleeptime_ms) +{ + fil_space_t* space = &*state->space; + const ulint zip_size = space->zip_size(); + const page_id_t page_id(space->id, offset); + ut_ad(space->referenced()); + + /* Before reading from tablespace we need to make sure that + the tablespace is not about to be dropped. */ + if (space->is_stopping()) { + return NULL; + } + + buf_block_t* block = buf_page_get_gen(page_id, zip_size, RW_X_LATCH, + NULL, + BUF_PEEK_IF_IN_POOL, mtr); + if (block != NULL) { + /* page was in buffer pool */ + state->crypt_stat.pages_read_from_cache++; + return block; + } + + if (space->is_stopping()) { + return NULL; + } + + if (offset % (zip_size ? zip_size : srv_page_size) + && DB_SUCCESS_LOCKED_REC + != fseg_page_is_allocated(space, offset)) { + /* page is already freed */ + return NULL; + } + + state->crypt_stat.pages_read_from_disk++; + + const ulonglong start = my_interval_timer(); + block = buf_page_get_gen(page_id, zip_size, + RW_X_LATCH, + NULL, BUF_GET_POSSIBLY_FREED, mtr); + const ulonglong end = my_interval_timer(); + + state->cnt_waited++; + + if (end > start) { + state->sum_waited_us += (end - start) / 1000; + } + + /* average page load */ + ulint add_sleeptime_ms = 0; + ulint avg_wait_time_us =ulint(state->sum_waited_us / state->cnt_waited); + ulint alloc_wait_us = 1000000 / state->allocated_iops; + + if (avg_wait_time_us < alloc_wait_us) { + /* we reading faster than we allocated */ + add_sleeptime_ms = (alloc_wait_us - avg_wait_time_us) / 1000; + } else { + /* if page load time is longer than we want, skip sleeping */ + } + + *sleeptime_ms += add_sleeptime_ms; + + return block; +} + +/*********************************************************************** +Rotate one page +@param[in,out] key_state Key state +@param[in,out] state Rotation state */ +static +void +fil_crypt_rotate_page( + const key_state_t* key_state, + rotate_thread_t* state) +{ + fil_space_t *space = &*state->space; + ulint space_id = space->id; + uint32_t offset = state->offset; + ulint sleeptime_ms = 0; + fil_space_crypt_t *crypt_data = space->crypt_data; + + ut_ad(space->referenced()); + ut_ad(offset > 0); + + /* In fil_crypt_thread where key rotation is done we have + acquired space and checked that this space is not yet + marked to be dropped. Similarly, in fil_crypt_find_page_to_rotate(). + Check here also to give DROP TABLE or similar a change. */ + if (space->is_stopping()) { + return; + } + + if (space_id == TRX_SYS_SPACE && offset == TRX_SYS_PAGE_NO) { + /* don't encrypt this as it contains address to dblwr buffer */ + return; + } + + mtr_t mtr; + mtr.start(); + if (buf_block_t* block = fil_crypt_get_page_throttle(state, + offset, &mtr, + &sleeptime_ms)) { + bool modified = false; + byte* frame = buf_block_get_frame(block); + const lsn_t block_lsn = mach_read_from_8(FIL_PAGE_LSN + frame); + uint kv = buf_page_get_key_version(frame, space->flags); + + if (block->page.oldest_modification() > 1) { + /* Do not unnecessarily touch pages that are + already dirty. */ + } else if (space->is_stopping()) { + /* The tablespace is closing (in DROP TABLE or + TRUNCATE TABLE or similar): avoid further access */ + } else if (!kv && !*reinterpret_cast + (&frame[FIL_PAGE_TYPE])) { + /* It looks like this page is not + allocated. Because key rotation is accessing + pages in a pattern that is unlike the normal + B-tree and undo log access pattern, we cannot + invoke fseg_page_is_allocated() here, because that + could result in a deadlock. If we invoked + fseg_page_is_allocated() and released the + tablespace latch before acquiring block->lock, + then the fseg_page_is_allocated() information + could be stale already. */ + + /* If the data file was originally created + before MariaDB 10.0 or MySQL 5.6, some + allocated data pages could carry 0 in + FIL_PAGE_TYPE. The FIL_PAGE_TYPE on those + pages will be updated in + buf_flush_init_for_writing() when the page + is modified the next time. + + Also, when the doublewrite buffer pages are + allocated on bootstrap in a non-debug build, + some dummy pages will be allocated, with 0 in + the FIL_PAGE_TYPE. Those pages should be + skipped from key rotation forever. */ + } else if (fil_crypt_needs_rotation( + crypt_data, + kv, + key_state->key_version, + key_state->rotate_key_age)) { + + mtr.set_named_space(space); + modified = true; + + /* force rotation by dummy updating page */ + mtr.write<1,mtr_t::FORCED>(*block, + &frame[FIL_PAGE_SPACE_ID], + frame[FIL_PAGE_SPACE_ID]); + + /* statistics */ + state->crypt_stat.pages_modified++; + } else { + if (crypt_data->is_encrypted()) { + if (kv < state->min_key_version_found) { + state->min_key_version_found = kv; + } + } + } + + mtr.commit(); + lsn_t end_lsn = mtr.commit_lsn(); + + + if (modified) { + /* if we modified page, we take lsn from mtr */ + ut_a(end_lsn > state->end_lsn); + ut_a(end_lsn > block_lsn); + state->end_lsn = end_lsn; + } else { + /* if we did not modify page, check for max lsn */ + if (block_lsn > state->end_lsn) { + state->end_lsn = block_lsn; + } + } + } else { + /* If block read failed mtr memo and log should be empty. */ + ut_ad(!mtr.has_modifications()); + ut_ad(mtr.is_empty()); + mtr.commit(); + } + + if (sleeptime_ms) { + mysql_mutex_lock(&fil_crypt_threads_mutex); + timespec abstime; + set_timespec_nsec(abstime, 1000000ULL * sleeptime_ms); + my_cond_timedwait(&fil_crypt_throttle_sleep_cond, + &fil_crypt_threads_mutex.m_mutex, &abstime); + mysql_mutex_unlock(&fil_crypt_threads_mutex); + } +} + +/*********************************************************************** +Rotate a batch of pages +@param[in,out] key_state Key state +@param[in,out] state Rotation state */ +static +void +fil_crypt_rotate_pages( + const key_state_t* key_state, + rotate_thread_t* state) +{ + const uint32_t space_id = state->space->id; + uint32_t end = std::min(state->offset + uint32_t(state->batch), + state->space->free_limit); + + ut_ad(state->space->referenced()); + + for (; state->offset < end; state->offset++) { + + /* we can't rotate pages in dblwr buffer as + * it's not possible to read those due to lots of asserts + * in buffer pool. + * + * However since these are only (short-lived) copies of + * real pages, they will be updated anyway when the + * real page is updated + */ + if (buf_dblwr.is_inside(page_id_t(space_id, state->offset))) { + continue; + } + + /* If space is marked as stopping, stop rotating + pages. */ + if (state->space->is_stopping()) { + break; + } + + fil_crypt_rotate_page(key_state, state); + } +} + +/*********************************************************************** +Flush rotated pages and then update page 0 + +@param[in,out] state rotation state */ +static +void +fil_crypt_flush_space( + rotate_thread_t* state) +{ + fil_space_t* space = &*state->space; + fil_space_crypt_t *crypt_data = space->crypt_data; + + ut_ad(space->referenced()); + + /* flush tablespace pages so that there are no pages left with old key */ + lsn_t end_lsn = crypt_data->rotate_state.end_lsn; + + if (end_lsn > 0 && !space->is_stopping()) { + ulint sum_pages = 0; + const ulonglong start = my_interval_timer(); + while (buf_flush_list_space(space, &sum_pages)); + if (sum_pages) { + const ulonglong end = my_interval_timer(); + + state->cnt_waited += sum_pages; + state->sum_waited_us += (end - start) / 1000; + + /* statistics */ + state->crypt_stat.pages_flushed += sum_pages; + } + } + + if (crypt_data->min_key_version == 0) { + crypt_data->type = CRYPT_SCHEME_UNENCRYPTED; + } + + if (space->is_stopping()) { + return; + } + + /* update page 0 */ + mtr_t mtr; + mtr.start(); + + if (buf_block_t* block = buf_page_get_gen( + page_id_t(space->id, 0), space->zip_size(), + RW_X_LATCH, NULL, BUF_GET_POSSIBLY_FREED, &mtr)) { + mtr.set_named_space(space); + crypt_data->write_page0(block, &mtr); + } + + mtr.commit(); +} + +/*********************************************************************** +Complete rotating a space +@param[in,out] state Rotation state */ +static void fil_crypt_complete_rotate_space(rotate_thread_t* state) +{ + fil_space_crypt_t *crypt_data = state->space->crypt_data; + + ut_ad(crypt_data); + ut_ad(state->space->referenced()); + + mysql_mutex_lock(&crypt_data->mutex); + + /* Space might already be dropped */ + if (!state->space->is_stopping()) { + /** + * Update crypt data state with state from thread + */ + if (state->min_key_version_found < + crypt_data->rotate_state.min_key_version_found) { + crypt_data->rotate_state.min_key_version_found = + state->min_key_version_found; + } + + if (state->end_lsn > crypt_data->rotate_state.end_lsn) { + crypt_data->rotate_state.end_lsn = state->end_lsn; + } + + ut_a(crypt_data->rotate_state.active_threads > 0); + crypt_data->rotate_state.active_threads--; + bool last = crypt_data->rotate_state.active_threads == 0; + + /** + * check if space is fully done + * this as when threads shutdown, it could be that we "complete" + * iterating before we have scanned the full space. + */ + bool done = crypt_data->rotate_state.next_offset >= + crypt_data->rotate_state.max_offset; + + /** + * we should flush space if we're last thread AND + * the iteration is done + */ + bool should_flush = last && done; + + if (should_flush) { + /* we're the last active thread */ + crypt_data->rotate_state.flushing = true; + crypt_data->min_key_version = + crypt_data->rotate_state.min_key_version_found; + mysql_mutex_unlock(&crypt_data->mutex); + fil_crypt_flush_space(state); + + mysql_mutex_lock(&crypt_data->mutex); + crypt_data->rotate_state.flushing = false; + } + } else { + ut_a(crypt_data->rotate_state.active_threads > 0); + crypt_data->rotate_state.active_threads--; + } + + mysql_mutex_unlock(&crypt_data->mutex); +} + +/** A thread which monitors global key state and rotates tablespaces +accordingly */ +static void fil_crypt_thread() +{ + mysql_mutex_lock(&fil_crypt_threads_mutex); + rotate_thread_t thr(srv_n_fil_crypt_threads_started++); + pthread_cond_signal(&fil_crypt_cond); /* signal that we started */ + + if (!thr.should_shutdown()) { + /* if we find a tablespace that is starting, skip over it + and recheck it later */ + bool recheck = false; + +wait_for_work: + if (!recheck && !thr.should_shutdown()) { + /* wait for key state changes + * i.e either new key version of change or + * new rotate_key_age */ + my_cond_wait(&fil_crypt_threads_cond, + &fil_crypt_threads_mutex.m_mutex); + } + + recheck = false; + thr.first = true; // restart from first tablespace + + key_state_t new_state; + + /* iterate all spaces searching for those needing rotation */ + while (fil_crypt_find_space_to_rotate(&new_state, &thr, + &recheck)) { + if (thr.space == fil_system.space_list.end()) { + goto wait_for_work; + } + + /* we found a space to rotate */ + mysql_mutex_unlock(&fil_crypt_threads_mutex); + fil_crypt_start_rotate_space(&new_state, &thr); + + /* iterate all pages (cooperativly with other threads) */ + while (fil_crypt_find_page_to_rotate(&new_state, &thr)) { + + /* If space is marked as stopping, release + space and stop rotation. */ + if (thr.space->is_stopping()) { + fil_crypt_complete_rotate_space(&thr); + thr.space->release(); + thr.space = fil_system.space_list.end(); + break; + } + + fil_crypt_rotate_pages(&new_state, &thr); + /* realloc iops */ + if (!fil_crypt_realloc_iops(&thr)) { + break; + } + } + + /* complete rotation */ + if (thr.space != fil_system.space_list.end()) { + fil_crypt_complete_rotate_space(&thr); + } + + /* force key state refresh */ + new_state.key_id = 0; + + mysql_mutex_lock(&fil_crypt_threads_mutex); + /* release iops */ + fil_crypt_return_iops(&thr); + } + + if (thr.space != fil_system.space_list.end()) { + thr.space->release(); + thr.space = fil_system.space_list.end(); + } + } + + fil_crypt_return_iops(&thr); + srv_n_fil_crypt_threads_started--; + pthread_cond_signal(&fil_crypt_cond); /* signal that we stopped */ + mysql_mutex_unlock(&fil_crypt_threads_mutex); + +#ifdef UNIV_PFS_THREAD + pfs_delete_thread(); +#endif +} + +/********************************************************************* +Adjust thread count for key rotation +@param[in] enw_cnt Number of threads to be used */ +void fil_crypt_set_thread_cnt(const uint new_cnt) +{ + if (!fil_crypt_threads_inited) { + if (srv_shutdown_state != SRV_SHUTDOWN_NONE) + return; + fil_crypt_threads_init(); + } + + mysql_mutex_lock(&fil_crypt_threads_mutex); + + if (new_cnt > srv_n_fil_crypt_threads) { + uint add = new_cnt - srv_n_fil_crypt_threads; + srv_n_fil_crypt_threads = new_cnt; + for (uint i = 0; i < add; i++) { + std::thread thd(fil_crypt_thread); + ib::info() << "Creating #" + << i+1 << " encryption thread id " + << thd.get_id() + << " total threads " << new_cnt << "."; + thd.detach(); + } + } else if (new_cnt < srv_n_fil_crypt_threads) { + srv_n_fil_crypt_threads = new_cnt; + } + + pthread_cond_broadcast(&fil_crypt_threads_cond); + + while (srv_n_fil_crypt_threads_started != srv_n_fil_crypt_threads) { + my_cond_wait(&fil_crypt_cond, + &fil_crypt_threads_mutex.m_mutex); + } + + pthread_cond_broadcast(&fil_crypt_threads_cond); + mysql_mutex_unlock(&fil_crypt_threads_mutex); +} + +/** Initialize the tablespace default_encrypt_tables +if innodb_encryption_rotate_key_age=0. */ +static void fil_crypt_default_encrypt_tables_fill() +{ + mysql_mutex_assert_owner(&fil_system.mutex); + + for (fil_space_t& space : fil_system.space_list) { + if (space.purpose != FIL_TYPE_TABLESPACE + || space.is_in_default_encrypt + || UT_LIST_GET_LEN(space.chain) == 0 + || !space.acquire_if_not_stopped()) { + continue; + } + + /* Ensure that crypt_data has been initialized. */ + ut_ad(space.size); + + /* Skip ENCRYPTION!=DEFAULT tablespaces. */ + if (space.crypt_data + && !space.crypt_data->is_default_encryption()) { + goto next; + } + + if (srv_encrypt_tables) { + /* Skip encrypted tablespaces if + innodb_encrypt_tables!=OFF */ + if (space.crypt_data + && space.crypt_data->min_key_version) { + goto next; + } + } else { + /* Skip unencrypted tablespaces if + innodb_encrypt_tables=OFF */ + if (!space.crypt_data + || !space.crypt_data->min_key_version) { + goto next; + } + } + + fil_system.default_encrypt_tables.push_back(space); + space.is_in_default_encrypt = true; +next: + space.release(); + } +} + +/********************************************************************* +Adjust max key age +@param[in] val New max key age */ +void fil_crypt_set_rotate_key_age(uint val) +{ + mysql_mutex_lock(&fil_crypt_threads_mutex); + mysql_mutex_lock(&fil_system.mutex); + srv_fil_crypt_rotate_key_age= val; + if (val == 0) + fil_crypt_default_encrypt_tables_fill(); + mysql_mutex_unlock(&fil_system.mutex); + pthread_cond_broadcast(&fil_crypt_threads_cond); + mysql_mutex_unlock(&fil_crypt_threads_mutex); +} + +/********************************************************************* +Adjust rotation iops +@param[in] val New max roation iops */ +void fil_crypt_set_rotation_iops(uint val) +{ + mysql_mutex_lock(&fil_crypt_threads_mutex); + srv_n_fil_crypt_iops= val; + pthread_cond_broadcast(&fil_crypt_threads_cond); + mysql_mutex_unlock(&fil_crypt_threads_mutex); +} + +/********************************************************************* +Adjust encrypt tables +@param[in] val New setting for innodb-encrypt-tables */ +void fil_crypt_set_encrypt_tables(ulong val) +{ + if (!fil_crypt_threads_inited) + return; + + mysql_mutex_lock(&fil_crypt_threads_mutex); + + mysql_mutex_lock(&fil_system.mutex); + srv_encrypt_tables= val; + + if (fil_crypt_must_default_encrypt()) + fil_crypt_default_encrypt_tables_fill(); + + mysql_mutex_unlock(&fil_system.mutex); + + pthread_cond_broadcast(&fil_crypt_threads_cond); + mysql_mutex_unlock(&fil_crypt_threads_mutex); +} + +/********************************************************************* +Init threads for key rotation */ +void fil_crypt_threads_init() +{ + if (!fil_crypt_threads_inited) { + pthread_cond_init(&fil_crypt_cond, nullptr); + pthread_cond_init(&fil_crypt_threads_cond, nullptr); + mysql_mutex_init(0, &fil_crypt_threads_mutex, nullptr); + uint cnt = srv_n_fil_crypt_threads; + srv_n_fil_crypt_threads = 0; + fil_crypt_threads_inited = true; + fil_crypt_set_thread_cnt(cnt); + } +} + +/********************************************************************* +Clean up key rotation threads resources */ +void fil_crypt_threads_cleanup() +{ + if (!fil_crypt_threads_inited) { + return; + } + ut_a(!srv_n_fil_crypt_threads_started); + pthread_cond_destroy(&fil_crypt_cond); + pthread_cond_destroy(&fil_crypt_threads_cond); + mysql_mutex_destroy(&fil_crypt_threads_mutex); + fil_crypt_threads_inited = false; +} + +/********************************************************************* +Wait for crypt threads to stop accessing space +@param[in] space Tablespace */ +void fil_space_crypt_close_tablespace(const fil_space_t *space) +{ + fil_space_crypt_t* crypt_data = space->crypt_data; + + if (!crypt_data || srv_n_fil_crypt_threads == 0 + || !fil_crypt_threads_inited) { + return; + } + + time_t start = time(0); + time_t last = start; + + mysql_mutex_lock(&crypt_data->mutex); + + while (crypt_data->rotate_state.active_threads + || crypt_data->rotate_state.flushing) { + mysql_mutex_unlock(&crypt_data->mutex); + + /* wakeup throttle (all) sleepers */ + mysql_mutex_lock(&fil_crypt_threads_mutex); + pthread_cond_broadcast(&fil_crypt_throttle_sleep_cond); + pthread_cond_broadcast(&fil_crypt_threads_cond); + mysql_mutex_unlock(&fil_crypt_threads_mutex); + + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + + time_t now = time(0); + + if (UNIV_UNLIKELY(now >= last + 30)) { + ib::warn() << "Waited " + << now - start + << " seconds to drop space: " + << space->chain.start->name << " (" + << space->id << ") active threads " + << crypt_data->rotate_state.active_threads + << "flushing=" + << crypt_data->rotate_state.flushing << "."; + last = now; + } + + mysql_mutex_lock(&crypt_data->mutex); + } + + mysql_mutex_unlock(&crypt_data->mutex); +} + +/********************************************************************* +Get crypt status for a space (used by information_schema) +@param[in] space Tablespace +@param[out] status Crypt status */ +void +fil_space_crypt_get_status( + const fil_space_t* space, + struct fil_space_crypt_status_t* status) +{ + memset(status, 0, sizeof(*status)); + + ut_ad(space->referenced()); + + /* If there is no crypt data and we have not yet read + page 0 for this tablespace, we need to read it before + we can continue. */ + if (!space->crypt_data) { + fil_crypt_read_crypt_data(const_cast(space)); + } + + status->space = ULINT_UNDEFINED; + + if (fil_space_crypt_t* crypt_data = space->crypt_data) { + status->space = space->id; + mysql_mutex_lock(&crypt_data->mutex); + status->scheme = crypt_data->type; + status->keyserver_requests = crypt_data->keyserver_requests; + status->min_key_version = crypt_data->min_key_version; + status->key_id = crypt_data->key_id; + + if (crypt_data->rotate_state.active_threads > 0 || + crypt_data->rotate_state.flushing) { + status->rotating = true; + status->flushing = + crypt_data->rotate_state.flushing; + status->rotate_next_page_number = + crypt_data->rotate_state.next_offset; + status->rotate_max_page_number = + crypt_data->rotate_state.max_offset; + } + + mysql_mutex_unlock(&crypt_data->mutex); + + if (srv_encrypt_tables || crypt_data->min_key_version) { + status->current_key_version = + fil_crypt_get_latest_key_version(crypt_data); + } + } +} + +/********************************************************************* +Return crypt statistics +@param[out] stat Crypt statistics */ +void fil_crypt_total_stat(fil_crypt_stat_t *stat) +{ + mysql_mutex_lock(&crypt_stat_mutex); + *stat = crypt_stat; + mysql_mutex_unlock(&crypt_stat_mutex); +} + +#endif /* UNIV_INNOCHECKSUM */ + +/** +Verify that post encryption checksum match calculated checksum. +This function should be called only if tablespace contains crypt_data +metadata (this is strong indication that tablespace is encrypted). +Function also verifies that traditional checksum does not match +calculated checksum as if it does page could be valid unencrypted, +encrypted, or corrupted. + +@param[in,out] page page frame (checksum is temporarily modified) +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@return true if page is encrypted AND OK, false otherwise */ +bool fil_space_verify_crypt_checksum(const byte* page, ulint zip_size) +{ + if (ENCRYPTION_KEY_NOT_ENCRYPTED == mach_read_from_4( + page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION)) { + return false; + } + + /* Compressed and encrypted pages do not have checksum. Assume not + corrupted. Page verification happens after decompression in + buf_page_t::read_complete() using buf_page_is_corrupted(). */ + if (fil_page_get_type(page) == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) { + return true; + } + + /* Read stored post encryption checksum. */ + const ib_uint32_t checksum = mach_read_from_4( + page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + 4); + + /* If stored checksum matches one of the calculated checksums + page is not corrupted. */ + +#ifndef UNIV_INNOCHECKSUM + switch (srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: +#endif /* !UNIV_INNOCHECKSUM */ + if (zip_size) { + return checksum == page_zip_calc_checksum( + page, zip_size, false); + } + + return checksum == buf_calc_page_crc32(page); +#ifndef UNIV_INNOCHECKSUM + default: + if (checksum == BUF_NO_CHECKSUM_MAGIC) { + return true; + } + if (zip_size) { + return checksum == page_zip_calc_checksum( + page, zip_size, false) + || checksum == page_zip_calc_checksum( + page, zip_size, true); + } + + return checksum == buf_calc_page_crc32(page) + || checksum == buf_calc_page_new_checksum(page); + } +#endif /* !UNIV_INNOCHECKSUM */ +} diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc new file mode 100644 index 00000000..8a88f4e2 --- /dev/null +++ b/storage/innobase/fil/fil0fil.cc @@ -0,0 +1,3282 @@ +/***************************************************************************** + +Copyright (c) 1995, 2021, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2014, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file fil/fil0fil.cc +The tablespace memory cache + +Created 10/25/1995 Heikki Tuuri +*******************************************************/ + +#include "fil0fil.h" +#include "fil0crypt.h" + +#include "btr0btr.h" +#include "buf0buf.h" +#include "dict0boot.h" +#include "dict0dict.h" +#include "dict0load.h" +#include "fsp0file.h" +#include "fsp0fsp.h" +#include "hash0hash.h" +#include "log0log.h" +#include "log0recv.h" +#include "mach0data.h" +#include "mtr0log.h" +#include "os0file.h" +#include "page0zip.h" +#include "row0mysql.h" +#include "srv0start.h" +#include "trx0purge.h" +#include "buf0lru.h" +#include "buf0flu.h" +#include "log.h" +#ifdef __linux__ +# include +# include +# include +#endif + +#include "lz4.h" +#include "lzo/lzo1x.h" +#include "lzma.h" +#include "bzlib.h" +#include "snappy-c.h" + +ATTRIBUTE_COLD void fil_space_t::set_corrupted() const +{ + if (!is_stopping() && !is_corrupted.test_and_set()) + sql_print_error("InnoDB: File '%s' is corrupted", chain.start->name); +} + +/** Try to close a file to adhere to the innodb_open_files limit. +@param print_info whether to diagnose why a file cannot be closed +@return whether a file was closed */ +bool fil_space_t::try_to_close(bool print_info) +{ + mysql_mutex_assert_owner(&fil_system.mutex); + for (fil_space_t &space : fil_system.space_list) + { + switch (space.purpose) { + case FIL_TYPE_TEMPORARY: + continue; + case FIL_TYPE_IMPORT: + break; + case FIL_TYPE_TABLESPACE: + if (is_predefined_tablespace(space.id)) + continue; + } + + /* We are using an approximation of LRU replacement policy. In + fil_node_open_file_low(), newly opened files are moved to the end + of fil_system.space_list, so that they would be less likely to be + closed here. */ + fil_node_t *node= UT_LIST_GET_FIRST(space.chain); + if (!node) + /* fil_ibd_create() did not invoke fil_space_t::add() yet */ + continue; + ut_ad(!UT_LIST_GET_NEXT(chain, node)); + + if (!node->is_open()) + continue; + + const auto n= space.set_closing(); + if (n & STOPPING) + /* Let fil_space_t::drop() in another thread handle this. */ + continue; + if (n & (PENDING | NEEDS_FSYNC)) + { + if (!print_info) + continue; + print_info= false; + const time_t now= time(nullptr); + if (now - fil_system.n_open_exceeded_time < 5) + continue; /* We display messages at most once in 5 seconds. */ + fil_system.n_open_exceeded_time= now; + + if (n & PENDING) + sql_print_information("InnoDB: Cannot close file %s because of " + UINT32PF " pending operations%s", node->name, + n & PENDING, + (n & NEEDS_FSYNC) ? " and pending fsync" : ""); + else if (n & NEEDS_FSYNC) + sql_print_information("InnoDB: Cannot close file %s because of " + "pending fsync", node->name); + continue; + } + + node->close(); + + fil_system.move_closed_last_to_space_list(node->space); + + return true; + } + + return false; +} + +/* + IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE + ============================================= + +The tablespace cache is responsible for providing fast read/write access to +tablespaces and logs of the database. File creation and deletion is done +in other modules which know more of the logic of the operation, however. + +A tablespace consists of a chain of files. The size of the files does not +have to be divisible by the database block size, because we may just leave +the last incomplete block unused. When a new file is appended to the +tablespace, the maximum size of the file is also specified. At the moment, +we think that it is best to extend the file to its maximum size already at +the creation of the file, because then we can avoid dynamically extending +the file when more space is needed for the tablespace. + +A block's position in the tablespace is specified with a 32-bit unsigned +integer. The files in the chain are thought to be catenated, and the block +corresponding to an address n is the nth block in the catenated file (where +the first block is named the 0th block, and the incomplete block fragments +at the end of files are not taken into account). A tablespace can be extended +by appending a new file at the end of the chain. + +Our tablespace concept is similar to the one of Oracle. + +To acquire more speed in disk transfers, a technique called disk striping is +sometimes used. This means that logical block addresses are divided in a +round-robin fashion across several disks. Windows NT supports disk striping, +so there we do not need to support it in the database. Disk striping is +implemented in hardware in RAID disks. We conclude that it is not necessary +to implement it in the database. Oracle 7 does not support disk striping, +either. + +Another trick used at some database sites is replacing tablespace files by +raw disks, that is, the whole physical disk drive, or a partition of it, is +opened as a single file, and it is accessed through byte offsets calculated +from the start of the disk or the partition. This is recommended in some +books on database tuning to achieve more speed in i/o. Using raw disk +certainly prevents the OS from fragmenting disk space, but it is not clear +if it really adds speed. We measured on the Pentium 100 MHz + NT + NTFS file +system + EIDE Conner disk only a negligible difference in speed when reading +from a file, versus reading from a raw disk. + +To have fast access to a tablespace or a log file, we put the data structures +to a hash table. Each tablespace and log file is given an unique 32-bit +identifier. */ + +/** Reference to the server data directory. Usually it is the +current working directory ".", but in the MariaDB Embedded Server Library +it is an absolute path. */ +const char* fil_path_to_mysql_datadir; + +/** Common InnoDB file extensions */ +const char* dot_ext[] = { "", ".ibd", ".isl", ".cfg" }; + +/** Number of pending tablespace flushes */ +Atomic_counter fil_n_pending_tablespace_flushes; + +/** The tablespace memory cache. This variable is NULL before the module is +initialized. */ +fil_system_t fil_system; + +/** At this age or older a space/page will be rotated */ +extern uint srv_fil_crypt_rotate_key_age; + +#ifdef UNIV_DEBUG +/** Try fil_validate() every this many times */ +# define FIL_VALIDATE_SKIP 17 + +/******************************************************************//** +Checks the consistency of the tablespace cache some of the time. +@return true if ok or the check was skipped */ +static +bool +fil_validate_skip(void) +/*===================*/ +{ + /** The fil_validate() call skip counter. */ + static Atomic_counter fil_validate_count; + + /* We want to reduce the call frequency of the costly fil_validate() + check in debug builds. */ + return (fil_validate_count++ % FIL_VALIDATE_SKIP) || fil_validate(); +} +#endif /* UNIV_DEBUG */ + +/** Look up a tablespace. +@param tablespace identifier +@return tablespace +@retval nullptr if not found */ +fil_space_t *fil_space_get_by_id(uint32_t id) +{ + fil_space_t* space; + + ut_ad(fil_system.is_initialised()); + mysql_mutex_assert_owner(&fil_system.mutex); + + HASH_SEARCH(hash, &fil_system.spaces, id, + fil_space_t*, space,, space->id == id); + + return(space); +} + +/** Look up a tablespace. +The caller should hold an InnoDB table lock or a MDL that prevents +the tablespace from being dropped during the operation, +or the caller should be in single-threaded crash recovery mode +(no user connections that could drop tablespaces). +Normally, fil_space_t::get() should be used instead. +@param[in] id tablespace ID +@return tablespace, or NULL if not found */ +fil_space_t *fil_space_get(uint32_t id) +{ + mysql_mutex_lock(&fil_system.mutex); + fil_space_t *space= fil_space_get_by_id(id); + mysql_mutex_unlock(&fil_system.mutex); + return space; +} + +/** Check if the compression algorithm is loaded +@param[in] comp_algo ulint compression algorithm +@return whether the compression algorithm is loaded */ +bool fil_comp_algo_loaded(ulint comp_algo) +{ + switch (comp_algo) { + case PAGE_UNCOMPRESSED: + case PAGE_ZLIB_ALGORITHM: + return true; + + case PAGE_LZ4_ALGORITHM: + return provider_service_lz4->is_loaded; + + case PAGE_LZO_ALGORITHM: + return provider_service_lzo->is_loaded; + + case PAGE_LZMA_ALGORITHM: + return provider_service_lzma->is_loaded; + + case PAGE_BZIP2_ALGORITHM: + return provider_service_bzip2->is_loaded; + + case PAGE_SNAPPY_ALGORITHM: + return provider_service_snappy->is_loaded; + } + + return false; +} + +/** Append a file to the chain of files of a space. +@param[in] name file name of a file that is not open +@param[in] handle file handle, or OS_FILE_CLOSED +@param[in] size file size in entire database pages +@param[in] is_raw whether this is a raw device +@param[in] atomic_write true if atomic write could be enabled +@param[in] max_pages maximum number of pages in file, +or UINT32_MAX for unlimited +@return file object */ +fil_node_t* fil_space_t::add(const char* name, pfs_os_file_t handle, + uint32_t size, bool is_raw, bool atomic_write, + uint32_t max_pages) +{ + mysql_mutex_assert_owner(&fil_system.mutex); + + fil_node_t* node; + + ut_ad(name != NULL); + ut_ad(fil_system.is_initialised()); + + node = reinterpret_cast(ut_zalloc_nokey(sizeof(*node))); + + node->handle = handle; + + node->name = mem_strdup(name); + + ut_a(!is_raw || srv_start_raw_disk_in_use); + + node->is_raw_disk = is_raw; + + node->size = size; + + node->init_size = size; + node->max_size = max_pages; + + node->space = this; + + node->atomic_write = atomic_write; + + this->size += size; + UT_LIST_ADD_LAST(chain, node); + if (node->is_open()) { + clear_closing(); + if (++fil_system.n_open >= srv_max_n_open_files) { + reacquire(); + try_to_close(true); + release(); + } + } + + return node; +} + +__attribute__((warn_unused_result, nonnull)) +/** Open a tablespace file. +@param node data file +@return whether the file was successfully opened */ +static bool fil_node_open_file_low(fil_node_t *node) +{ + ut_ad(!node->is_open()); + ut_ad(node->space->is_closing()); + mysql_mutex_assert_owner(&fil_system.mutex); + ulint type; + static_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096, "compatibility"); + switch (FSP_FLAGS_GET_ZIP_SSIZE(node->space->flags)) { + case 1: + case 2: + type= OS_DATA_FILE_NO_O_DIRECT; + break; + default: + type= OS_DATA_FILE; + } + + for (;;) + { + bool success; + node->handle= os_file_create(innodb_data_file_key, node->name, + node->is_raw_disk + ? OS_FILE_OPEN_RAW | OS_FILE_ON_ERROR_NO_EXIT + : OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT, + OS_FILE_AIO, type, + srv_read_only_mode, &success); + + if (success && node->is_open()) + { +#ifndef _WIN32 + if (!node->space->id && !srv_read_only_mode && my_disable_locking && + os_file_lock(node->handle, node->name)) + { + os_file_close(node->handle); + node->handle= OS_FILE_CLOSED; + return false; + } +#endif + break; + } + + /* The following call prints an error message */ + if (os_file_get_last_error(true) == EMFILE + 100 && + fil_space_t::try_to_close(true)) + continue; + + ib::warn() << "Cannot open '" << node->name << "'."; + return false; + } + + ulint comp_algo = node->space->get_compression_algo(); + bool comp_algo_invalid = false; + + if (node->size); + else if (!node->read_page0() || + // validate compression algorithm for full crc32 format + (node->space->full_crc32() && + (comp_algo_invalid = !fil_comp_algo_loaded(comp_algo)))) + { + if (comp_algo_invalid) + { + if (comp_algo <= PAGE_ALGORITHM_LAST) + ib::warn() << "'" << node->name << "' is compressed with " + << page_compression_algorithms[comp_algo] + << ", which is not currently loaded"; + else + ib::warn() << "'" << node->name << "' is compressed with " + << "invalid algorithm: " << comp_algo; + } + + os_file_close(node->handle); + node->handle= OS_FILE_CLOSED; + return false; + } + + ut_ad(node->is_open()); + + fil_system.move_opened_last_to_space_list(node->space); + + fil_system.n_open++; + return true; +} + +/** Open a tablespace file. +@param node data file +@return whether the file was successfully opened */ +static bool fil_node_open_file(fil_node_t *node) +{ + mysql_mutex_assert_owner(&fil_system.mutex); + ut_ad(!node->is_open()); + ut_ad(!is_predefined_tablespace(node->space->id) || + srv_operation == SRV_OPERATION_BACKUP || + srv_operation == SRV_OPERATION_RESTORE || + srv_operation == SRV_OPERATION_RESTORE_DELTA); + ut_ad(node->space->purpose != FIL_TYPE_TEMPORARY); + ut_ad(node->space->referenced()); + + const auto old_time= fil_system.n_open_exceeded_time; + + for (ulint count= 0; fil_system.n_open >= srv_max_n_open_files; count++) + { + if (fil_space_t::try_to_close(count > 1)) + count= 0; + else if (count >= 2) + { + if (old_time != fil_system.n_open_exceeded_time) + sql_print_warning("InnoDB: innodb_open_files=" ULINTPF + " is exceeded (" ULINTPF " files stay open)", + srv_max_n_open_files, fil_system.n_open); + break; + } + else + { + mysql_mutex_unlock(&fil_system.mutex); + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + /* Flush tablespaces so that we can close modified files. */ + fil_flush_file_spaces(); + mysql_mutex_lock(&fil_system.mutex); + if (node->is_open()) + return true; + } + } + + /* The node can be opened beween releasing and acquiring fil_system.mutex + in the above code */ + return node->is_open() || fil_node_open_file_low(node); +} + +/** Close the file handle. */ +void fil_node_t::close() +{ + prepare_to_close_or_detach(); + + /* printf("Closing file %s\n", name); */ + int ret= os_file_close(handle); + ut_a(ret); + handle= OS_FILE_CLOSED; +} + +pfs_os_file_t fil_node_t::detach() +{ + prepare_to_close_or_detach(); + + pfs_os_file_t result= handle; + handle= OS_FILE_CLOSED; + return result; +} + +void fil_node_t::prepare_to_close_or_detach() +{ + mysql_mutex_assert_owner(&fil_system.mutex); + ut_ad(space->is_ready_to_close() || srv_operation == SRV_OPERATION_BACKUP || + srv_operation == SRV_OPERATION_RESTORE_DELTA); + ut_a(is_open()); + ut_a(!being_extended); + ut_a(space->is_ready_to_close() || space->purpose == FIL_TYPE_TEMPORARY || + srv_fast_shutdown == 2 || !srv_was_started); + + ut_a(fil_system.n_open > 0); + fil_system.n_open--; +} + +/** Flush any writes cached by the file system. */ +void fil_space_t::flush_low() +{ + mysql_mutex_assert_not_owner(&fil_system.mutex); + + uint32_t n= 1; + while (!n_pending.compare_exchange_strong(n, n | NEEDS_FSYNC, + std::memory_order_acquire, + std::memory_order_relaxed)) + { + ut_ad(n & PENDING); + if (n & STOPPING_WRITES) + return; + if (n & NEEDS_FSYNC) + break; + } + + fil_n_pending_tablespace_flushes++; + for (fil_node_t *node= UT_LIST_GET_FIRST(chain); node; + node= UT_LIST_GET_NEXT(chain, node)) + { + if (!node->is_open()) + { + ut_ad(!is_in_unflushed_spaces); + continue; + } + IF_WIN(if (node->is_raw_disk) continue,); + os_file_flush(node->handle); + } + + if (is_in_unflushed_spaces) + { + mysql_mutex_lock(&fil_system.mutex); + if (is_in_unflushed_spaces) + { + is_in_unflushed_spaces= false; + fil_system.unflushed_spaces.remove(*this); + } + mysql_mutex_unlock(&fil_system.mutex); + } + + clear_flush(); + fil_n_pending_tablespace_flushes--; +} + +/** Try to extend a tablespace. +@param[in,out] space tablespace to be extended +@param[in,out] node last file of the tablespace +@param[in] size desired size in number of pages +@param[out] success whether the operation succeeded +@return whether the operation should be retried */ +static ATTRIBUTE_COLD __attribute__((warn_unused_result, nonnull)) +bool +fil_space_extend_must_retry( + fil_space_t* space, + fil_node_t* node, + uint32_t size, + bool* success) +{ + mysql_mutex_assert_owner(&fil_system.mutex); + ut_ad(UT_LIST_GET_LAST(space->chain) == node); + ut_ad(size >= FIL_IBD_FILE_INITIAL_SIZE); + ut_ad(node->space == space); + ut_ad(space->referenced() || space->is_being_truncated); + + *success = space->size >= size; + + if (*success) { + /* Space already big enough */ + return(false); + } + + if (node->being_extended) { + /* Another thread is currently extending the file. Wait + for it to finish. + It'd have been better to use event driven mechanism but + the entire module is peppered with polling stuff. */ + mysql_mutex_unlock(&fil_system.mutex); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + return(true); + } + + node->being_extended = true; + + /* At this point it is safe to release fil_system.mutex. No + other thread can rename, delete, close or extend the file because + we have set the node->being_extended flag. */ + mysql_mutex_unlock(&fil_system.mutex); + + ut_ad(size >= space->size); + + uint32_t last_page_no = space->size; + const uint32_t file_start_page_no = last_page_no - node->size; + + const unsigned page_size = space->physical_size(); + + /* Datafile::read_first_page() expects innodb_page_size bytes. + fil_node_t::read_page0() expects at least 4 * innodb_page_size bytes. + os_file_set_size() expects multiples of 4096 bytes. + For ROW_FORMAT=COMPRESSED tables using 1024-byte or 2048-byte + pages, we will preallocate up to an integer multiple of 4096 bytes, + and let normal writes append 1024, 2048, or 3072 bytes to the file. */ + os_offset_t new_size = std::max( + (os_offset_t(size - file_start_page_no) * page_size) + & ~os_offset_t(4095), + os_offset_t(FIL_IBD_FILE_INITIAL_SIZE << srv_page_size_shift)); + + *success = os_file_set_size(node->name, node->handle, new_size, + node->punch_hole == 1); + + os_has_said_disk_full = *success; + if (*success) { + os_file_flush(node->handle); + last_page_no = size; + } else { + /* Let us measure the size of the file + to determine how much we were able to + extend it */ + os_offset_t fsize = os_file_get_size(node->handle); + ut_a(fsize != os_offset_t(-1)); + + last_page_no = uint32_t(fsize / page_size) + + file_start_page_no; + } + mysql_mutex_lock(&fil_system.mutex); + + ut_a(node->being_extended); + node->being_extended = false; + ut_a(last_page_no - file_start_page_no >= node->size); + + uint32_t file_size = last_page_no - file_start_page_no; + space->size += file_size - node->size; + node->size = file_size; + const uint32_t pages_in_MiB = node->size + & ~uint32_t((1U << (20U - srv_page_size_shift)) - 1); + + /* Keep the last data file size info up to date, rounded to + full megabytes */ + + switch (space->id) { + case TRX_SYS_SPACE: + srv_sys_space.set_last_file_size(pages_in_MiB); + do_flush: + space->reacquire(); + mysql_mutex_unlock(&fil_system.mutex); + space->flush_low(); + space->release(); + mysql_mutex_lock(&fil_system.mutex); + break; + default: + ut_ad(space->purpose == FIL_TYPE_TABLESPACE + || space->purpose == FIL_TYPE_IMPORT); + if (space->purpose == FIL_TYPE_TABLESPACE + && !space->is_being_truncated) { + goto do_flush; + } + break; + case SRV_TMP_SPACE_ID: + ut_ad(space->purpose == FIL_TYPE_TEMPORARY); + srv_tmp_space.set_last_file_size(pages_in_MiB); + break; + } + + return false; +} + +/** @return whether the file is usable for io() */ +ATTRIBUTE_COLD bool fil_space_t::prepare_acquired() +{ + ut_ad(referenced()); + mysql_mutex_assert_owner(&fil_system.mutex); + fil_node_t *node= UT_LIST_GET_LAST(chain); + ut_ad(!id || purpose == FIL_TYPE_TEMPORARY || + node == UT_LIST_GET_FIRST(chain)); + + const bool is_open= node && (node->is_open() || fil_node_open_file(node)); + + if (!is_open) + release(); + else if (node->deferred); + else if (auto desired_size= recv_size) + { + bool success; + while (fil_space_extend_must_retry(this, node, desired_size, &success)) + mysql_mutex_lock(&fil_system.mutex); + + mysql_mutex_assert_owner(&fil_system.mutex); + /* Crash recovery requires the file extension to succeed. */ + ut_a(success); + /* InnoDB data files cannot shrink. */ + ut_a(size >= desired_size); + if (desired_size > committed_size) + committed_size= desired_size; + + /* There could be multiple concurrent I/O requests for this + tablespace (multiple threads trying to extend this tablespace). + + Also, fil_space_set_recv_size_and_flags() may have been invoked + again during the file extension while fil_system.mutex was not + being held by us. + + Only if recv_size matches what we read originally, reset the + field. In this way, a subsequent I/O request will handle any + pending fil_space_set_recv_size_and_flags(). */ + + if (desired_size == recv_size) + { + recv_size= 0; + goto clear; + } + } + else +clear: + clear_closing(); + + return is_open; +} + +/** @return whether the file is usable for io() */ +ATTRIBUTE_COLD bool fil_space_t::acquire_and_prepare() +{ + mysql_mutex_lock(&fil_system.mutex); + const auto flags= acquire_low() & (STOPPING | CLOSING); + const bool is_open= !flags || (flags == CLOSING && prepare_acquired()); + mysql_mutex_unlock(&fil_system.mutex); + return is_open; +} + +/** Try to extend a tablespace if it is smaller than the specified size. +@param[in,out] space tablespace +@param[in] size desired size in pages +@return whether the tablespace is at least as big as requested */ +bool fil_space_extend(fil_space_t *space, uint32_t size) +{ + ut_ad(!srv_read_only_mode || space->purpose == FIL_TYPE_TEMPORARY); + bool success= false; + const bool acquired= space->acquire(); + mysql_mutex_lock(&fil_system.mutex); + if (acquired || space->is_being_truncated) + { + while (fil_space_extend_must_retry(space, UT_LIST_GET_LAST(space->chain), + size, &success)) + mysql_mutex_lock(&fil_system.mutex); + } + mysql_mutex_unlock(&fil_system.mutex); + if (acquired) + space->release(); + return success; +} + +/** Prepare to free a file from fil_system. */ +inline pfs_os_file_t fil_node_t::close_to_free(bool detach_handle) +{ + mysql_mutex_assert_owner(&fil_system.mutex); + ut_a(!being_extended); + + if (is_open() && + (space->n_pending.fetch_or(fil_space_t::CLOSING, + std::memory_order_acquire) & + fil_space_t::PENDING)) + { + mysql_mutex_unlock(&fil_system.mutex); + while (space->referenced()) + std::this_thread::sleep_for(std::chrono::microseconds(100)); + mysql_mutex_lock(&fil_system.mutex); + } + + while (is_open()) + { + if (space->is_in_unflushed_spaces) + { + ut_ad(srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC); + space->is_in_unflushed_spaces= false; + fil_system.unflushed_spaces.remove(*space); + } + + ut_a(!being_extended); + if (detach_handle) + { + auto result= handle; + handle= OS_FILE_CLOSED; + return result; + } + bool ret= os_file_close(handle); + ut_a(ret); + handle= OS_FILE_CLOSED; + break; + } + + return OS_FILE_CLOSED; +} + +/** Detach a tablespace from the cache and close the files. +@param space tablespace +@param detach_handle whether to detach the handle, instead of closing +@return detached handle +@retval OS_FILE_CLOSED if no handle was detached */ +pfs_os_file_t fil_system_t::detach(fil_space_t *space, bool detach_handle) +{ + mysql_mutex_assert_owner(&fil_system.mutex); + HASH_DELETE(fil_space_t, hash, &spaces, space->id, space); + + if (space->is_in_unflushed_spaces) + { + ut_ad(srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC); + space->is_in_unflushed_spaces= false; + unflushed_spaces.remove(*space); + } + + if (space->is_in_default_encrypt) + { + space->is_in_default_encrypt= false; + default_encrypt_tables.remove(*space); + } + + { + space_list_t::iterator s= space_list_t::iterator(space); + if (space_list_last_opened == space) + { + if (s == space_list.begin()) + { + ut_ad(srv_operation > SRV_OPERATION_EXPORT_RESTORED || + srv_shutdown_state > SRV_SHUTDOWN_NONE); + space_list_last_opened= nullptr; + } + else + { + space_list_t::iterator prev= s; + space_list_last_opened= &*--prev; + } + } + space_list.erase(s); + } + + if (space == sys_space) + sys_space= nullptr; + else if (space == temp_space) + temp_space= nullptr; + + for (fil_node_t* node= UT_LIST_GET_FIRST(space->chain); node; + node= UT_LIST_GET_NEXT(chain, node)) + if (node->is_open()) + { + ut_ad(n_open > 0); + n_open--; + } + + ut_ad(!detach_handle || space->id); + ut_ad(!detach_handle || UT_LIST_GET_LEN(space->chain) <= 1); + + pfs_os_file_t handle= OS_FILE_CLOSED; + + for (fil_node_t* node= UT_LIST_GET_FIRST(space->chain); node; + node= UT_LIST_GET_NEXT(chain, node)) + handle= node->close_to_free(detach_handle); + + ut_ad(!space->referenced()); + return handle; +} + +/** Free a tablespace object on which fil_system_t::detach() was invoked. +There must not be any pending i/o's or flushes on the files. +@param[in,out] space tablespace */ +static +void +fil_space_free_low( + fil_space_t* space) +{ + /* The tablespace must not be in fil_system.named_spaces. */ + ut_ad(srv_fast_shutdown == 2 || !srv_was_started + || space->max_lsn == 0); + + /* Wait for fil_space_t::release() after + fil_system_t::detach(), the tablespace cannot be found, so + fil_space_t::get() would return NULL */ + while (space->referenced()) { + std::this_thread::sleep_for(std::chrono::microseconds(100)); + } + + for (fil_node_t* node = UT_LIST_GET_FIRST(space->chain); + node != NULL; ) { + ut_d(space->size -= node->size); + ut_free(node->name); + fil_node_t* old_node = node; + node = UT_LIST_GET_NEXT(chain, node); + ut_free(old_node); + } + + ut_ad(space->size == 0); + + fil_space_destroy_crypt_data(&space->crypt_data); + + space->~fil_space_t(); + ut_free(space); +} + +/** Frees a space object from the tablespace memory cache. +Closes the files in the chain but does not delete them. +There must not be any pending i/o's or flushes on the files. +@param id tablespace identifier +@param x_latched whether the caller holds exclusive fil_space_t::latch +@return true if success */ +bool fil_space_free(uint32_t id, bool x_latched) +{ + ut_ad(id != TRX_SYS_SPACE); + + mysql_mutex_lock(&fil_system.mutex); + fil_space_t* space = fil_space_get_by_id(id); + + if (space != NULL) { + fil_system.detach(space); + } + + mysql_mutex_unlock(&fil_system.mutex); + + if (space != NULL) { + if (x_latched) { + space->x_unlock(); + } + + if (!recv_recovery_is_on()) { + log_sys.latch.wr_lock(SRW_LOCK_CALL); + + if (space->max_lsn) { + ut_d(space->max_lsn = 0); + fil_system.named_spaces.remove(*space); + } + + log_sys.latch.wr_unlock(); + } else { +#ifndef SUX_LOCK_GENERIC + ut_ad(log_sys.latch.is_write_locked()); +#endif + if (space->max_lsn) { + ut_d(space->max_lsn = 0); + fil_system.named_spaces.remove(*space); + } + } + + fil_space_free_low(space); + } + + return(space != NULL); +} + +/** Create a tablespace in fil_system. +@param name tablespace name +@param id tablespace identifier +@param flags tablespace flags +@param purpose tablespace purpose +@param crypt_data encryption information +@param mode encryption mode +@param opened true if space files are opened +@return pointer to created tablespace, to be filled in with add() +@retval nullptr on failure (such as when the same tablespace exists) */ +fil_space_t *fil_space_t::create(uint32_t id, uint32_t flags, + fil_type_t purpose, + fil_space_crypt_t *crypt_data, + fil_encryption_t mode, + bool opened) +{ + fil_space_t* space; + + mysql_mutex_assert_owner(&fil_system.mutex); + ut_ad(fil_system.is_initialised()); + ut_ad(fil_space_t::is_valid_flags(flags & ~FSP_FLAGS_MEM_MASK, id)); + ut_ad(srv_page_size == UNIV_PAGE_SIZE_ORIG || flags != 0); + + DBUG_EXECUTE_IF("fil_space_create_failure", return(NULL);); + + /* FIXME: if calloc() is defined as an inline function that calls + memset() or bzero(), then GCC 6 -flifetime-dse can optimize it away */ + space= new (ut_zalloc_nokey(sizeof(*space))) fil_space_t; + + space->id = id; + + UT_LIST_INIT(space->chain, &fil_node_t::chain); + + space->purpose = purpose; + space->flags = flags; + + space->crypt_data = crypt_data; + space->n_pending.store(CLOSING, std::memory_order_relaxed); + + DBUG_LOG("tablespace", "Created metadata for " << id); + if (crypt_data) { + DBUG_LOG("crypt", + "Tablespace " << id + << " encryption " << crypt_data->encryption + << " key id " << crypt_data->key_id + << ":" << fil_crypt_get_mode(crypt_data) + << " " << fil_crypt_get_type(crypt_data)); + } + + space->latch.SRW_LOCK_INIT(fil_space_latch_key); + + if (const fil_space_t *old_space = fil_space_get_by_id(id)) { + ib::error() << "Trying to add tablespace with id " << id + << " to the cache, but tablespace '" + << (old_space->chain.start + ? old_space->chain.start->name + : "") + << "' already exists in the cache!"; + space->~fil_space_t(); + ut_free(space); + return(NULL); + } + + HASH_INSERT(fil_space_t, hash, &fil_system.spaces, id, space); + + if (opened) + fil_system.add_opened_last_to_space_list(space); + else + fil_system.space_list.push_back(*space); + + switch (id) { + case 0: + ut_ad(!fil_system.sys_space); + fil_system.sys_space = space; + break; + case SRV_TMP_SPACE_ID: + ut_ad(!fil_system.temp_space); + fil_system.temp_space = space; + break; + default: + ut_ad(purpose != FIL_TYPE_TEMPORARY); + if (UNIV_LIKELY(id <= fil_system.max_assigned_id)) { + break; + } + if (UNIV_UNLIKELY(srv_operation == SRV_OPERATION_BACKUP)) { + break; + } + if (!fil_system.space_id_reuse_warned) { + ib::warn() << "Allocated tablespace ID " << id + << ", old maximum was " + << fil_system.max_assigned_id; + } + + fil_system.max_assigned_id = id; + } + + const bool rotate = purpose == FIL_TYPE_TABLESPACE + && (mode == FIL_ENCRYPTION_ON || mode == FIL_ENCRYPTION_OFF + || srv_encrypt_tables) + && fil_crypt_must_default_encrypt(); + + if (rotate) { + fil_system.default_encrypt_tables.push_back(*space); + space->is_in_default_encrypt = true; + + if (srv_n_fil_crypt_threads_started) { + mysql_mutex_unlock(&fil_system.mutex); + fil_crypt_threads_signal(); + mysql_mutex_lock(&fil_system.mutex); + } + } + + return(space); +} + +/*******************************************************************//** +Assigns a new space id for a new single-table tablespace. This works simply by +incrementing the global counter. If 4 billion id's is not enough, we may need +to recycle id's. +@return true if assigned, false if not */ +bool fil_assign_new_space_id(uint32_t *space_id) +{ + uint32_t id = *space_id; + bool success; + + mysql_mutex_lock(&fil_system.mutex); + + if (id < fil_system.max_assigned_id) { + id = fil_system.max_assigned_id; + } + + id++; + + if (id > (SRV_SPACE_ID_UPPER_BOUND / 2) && (id % 1000000UL == 0)) { + ib::warn() << "You are running out of new single-table" + " tablespace id's. Current counter is " << id + << " and it must not exceed" <is_open() || fil_node_open_file(node); + release(); + return ok; +} + +/** Look up a tablespace and ensure that its first page has been validated. */ +static fil_space_t *fil_space_get_space(uint32_t id) +{ + if (fil_space_t *space= fil_space_get_by_id(id)) + if (space->read_page0()) + return space; + return nullptr; +} + +void fil_space_set_recv_size_and_flags(uint32_t id, uint32_t size, + uint32_t flags) +{ + ut_ad(id < SRV_SPACE_ID_UPPER_BOUND); + mysql_mutex_lock(&fil_system.mutex); + if (fil_space_t *space= fil_space_get_space(id)) + { + if (size) + space->recv_size= size; + if (flags != FSP_FLAGS_FCRC32_MASK_MARKER) + space->flags= flags; + } + mysql_mutex_unlock(&fil_system.mutex); +} + +/** Open each file. Never invoked on .ibd files. +@param create_new_db whether to skip the call to fil_node_t::read_page0() +@return whether all files were opened */ +bool fil_space_t::open(bool create_new_db) +{ + ut_ad(fil_system.is_initialised()); + ut_ad(!id || create_new_db); + + bool success= true; + bool skip_read= create_new_db; + + mysql_mutex_lock(&fil_system.mutex); + + for (fil_node_t *node= UT_LIST_GET_FIRST(chain); node; + node= UT_LIST_GET_NEXT(chain, node)) + { + if (!node->is_open() && !fil_node_open_file_low(node)) + { +err_exit: + success= false; + break; + } + + if (create_new_db) + { + node->find_metadata(node->handle); + continue; + } + if (skip_read) + { + size+= node->size; + continue; + } + + if (!node->read_page0()) + { + fil_system.n_open--; + os_file_close(node->handle); + node->handle= OS_FILE_CLOSED; + goto err_exit; + } + + skip_read= true; + } + + if (!create_new_db) + committed_size= size; + mysql_mutex_unlock(&fil_system.mutex); + return success; +} + +/** Close each file. Only invoked on fil_system.temp_space. */ +void fil_space_t::close() +{ + if (!fil_system.is_initialised()) { + return; + } + + mysql_mutex_lock(&fil_system.mutex); + ut_ad(this == fil_system.temp_space + || srv_operation == SRV_OPERATION_BACKUP + || srv_operation == SRV_OPERATION_RESTORE + || srv_operation == SRV_OPERATION_RESTORE_DELTA); + + for (fil_node_t* node = UT_LIST_GET_FIRST(chain); + node != NULL; + node = UT_LIST_GET_NEXT(chain, node)) { + if (node->is_open()) { + node->close(); + } + } + + mysql_mutex_unlock(&fil_system.mutex); +} + +void fil_system_t::create(ulint hash_size) +{ + ut_ad(this == &fil_system); + ut_ad(!is_initialised()); + ut_ad(!(srv_page_size % FSP_EXTENT_SIZE)); + ut_ad(srv_page_size); + ut_ad(!spaces.array); + + m_initialised = true; + + compile_time_assert(!(UNIV_PAGE_SIZE_MAX % FSP_EXTENT_SIZE_MAX)); + compile_time_assert(!(UNIV_PAGE_SIZE_MIN % FSP_EXTENT_SIZE_MIN)); + + ut_ad(hash_size > 0); + + mysql_mutex_init(fil_system_mutex_key, &mutex, nullptr); + + spaces.create(hash_size); + + fil_space_crypt_init(); +#ifdef __linux__ + ssd.clear(); + char fn[sizeof(dirent::d_name) + + sizeof "/sys/block/" "/queue/rotational"]; + const size_t sizeof_fnp = (sizeof fn) - sizeof "/sys/block"; + memcpy(fn, "/sys/block/", sizeof "/sys/block"); + char* fnp = &fn[sizeof "/sys/block"]; + + std::set ssd_devices; + if (DIR* d = opendir("/sys/block")) { + while (struct dirent* e = readdir(d)) { + if (e->d_name[0] == '.') { + continue; + } + snprintf(fnp, sizeof_fnp, "%s/queue/rotational", + e->d_name); + int f = open(fn, O_RDONLY); + if (f == -1) { + continue; + } + char b[sizeof "4294967295:4294967295\n"]; + ssize_t l = read(f, b, sizeof b); + ::close(f); + if (l != 2 || memcmp("0\n", b, 2)) { + continue; + } + snprintf(fnp, sizeof_fnp, "%s/dev", e->d_name); + f = open(fn, O_RDONLY); + if (f == -1) { + continue; + } + l = read(f, b, sizeof b); + ::close(f); + if (l <= 0 || b[l - 1] != '\n') { + continue; + } + b[l - 1] = '\0'; + char* end = b; + unsigned long dev_major = strtoul(b, &end, 10); + if (b == end || *end != ':' + || dev_major != unsigned(dev_major)) { + continue; + } + char* c = end + 1; + unsigned long dev_minor = strtoul(c, &end, 10); + if (c == end || *end + || dev_minor != unsigned(dev_minor)) { + continue; + } + ssd.push_back(makedev(unsigned(dev_major), + unsigned(dev_minor))); + } + closedir(d); + } + /* fil_system_t::is_ssd() assumes the following */ + ut_ad(makedev(0, 8) == 8); + ut_ad(makedev(0, 4) == 4); + ut_ad(makedev(0, 2) == 2); + ut_ad(makedev(0, 1) == 1); +#endif +} + +void fil_system_t::close() +{ + ut_ad(this == &fil_system); + ut_a(unflushed_spaces.empty()); + ut_a(space_list.empty()); + ut_ad(!sys_space); + ut_ad(!temp_space); + + if (is_initialised()) + { + m_initialised= false; + spaces.free(); + mysql_mutex_destroy(&mutex); + fil_space_crypt_cleanup(); + } + + ut_ad(!spaces.array); + +#ifdef __linux__ + ssd.clear(); + ssd.shrink_to_fit(); +#endif /* __linux__ */ +} + +void fil_system_t::add_opened_last_to_space_list(fil_space_t *space) +{ + if (UNIV_LIKELY(space_list_last_opened != nullptr)) + space_list.insert(++space_list_t::iterator(space_list_last_opened), *space); + else + space_list.push_front(*space); + space_list_last_opened= space; +} + +/** Extend all open data files to the recovered size */ +ATTRIBUTE_COLD void fil_system_t::extend_to_recv_size() +{ + ut_ad(is_initialised()); + mysql_mutex_lock(&mutex); + for (fil_space_t &space : fil_system.space_list) + { + const uint32_t size= space.recv_size; + + if (size > space.size) + { + if (space.is_closing()) + continue; + space.reacquire(); + bool success; + while (fil_space_extend_must_retry(&space, UT_LIST_GET_LAST(space.chain), + size, &success)) + mysql_mutex_lock(&mutex); + /* Crash recovery requires the file extension to succeed. */ + ut_a(success); + space.release(); + } + } + mysql_mutex_unlock(&mutex); +} + +/** Close all tablespace files at shutdown */ +void fil_space_t::close_all() +{ + if (!fil_system.is_initialised()) + return; + + /* At shutdown, we should not have any files in this list. */ + ut_ad(srv_fast_shutdown == 2 || !srv_was_started || + fil_system.named_spaces.empty()); + fil_flush_file_spaces(); + + mysql_mutex_lock(&fil_system.mutex); + + while (!fil_system.space_list.empty()) + { + fil_space_t &space= fil_system.space_list.front(); + + for (fil_node_t *node= UT_LIST_GET_FIRST(space.chain); node != NULL; + node= UT_LIST_GET_NEXT(chain, node)) + { + + if (!node->is_open()) + { + next: + continue; + } + + for (ulint count= 10000; count--;) + { + const auto n= space.set_closing(); + if (n & STOPPING) + goto next; + if (!(n & (PENDING | NEEDS_FSYNC))) + { + node->close(); + goto next; + } + mysql_mutex_unlock(&fil_system.mutex); + std::this_thread::sleep_for(std::chrono::microseconds(100)); + mysql_mutex_lock(&fil_system.mutex); + if (!node->is_open()) + goto next; + } + + ib::error() << "File '" << node->name << "' has " << space.referenced() + << " operations"; + } + + fil_system.detach(&space); + mysql_mutex_unlock(&fil_system.mutex); + fil_space_free_low(&space); + mysql_mutex_lock(&fil_system.mutex); + } + + mysql_mutex_unlock(&fil_system.mutex); + + ut_ad(srv_fast_shutdown == 2 || !srv_was_started || + fil_system.named_spaces.empty()); +} + +/*******************************************************************//** +Sets the max tablespace id counter if the given number is bigger than the +previous value. */ +void fil_set_max_space_id_if_bigger(uint32_t max_id) +{ + ut_a(max_id < SRV_SPACE_ID_UPPER_BOUND); + + mysql_mutex_lock(&fil_system.mutex); + + if (fil_system.max_assigned_id < max_id) { + + fil_system.max_assigned_id = max_id; + } + + mysql_mutex_unlock(&fil_system.mutex); +} + +/** Acquire a tablespace reference. +@param id tablespace identifier +@return tablespace +@retval nullptr if the tablespace is missing or inaccessible */ +fil_space_t *fil_space_t::get(uint32_t id) +{ + mysql_mutex_lock(&fil_system.mutex); + fil_space_t *space= fil_space_get_by_id(id); + const uint32_t n= space ? space->acquire_low() : 0; + + if (n & STOPPING) + space= nullptr; + else if ((n & CLOSING) && !space->prepare_acquired()) + space= nullptr; + + mysql_mutex_unlock(&fil_system.mutex); + return space; +} + +/** Write a log record about a file operation. +@param type file operation +@param first_page_no first page number in the file +@param path file path +@param new_path new file path for type=FILE_RENAME */ +inline void mtr_t::log_file_op(mfile_type_t type, uint32_t space_id, + const char *path, const char *new_path) +{ + ut_ad((new_path != nullptr) == (type == FILE_RENAME)); + ut_ad(!(byte(type) & 15)); + + /* fil_name_parse() requires that there be at least one path + separator and that the file path end with ".ibd". */ + ut_ad(strchr(path, '/')); + ut_ad(!strcmp(&path[strlen(path) - strlen(DOT_IBD)], DOT_IBD)); + + m_modifications= true; + if (!is_logged()) + return; + m_last= nullptr; + + const size_t len= strlen(path); + const size_t new_len= type == FILE_RENAME ? 1 + strlen(new_path) : 0; + ut_ad(len > 0); + byte *const log_ptr= m_log.open(1 + 3/*length*/ + 5/*space_id*/ + + 1/*page_no=0*/); + byte *end= log_ptr + 1; + end= mlog_encode_varint(end, space_id); + *end++= 0; + if (UNIV_LIKELY(end + len + new_len >= &log_ptr[16])) + { + *log_ptr= type; + size_t total_len= len + new_len + end - log_ptr - 15; + if (total_len >= MIN_3BYTE) + total_len+= 2; + else if (total_len >= MIN_2BYTE) + total_len++; + end= mlog_encode_varint(log_ptr + 1, total_len); + end= mlog_encode_varint(end, space_id); + *end++= 0; + } + else + { + *log_ptr= static_cast(type | (end + len + new_len - &log_ptr[1])); + ut_ad(*log_ptr & 15); + } + + m_log.close(end); + + if (type == FILE_RENAME) + { + ut_ad(strchr(new_path, '/')); + m_log.push(reinterpret_cast(path), uint32_t(len + 1)); + m_log.push(reinterpret_cast(new_path), uint32_t(new_len - 1)); + } + else + m_log.push(reinterpret_cast(path), uint32_t(len)); +} + +/** Write FILE_MODIFY for a file. +@param[in] space_id tablespace id +@param[in] name tablespace file name +@param[in,out] mtr mini-transaction */ +static void fil_name_write(uint32_t space_id, const char *name, + mtr_t *mtr) +{ + ut_ad(!is_predefined_tablespace(space_id)); + mtr->log_file_op(FILE_MODIFY, space_id, name); +} + +fil_space_t *fil_space_t::drop(uint32_t id, pfs_os_file_t *detached_handle) +{ + ut_a(!is_system_tablespace(id)); + mysql_mutex_lock(&fil_system.mutex); + fil_space_t *space= fil_space_get_by_id(id); + + if (!space) + { + mysql_mutex_unlock(&fil_system.mutex); + return nullptr; + } + + if (space->pending() & STOPPING) + { + /* A thread executing DDL and another thread executing purge may + be executing fil_delete_tablespace() concurrently for the same + tablespace. Wait for the other thread to complete the operation. */ + for (ulint count= 0;; count++) + { + space= fil_space_get_by_id(id); + ut_ad(!space || space->is_stopping()); + mysql_mutex_unlock(&fil_system.mutex); + if (!space) + return nullptr; + /* Issue a warning every 10.24 seconds, starting after 2.56 seconds */ + if ((count & 511) == 128) + sql_print_warning("InnoDB: Waiting for tablespace " UINT32PF + " to be deleted", id); + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + mysql_mutex_lock(&fil_system.mutex); + } + } + + /* We must be the first one to set either STOPPING flag on the .ibd file, + because the flags are only being set here, within a critical section of + fil_system.mutex. */ + unsigned pending; + ut_d(pending=) + space->n_pending.fetch_add(STOPPING_READS + 1, std::memory_order_relaxed); + ut_ad(!(pending & STOPPING)); + mysql_mutex_unlock(&fil_system.mutex); + + if (space->crypt_data) + fil_space_crypt_close_tablespace(space); + + if (space->purpose == FIL_TYPE_TABLESPACE) + { + if (id >= srv_undo_space_id_start && + id < srv_undo_space_id_start + srv_undo_tablespaces_open) + { + os_file_delete(innodb_data_file_key, space->chain.start->name); + goto deleted; + } + + /* Before deleting the file, persistently write a log record. */ + mtr_t mtr; + mtr.start(); + mtr.log_file_op(FILE_DELETE, id, space->chain.start->name); + mtr.commit_file(*space, nullptr); + + if (FSP_FLAGS_HAS_DATA_DIR(space->flags)) + RemoteDatafile::delete_link_file(space->name()); + + os_file_delete(innodb_data_file_key, space->chain.start->name); + } + else + ut_ad(space->purpose == FIL_TYPE_IMPORT); + + if (char *cfg_name= fil_make_filepath(space->chain.start->name, + fil_space_t::name_type{}, CFG, false)) + { + os_file_delete_if_exists(innodb_data_file_key, cfg_name, nullptr); + ut_free(cfg_name); + } + + deleted: + mysql_mutex_lock(&fil_system.mutex); + ut_ad(space == fil_space_get_by_id(id)); + pending= + space->n_pending.fetch_add(STOPPING_WRITES - 1, std::memory_order_relaxed); + ut_ad((pending & STOPPING) == STOPPING_READS); + ut_ad(pending & PENDING); + pending&= PENDING; + if (--pending) + { + for (ulint count= 0;; count++) + { + ut_ad(space == fil_space_get_by_id(id)); + pending= space->n_pending.load(std::memory_order_relaxed) & PENDING; + if (!pending) + break; + mysql_mutex_unlock(&fil_system.mutex); + /* Issue a warning every 10.24 seconds, starting after 2.56 seconds */ + if ((count & 511) == 128) + sql_print_warning("InnoDB: Trying to delete tablespace '%s' " + "but there are %u pending operations", + space->chain.start->name, pending); + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + mysql_mutex_lock(&fil_system.mutex); + } + } + + pfs_os_file_t handle= fil_system.detach(space, true); + mysql_mutex_unlock(&fil_system.mutex); + if (detached_handle) + *detached_handle = handle; + else + os_file_close(handle); + return space; +} + +/** Close a single-table tablespace on failed IMPORT TABLESPACE. +The tablespace must be cached in the memory cache. +Free all pages used by the tablespace. */ +void fil_close_tablespace(uint32_t id) +{ + ut_ad(!is_system_tablespace(id)); + fil_space_t* space = fil_space_t::drop(id, nullptr); + if (!space) { + return; + } + + space->x_lock(); + ut_ad(space->is_stopping()); + + /* Invalidate in the buffer pool all pages belonging to the + tablespace. Since space->is_stopping() holds, readahead + can no longer read more pages of this tablespace to buf_pool. + Thus we can clean the tablespace out of buf_pool + completely and permanently. */ + while (buf_flush_list_space(space)); + + space->x_unlock(); + log_sys.latch.wr_lock(SRW_LOCK_CALL); + if (space->max_lsn != 0) { + ut_d(space->max_lsn = 0); + fil_system.named_spaces.remove(*space); + } + log_sys.latch.wr_unlock(); + fil_space_free_low(space); +} + +/** Delete a tablespace and associated .ibd file. +@param id tablespace identifier +@return detached file handle (to be closed by the caller) +@return OS_FILE_CLOSED if no file existed */ +pfs_os_file_t fil_delete_tablespace(uint32_t id) +{ + ut_ad(!is_system_tablespace(id)); + pfs_os_file_t handle= OS_FILE_CLOSED; + if (fil_space_t *space= fil_space_t::drop(id, &handle)) + fil_space_free_low(space); + return handle; +} + +/*******************************************************************//** +Allocates and builds a file name from a path, a table or tablespace name +and a suffix. The string must be freed by caller with ut_free(). +@param[in] path NULL or the directory path or the full path and filename. +@param[in] name {} if path is full, or Table/Tablespace name +@param[in] ext the file extension to use +@param[in] trim_name true if the last name on the path should be trimmed. +@return own: file name */ +char* fil_make_filepath(const char *path, const fil_space_t::name_type &name, + ib_extention ext, bool trim_name) +{ + /* The path may contain the basename of the file, if so we do not + need the name. If the path is NULL, we can use the default path, + but there needs to be a name. */ + ut_ad(path || name.data()); + + /* If we are going to strip a name off the path, there better be a + path and a new name to put back on. */ + ut_ad(!trim_name || (path && name.data())); + + if (path == NULL) { + path = fil_path_to_mysql_datadir; + } + + ulint len = 0; /* current length */ + ulint path_len = strlen(path); + const char* suffix = dot_ext[ext]; + ulint suffix_len = strlen(suffix); + ulint full_len = path_len + 1 + name.size() + suffix_len + 1; + + char* full_name = static_cast(ut_malloc_nokey(full_len)); + if (full_name == NULL) { + return NULL; + } + + /* If the name is a relative or absolute path, do not prepend "./". */ + if (path[0] == '.' + && (path[1] == '\0' || path[1] == '/' IF_WIN(|| path[1] == '\\',)) + && name.size() && (name.data()[0] == '.' + || is_absolute_path(name.data()))) { + path = NULL; + path_len = 0; + } + + if (path != NULL) { + memcpy(full_name, path, path_len); + len = path_len; + } + + full_name[len] = '\0'; + + if (trim_name) { + /* Find the offset of the last DIR separator and set it to + null in order to strip off the old basename from this path. */ + char* last_dir_sep = strrchr(full_name, '/'); +#ifdef _WIN32 + if (char *last = strrchr(full_name, '\\')) { + if (last > last_dir_sep) { + last_dir_sep = last; + } + } +#endif + if (last_dir_sep) { + last_dir_sep[0] = '\0'; + len = strlen(full_name); + } + } + + if (name.size()) { + if (len && full_name[len - 1] != '/') { + /* Add a DIR separator */ + full_name[len] = '/'; + full_name[++len] = '\0'; + } + + char* ptr = &full_name[len]; + memcpy(ptr, name.data(), name.size()); + len += name.size(); + full_name[len] = '\0'; + } + + /* Make sure that the specified suffix is at the end of the filepath + string provided. This assumes that the suffix starts with '.'. + If the first char of the suffix is found in the filepath at the same + length as the suffix from the end, then we will assume that there is + a previous suffix that needs to be replaced. */ + if (suffix != NULL) { + /* Need room for the trailing null byte. */ + ut_ad(len < full_len); + + if ((len > suffix_len) + && (full_name[len - suffix_len] == suffix[0])) { + /* Another suffix exists, make it the one requested. */ + memcpy(&full_name[len - suffix_len], suffix, suffix_len); + + } else { + /* No previous suffix, add it. */ + ut_ad(len + suffix_len < full_len); + memcpy(&full_name[len], suffix, suffix_len); + full_name[len + suffix_len] = '\0'; + } + } + + return(full_name); +} + +char *fil_make_filepath(const char* path, const table_name_t name, + ib_extention suffix, bool strip_name) +{ + return fil_make_filepath(path, {name.m_name, strlen(name.m_name)}, + suffix, strip_name); +} + +dberr_t fil_space_t::rename(const char *path, bool log, bool replace) +{ + ut_ad(UT_LIST_GET_LEN(chain) == 1); + ut_ad(!is_predefined_tablespace(id)); + + const char *old_path= chain.start->name; + + ut_ad(strchr(old_path, '/')); + ut_ad(strchr(path, '/')); + + if (!strcmp(path, old_path)) + return DB_SUCCESS; + + if (!log) + { + if (!os_file_rename(innodb_data_file_key, old_path, path)) + return DB_ERROR; + mysql_mutex_lock(&fil_system.mutex); + ut_free(chain.start->name); + chain.start->name= mem_strdup(path); + mysql_mutex_unlock(&fil_system.mutex); + return DB_SUCCESS; + } + + bool exists= false; + os_file_type_t ftype; + + /* Check upfront if the rename operation might succeed, because we + must durably write redo log before actually attempting to execute + the rename in the file system. */ + if (os_file_status(old_path, &exists, &ftype) && !exists) + { + sql_print_error("InnoDB: Cannot rename '%s' to '%s'" + " because the source file does not exist.", + old_path, path); + return DB_TABLESPACE_NOT_FOUND; + } + + exists= false; + if (replace); + else if (!os_file_status(path, &exists, &ftype) || exists) + { + sql_print_error("InnoDB: Cannot rename '%s' to '%s'" + " because the target file exists.", + old_path, path); + return DB_TABLESPACE_EXISTS; + } + + mtr_t mtr; + mtr.start(); + mtr.log_file_op(FILE_RENAME, id, old_path, path); + return mtr.commit_file(*this, path) ? DB_SUCCESS : DB_ERROR; +} + +/** Create a tablespace file. +@param[in] space_id Tablespace ID +@param[in] name Tablespace name in dbname/tablename format. +@param[in] path Path and filename of the datafile to create. +@param[in] flags Tablespace flags +@param[in] size Initial size of the tablespace file in pages, +must be >= FIL_IBD_FILE_INITIAL_SIZE +@param[in] mode MariaDB encryption mode +@param[in] key_id MariaDB encryption key_id +@param[out] err DB_SUCCESS or error code +@return the created tablespace +@retval NULL on error */ +fil_space_t* +fil_ibd_create( + uint32_t space_id, + const table_name_t name, + const char* path, + uint32_t flags, + uint32_t size, + fil_encryption_t mode, + uint32_t key_id, + dberr_t* err) +{ + pfs_os_file_t file; + bool success; + mtr_t mtr; + bool has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags) != 0; + + ut_ad(!is_system_tablespace(space_id)); + ut_ad(!srv_read_only_mode); + ut_a(space_id < SRV_SPACE_ID_UPPER_BOUND); + ut_a(size >= FIL_IBD_FILE_INITIAL_SIZE); + ut_a(fil_space_t::is_valid_flags(flags & ~FSP_FLAGS_MEM_MASK, space_id)); + + /* Create the subdirectories in the path, if they are + not there already. */ + *err = os_file_create_subdirs_if_needed(path); + if (*err != DB_SUCCESS) { + return NULL; + } + + mtr.start(); + mtr.log_file_op(FILE_CREATE, space_id, path); + log_sys.latch.wr_lock(SRW_LOCK_CALL); + auto lsn= mtr.commit_files(); + log_sys.latch.wr_unlock(); + mtr.flag_wr_unlock(); + log_write_up_to(lsn, true); + + ulint type; + static_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096, + "compatibility"); + switch (FSP_FLAGS_GET_ZIP_SSIZE(flags)) { + case 1: + case 2: + type = OS_DATA_FILE_NO_O_DIRECT; + break; + default: + type = OS_DATA_FILE; + } + + file = os_file_create( + innodb_data_file_key, path, + OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT, + OS_FILE_AIO, type, srv_read_only_mode, &success); + + if (!success) { + /* The following call will print an error message */ + switch (os_file_get_last_error(true)) { + case OS_FILE_ALREADY_EXISTS: + ib::info() << "The file '" << path << "'" + " already exists though the" + " corresponding table did not exist" + " in the InnoDB data dictionary." + " You can resolve the problem by removing" + " the file."; + *err = DB_TABLESPACE_EXISTS; + break; + case OS_FILE_DISK_FULL: + *err = DB_OUT_OF_FILE_SPACE; + break; + default: + *err = DB_ERROR; + } + ib::error() << "Cannot create file '" << path << "'"; + return NULL; + } + + const bool is_compressed = fil_space_t::is_compressed(flags); +#ifdef _WIN32 + const bool is_sparse = is_compressed; + if (is_compressed) { + os_file_set_sparse_win32(file); + } +#else + const bool is_sparse = is_compressed + && DB_SUCCESS == os_file_punch_hole(file, 0, 4096) + && !my_test_if_thinly_provisioned(file); +#endif + + if (fil_space_t::full_crc32(flags)) { + flags |= FSP_FLAGS_FCRC32_PAGE_SSIZE(); + } else { + flags |= FSP_FLAGS_PAGE_SSIZE(); + } + + /* Create crypt data if the tablespace is either encrypted or user has + requested it to remain unencrypted. */ + fil_space_crypt_t* crypt_data = (mode != FIL_ENCRYPTION_DEFAULT + || srv_encrypt_tables) + ? fil_space_create_crypt_data(mode, key_id) + : nullptr; + + if (!os_file_set_size(path, file, + os_offset_t(size) << srv_page_size_shift, + is_sparse)) { + *err = DB_OUT_OF_FILE_SPACE; +err_exit: + os_file_close(file); + os_file_delete(innodb_data_file_key, path); + free(crypt_data); + return nullptr; + } + + fil_space_t::name_type space_name; + + if (has_data_dir) { + /* Make the ISL file if the IBD file is not + in the default location. */ + space_name = {name.m_name, strlen(name.m_name)}; + *err = RemoteDatafile::create_link_file(space_name, path); + if (*err != DB_SUCCESS) { + goto err_exit; + } + } + + DBUG_EXECUTE_IF("checkpoint_after_file_create", + log_make_checkpoint();); + + mysql_mutex_lock(&fil_system.mutex); + if (fil_space_t* space = fil_space_t::create(space_id, flags, + FIL_TYPE_TABLESPACE, + crypt_data, mode, true)) { + fil_node_t* node = space->add(path, file, size, false, true); + IF_WIN(node->find_metadata(), node->find_metadata(file, true)); + mysql_mutex_unlock(&fil_system.mutex); + mtr.start(); + mtr.set_named_space(space); + ut_a(fsp_header_init(space, size, &mtr) == DB_SUCCESS); + mtr.commit(); + return space; + } else { + mysql_mutex_unlock(&fil_system.mutex); + } + + if (space_name.data()) { + RemoteDatafile::delete_link_file(space_name); + } + + *err = DB_ERROR; + goto err_exit; +} + +/** Try to open a single-table tablespace and optionally check that the +space id in it is correct. If this does not succeed, print an error message +to the .err log. This function is used to open a tablespace when we start +mysqld after the dictionary has been booted, and also in IMPORT TABLESPACE. + +NOTE that we assume this operation is used either at the database startup +or under the protection of dict_sys.latch, so that two users cannot +race here. This operation does not leave the file associated with the +tablespace open, but closes it after we have looked at the space id in it. + +If the validate boolean is set, we read the first page of the file and +check that the space id in the file is what we expect. We assume that +this function runs much faster if no check is made, since accessing the +file inode probably is much faster (the OS caches them) than accessing +the first page of the file. This boolean may be initially false, but if +a remote tablespace is found it will be changed to true. + +If the fix_dict boolean is set, then it is safe to use an internal SQL +statement to update the dictionary tables if they are incorrect. + +@param[in] validate 0=maybe missing, 1=do not validate, 2=validate +@param[in] purpose FIL_TYPE_TABLESPACE or FIL_TYPE_TEMPORARY +@param[in] id tablespace ID +@param[in] flags expected FSP_SPACE_FLAGS +@param[in] name table name +If file-per-table, it is the table name in the databasename/tablename format +@param[in] path_in expected filepath, usually read from dictionary +@param[out] err DB_SUCCESS or error code +@return tablespace +@retval NULL if the tablespace could not be opened */ +fil_space_t* +fil_ibd_open( + unsigned validate, + fil_type_t purpose, + uint32_t id, + uint32_t flags, + fil_space_t::name_type name, + const char* path_in, + dberr_t* err) +{ + mysql_mutex_lock(&fil_system.mutex); + fil_space_t* space = fil_space_get_by_id(id); + mysql_mutex_unlock(&fil_system.mutex); + if (space) { + if (validate > 1 && !srv_read_only_mode) { + fsp_flags_try_adjust(space, + flags & ~FSP_FLAGS_MEM_MASK); + } + return space; + } + + dberr_t local_err = DB_SUCCESS; + + /* Table flags can be ULINT_UNDEFINED if + dict_tf_to_fsp_flags_failure is set. */ + if (flags == UINT32_MAX) { +corrupted: + local_err = DB_CORRUPTION; +func_exit: + if (err) *err = local_err; + return space; + } + + ut_ad(fil_space_t::is_valid_flags(flags & ~FSP_FLAGS_MEM_MASK, id)); + + Datafile df_default; /* default location */ + RemoteDatafile df_remote; /* remote location */ + ulint tablespaces_found = 0; + ulint valid_tablespaces_found = 0; + + df_default.init(flags); + df_remote.init(flags); + + /* Discover the correct file by looking in three possible locations + while avoiding unecessary effort. */ + + /* We will always look for an ibd in the default location. */ + df_default.make_filepath(nullptr, name, IBD); + + /* Look for a filepath embedded in an ISL where the default file + would be. */ + bool must_validate = df_remote.open_link_file(name); + + if (must_validate) { + if (df_remote.open_read_only(true) == DB_SUCCESS) { + ut_ad(df_remote.is_open()); + ++tablespaces_found; + } else { + /* The following call prints an error message */ + os_file_get_last_error(true); + ib::error() << "A link file was found named '" + << df_remote.link_filepath() + << "' but the linked tablespace '" + << df_remote.filepath() + << "' could not be opened read-only."; + } + } else if (path_in && !df_default.same_filepath_as(path_in)) { + /* Dict path is not the default path. Always validate + remote files. If default is opened, it was moved. */ + must_validate = true; + } else if (validate > 1) { + must_validate = true; + } + + const bool operation_not_for_export = + srv_operation != SRV_OPERATION_RESTORE_EXPORT + && srv_operation != SRV_OPERATION_EXPORT_RESTORED; + + /* Always look for a file at the default location. But don't log + an error if the tablespace is already open in remote or dict. */ + ut_a(df_default.filepath()); + + /* Mariabackup will not copy files whose names start with + #sql-. We will suppress messages about such files missing on + the first server startup. The tables ought to be dropped by + drop_garbage_tables_after_restore() a little later. */ + + const bool strict = validate && !tablespaces_found + && operation_not_for_export + && !(srv_operation == SRV_OPERATION_NORMAL + && srv_start_after_restore + && srv_force_recovery < SRV_FORCE_NO_BACKGROUND + && dict_table_t::is_temporary_name( + df_default.filepath())); + + if (df_default.open_read_only(strict) == DB_SUCCESS) { + ut_ad(df_default.is_open()); + ++tablespaces_found; + } + + /* Check if multiple locations point to the same file. */ + if (tablespaces_found > 1 && df_default.same_as(df_remote)) { + /* A link file was found with the default path in it. + Use the default path and delete the link file. */ + --tablespaces_found; + df_remote.delete_link_file(); + df_remote.close(); + } + + /* We have now checked all possible tablespace locations and + have a count of how many unique files we found. If things are + normal, we only found 1. */ + /* For encrypted tablespace, we need to check the + encryption in header of first page. */ + if (!must_validate && tablespaces_found == 1) { + goto skip_validate; + } + + /* Read and validate the first page of these three tablespace + locations, if found. */ + valid_tablespaces_found += + (df_remote.validate_to_dd(id, flags) == DB_SUCCESS); + + valid_tablespaces_found += + (df_default.validate_to_dd(id, flags) == DB_SUCCESS); + + /* Make sense of these three possible locations. + First, bail out if no tablespace files were found. */ + if (valid_tablespaces_found == 0) { + if (!strict + && IF_WIN(GetLastError() == ERROR_FILE_NOT_FOUND + || GetLastError() == ERROR_PATH_NOT_FOUND, + errno == ENOENT)) { + /* Suppress a message about a missing file. */ + goto corrupted; + } + + os_file_get_last_error(operation_not_for_export, + !operation_not_for_export); + if (!operation_not_for_export) { + goto corrupted; + } + sql_print_error("InnoDB: Could not find a valid tablespace" + " file for %.*s. %s", + static_cast(name.size()), name.data(), + TROUBLESHOOT_DATADICT_MSG); + goto corrupted; + } + if (!must_validate) { + goto skip_validate; + } + + /* Do not open any tablespaces if more than one tablespace with + the correct space ID and flags were found. */ + if (df_default.is_open() && df_remote.is_open()) { + ib::error() + << "A tablespace has been found in multiple places: " + << df_default.filepath() + << "(Space ID=" << df_default.space_id() + << ", Flags=" << df_default.flags() + << ") and " + << df_remote.filepath() + << "(Space ID=" << df_remote.space_id() + << ", Flags=" << df_remote.flags() + << (valid_tablespaces_found > 1 || srv_force_recovery + ? "); will not open" + : ")"); + + /* Force-recovery will allow some tablespaces to be + skipped by REDO if there was more than one file found. + Unlike during the REDO phase of recovery, we now know + if the tablespace is valid according to the dictionary, + which was not available then. So if we did not force + recovery and there is only one good tablespace, ignore + any bad tablespaces. */ + if (valid_tablespaces_found > 1 || srv_force_recovery > 0) { + /* If the file is not open it cannot be valid. */ + ut_ad(df_default.is_open() || !df_default.is_valid()); + ut_ad(df_remote.is_open() || !df_remote.is_valid()); + + /* Having established that, this is an easy way to + look for corrupted data files. */ + if (df_default.is_open() != df_default.is_valid() + || df_remote.is_open() != df_remote.is_valid()) { + goto corrupted; + } +error: + local_err = DB_ERROR; + goto func_exit; + } + + /* There is only one valid tablespace found and we did + not use srv_force_recovery during REDO. Use this one + tablespace and clean up invalid tablespace pointers */ + if (df_default.is_open() && !df_default.is_valid()) { + df_default.close(); + tablespaces_found--; + } + + if (df_remote.is_open() && !df_remote.is_valid()) { + df_remote.close(); + tablespaces_found--; + } + } + + /* At this point, there should be only one filepath. */ + ut_a(tablespaces_found == 1); + ut_a(valid_tablespaces_found == 1); + +skip_validate: + const byte* first_page = + df_default.is_open() ? df_default.get_first_page() : + df_remote.get_first_page(); + + fil_space_crypt_t* crypt_data = first_page + ? fil_space_read_crypt_data(fil_space_t::zip_size(flags), + first_page) + : NULL; + + mysql_mutex_lock(&fil_system.mutex); + space = fil_space_t::create(id, flags, purpose, crypt_data); + if (!space) { + mysql_mutex_unlock(&fil_system.mutex); + goto error; + } + + /* We do not measure the size of the file, that is why + we pass the 0 below */ + + space->add( + df_remote.is_open() ? df_remote.filepath() : + df_default.filepath(), OS_FILE_CLOSED, 0, false, true); + mysql_mutex_unlock(&fil_system.mutex); + + if (must_validate && !srv_read_only_mode) { + df_remote.close(); + df_default.close(); + if (space->acquire()) { + if (purpose != FIL_TYPE_IMPORT) { + fsp_flags_try_adjust(space, flags + & ~FSP_FLAGS_MEM_MASK); + } + space->release(); + } + } + + goto func_exit; +} + +/** Discover the correct IBD file to open given a remote or missing +filepath from the REDO log. Administrators can move a crashed +database to another location on the same machine and try to recover it. +Remote IBD files might be moved as well to the new location. + The problem with this is that the REDO log contains the old location +which may be still accessible. During recovery, if files are found in +both locations, we can chose on based on these priorities; +1. Default location +2. ISL location +3. REDO location +@param[in] space_id tablespace ID +@param[in] df Datafile object with path from redo +@return true if a valid datafile was found, false if not */ +static +bool +fil_ibd_discover( + ulint space_id, + Datafile& df) +{ + Datafile df_def_per; /* default file-per-table datafile */ + RemoteDatafile df_rem_per; /* remote file-per-table datafile */ + + /* Look for the datafile in the default location. */ + const char* filename = df.filepath(); + const char* basename = base_name(filename); + + /* If this datafile is file-per-table it will have a schema dir. */ + ulint sep_found = 0; + const char* db = basename; + for (; db > filename && sep_found < 2; db--) { + switch (db[0]) { +#ifdef _WIN32 + case '\\': +#endif + case '/': + sep_found++; + } + } + if (sep_found == 2) { + db += 2; + df_def_per.init(0); + df_def_per.set_filepath(db); + if (df_def_per.open_read_only(false) == DB_SUCCESS + && df_def_per.validate_for_recovery() == DB_SUCCESS + && df_def_per.space_id() == space_id) { + df.set_filepath(df_def_per.filepath()); + df.open_read_only(false); + return(true); + } + + /* Look for a remote file-per-table tablespace. */ + + switch (srv_operation) { + case SRV_OPERATION_BACKUP: + case SRV_OPERATION_RESTORE_DELTA: + case SRV_OPERATION_BACKUP_NO_DEFER: + ut_ad(0); + break; + case SRV_OPERATION_RESTORE_EXPORT: + case SRV_OPERATION_RESTORE: + break; + case SRV_OPERATION_NORMAL: + case SRV_OPERATION_EXPORT_RESTORED: + size_t len= strlen(db); + if (len <= 4 || strcmp(db + len - 4, dot_ext[IBD])) { + break; + } + df_rem_per.open_link_file({db, len - 4}); + + if (!df_rem_per.filepath()) { + break; + } + + /* An ISL file was found with contents. */ + if (df_rem_per.open_read_only(false) != DB_SUCCESS + || df_rem_per.validate_for_recovery() + != DB_SUCCESS) { + + /* Assume that this ISL file is intended to + be used. Do not continue looking for another + if this file cannot be opened or is not + a valid IBD file. */ + ib::error() << "ISL file '" + << df_rem_per.link_filepath() + << "' was found but the linked file '" + << df_rem_per.filepath() + << "' could not be opened or is" + " not correct."; + return(false); + } + + /* Use this file if it has the space_id from the + FILE_ record. */ + if (df_rem_per.space_id() == space_id) { + df.set_filepath(df_rem_per.filepath()); + df.open_read_only(false); + return(true); + } + + /* Since old MLOG records can use the same basename + in multiple CREATE/DROP TABLE sequences, this ISL + file could be pointing to a later version of this + basename.ibd file which has a different space_id. + Keep looking. */ + } + } + + /* No ISL files were found in the default location. Use the location + given in the redo log. */ + if (df.open_read_only(false) == DB_SUCCESS + && df.validate_for_recovery() == DB_SUCCESS + && df.space_id() == space_id) { + return(true); + } + + /* A datafile was not discovered for the filename given. */ + return(false); +} + +bool fil_crypt_check(fil_space_crypt_t *crypt_data, const char *f_name) +{ + if (crypt_data->is_key_found()) + return true; + sql_print_error("InnoDB: Encryption key is not found for %s", f_name); + crypt_data->~fil_space_crypt_t(); + ut_free(crypt_data); + return false; +} + +/** Open an ibd tablespace and add it to the InnoDB data structures. +This is similar to fil_ibd_open() except that it is used while processing +the REDO log, so the data dictionary is not available and very little +validation is done. The tablespace name is extracred from the +dbname/tablename.ibd portion of the filename, which assumes that the file +is a file-per-table tablespace. Any name will do for now. General +tablespace names will be read from the dictionary after it has been +recovered. The tablespace flags are read at this time from the first page +of the file in validate_for_recovery(). +@param[in] space_id tablespace ID +@param[in] filename path/to/databasename/tablename.ibd +@param[out] space the tablespace, or NULL on error +@return status of the operation */ +enum fil_load_status +fil_ibd_load(uint32_t space_id, const char *filename, fil_space_t *&space) +{ + /* If the a space is already in the file system cache with this + space ID, then there is nothing to do. */ + mysql_mutex_lock(&fil_system.mutex); + space = fil_space_get_by_id(space_id); + mysql_mutex_unlock(&fil_system.mutex); + + if (space) { + /* Compare the filename we are trying to open with the + filename from the first node of the tablespace we opened + previously. Fail if it is different. */ + fil_node_t* node = UT_LIST_GET_FIRST(space->chain); + if (0 != strcmp(innobase_basename(filename), + innobase_basename(node->name))) { + ib::info() + << "Ignoring data file '" << filename + << "' with space ID " << space->id + << ". Another data file called " << node->name + << " exists with the same space ID."; + space = NULL; + return(FIL_LOAD_ID_CHANGED); + } + return(FIL_LOAD_OK); + } + + if (srv_operation == SRV_OPERATION_RESTORE) { + /* Replace absolute DATA DIRECTORY file paths with + short names relative to the backup directory. */ + const char* name = strrchr(filename, '/'); +#ifdef _WIN32 + if (const char *last = strrchr(filename, '\\')) { + if (last > name) { + name = last; + } + } +#endif + if (name) { + while (--name > filename +#ifdef _WIN32 + && *name != '\\' +#endif + && *name != '/'); + if (name > filename) { + filename = name + 1; + } + } + } + + Datafile file; + file.set_filepath(filename); + file.open_read_only(false); + + if (!file.is_open()) { + /* The file has been moved or it is a remote datafile. */ + if (!fil_ibd_discover(space_id, file) + || !file.is_open()) { + return(FIL_LOAD_NOT_FOUND); + } + } + + os_offset_t size; + bool deferred_space = false; + + /* Read and validate the first page of the tablespace. + Assign a tablespace name based on the tablespace type. */ + switch (file.validate_for_recovery()) { + os_offset_t minimum_size; + case DB_SUCCESS: + deferred_space = file.m_defer; + + if (deferred_space) { + goto tablespace_check; + } + + if (file.space_id() != space_id) { + return(FIL_LOAD_ID_CHANGED); + } +tablespace_check: + /* Get and test the file size. */ + size = os_file_get_size(file.handle()); + + /* Every .ibd file is created >= 4 pages in size. + Smaller files cannot be OK. */ + minimum_size = os_offset_t(FIL_IBD_FILE_INITIAL_SIZE) + << srv_page_size_shift; + + if (size == static_cast(-1)) { + /* The following call prints an error message */ + os_file_get_last_error(true); + + ib::error() << "Could not measure the size of" + " single-table tablespace file '" + << file.filepath() << "'"; + } else if (deferred_space) { + return FIL_LOAD_DEFER; + } else if (size < minimum_size) { + ib::error() << "The size of tablespace file '" + << file.filepath() << "' is only " << size + << ", should be at least " << minimum_size + << "!"; + } else { + /* Everything is fine so far. */ + break; + } + + /* fall through */ + + case DB_TABLESPACE_EXISTS: + return(FIL_LOAD_INVALID); + + default: + return(FIL_LOAD_NOT_FOUND); + } + + ut_ad(space == NULL); + + /* Adjust the memory-based flags that would normally be set by + dict_tf_to_fsp_flags(). In recovery, we have no data dictionary. */ + uint32_t flags = file.flags(); + if (fil_space_t::is_compressed(flags)) { + flags |= page_zip_level + << FSP_FLAGS_MEM_COMPRESSION_LEVEL; + } + + const byte* first_page = file.get_first_page(); + fil_space_crypt_t* crypt_data = first_page + ? fil_space_read_crypt_data(fil_space_t::zip_size(flags), + first_page) + : NULL; + + if (crypt_data && !fil_crypt_check(crypt_data, filename)) { + return FIL_LOAD_INVALID; + } + + mysql_mutex_lock(&fil_system.mutex); + + space = fil_space_t::create( + space_id, flags, FIL_TYPE_TABLESPACE, crypt_data); + + if (space == NULL) { + mysql_mutex_unlock(&fil_system.mutex); + return(FIL_LOAD_INVALID); + } + + ut_ad(space->id == file.space_id()); + ut_ad(space->id == space_id); + + /* We do not use the size information we have about the file, because + the rounding formula for extents and pages is somewhat complex; we + let fil_node_open() do that task. */ + + space->add(file.filepath(), OS_FILE_CLOSED, 0, false, false); + mysql_mutex_unlock(&fil_system.mutex); + + return(FIL_LOAD_OK); +} + +/** Try to adjust FSP_SPACE_FLAGS if they differ from the expectations. +(Typically when upgrading from MariaDB 10.1.0..10.1.20.) +@param[in,out] space tablespace +@param[in] flags desired tablespace flags */ +void fsp_flags_try_adjust(fil_space_t *space, uint32_t flags) +{ + ut_ad(!srv_read_only_mode); + ut_ad(fil_space_t::is_valid_flags(flags, space->id)); + if (space->full_crc32() || fil_space_t::full_crc32(flags)) { + return; + } + if (!space->size && (space->purpose != FIL_TYPE_TABLESPACE + || !space->get_size())) { + return; + } + /* This code is executed during server startup while no + connections are allowed. We do not need to protect against + DROP TABLE by fil_space_acquire(). */ + mtr_t mtr; + mtr.start(); + if (buf_block_t* b = buf_page_get( + page_id_t(space->id, 0), space->zip_size(), + RW_X_LATCH, &mtr)) { + uint32_t f = fsp_header_get_flags(b->page.frame); + if (fil_space_t::full_crc32(f)) { + goto func_exit; + } + if (fil_space_t::is_flags_equal(f, flags)) { + goto func_exit; + } + /* Suppress the message if only the DATA_DIR flag to differs. */ + if ((f ^ flags) & ~(1U << FSP_FLAGS_POS_RESERVED)) { + ib::warn() + << "adjusting FSP_SPACE_FLAGS of file '" + << UT_LIST_GET_FIRST(space->chain)->name + << "' from " << ib::hex(f) + << " to " << ib::hex(flags); + } + mtr.set_named_space(space); + mtr.write<4,mtr_t::FORCED>(*b, + FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + + b->page.frame, flags); + } +func_exit: + mtr.commit(); +} + +/** Determine if a matching tablespace exists in the InnoDB tablespace +memory cache. Note that if we have not done a crash recovery at the database +startup, there may be many tablespaces which are not yet in the memory cache. +@param[in] id Tablespace ID +@param[in] table_flags table flags +@return the tablespace +@retval NULL if no matching tablespace exists in the memory cache */ +fil_space_t *fil_space_for_table_exists_in_mem(uint32_t id, + uint32_t table_flags) +{ + const uint32_t expected_flags = dict_tf_to_fsp_flags(table_flags); + + mysql_mutex_lock(&fil_system.mutex); + if (fil_space_t* space = fil_space_get_by_id(id)) { + uint32_t tf = expected_flags & ~FSP_FLAGS_MEM_MASK; + uint32_t sf = space->flags & ~FSP_FLAGS_MEM_MASK; + + if (!fil_space_t::is_flags_equal(tf, sf) + && !fil_space_t::is_flags_equal(sf, tf)) { + goto func_exit; + } + + /* Adjust the flags that are in FSP_FLAGS_MEM_MASK. + FSP_SPACE_FLAGS will not be written back here. */ + space->flags = (space->flags & ~FSP_FLAGS_MEM_MASK) + | (expected_flags & FSP_FLAGS_MEM_MASK); + mysql_mutex_unlock(&fil_system.mutex); + if (!srv_read_only_mode) { + fsp_flags_try_adjust(space, expected_flags + & ~FSP_FLAGS_MEM_MASK); + } + return space; + } + +func_exit: + mysql_mutex_unlock(&fil_system.mutex); + return NULL; +} + +/*============================ FILE I/O ================================*/ + +/** Report information about an invalid page access. */ +ATTRIBUTE_COLD +static void fil_invalid_page_access_msg(const char *name, + os_offset_t offset, ulint len, + bool is_read) +{ + sql_print_error("%s %zu bytes at " UINT64PF + " outside the bounds of the file: %s", + is_read + ? "InnoDB: Trying to read" + : "[FATAL] InnoDB: Trying to write", len, offset, name); + if (!is_read) + abort(); +} + +/** Update the data structures on write completion */ +inline void fil_node_t::complete_write() +{ + mysql_mutex_assert_not_owner(&fil_system.mutex); + + if (space->purpose != FIL_TYPE_TEMPORARY && + srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC && + space->set_needs_flush()) + { + mysql_mutex_lock(&fil_system.mutex); + if (!space->is_in_unflushed_spaces) + { + space->is_in_unflushed_spaces= true; + fil_system.unflushed_spaces.push_front(*space); + } + mysql_mutex_unlock(&fil_system.mutex); + } +} + +/** Read or write data. +@param type I/O context +@param offset offset in bytes +@param len number of bytes +@param buf the data to be read or written +@param bpage buffer block (for type.is_async() completion callback) +@return status and file descriptor */ +fil_io_t fil_space_t::io(const IORequest &type, os_offset_t offset, size_t len, + void *buf, buf_page_t *bpage) +{ + ut_ad(referenced()); + ut_ad(offset % UNIV_ZIP_SIZE_MIN == 0); + ut_ad(len % 512 == 0); /* page_compressed */ + ut_ad(fil_validate_skip()); + ut_ad(type.is_read() || type.is_write()); + ut_ad(type.type != IORequest::DBLWR_BATCH); + + if (type.is_read()) { + srv_stats.data_read.add(len); + } else { + ut_ad(!srv_read_only_mode || this == fil_system.temp_space); + srv_stats.data_written.add(len); + } + + fil_node_t* node= UT_LIST_GET_FIRST(chain); + ut_ad(node); + ulint p = static_cast(offset >> srv_page_size_shift); + dberr_t err; + + if (type.type == IORequest::READ_ASYNC && is_stopping()) { + err = DB_TABLESPACE_DELETED; + node = nullptr; + goto release; + } + + DBUG_EXECUTE_IF("intermittent_recovery_failure", + if (type.is_read() && !(~get_rnd_value() & 0x3ff0)) + goto io_error;); + + DBUG_EXECUTE_IF("intermittent_read_failure", + if (srv_was_started && type.is_read() && + !(~get_rnd_value() & 0x3ff0)) goto io_error;); + + if (UNIV_LIKELY_NULL(UT_LIST_GET_NEXT(chain, node))) { + ut_ad(this == fil_system.sys_space + || this == fil_system.temp_space); + ut_ad(!(offset & ((1 << srv_page_size_shift) - 1))); + + while (node->size <= p) { + p -= node->size; + node = UT_LIST_GET_NEXT(chain, node); + if (!node) { +fail: + if (type.type != IORequest::READ_ASYNC) { + fil_invalid_page_access_msg( + node->name, + offset, len, + type.is_read()); + } +#ifndef DBUG_OFF +io_error: +#endif + set_corrupted(); + err = DB_CORRUPTION; + node = nullptr; + goto release; + } + } + + offset = os_offset_t{p} << srv_page_size_shift; + } + + if (UNIV_UNLIKELY(node->size <= p)) { + goto fail; + } + + if (type.type == IORequest::PUNCH_RANGE) { + err = os_file_punch_hole(node->handle, offset, len); + /* Punch hole is not supported, make space not to + support punch hole */ + if (UNIV_UNLIKELY(err == DB_IO_NO_PUNCH_HOLE)) { + node->punch_hole = false; + err = DB_SUCCESS; + } + goto release_sync_write; + } else { + /* Queue the aio request */ + err = os_aio(IORequest{bpage, type.slot, node, type.type}, + buf, offset, len); + } + + if (!type.is_async()) { + if (type.is_write()) { +release_sync_write: + node->complete_write(); +release: + release(); + goto func_exit; + } + ut_ad(fil_validate_skip()); + } + if (err != DB_SUCCESS) { + goto release; + } +func_exit: + return {err, node}; +} + +#include + +void IORequest::write_complete(int io_error) const +{ + ut_ad(fil_validate_skip()); + ut_ad(node); + ut_ad(is_write()); + node->complete_write(); + + if (!bpage) + { + ut_ad(!srv_read_only_mode); + if (type == IORequest::DBLWR_BATCH) + buf_dblwr.flush_buffered_writes_completed(*this); + else + ut_ad(type == IORequest::WRITE_ASYNC); + } + else + buf_page_write_complete(*this, io_error); + + node->space->release(); +} + +void IORequest::read_complete(int io_error) const +{ + ut_ad(fil_validate_skip()); + ut_ad(node); + ut_ad(is_read()); + ut_ad(bpage); + + /* IMPORTANT: since i/o handling for reads will read also the insert + buffer in fil_system.sys_space, we have to be very careful not to + introduce deadlocks. We never close fil_system.sys_space data files + and never issue asynchronous reads of change buffer pages. */ + const page_id_t id(bpage->id()); + + if (UNIV_UNLIKELY(io_error != 0)) + { + sql_print_error("InnoDB: Read error %d of page " UINT32PF " in file %s", + io_error, id.page_no(), node->name); + buf_pool.corrupted_evict(bpage, buf_page_t::READ_FIX); + corrupted: + if (recv_recovery_is_on() && !srv_force_recovery) + { + mysql_mutex_lock(&recv_sys.mutex); + recv_sys.set_corrupt_fs(); + mysql_mutex_unlock(&recv_sys.mutex); + } + } + else if (dberr_t err= bpage->read_complete(*node)) + { + if (err != DB_FAIL) + ib::error() << "Failed to read page " << id.page_no() + << " from file '" << node->name << "': " << err; + goto corrupted; + } + + node->space->release(); +} + +/** Flush to disk the writes in file spaces of the given type +possibly cached by the OS. */ +void fil_flush_file_spaces() +{ + if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC) + { + ut_d(mysql_mutex_lock(&fil_system.mutex)); + ut_ad(fil_system.unflushed_spaces.empty()); + ut_d(mysql_mutex_unlock(&fil_system.mutex)); + return; + } + +rescan: + mysql_mutex_lock(&fil_system.mutex); + + for (fil_space_t &space : fil_system.unflushed_spaces) + { + if (space.needs_flush_not_stopping()) + { + space.reacquire(); + mysql_mutex_unlock(&fil_system.mutex); + space.flush_low(); + space.release(); + goto rescan; + } + } + + mysql_mutex_unlock(&fil_system.mutex); +} + +/** Functor to validate the file node list of a tablespace. */ +struct Check { + /** Total size of file nodes visited so far */ + ulint size; + /** Total number of open files visited so far */ + ulint n_open; + + /** Constructor */ + Check() : size(0), n_open(0) {} + + /** Visit a file node + @param[in] elem file node to visit */ + void operator()(const fil_node_t* elem) + { + n_open += elem->is_open(); + size += elem->size; + } + + /** Validate a tablespace. + @param[in] space tablespace to validate + @return number of open file nodes */ + static ulint validate(const fil_space_t* space) + { + mysql_mutex_assert_owner(&fil_system.mutex); + Check check; + ut_list_validate(space->chain, check); + ut_a(space->size == check.size); + + switch (space->id) { + case TRX_SYS_SPACE: + ut_ad(fil_system.sys_space == NULL + || fil_system.sys_space == space); + break; + case SRV_TMP_SPACE_ID: + ut_ad(fil_system.temp_space == NULL + || fil_system.temp_space == space); + break; + default: + break; + } + + return(check.n_open); + } +}; + +/******************************************************************//** +Checks the consistency of the tablespace cache. +@return true if ok */ +bool fil_validate() +{ + ulint n_open = 0; + + mysql_mutex_lock(&fil_system.mutex); + + for (fil_space_t &space : fil_system.space_list) { + n_open += Check::validate(&space); + } + + ut_a(fil_system.n_open == n_open); + + mysql_mutex_unlock(&fil_system.mutex); + + return(true); +} + +/*********************************************************************//** +Sets the file page type. */ +void +fil_page_set_type( +/*==============*/ + byte* page, /*!< in/out: file page */ + ulint type) /*!< in: type */ +{ + ut_ad(page); + + mach_write_to_2(page + FIL_PAGE_TYPE, type); +} + +/********************************************************************//** +Delete the tablespace file and any related files like .cfg. +This should not be called for temporary tables. +@param[in] ibd_filepath File path of the IBD tablespace */ +void fil_delete_file(const char *ibd_filepath) +{ + ib::info() << "Deleting " << ibd_filepath; + os_file_delete_if_exists(innodb_data_file_key, ibd_filepath, nullptr); + + if (char *cfg_filepath= fil_make_filepath(ibd_filepath, + fil_space_t::name_type{}, CFG, + false)) + { + os_file_delete_if_exists(innodb_data_file_key, cfg_filepath, nullptr); + ut_free(cfg_filepath); + } +} + +#ifdef UNIV_DEBUG +/** Check that a tablespace is valid for mtr_commit(). +@param[in] space persistent tablespace that has been changed */ +static +void +fil_space_validate_for_mtr_commit( + const fil_space_t* space) +{ + mysql_mutex_assert_not_owner(&fil_system.mutex); + ut_ad(space != NULL); + ut_ad(space->purpose == FIL_TYPE_TABLESPACE); + ut_ad(!is_predefined_tablespace(space->id)); + + /* We are serving mtr_commit(). While there is an active + mini-transaction, we should have !space->stop_new_ops. This is + guaranteed by meta-data locks or transactional locks. */ + ut_ad(!space->is_stopping() + || space->is_being_truncated /* fil_truncate_prepare() */ + || space->referenced()); +} +#endif /* UNIV_DEBUG */ + +/** Note that a non-predefined persistent tablespace has been modified +by redo log. +@param[in,out] space tablespace */ +void +fil_names_dirty( + fil_space_t* space) +{ +#ifndef SUX_LOCK_GENERIC + ut_ad(log_sys.latch.is_write_locked()); +#endif + ut_ad(recv_recovery_is_on()); + ut_ad(log_sys.get_lsn() != 0); + ut_ad(space->max_lsn == 0); + ut_d(fil_space_validate_for_mtr_commit(space)); + + fil_system.named_spaces.push_back(*space); + space->max_lsn = log_sys.get_lsn(); +} + +/** Write a FILE_MODIFY record when a non-predefined persistent +tablespace was modified for the first time since fil_names_clear(). */ +ATTRIBUTE_NOINLINE ATTRIBUTE_COLD void mtr_t::name_write() +{ +#ifndef SUX_LOCK_GENERIC + ut_ad(log_sys.latch.is_write_locked()); +#endif + ut_d(fil_space_validate_for_mtr_commit(m_user_space)); + ut_ad(!m_user_space->max_lsn); + m_user_space->max_lsn= log_sys.get_lsn(); + + fil_system.named_spaces.push_back(*m_user_space); + ut_ad(UT_LIST_GET_LEN(m_user_space->chain) == 1); + + mtr_t mtr; + mtr.start(); + fil_name_write(m_user_space->id, + UT_LIST_GET_FIRST(m_user_space->chain)->name, + &mtr); + mtr.commit_files(); +} + +/** On a log checkpoint, reset fil_names_dirty_and_write() flags +and write out FILE_MODIFY if needed, and write FILE_CHECKPOINT. +@param lsn checkpoint LSN +@return current LSN */ +lsn_t fil_names_clear(lsn_t lsn) +{ + mtr_t mtr; + +#ifndef SUX_LOCK_GENERIC + ut_ad(log_sys.latch.is_write_locked()); +#endif + ut_ad(lsn); + ut_ad(log_sys.is_latest()); + + mtr.start(); + + for (auto it = fil_system.named_spaces.begin(); + it != fil_system.named_spaces.end(); ) { + if (mtr.get_log_size() + strlen(it->chain.start->name) + >= recv_sys.MTR_SIZE_MAX - (3 + 5)) { + /* Prevent log parse buffer overflow */ + mtr.commit_files(); + mtr.start(); + } + + auto next = std::next(it); + + ut_ad(it->max_lsn > 0); + if (it->max_lsn < lsn) { + /* The tablespace was last dirtied before the + checkpoint LSN. Remove it from the list, so + that if the tablespace is not going to be + modified any more, subsequent checkpoints will + avoid calling fil_names_write() on it. */ + it->max_lsn = 0; + fil_system.named_spaces.erase(it); + } + + /* max_lsn is the last LSN where fil_names_dirty_and_write() + was called. If we kept track of "min_lsn" (the first LSN + where max_lsn turned nonzero), we could avoid the + fil_names_write() call if min_lsn > lsn. */ + ut_ad(UT_LIST_GET_LEN((*it).chain) == 1); + fil_name_write((*it).id, UT_LIST_GET_FIRST((*it).chain)->name, + &mtr); + it = next; + } + + return mtr.commit_files(lsn); +} + +/* Unit Tests */ +#ifdef UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH +#define MF fil_make_filepath +#define DISPLAY ib::info() << path +void +test_make_filepath() +{ + char* path; + const char* long_path = + "this/is/a/very/long/path/including/a/very/" + "looooooooooooooooooooooooooooooooooooooooooooooooo" + "oooooooooooooooooooooooooooooooooooooooooooooooooo" + "oooooooooooooooooooooooooooooooooooooooooooooooooo" + "oooooooooooooooooooooooooooooooooooooooooooooooooo" + "oooooooooooooooooooooooooooooooooooooooooooooooooo" + "oooooooooooooooooooooooooooooooooooooooooooooooooo" + "oooooooooooooooooooooooooooooooooooooooooooooooooo" + "oooooooooooooooooooooooooooooooooooooooooooooooooo" + "oooooooooooooooooooooooooooooooooooooooooooooooooo" + "oooooooooooooooooooooooooooooooooooooooooooooooong" + "/folder/name"; + path = MF("/this/is/a/path/with/a/filename", NULL, IBD, false); DISPLAY; + path = MF("/this/is/a/path/with/a/filename", NULL, ISL, false); DISPLAY; + path = MF("/this/is/a/path/with/a/filename", NULL, CFG, false); DISPLAY; + path = MF("/this/is/a/path/with/a/filename.ibd", NULL, IBD, false); DISPLAY; + path = MF("/this/is/a/path/with/a/filename.ibd", NULL, IBD, false); DISPLAY; + path = MF("/this/is/a/path/with/a/filename.dat", NULL, IBD, false); DISPLAY; + path = MF(NULL, "tablespacename", NO_EXT, false); DISPLAY; + path = MF(NULL, "tablespacename", IBD, false); DISPLAY; + path = MF(NULL, "dbname/tablespacename", NO_EXT, false); DISPLAY; + path = MF(NULL, "dbname/tablespacename", IBD, false); DISPLAY; + path = MF(NULL, "dbname/tablespacename", ISL, false); DISPLAY; + path = MF(NULL, "dbname/tablespacename", CFG, false); DISPLAY; + path = MF(NULL, "dbname\\tablespacename", NO_EXT, false); DISPLAY; + path = MF(NULL, "dbname\\tablespacename", IBD, false); DISPLAY; + path = MF("/this/is/a/path", "dbname/tablespacename", IBD, false); DISPLAY; + path = MF("/this/is/a/path", "dbname/tablespacename", IBD, true); DISPLAY; + path = MF("./this/is/a/path", "dbname/tablespacename.ibd", IBD, true); DISPLAY; + path = MF("this\\is\\a\\path", "dbname/tablespacename", IBD, true); DISPLAY; + path = MF("/this/is/a/path", "dbname\\tablespacename", IBD, true); DISPLAY; + path = MF(long_path, NULL, IBD, false); DISPLAY; + path = MF(long_path, "tablespacename", IBD, false); DISPLAY; + path = MF(long_path, "tablespacename", IBD, true); DISPLAY; +} +#endif /* UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH */ +/* @} */ + +/** Determine the block size of the data file. +@param[in] space tablespace +@param[in] offset page number +@return block size */ +ulint fil_space_get_block_size(const fil_space_t *space, unsigned offset) +{ + ulint block_size = 512; + + for (fil_node_t* node = UT_LIST_GET_FIRST(space->chain); + node != NULL; + node = UT_LIST_GET_NEXT(chain, node)) { + block_size = node->block_size; + if (node->size > offset) { + ut_ad(node->size <= 0xFFFFFFFFU); + break; + } + offset -= static_cast(node->size); + } + + /* Currently supporting block size up to 4K, + fall back to default if bigger requested. */ + if (block_size > 4096) { + block_size = 512; + } + + return block_size; +} + +/** @return the tablespace name (databasename/tablename) */ +fil_space_t::name_type fil_space_t::name() const +{ + switch (id) { + case 0: + return name_type{"innodb_system", 13}; + case SRV_TMP_SPACE_ID: + return name_type{"innodb_temporary", 16}; + } + + if (!UT_LIST_GET_FIRST(chain) || srv_is_undo_tablespace(id)) + return name_type{}; + + ut_ad(purpose != FIL_TYPE_TEMPORARY); + ut_ad(UT_LIST_GET_LEN(chain) == 1); + + const char *path= UT_LIST_GET_FIRST(chain)->name; + const char *sep= strchr(path, '/'); + ut_ad(sep); + + while (const char *next_sep= strchr(sep + 1, '/')) + path= sep + 1, sep= next_sep; + +#ifdef _WIN32 + if (const char *last_sep= strchr(path, '\\')) + if (last_sep < sep) + path= last_sep; +#endif + + size_t len= strlen(path); + ut_ad(len > 4); + len-= 4; + ut_ad(!strcmp(&path[len], DOT_IBD)); + + return name_type{path, len}; +} + +#ifdef UNIV_DEBUG + +fil_space_t *fil_space_t::next_in_space_list() +{ + space_list_t::iterator it(this); + auto end= fil_system.space_list.end(); + if (it == end) + return nullptr; + ++it; + return it == end ? nullptr : &*it; +} + +fil_space_t *fil_space_t::prev_in_space_list() +{ + space_list_t::iterator it(this); + if (it == fil_system.space_list.begin()) + return nullptr; + --it; + return &*it; +} + +fil_space_t *fil_space_t::next_in_unflushed_spaces() +{ + sized_ilist::iterator it(this); + auto end= fil_system.unflushed_spaces.end(); + if (it == end) + return nullptr; + ++it; + return it == end ? nullptr : &*it; +} + +fil_space_t *fil_space_t::prev_in_unflushed_spaces() +{ + sized_ilist::iterator it(this); + if (it == fil_system.unflushed_spaces.begin()) + return nullptr; + --it; + return &*it; +} + +#endif diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc new file mode 100644 index 00000000..16aea2a7 --- /dev/null +++ b/storage/innobase/fil/fil0pagecompress.cc @@ -0,0 +1,584 @@ +/***************************************************************************** + +Copyright (C) 2013, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file fil/fil0pagecompress.cc +Implementation for page compressed file spaces. + +Created 11/12/2013 Jan Lindström jan.lindstrom@mariadb.com +Updated 14/02/2015 +***********************************************************************/ + +#include "fil0fil.h" +#include "fil0pagecompress.h" + +#include + +#include "mem0mem.h" +#include "hash0hash.h" +#include "os0file.h" +#include "mach0data.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "log0recv.h" +#include "fsp0fsp.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "mtr0mtr.h" +#include "mtr0log.h" +#include "dict0dict.h" +#include "page0page.h" +#include "page0zip.h" +#include "trx0sys.h" +#include "row0mysql.h" +#include "buf0lru.h" +#include "ibuf0ibuf.h" +#include "zlib.h" +#ifdef __linux__ +#include +#include +#include +#endif +#include "row0mysql.h" +#include "lz4.h" +#include "lzo/lzo1x.h" +#include "lzma.h" +#include "bzlib.h" +#include "snappy-c.h" + +/** Compress a page for the given compression algorithm. +@param[in] buf page to be compressed +@param[out] out_buf compressed page +@param[in] header_len header length of the page +@param[in] comp_algo compression algorithm +@param[in] comp_level compression level +@return actual length of compressed page data +@retval 0 if the page was not compressed */ +static ulint fil_page_compress_low( + const byte* buf, + byte* out_buf, + ulint header_len, + ulint comp_algo, + unsigned comp_level) +{ + ulint write_size = srv_page_size - header_len; + + switch (comp_algo) { + default: + ut_ad("unknown compression method" == 0); + /* fall through */ + case PAGE_UNCOMPRESSED: + return 0; + + case PAGE_ZLIB_ALGORITHM: + { + ulong len = uLong(write_size); + if (Z_OK == compress2( + out_buf + header_len, &len, buf, + uLong(srv_page_size), int(comp_level))) { + return len; + } + } + break; + + case PAGE_LZ4_ALGORITHM: + write_size = LZ4_compress_default( + reinterpret_cast(buf), + reinterpret_cast(out_buf) + header_len, + int(srv_page_size), int(write_size)); + + return write_size; + + case PAGE_LZO_ALGORITHM: { + lzo_uint len = write_size; + + if (LZO_E_OK == lzo1x_1_15_compress( + buf, srv_page_size, + out_buf + header_len, &len, + out_buf + srv_page_size) + && len <= write_size) { + return len; + } + break; + } + + case PAGE_LZMA_ALGORITHM: { + size_t out_pos = 0; + + if (LZMA_OK == lzma_easy_buffer_encode( + comp_level, LZMA_CHECK_NONE, NULL, + buf, srv_page_size, out_buf + header_len, + &out_pos, write_size) + && out_pos <= write_size) { + return out_pos; + } + break; + } + + case PAGE_BZIP2_ALGORITHM: { + unsigned len = unsigned(write_size); + if (BZ_OK == BZ2_bzBuffToBuffCompress( + reinterpret_cast(out_buf + header_len), + &len, + const_cast( + reinterpret_cast(buf)), + unsigned(srv_page_size), 1, 0, 0) + && len <= write_size) { + return len; + } + break; + } + + case PAGE_SNAPPY_ALGORITHM: { + size_t len = snappy_max_compressed_length(srv_page_size); + + if (SNAPPY_OK == snappy_compress( + reinterpret_cast(buf), + srv_page_size, + reinterpret_cast(out_buf) + header_len, + &len) + && len <= write_size) { + return len; + } + break; + } + } + + return 0; +} + +/** Compress a page_compressed page for full crc32 format. +@param[in] buf page to be compressed +@param[out] out_buf compressed page +@param[in] flags tablespace flags +@param[in] block_size file system block size +@return actual length of compressed page +@retval 0 if the page was not compressed */ +static ulint fil_page_compress_for_full_crc32( + const byte* buf, + byte* out_buf, + uint32_t flags, + ulint block_size, + bool encrypted) +{ + ulint comp_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags); + ulint comp_algo = fil_space_t::get_compression_algo(flags); + + if (comp_level == 0) { + comp_level = page_zip_level; + } + + const ulint header_len = FIL_PAGE_COMP_ALGO; + + ulint write_size = fil_page_compress_low( + buf, out_buf, header_len, + comp_algo, + static_cast(comp_level)); + + if (write_size == 0) { +fail: + if (comp_algo != PAGE_UNCOMPRESSED) + srv_stats.pages_page_compression_error.inc(); + return 0; + } + + write_size += header_len; + const ulint actual_size = write_size; + /* Write the actual length of the data & page type + for full crc32 format. */ + const bool lsb = fil_space_t::full_crc32_page_compressed_len(flags); + /* In the MSB, store the rounded-up page size. */ + write_size = (write_size + lsb + (4 + 255)) & ~255; + if (write_size >= srv_page_size) { + goto fail; + } + + /* Set up the page header */ + memcpy(out_buf, buf, header_len); + out_buf[FIL_PAGE_TYPE] = 1U << (FIL_PAGE_COMPRESS_FCRC32_MARKER - 8); + out_buf[FIL_PAGE_TYPE + 1] = byte(write_size >> 8); + /* Clean up the buffer for the remaining write_size (except checksum) */ + memset(out_buf + actual_size, 0, write_size - actual_size - 4); + if (lsb) { + /* Store the LSB */ + out_buf[write_size - 5] = byte(actual_size + (1 + 4)); + } + + if (!block_size) { + block_size = 512; + } + + ut_ad(write_size); + if (write_size & (block_size - 1)) { + size_t tmp = write_size; + write_size = (write_size + (block_size - 1)) + & ~(block_size - 1); + memset(out_buf + tmp, 0, write_size - tmp); + } + + srv_stats.page_compression_saved.add(srv_page_size - write_size); + srv_stats.pages_page_compressed.inc(); + + return write_size; +} + +/** Compress a page_compressed page for non full crc32 format. +@param[in] buf page to be compressed +@param[out] out_buf compressed page +@param[in] flags tablespace flags +@param[in] block_size file system block size +@param[in] encrypted whether the page will be subsequently encrypted +@return actual length of compressed page +@retval 0 if the page was not compressed */ +static ulint fil_page_compress_for_non_full_crc32( + const byte* buf, + byte* out_buf, + ulint flags, + ulint block_size, + bool encrypted) +{ + uint comp_level = static_cast( + FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags)); + ulint header_len = FIL_PAGE_DATA + FIL_PAGE_COMP_METADATA_LEN; + /* Cache to avoid change during function execution */ + ulint comp_algo = innodb_compression_algorithm; + + if (encrypted) { + header_len += FIL_PAGE_ENCRYPT_COMP_ALGO; + } + + /* If no compression level was provided to this table, use system + default level */ + if (comp_level == 0) { + comp_level = page_zip_level; + } + + ulint write_size = fil_page_compress_low( + buf, out_buf, + header_len, comp_algo, comp_level); + + if (write_size == 0) { + if (comp_algo != PAGE_UNCOMPRESSED) + srv_stats.pages_page_compression_error.inc(); + return 0; + } + + /* Set up the page header */ + memcpy(out_buf, buf, FIL_PAGE_DATA); + /* Set up the checksum */ + mach_write_to_4(out_buf + FIL_PAGE_SPACE_OR_CHKSUM, BUF_NO_CHECKSUM_MAGIC); + + /* Set up the compression algorithm */ + mach_write_to_8(out_buf + FIL_PAGE_COMP_ALGO, comp_algo); + + if (encrypted) { + /* Set up the correct page type */ + mach_write_to_2(out_buf + FIL_PAGE_TYPE, + FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED); + + mach_write_to_2(out_buf + FIL_PAGE_DATA + + FIL_PAGE_ENCRYPT_COMP_ALGO, comp_algo); + } else { + /* Set up the correct page type */ + mach_write_to_2(out_buf + FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED); + } + + /* Set up the actual payload lenght */ + mach_write_to_2(out_buf + FIL_PAGE_DATA + FIL_PAGE_COMP_SIZE, + write_size); + + ut_ad(mach_read_from_4(out_buf + FIL_PAGE_SPACE_OR_CHKSUM) + == BUF_NO_CHECKSUM_MAGIC); + + ut_ad(mach_read_from_2(out_buf + FIL_PAGE_DATA + FIL_PAGE_COMP_SIZE) + == write_size); + +#ifdef UNIV_DEBUG + bool is_compressed = (mach_read_from_8(out_buf + FIL_PAGE_COMP_ALGO) + == (ulint) comp_algo); + + bool is_encrypted_compressed = + (mach_read_from_2(out_buf + FIL_PAGE_DATA + + FIL_PAGE_ENCRYPT_COMP_ALGO) + == (ulint) comp_algo); +#endif /* UNIV_DEBUG */ + + ut_ad(is_compressed || is_encrypted_compressed); + + write_size+=header_len; + + if (block_size <= 0) { + block_size = 512; + } + + ut_ad(write_size > 0 && block_size > 0); + + /* Actual write needs to be alligned on block size */ + if (write_size % block_size) { + size_t tmp = write_size; + write_size = (size_t)ut_uint64_align_up( + (ib_uint64_t)write_size, block_size); + /* Clean up the end of buffer */ + memset(out_buf+tmp, 0, write_size - tmp); +#ifdef UNIV_DEBUG + ut_a(write_size > 0 && ((write_size % block_size) == 0)); + ut_a(write_size >= tmp); +#endif + } + + srv_stats.page_compression_saved.add(srv_page_size - write_size); + srv_stats.pages_page_compressed.inc(); + + return write_size; +} + +/** Compress a page_compressed page before writing to a data file. +@param[in] buf page to be compressed +@param[out] out_buf compressed page +@param[in] flags tablespace flags +@param[in] block_size file system block size +@param[in] encrypted whether the page will be subsequently encrypted +@return actual length of compressed page +@retval 0 if the page was not compressed */ +ulint fil_page_compress( + const byte* buf, + byte* out_buf, + uint32_t flags, + ulint block_size, + bool encrypted) +{ + /* The full_crc32 page_compressed format assumes this. */ + ut_ad(!(block_size & 255)); + ut_ad(ut_is_2pow(block_size)); + + /* Let's not compress file space header or + extent descriptor */ + switch (fil_page_get_type(buf)) { + case 0: + case FIL_PAGE_TYPE_FSP_HDR: + case FIL_PAGE_TYPE_XDES: + case FIL_PAGE_PAGE_COMPRESSED: + return 0; + } + + if (fil_space_t::full_crc32(flags)) { + return fil_page_compress_for_full_crc32( + buf, out_buf, flags, block_size, encrypted); + } + + return fil_page_compress_for_non_full_crc32( + buf, out_buf, flags, block_size, encrypted); +} + +/** Decompress a page that may be subject to page_compressed compression. +@param[in,out] tmp_buf temporary buffer (of innodb_page_size) +@param[in,out] buf possibly compressed page buffer +@param[in] comp_algo compression algorithm +@param[in] header_len header length of the page +@param[in] actual size actual size of the page +@retval true if the page is decompressed or false */ +static bool fil_page_decompress_low( + byte* tmp_buf, + byte* buf, + ulint comp_algo, + ulint header_len, + ulint actual_size) +{ + switch (comp_algo) { + default: + ib::error() << "Unknown compression algorithm " + << comp_algo; + return false; + case PAGE_ZLIB_ALGORITHM: + { + uLong len = srv_page_size; + return (Z_OK == uncompress(tmp_buf, &len, + buf + header_len, + uLong(actual_size)) + && len == srv_page_size); + } + + case PAGE_LZ4_ALGORITHM: + return LZ4_decompress_safe( + reinterpret_cast(buf) + header_len, + reinterpret_cast(tmp_buf), + static_cast(actual_size), + static_cast(srv_page_size)) == + static_cast(srv_page_size); + + case PAGE_LZO_ALGORITHM: + { + lzo_uint len_lzo = srv_page_size; + return (LZO_E_OK == lzo1x_decompress_safe( + buf + header_len, + actual_size, tmp_buf, &len_lzo, NULL) + && len_lzo == srv_page_size); + } + + case PAGE_LZMA_ALGORITHM: + { + size_t src_pos = 0; + size_t dst_pos = 0; + uint64_t memlimit = UINT64_MAX; + + return LZMA_OK == lzma_stream_buffer_decode( + &memlimit, 0, NULL, buf + header_len, + &src_pos, actual_size, tmp_buf, &dst_pos, + srv_page_size) + && dst_pos == srv_page_size; + } + + case PAGE_BZIP2_ALGORITHM: + { + uint dst_pos = static_cast(srv_page_size); + return BZ_OK == BZ2_bzBuffToBuffDecompress( + reinterpret_cast(tmp_buf), + &dst_pos, + reinterpret_cast(buf) + header_len, + static_cast(actual_size), 1, 0) + && dst_pos == srv_page_size; + } + + case PAGE_SNAPPY_ALGORITHM: + { + size_t olen = srv_page_size; + + return SNAPPY_OK == snappy_uncompress( + reinterpret_cast(buf) + + header_len, + actual_size, + reinterpret_cast(tmp_buf), &olen) + && olen == srv_page_size; + } + } + + return false; +} + +/** Decompress a page for full crc32 format. +@param[in,out] tmp_buf temporary buffer (of innodb_page_size) +@param[in,out] buf possibly compressed page buffer +@param[in] flags tablespace flags +@return size of the compressed data +@retval 0 if decompression failed +@retval srv_page_size if the page was not compressed */ +static size_t fil_page_decompress_for_full_crc32(byte *tmp_buf, byte *buf, + uint32_t flags) +{ + ut_ad(fil_space_t::full_crc32(flags)); + bool compressed = false; + size_t size = buf_page_full_crc32_size(buf, &compressed, NULL); + if (!compressed) { + ut_ad(size == srv_page_size); + return size; + } + + if (!fil_space_t::is_compressed(flags)) { + return 0; + } + + if (size >= srv_page_size) { + return 0; + } + + if (fil_space_t::full_crc32_page_compressed_len(flags)) { + compile_time_assert(FIL_PAGE_FCRC32_CHECKSUM == 4); + if (size_t lsb = buf[size - 5]) { + size += lsb - 0x100; + } + size -= 5; + } + + const size_t header_len = FIL_PAGE_COMP_ALGO; + + if (!fil_page_decompress_low(tmp_buf, buf, + fil_space_t::get_compression_algo(flags), + header_len, size - header_len)) { + return 0; + } + + srv_stats.pages_page_decompressed.inc(); + memcpy(buf, tmp_buf, srv_page_size); + return size; +} + +/** Decompress a page for non full crc32 format. +@param[in,out] tmp_buf temporary buffer (of innodb_page_size) +@param[in,out] buf possibly compressed page buffer +@return size of the compressed data +@retval 0 if decompression failed +@retval srv_page_size if the page was not compressed */ +static size_t fil_page_decompress_for_non_full_crc32(byte *tmp_buf, byte *buf) +{ + ulint header_len; + uint comp_algo; + switch (fil_page_get_type(buf)) { + case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED: + header_len= FIL_PAGE_DATA + FIL_PAGE_ENCRYPT_COMP_METADATA_LEN; + comp_algo = mach_read_from_2( + FIL_PAGE_DATA + FIL_PAGE_ENCRYPT_COMP_ALGO + buf); + break; + case FIL_PAGE_PAGE_COMPRESSED: + header_len = FIL_PAGE_DATA + FIL_PAGE_COMP_METADATA_LEN; + if (mach_read_from_6(FIL_PAGE_COMP_ALGO + buf)) { + return 0; + } + comp_algo = mach_read_from_2(FIL_PAGE_COMP_ALGO + 6 + buf); + break; + default: + return srv_page_size; + } + + if (mach_read_from_4(buf + FIL_PAGE_SPACE_OR_CHKSUM) + != BUF_NO_CHECKSUM_MAGIC) { + return 0; + } + + ulint actual_size = mach_read_from_2(buf + FIL_PAGE_DATA + + FIL_PAGE_COMP_SIZE); + + /* Check if payload size is corrupted */ + if (actual_size == 0 || actual_size > srv_page_size - header_len) { + return 0; + } + + if (!fil_page_decompress_low(tmp_buf, buf, comp_algo, header_len, + actual_size)) { + return 0; + } + + srv_stats.pages_page_decompressed.inc(); + memcpy(buf, tmp_buf, srv_page_size); + return actual_size; +} + +/** Decompress a page that may be subject to page_compressed compression. +@param[in,out] tmp_buf temporary buffer (of innodb_page_size) +@param[in,out] buf possibly compressed page buffer +@param[in] flags tablespace flags +@return size of the compressed data +@retval 0 if decompression failed +@retval srv_page_size if the page was not compressed */ +ulint fil_page_decompress(byte *tmp_buf, byte *buf, uint32_t flags) +{ + if (fil_space_t::full_crc32(flags)) { + return fil_page_decompress_for_full_crc32(tmp_buf, buf, flags); + } + + return fil_page_decompress_for_non_full_crc32(tmp_buf, buf); +} diff --git a/storage/innobase/fsp/fsp0file.cc b/storage/innobase/fsp/fsp0file.cc new file mode 100644 index 00000000..cafff419 --- /dev/null +++ b/storage/innobase/fsp/fsp0file.cc @@ -0,0 +1,936 @@ +/***************************************************************************** + +Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file fsp/fsp0file.cc +Tablespace data file implementation + +Created 2013-7-26 by Kevin Lewis +*******************************************************/ + +#include "fil0fil.h" +#include "fsp0types.h" +#include "os0file.h" +#include "page0page.h" +#include "srv0start.h" +#include "log.h" + +/** Release the resources. */ +void +Datafile::shutdown() +{ + close(); + + free_filepath(); + free_first_page(); +} + +/** Create/open a data file. +@param[in] read_only_mode if true, then readonly mode checks are enforced. +@return DB_SUCCESS or error code */ +dberr_t +Datafile::open_or_create(bool read_only_mode) +{ + bool success; + ut_a(m_filepath != NULL); + ut_ad(m_handle == OS_FILE_CLOSED); + + m_handle = os_file_create( + innodb_data_file_key, m_filepath, m_open_flags, + OS_FILE_NORMAL, OS_DATA_FILE, read_only_mode, &success); + + if (!success) { + m_last_os_error = os_file_get_last_error(true); + ib::error() << "Cannot open datafile '" << m_filepath << "'"; + return(DB_CANNOT_OPEN_FILE); + } + + return(DB_SUCCESS); +} + +/** Open a data file in read-only mode to check if it exists so that it +can be validated. +@param[in] strict whether to issue error messages +@return DB_SUCCESS or error code */ +dberr_t +Datafile::open_read_only(bool strict) +{ + bool success = false; + ut_ad(m_handle == OS_FILE_CLOSED); + + /* This function can be called for file objects that do not need + to be opened, which is the case when the m_filepath is NULL */ + if (m_filepath == NULL) { + return(DB_ERROR); + } + + set_open_flags(OS_FILE_OPEN); + m_handle = os_file_create_simple_no_error_handling( + innodb_data_file_key, m_filepath, m_open_flags, + OS_FILE_READ_ONLY, true, &success); + + if (success) { + m_exists = true; + init_file_info(); + + return(DB_SUCCESS); + } + + if (strict) { + m_last_os_error = os_file_get_last_error(true); + ib::error() << "Cannot open datafile for read-only: '" + << m_filepath << "' OS error: " << m_last_os_error; + } + + return(DB_CANNOT_OPEN_FILE); +} + +/** Open a data file in read-write mode during start-up so that +doublewrite pages can be restored and then it can be validated.* +@return DB_SUCCESS or error code */ +inline dberr_t Datafile::open_read_write() +{ + bool success = false; + ut_ad(m_handle == OS_FILE_CLOSED); + ut_ad(!srv_read_only_mode); + + /* This function can be called for file objects that do not need + to be opened, which is the case when the m_filepath is NULL */ + if (m_filepath == NULL) { + return(DB_ERROR); + } + + set_open_flags(OS_FILE_OPEN); + m_handle = os_file_create_simple_no_error_handling( + innodb_data_file_key, m_filepath, m_open_flags, + OS_FILE_READ_WRITE, false, &success); + + if (!success) { + m_last_os_error = os_file_get_last_error(true); + ib::error() << "Cannot open datafile for read-write: '" + << m_filepath << "'"; + return(DB_CANNOT_OPEN_FILE); + } + + m_exists = true; + + init_file_info(); + + return(DB_SUCCESS); +} + +/** Initialize OS specific file info. */ +void +Datafile::init_file_info() +{ +#ifdef _WIN32 + GetFileInformationByHandle((os_file_t)m_handle, &m_file_info); +#else + fstat(m_handle, &m_file_info); +#endif /* WIN32 */ +} + +/** Close a data file. +@return DB_SUCCESS or error code */ +dberr_t +Datafile::close() +{ + if (m_handle != OS_FILE_CLOSED) { + ibool success = os_file_close(m_handle); + ut_a(success); + + m_handle = OS_FILE_CLOSED; + } + + return(DB_SUCCESS); +} + +/** Make a full filepath from a directory path and a filename. +Prepend the dirpath to filename using the extension given. +If dirpath is NULL, prepend the default datadir to filepath. +Store the result in m_filepath. +@param dirpath directory path +@param name tablespace (table) name +@param ext filename extension */ +void Datafile::make_filepath(const char *dirpath, fil_space_t::name_type name, + ib_extention ext) +{ + ut_ad(dirpath || name.size()); + free_filepath(); + m_filepath= fil_make_filepath(dirpath, name, ext, false); + ut_ad(m_filepath); + set_filename(); +} + +/** Set the filepath by duplicating the filepath sent in. This is the +name of the file with its extension and absolute or relative path. +@param[in] filepath filepath to set */ +void +Datafile::set_filepath(const char* filepath) +{ + free_filepath(); + m_filepath = static_cast(ut_malloc_nokey(strlen(filepath) + 1)); + ::strcpy(m_filepath, filepath); + set_filename(); +} + +/** Free the filepath buffer. */ +void +Datafile::free_filepath() +{ + if (m_filepath != NULL) { + ut_free(m_filepath); + m_filepath = NULL; + m_filename = NULL; + } +} + +/** Do a quick test if the filepath provided looks the same as this filepath +byte by byte. If they are two different looking paths to the same file, +same_as() will be used to show that after the files are opened. +@param[in] other filepath to compare with +@retval true if it is the same filename by byte comparison +@retval false if it looks different */ +bool +Datafile::same_filepath_as( + const char* other) const +{ + return(0 == strcmp(m_filepath, other)); +} + +/** Test if another opened datafile is the same file as this object. +@param[in] other Datafile to compare with +@return true if it is the same file, else false */ +bool +Datafile::same_as( + const Datafile& other) const +{ +#ifdef _WIN32 + return(m_file_info.dwVolumeSerialNumber + == other.m_file_info.dwVolumeSerialNumber + && m_file_info.nFileIndexHigh + == other.m_file_info.nFileIndexHigh + && m_file_info.nFileIndexLow + == other.m_file_info.nFileIndexLow); +#else + return(m_file_info.st_ino == other.m_file_info.st_ino + && m_file_info.st_dev == other.m_file_info.st_dev); +#endif /* WIN32 */ +} + +/** Reads a few significant fields from the first page of the first +datafile. The Datafile must already be open. +@param[in] read_only_mode If true, then readonly mode checks are enforced. +@return DB_SUCCESS or DB_IO_ERROR if page cannot be read */ +dberr_t +Datafile::read_first_page(bool read_only_mode) +{ + if (m_handle == OS_FILE_CLOSED) { + + dberr_t err = open_or_create(read_only_mode); + + if (err != DB_SUCCESS) { + return(err); + } + } + + /* Align the memory for a possible read from a raw device */ + + m_first_page = static_cast( + aligned_malloc(UNIV_PAGE_SIZE_MAX, srv_page_size)); + + dberr_t err = DB_ERROR; + size_t page_size = UNIV_PAGE_SIZE_MAX; + + /* Don't want unnecessary complaints about partial reads. */ + + while (page_size >= UNIV_PAGE_SIZE_MIN) { + + ulint n_read = 0; + + err = os_file_read( + IORequestReadPartial, m_handle, m_first_page, 0, + page_size, &n_read); + + if (err == DB_SUCCESS) { + break; + } + + if (err == DB_IO_ERROR && n_read == 0) { + break; + } + if (err == DB_IO_ERROR && n_read >= UNIV_PAGE_SIZE_MIN) { + page_size >>= 1; + } else if (srv_operation == SRV_OPERATION_BACKUP) { + break; + } else { + ib::info() << "Cannot read first page of '" + << m_filepath << "': " << err; + break; + } + } + + if (err != DB_SUCCESS) { + return(err); + } + + if (m_order == 0) { + if (memcmp_aligned<2>(FIL_PAGE_SPACE_ID + m_first_page, + FSP_HEADER_OFFSET + FSP_SPACE_ID + + m_first_page, 4)) { + ib::error() + << "Inconsistent tablespace ID in " + << m_filepath; + return DB_CORRUPTION; + } + + m_space_id = mach_read_from_4(FIL_PAGE_SPACE_ID + + m_first_page); + m_flags = fsp_header_get_flags(m_first_page); + if (!fil_space_t::is_valid_flags(m_flags, m_space_id)) { + uint32_t cflags = fsp_flags_convert_from_101(m_flags); + if (cflags == UINT32_MAX) { + switch (fsp_flags_is_incompatible_mysql(m_flags)) { + case 0: + sql_print_error("InnoDB: Invalid flags 0x%" PRIx32 " in %s", + m_flags, m_filepath); + return DB_CORRUPTION; + case 3: + case 2: + sql_print_error("InnoDB: MySQL-8.0 tablespace in %s", + m_filepath); + break; + case 1: + sql_print_error("InnoDB: MySQL Encrypted tablespace in %s", + m_filepath); + break; + } + sql_print_error("InnoDB: Restart in MySQL for migration/recovery."); + return DB_UNSUPPORTED; + } else { + m_flags = cflags; + } + } + } + + const size_t physical_size = fil_space_t::physical_size(m_flags); + + if (physical_size > page_size) { + ib::error() << "File " << m_filepath + << " should be longer than " + << page_size << " bytes"; + return(DB_CORRUPTION); + } + + return(err); +} + +/** Free the first page from memory when it is no longer needed. */ +void Datafile::free_first_page() +{ + aligned_free(m_first_page); + m_first_page= nullptr; +} + +/** Validates the datafile and checks that it conforms with the expected +space ID and flags. The file should exist and be successfully opened +in order for this function to validate it. +@param[in] space_id The expected tablespace ID. +@param[in] flags The expected tablespace flags. +@retval DB_SUCCESS if tablespace is valid, DB_ERROR if not. +m_is_valid is also set true on success, else false. */ +dberr_t Datafile::validate_to_dd(uint32_t space_id, uint32_t flags) +{ + dberr_t err; + + if (!is_open()) { + return DB_ERROR; + } + + /* Validate this single-table-tablespace with the data dictionary, + but do not compare the DATA_DIR flag, in case the tablespace was + remotely located. */ + err = validate_first_page(); + if (err != DB_SUCCESS) { + return(err); + } + + flags &= ~FSP_FLAGS_MEM_MASK; + + /* Make sure the datafile we found matched the space ID. + If the datafile is a file-per-table tablespace then also match + the row format and zip page size. */ + if (m_space_id == space_id + && (fil_space_t::is_flags_equal(flags, m_flags) + || fil_space_t::is_flags_equal(m_flags, flags))) { + /* Datafile matches the tablespace expected. */ + return(DB_SUCCESS); + } + + /* else do not use this tablespace. */ + m_is_valid = false; + + ib::error() << "Refusing to load '" << m_filepath << "' (id=" + << m_space_id << ", flags=" << ib::hex(m_flags) + << "); dictionary contains id=" + << space_id << ", flags=" << ib::hex(flags); + + return(DB_ERROR); +} + +/** Validates this datafile for the purpose of recovery. The file should +exist and be successfully opened. We initially open it in read-only mode +because we just want to read the SpaceID. However, if the first page is +corrupt and needs to be restored from the doublewrite buffer, we will +reopen it in write mode and ry to restore that page. +@retval DB_SUCCESS if tablespace is valid, DB_ERROR if not. +m_is_valid is also set true on success, else false. */ +dberr_t +Datafile::validate_for_recovery() +{ + dberr_t err; + + ut_ad(is_open()); + ut_ad(!srv_read_only_mode); + + err = validate_first_page(); + + switch (err) { + case DB_TABLESPACE_EXISTS: + break; + case DB_SUCCESS: + if (!m_defer || !m_space_id) { + break; + } + /* InnoDB should check whether the deferred + tablespace page0 can be recovered from + double write buffer. InnoDB should try + to recover only if m_space_id exists because + dblwr pages can be searched via {space_id, 0}. + m_space_id is set in read_first_page(). */ + /* fall through */ + default: + /* Re-open the file in read-write mode Attempt to restore + page 0 from doublewrite and read the space ID from a survey + of the first few pages. */ + close(); + err = open_read_write(); + if (err != DB_SUCCESS) { + return(err); + } + + if (!m_defer) { + err = find_space_id(); + if (err != DB_SUCCESS || m_space_id == 0) { + ib::error() << "Datafile '" << m_filepath + << "' is corrupted. Cannot determine " + "the space ID from the first 64 pages."; + return(err); + } + } + + if (m_space_id == UINT32_MAX) { + return DB_SUCCESS; /* empty file */ + } + + if (recv_sys.dblwr.restore_first_page( + m_space_id, m_filepath, m_handle)) { + return m_defer ? err : DB_CORRUPTION; + } + + /* Free the previously read first page and then re-validate. */ + free_first_page(); + m_defer = false; + err = validate_first_page(); + } + + return(err); +} + +/** Check the consistency of the first page of a datafile when the +tablespace is opened. This occurs before the fil_space_t is created +so the Space ID found here must not already be open. +m_is_valid is set true on success, else false. +@retval DB_SUCCESS on if the datafile is valid +@retval DB_CORRUPTION if the datafile is not readable +@retval DB_TABLESPACE_EXISTS if there is a duplicate space_id */ +dberr_t Datafile::validate_first_page() +{ + const char* error_txt = NULL; + + m_is_valid = true; + + if (m_first_page == NULL + && read_first_page(srv_read_only_mode) != DB_SUCCESS) { + + error_txt = "Cannot read first page"; + } + + if (error_txt != NULL) { +err_exit: + free_first_page(); + + if (recv_recovery_is_on() + || srv_operation == SRV_OPERATION_BACKUP) { + m_defer= true; + return DB_SUCCESS; + } + + ib::info() << error_txt << " in datafile: " << m_filepath + << ", Space ID:" << m_space_id << ", Flags: " + << m_flags; + m_is_valid = false; + return(DB_CORRUPTION); + } + + /* Check if the whole page is blank. */ + if (!m_space_id && !m_flags) { + const byte* b = m_first_page; + ulint nonzero_bytes = srv_page_size; + + while (*b == '\0' && --nonzero_bytes != 0) { + + b++; + } + + if (nonzero_bytes == 0) { + error_txt = "Header page consists of zero bytes"; + goto err_exit; + } + } + + if (!fil_space_t::is_valid_flags(m_flags, m_space_id)) { + /* Tablespace flags must be valid. */ + error_txt = "Tablespace flags are invalid"; + goto err_exit; + } + + ulint logical_size = fil_space_t::logical_size(m_flags); + + if (srv_page_size != logical_size) { + free_first_page(); + if (recv_recovery_is_on() + || srv_operation == SRV_OPERATION_BACKUP) { + m_defer= true; + return DB_SUCCESS; + } + /* Logical size must be innodb_page_size. */ + ib::error() + << "Data file '" << m_filepath << "' uses page size " + << logical_size << ", but the innodb_page_size" + " start-up parameter is " + << srv_page_size; + return(DB_ERROR); + } + + if (page_get_page_no(m_first_page) != 0) { + /* First page must be number 0 */ + error_txt = "Header page contains inconsistent data"; + goto err_exit; + } + + if (m_space_id >= SRV_SPACE_ID_UPPER_BOUND) { + error_txt = "A bad Space ID was found"; + goto err_exit; + } + + if (buf_page_is_corrupted(false, m_first_page, m_flags)) { + /* Look for checksum and other corruptions. */ + error_txt = "Checksum mismatch"; + goto err_exit; + } + + mysql_mutex_lock(&fil_system.mutex); + + fil_space_t* space = fil_space_get_by_id(m_space_id); + + if (space) { + fil_node_t* node = UT_LIST_GET_FIRST(space->chain); + + if (node && !strcmp(m_filepath, node->name)) { +ok_exit: + mysql_mutex_unlock(&fil_system.mutex); + return DB_SUCCESS; + } + + if (!m_space_id + && (recv_recovery_is_on() + || srv_operation == SRV_OPERATION_BACKUP)) { + m_defer= true; + goto ok_exit; + } + + /* Make sure the space_id has not already been opened. */ + ib::error() << "Attempted to open a previously opened" + " tablespace. Previous tablespace: " + << (node ? node->name : "(unknown)") + << " uses space ID: " << m_space_id + << ". Cannot open filepath: " << m_filepath + << " which uses the same space ID."; + } + + mysql_mutex_unlock(&fil_system.mutex); + + if (space) { + m_is_valid = false; + + free_first_page(); + + return(is_predefined_tablespace(m_space_id) + ? DB_CORRUPTION + : DB_TABLESPACE_EXISTS); + } + + return(DB_SUCCESS); +} + +/** Determine the space id of the given file descriptor by reading a few +pages from the beginning of the .ibd file. +@return DB_SUCCESS if space id was successfully identified, else DB_ERROR. */ +dberr_t +Datafile::find_space_id() +{ + os_offset_t file_size; + + ut_ad(m_handle != OS_FILE_CLOSED); + + file_size = os_file_get_size(m_handle); + + if (!file_size) { + return DB_SUCCESS; + } + + if (file_size == (os_offset_t) -1) { + ib::error() << "Could not get file size of datafile '" + << m_filepath << "'"; + return(DB_CORRUPTION); + } + + /* Assuming a page size, read the space_id from each page and store it + in a map. Find out which space_id is agreed on by majority of the + pages. Choose that space_id. */ + for (ulint page_size = UNIV_ZIP_SIZE_MIN; + page_size <= UNIV_PAGE_SIZE_MAX; + page_size <<= 1) { + /* map[space_id] = count of pages */ + typedef std::map< + uint32_t, + uint32_t, + std::less, + ut_allocator > > + Pages; + + Pages verify; + uint32_t page_count = 64; + uint32_t valid_pages = 0; + + /* Adjust the number of pages to analyze based on file size */ + while ((page_count * page_size) > file_size) { + --page_count; + } + + ib::info() + << "Page size:" << page_size + << ". Pages to analyze:" << page_count; + + byte* page = static_cast( + aligned_malloc(page_size, page_size)); + + uint32_t fsp_flags; + /* provide dummy value if the first os_file_read() fails */ + switch (srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: + case SRV_CHECKSUM_ALGORITHM_FULL_CRC32: + fsp_flags = 1U << FSP_FLAGS_FCRC32_POS_MARKER + | FSP_FLAGS_FCRC32_PAGE_SSIZE() + | uint(innodb_compression_algorithm) + << FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO; + break; + default: + fsp_flags = 0; + } + + for (ulint j = 0; j < page_count; ++j) { + if (os_file_read(IORequestRead, m_handle, page, + j * page_size, page_size, nullptr)) { + ib::info() + << "READ FAIL: page_no:" << j; + continue; + } + + if (j == 0) { + fsp_flags = mach_read_from_4( + page + FSP_HEADER_OFFSET + FSP_SPACE_FLAGS); + } + + bool noncompressed_ok = false; + + /* For noncompressed pages, the page size must be + equal to srv_page_size. */ + if (page_size == srv_page_size + && !fil_space_t::zip_size(fsp_flags)) { + noncompressed_ok = !buf_page_is_corrupted( + false, page, fsp_flags); + } + + bool compressed_ok = false; + + if (srv_page_size <= UNIV_PAGE_SIZE_DEF + && page_size == fil_space_t::zip_size(fsp_flags)) { + compressed_ok = !buf_page_is_corrupted( + false, page, fsp_flags); + } + + if (noncompressed_ok || compressed_ok) { + + uint32_t space_id = mach_read_from_4(page + + FIL_PAGE_SPACE_ID); + + if (space_id > 0) { + + ib::info() + << "VALID: space:" + << space_id << " page_no:" << j + << " page_size:" << page_size; + + ++valid_pages; + + ++verify[space_id]; + } + } + } + + aligned_free(page); + + ib::info() + << "Page size: " << page_size + << ". Possible space_id count:" << verify.size(); + + const ulint pages_corrupted = 3; + + for (ulint missed = 0; missed <= pages_corrupted; ++missed) { + + for (Pages::const_iterator it = verify.begin(); + it != verify.end(); + ++it) { + + ib::info() << "space_id:" << it->first + << ", Number of pages matched: " + << it->second << "/" << valid_pages + << " (" << page_size << ")"; + + if (it->second == (valid_pages - missed)) { + ib::info() << "Chosen space:" + << it->first; + + m_space_id = it->first; + return(DB_SUCCESS); + } + } + + } + } + + return(DB_CORRUPTION); +} + +/** Read an InnoDB Symbolic Link (ISL) file by name. +@param link_filepath filepath of the ISL file +@return data file name (must be freed by the caller) +@retval nullptr on error */ +static char *read_link_file(const char *link_filepath) +{ + if (FILE* file= fopen(link_filepath, "r+b" STR_O_CLOEXEC)) + { + char *filepath= static_cast(ut_malloc_nokey(OS_FILE_MAX_PATH)); + + os_file_read_string(file, filepath, OS_FILE_MAX_PATH); + fclose(file); + + if (size_t len= strlen(filepath)) + { + /* Trim whitespace from end of filepath */ + len--; + while (static_cast(filepath[len]) <= 0x20) + { + if (!len) + return nullptr; + filepath[len--]= 0; + } + /* Ensure that the last 2 path separators are forward slashes, + because elsewhere we are assuming that tablespace file names end + in "/databasename/tablename.ibd". */ + unsigned trailing_slashes= 0; + for (; len; len--) + { + switch (filepath[len]) { +#ifdef _WIN32 + case '\\': + filepath[len]= '/'; + /* fall through */ +#endif + case '/': + if (++trailing_slashes >= 2) + return filepath; + } + } + } + } + + return nullptr; +} + +/** Create a link filename, +open that file, and read the contents into m_filepath. +@param name table name +@return filepath() +@retval nullptr if the .isl file does not exist or cannot be read */ +const char *RemoteDatafile::open_link_file(const fil_space_t::name_type name) +{ + if (!m_link_filepath) + m_link_filepath= fil_make_filepath(nullptr, name, ISL, false); + m_filepath= read_link_file(m_link_filepath); + return m_filepath; +} + +/** Release the resources. */ +void +RemoteDatafile::shutdown() +{ + Datafile::shutdown(); + + if (m_link_filepath != 0) { + ut_free(m_link_filepath); + m_link_filepath = 0; + } +} + +/** Create InnoDB Symbolic Link (ISL) file. +@param name tablespace name +@param filepath full file name +@return DB_SUCCESS or error code */ +dberr_t RemoteDatafile::create_link_file(fil_space_t::name_type name, + const char *filepath) +{ + bool success; + dberr_t err = DB_SUCCESS; + char* link_filepath = NULL; + char* prev_filepath = NULL; + + ut_ad(!srv_read_only_mode); + + link_filepath = fil_make_filepath(NULL, name, ISL, false); + + if (link_filepath == NULL) { + return(DB_ERROR); + } + + prev_filepath = read_link_file(link_filepath); + if (prev_filepath) { + /* Truncate (starting with MySQL 5.6, probably no + longer since MariaDB Server 10.2.19) used to call this + with an existing link file which contains the same filepath. */ + bool same = !strncmp(prev_filepath, name.data(), name.size()) + && !strcmp(prev_filepath + name.size(), DOT_IBD); + ut_free(prev_filepath); + if (same) { + ut_free(link_filepath); + return(DB_SUCCESS); + } + } + + /** Check if the file already exists. */ + FILE* file = NULL; + bool exists; + os_file_type_t ftype; + + success = os_file_status(link_filepath, &exists, &ftype); + ulint error = 0; + + if (success && !exists) { + + file = fopen(link_filepath, "w"); + if (file == NULL) { + /* This call will print its own error message */ + error = os_file_get_last_error(true); + } + } else { + error = OS_FILE_ALREADY_EXISTS; + } + + if (error != 0) { + + ib::error() << "Cannot create file " << link_filepath << "."; + + if (error == OS_FILE_ALREADY_EXISTS) { + ib::error() << "The link file: " << link_filepath + << " already exists."; + err = DB_TABLESPACE_EXISTS; + + } else if (error == OS_FILE_DISK_FULL) { + err = DB_OUT_OF_FILE_SPACE; + + } else { + err = DB_ERROR; + } + + /* file is not open, no need to close it. */ + ut_free(link_filepath); + return(err); + } + + const size_t len = strlen(filepath); + if (fwrite(filepath, 1, len, file) != len) { + error = os_file_get_last_error(true); + ib::error() << + "Cannot write link file: " + << link_filepath << " filepath: " << filepath; + err = DB_ERROR; + } + + /* Close the file, we only need it at startup */ + fclose(file); + + ut_free(link_filepath); + + return(err); +} + +/** Delete an InnoDB Symbolic Link (ISL) file. */ +void +RemoteDatafile::delete_link_file(void) +{ + ut_ad(m_link_filepath != NULL); + + if (m_link_filepath != NULL) { + os_file_delete_if_exists(innodb_data_file_key, + m_link_filepath, NULL); + } +} + +/** Delete an InnoDB Symbolic Link (ISL) file by name. +@param name tablespace name */ +void RemoteDatafile::delete_link_file(fil_space_t::name_type name) +{ + if (char *link_filepath= fil_make_filepath(NULL, name, ISL, false)) + { + os_file_delete_if_exists(innodb_data_file_key, link_filepath, nullptr); + ut_free(link_filepath); + } +} diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc new file mode 100644 index 00000000..6c5c354e --- /dev/null +++ b/storage/innobase/fsp/fsp0fsp.cc @@ -0,0 +1,3070 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file fsp/fsp0fsp.cc +File space management + +Created 11/29/1995 Heikki Tuuri +***********************************************************************/ + +#include "fsp0fsp.h" +#include "buf0buf.h" +#include "fil0fil.h" +#include "fil0crypt.h" +#include "mtr0log.h" +#include "ut0byte.h" +#include "page0page.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "ibuf0ibuf.h" +#include "btr0btr.h" +#include "btr0sea.h" +#include "dict0boot.h" +#include "log0log.h" +#include "dict0mem.h" +#include "fsp0types.h" +#include "log.h" + +typedef uint32_t page_no_t; + +/** Returns the first extent descriptor for a segment. +We think of the extent lists of the segment catenated in the order +FSEG_FULL -> FSEG_NOT_FULL -> FSEG_FREE. +@param[in] inode segment inode +@param[in] space tablespace +@param[in,out] mtr mini-transaction +@param[out] err error code +@return the first extent descriptor, or NULL if none */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +static +xdes_t* +fseg_get_first_extent( + fseg_inode_t* inode, + const fil_space_t* space, + mtr_t* mtr, + dberr_t* err); + +ATTRIBUTE_COLD MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Put new extents to the free list if there are free extents above the free +limit. If an extent happens to contain an extent descriptor page, the extent +is put to the FSP_FREE_FRAG list with the page marked as used. +@param[in] init_space true if this is a single-table tablespace +and we are only initializing the first extent and the first bitmap pages; +then we will not allocate more extents +@param[in,out] space tablespace +@param[in,out] header tablespace header +@param[in,out] mtr mini-transaction */ +static +dberr_t +fsp_fill_free_list( + bool init_space, + fil_space_t* space, + buf_block_t* header, + mtr_t* mtr); + +/** Allocates a single free page from a segment. +This function implements the intelligent allocation strategy which tries to +minimize file space fragmentation. +@param[in,out] space tablespace +@param[in,out] seg_inode segment inode +@param[in,out] iblock segment inode page +@param[in] hint hint of which page would be desirable +@param[in] direction if the new page is needed because of +an index page split, and records are inserted there in order, into which +direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR +@param[in,out] mtr mini-transaction +@param[in,out] init_mtr mtr or another mini-transaction in +which the page should be initialized. +@param[out] err error code +@return the allocated page +@retval nullptr if no page could be allocated */ +static +buf_block_t* +fseg_alloc_free_page_low( + fil_space_t* space, + fseg_inode_t* seg_inode, + buf_block_t* iblock, + uint32_t hint, + byte direction, +#ifdef UNIV_DEBUG + bool has_done_reservation, + /*!< whether the space has already been reserved */ +#endif /* UNIV_DEBUG */ + mtr_t* mtr, + mtr_t* init_mtr, + dberr_t* err) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Get the tablespace header block, SX-latched +@param[in] space tablespace +@param[in,out] mtr mini-transaction +@param[out] err error code +@return pointer to the space header, page x-locked +@retval nullptr if the page cannot be retrieved or is corrupted */ +static buf_block_t *fsp_get_header(const fil_space_t *space, mtr_t *mtr, + dberr_t *err) +{ + const page_id_t id{space->id, 0}; + buf_block_t *block= mtr->get_already_latched(id, MTR_MEMO_PAGE_SX_FIX); + if (block) + *err= DB_SUCCESS; + else + { + block= buf_page_get_gen(id, space->zip_size(), RW_SX_LATCH, + nullptr, BUF_GET_POSSIBLY_FREED, + mtr, err); + if (block && + space->id != mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_ID + + block->page.frame)) + { + *err= DB_CORRUPTION; + block= nullptr; + } + } + return block; +} + +/** Set the XDES_FREE_BIT of a page. +@tparam free desired value of XDES_FREE_BIT +@param[in] block extent descriptor block +@param[in,out] descr extent descriptor +@param[in] offset page offset within the extent +@param[in,out] mtr mini-transaction */ +template +inline void xdes_set_free(const buf_block_t &block, xdes_t *descr, + ulint offset, mtr_t *mtr) +{ + ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX | + MTR_MEMO_PAGE_X_FIX)); + ut_ad(offset < FSP_EXTENT_SIZE); + ut_ad(page_align(descr) == block.page.frame); + compile_time_assert(XDES_BITS_PER_PAGE == 2); + compile_time_assert(XDES_FREE_BIT == 0); + compile_time_assert(XDES_CLEAN_BIT == 1); + + ulint index= XDES_BITS_PER_PAGE * offset; + byte *b= &descr[XDES_BITMAP + (index >> 3)]; + /* xdes_init() should have set all XDES_CLEAN_BIT. */ + ut_ad(!(~*b & 0xaa)); + /* Clear or set XDES_FREE_BIT. */ + byte val= free + ? static_cast(*b | 1 << (index & 7)) + : static_cast(*b & ~(1 << (index & 7))); + mtr->write<1>(block, b, val); +} + +/** +Find a free page. +@param descr extent descriptor +@param hint page offset to start searching from (towards larger pages) +@return free page offset +@retval FIL_NULL if no page is free */ +inline uint32_t xdes_find_free(const xdes_t *descr, uint32_t hint= 0) +{ + const uint32_t extent_size= FSP_EXTENT_SIZE; + ut_ad(hint < extent_size); + for (uint32_t i= hint; i < extent_size; i++) + if (xdes_is_free(descr, i)) + return i; + for (uint32_t i= 0; i < hint; i++) + if (xdes_is_free(descr, i)) + return i; + return FIL_NULL; +} + +/** +Determine the number of used pages in a descriptor. +@param descr file descriptor +@return number of pages used */ +inline uint32_t xdes_get_n_used(const xdes_t *descr) +{ + uint32_t count= 0; + + for (uint32_t i= FSP_EXTENT_SIZE; i--; ) + if (!xdes_is_free(descr, i)) + count++; + + return count; +} + +/** +Determine whether a file extent is full. +@param descr file descriptor +@return whether all pages have been allocated */ +inline bool xdes_is_full(const xdes_t *descr) +{ + return FSP_EXTENT_SIZE == xdes_get_n_used(descr); +} + +/** Set the state of an extent descriptor. +@param[in] block extent descriptor block +@param[in,out] descr extent descriptor +@param[in] state the state +@param[in,out] mtr mini-transaction */ +inline void xdes_set_state(const buf_block_t &block, xdes_t *descr, + byte state, mtr_t *mtr) +{ + ut_ad(descr && mtr); + ut_ad(state >= XDES_FREE); + ut_ad(state <= XDES_FSEG); + ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX | + MTR_MEMO_PAGE_X_FIX)); + ut_ad(page_align(descr) == block.page.frame); + ut_ad(mach_read_from_4(descr + XDES_STATE) <= XDES_FSEG); + mtr->write<1>(block, XDES_STATE + 3 + descr, state); +} + +/**********************************************************************//** +Gets the state of an xdes. +@return state */ +UNIV_INLINE +ulint +xdes_get_state( +/*===========*/ + const xdes_t* descr) /*!< in: descriptor */ +{ + ulint state; + + ut_ad(descr); + state = mach_read_from_4(descr + XDES_STATE); + ut_ad(state - 1 < XDES_FSEG); + return(state); +} + +/**********************************************************************//** +Inits an extent descriptor to the free and clean state. */ +inline void xdes_init(const buf_block_t &block, xdes_t *descr, mtr_t *mtr) +{ + ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX | + MTR_MEMO_PAGE_X_FIX)); + mtr->memset(&block, uint16_t(descr - block.page.frame) + XDES_BITMAP, + XDES_SIZE - XDES_BITMAP, 0xff); + xdes_set_state(block, descr, XDES_FREE, mtr); +} + +/** Mark a page used in an extent descriptor. +@param[in,out] seg_inode segment inode +@param[in,out] iblock segment inode page +@param[in] page page number +@param[in,out] descr extent descriptor +@param[in,out] xdes extent descriptor page +@param[in,out] mtr mini-transaction +@return error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fseg_mark_page_used(fseg_inode_t *seg_inode, buf_block_t *iblock, + ulint page, xdes_t *descr, buf_block_t *xdes, mtr_t *mtr) +{ + ut_ad(fil_page_get_type(iblock->page.frame) == FIL_PAGE_INODE); + ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + seg_inode, 4)); + ut_ad(!memcmp(seg_inode + FSEG_ID, descr + XDES_ID, 4)); + + const uint16_t xoffset= uint16_t(descr - xdes->page.frame + XDES_FLST_NODE); + const uint16_t ioffset= uint16_t(seg_inode - iblock->page.frame); + + if (!xdes_get_n_used(descr)) + { + /* We move the extent from the free list to the NOT_FULL list */ + if (dberr_t err= flst_remove(iblock, uint16_t(FSEG_FREE + ioffset), + xdes, xoffset, mtr)) + return err; + if (dberr_t err= flst_add_last(iblock, uint16_t(FSEG_NOT_FULL + ioffset), + xdes, xoffset, mtr)) + return err; + } + + if (UNIV_UNLIKELY(!xdes_is_free(descr, page % FSP_EXTENT_SIZE))) + return DB_CORRUPTION; + + /* We mark the page as used */ + xdes_set_free(*xdes, descr, page % FSP_EXTENT_SIZE, mtr); + + byte* p_not_full= seg_inode + FSEG_NOT_FULL_N_USED; + const uint32_t not_full_n_used= mach_read_from_4(p_not_full) + 1; + mtr->write<4>(*iblock, p_not_full, not_full_n_used); + if (xdes_is_full(descr)) + { + /* We move the extent from the NOT_FULL list to the FULL list */ + if (dberr_t err= flst_remove(iblock, uint16_t(FSEG_NOT_FULL + ioffset), + xdes, xoffset, mtr)) + return err; + if (dberr_t err= flst_add_last(iblock, uint16_t(FSEG_FULL + ioffset), + xdes, xoffset, mtr)) + return err; + mtr->write<4>(*iblock, seg_inode + FSEG_NOT_FULL_N_USED, + not_full_n_used - FSP_EXTENT_SIZE); + } + + return DB_SUCCESS; +} + +/** Get pointer to a the extent descriptor of a page. +@param[in,out] sp_header tablespace header page, x-latched +@param[in] space tablespace +@param[in] offset page offset +@param[in,out] mtr mini-transaction +@param[out] err error code +@param[out] desc_block descriptor block +@param[in] init_space whether the tablespace is being initialized +@return pointer to the extent descriptor, NULL if the page does not +exist in the space or if the offset exceeds free limit */ +UNIV_INLINE MY_ATTRIBUTE((warn_unused_result)) +xdes_t* +xdes_get_descriptor_with_space_hdr( + buf_block_t* header, + const fil_space_t* space, + page_no_t offset, + mtr_t* mtr, + dberr_t* err = nullptr, + buf_block_t** desc_block = nullptr, + bool init_space = false) +{ + ut_ad(space->is_owner()); + ut_ad(mtr->memo_contains_flagged(header, MTR_MEMO_PAGE_SX_FIX + | MTR_MEMO_PAGE_X_FIX)); + /* Read free limit and space size */ + uint32_t limit = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + + header->page.frame); + uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + + header->page.frame); + ut_ad(limit == space->free_limit + || (space->free_limit == 0 + && (init_space + || space->purpose == FIL_TYPE_TEMPORARY + || (srv_startup_is_before_trx_rollback_phase + && (space->id == TRX_SYS_SPACE + || srv_is_undo_tablespace(space->id)))))); + ut_ad(size == space->size_in_header); + + if (offset >= size || offset >= limit) { + return nullptr; + } + + const unsigned zip_size = space->zip_size(); + + uint32_t descr_page_no = xdes_calc_descriptor_page(zip_size, offset); + + buf_block_t* block = header; + + if (descr_page_no) { + block = buf_page_get_gen(page_id_t(space->id, descr_page_no), + zip_size, RW_SX_LATCH, nullptr, + BUF_GET_POSSIBLY_FREED, mtr, err); + } + + if (desc_block) { + *desc_block = block; + } + + return block + ? XDES_ARR_OFFSET + XDES_SIZE + * xdes_calc_descriptor_index(zip_size, offset) + + block->page.frame + : nullptr; +} + +MY_ATTRIBUTE((nonnull(1,3), warn_unused_result)) +/** Get the extent descriptor of a page. +The page where the extent descriptor resides is x-locked. If the page +offset is equal to the free limit of the space, we will add new +extents from above the free limit to the space free list, if not free +limit == space size. This adding is necessary to make the descriptor +defined, as they are uninitialized above the free limit. +@param[in] space tablespace +@param[in] offset page offset; if equal to the free limit, we +try to add new extents to the space free list +@param[in,out] mtr mini-transaction +@param[out] err error code +@param[out] xdes extent descriptor page +@return the extent descriptor */ +static xdes_t *xdes_get_descriptor(const fil_space_t *space, page_no_t offset, + mtr_t *mtr, dberr_t *err= nullptr, + buf_block_t **xdes= nullptr) +{ + if (buf_block_t *block= + buf_page_get_gen(page_id_t(space->id, 0), space->zip_size(), RW_SX_LATCH, + nullptr, BUF_GET_POSSIBLY_FREED, mtr, err)) + return xdes_get_descriptor_with_space_hdr(block, space, offset, mtr, + err, xdes); + return nullptr; +} + +MY_ATTRIBUTE((nonnull(3), warn_unused_result)) +/** Get a pointer to the extent descriptor. The page where the +extent descriptor resides is x-locked. +@param space tablespace +@param lst_node file address of the list node contained in the descriptor +@param mtr mini-transaction +@param err error code +@param block extent descriptor block +@return pointer to the extent descriptor */ +static inline +xdes_t *xdes_lst_get_descriptor(const fil_space_t &space, fil_addr_t lst_node, + mtr_t *mtr, buf_block_t **block= nullptr, + dberr_t *err= nullptr) +{ + ut_ad(mtr->memo_contains(space)); + ut_ad(lst_node.boffset < space.physical_size()); + buf_block_t *b; + if (!block) + block= &b; + *block= buf_page_get_gen(page_id_t{space.id, lst_node.page}, + space.zip_size(), RW_SX_LATCH, + nullptr, BUF_GET_POSSIBLY_FREED, mtr, err); + if (*block) + return (*block)->page.frame + lst_node.boffset - XDES_FLST_NODE; + + space.set_corrupted(); + return nullptr; +} + +/********************************************************************//** +Returns page offset of the first page in extent described by a descriptor. +@return offset of the first page in extent */ +static uint32_t xdes_get_offset(const xdes_t *descr) +{ + ut_ad(descr); + return page_get_page_no(page_align(descr)) + + uint32_t(((page_offset(descr) - XDES_ARR_OFFSET) / XDES_SIZE) * + FSP_EXTENT_SIZE); +} + +/** Initialize a file page whose prior contents should be ignored. +@param[in,out] block buffer pool block */ +void fsp_apply_init_file_page(buf_block_t *block) +{ + memset_aligned(block->page.frame, 0, srv_page_size); + const page_id_t id(block->page.id()); + + mach_write_to_4(block->page.frame + FIL_PAGE_OFFSET, id.page_no()); + memset_aligned<8>(block->page.frame + FIL_PAGE_PREV, 0xff, 8); + mach_write_to_4(block->page.frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + id.space()); + if (page_zip_des_t* page_zip= buf_block_get_page_zip(block)) + { + memset_aligned(page_zip->data, 0, + page_zip_get_size(page_zip)); + static_assert(FIL_PAGE_OFFSET == 4, "compatibility"); + memcpy_aligned<4>(page_zip->data + FIL_PAGE_OFFSET, + block->page.frame + FIL_PAGE_OFFSET, 4); + memset_aligned<8>(page_zip->data + FIL_PAGE_PREV, 0xff, 8); + static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2, + "not perfect alignment"); + memcpy_aligned<2>(page_zip->data + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, + block->page.frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 4); + } +} + +#ifdef UNIV_DEBUG +/** Assert that the mini-transaction is compatible with +updating an allocation bitmap page. +@param[in] mtr mini-transaction */ +void fil_space_t::modify_check(const mtr_t& mtr) const +{ + switch (mtr.get_log_mode()) { + case MTR_LOG_NONE: + /* These modes are only allowed within a non-bitmap page + when there is a higher-level redo log record written. */ + ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_TEMPORARY); + break; + case MTR_LOG_NO_REDO: + ut_ad(purpose == FIL_TYPE_TEMPORARY || purpose == FIL_TYPE_IMPORT); + break; + default: + /* We may only write redo log for a persistent tablespace. */ + ut_ad(purpose == FIL_TYPE_TABLESPACE); + ut_ad(mtr.is_named_space(id)); + } +} +#endif + +/** Initialize a tablespace header. +@param[in,out] space tablespace +@param[in] size current size in blocks +@param[in,out] mtr mini-transaction +@return error code */ +dberr_t fsp_header_init(fil_space_t *space, uint32_t size, mtr_t *mtr) +{ + const page_id_t page_id(space->id, 0); + const ulint zip_size = space->zip_size(); + + buf_block_t *free_block = buf_LRU_get_free_block(false); + + mtr->x_lock_space(space); + + buf_block_t* block = buf_page_create(space, 0, zip_size, mtr, + free_block); + if (UNIV_UNLIKELY(block != free_block)) { + buf_pool.free_block(free_block); + } + + space->size_in_header = size; + space->free_len = 0; + space->free_limit = 0; + + /* The prior contents of the file page should be ignored */ + + fsp_init_file_page(space, block, mtr); + + mtr->write<2>(*block, block->page.frame + FIL_PAGE_TYPE, + FIL_PAGE_TYPE_FSP_HDR); + + mtr->write<4,mtr_t::MAYBE_NOP>(*block, FSP_HEADER_OFFSET + FSP_SPACE_ID + + block->page.frame, space->id); + ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_NOT_USED + + block->page.frame)); + /* recv_sys_t::parse() expects to find a WRITE record that + covers all 4 bytes. Therefore, we must specify mtr_t::FORCED + in order to avoid optimizing away any unchanged most + significant bytes of FSP_SIZE. */ + mtr->write<4,mtr_t::FORCED>(*block, FSP_HEADER_OFFSET + FSP_SIZE + + block->page.frame, size); + ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + + block->page.frame)); + if (auto f = space->flags & ~FSP_FLAGS_MEM_MASK) { + mtr->write<4,mtr_t::FORCED>(*block, + FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + + block->page.frame, f); + } + ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_FRAG_N_USED + + block->page.frame)); + + flst_init(block, FSP_HEADER_OFFSET + FSP_FREE, mtr); + flst_init(block, FSP_HEADER_OFFSET + FSP_FREE_FRAG, mtr); + flst_init(block, FSP_HEADER_OFFSET + FSP_FULL_FRAG, mtr); + flst_init(block, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL, mtr); + flst_init(block, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE, mtr); + + mtr->write<8>(*block, FSP_HEADER_OFFSET + FSP_SEG_ID + + block->page.frame, + 1U); + + if (dberr_t err = fsp_fill_free_list(!is_system_tablespace(space->id), + space, block, mtr)) { + return err; + } + + /* Write encryption metadata to page 0 if tablespace is + encrypted or encryption is disabled by table option. */ + if (space->crypt_data && + (space->crypt_data->should_encrypt() || + space->crypt_data->not_encrypted())) { + space->crypt_data->write_page0(block, mtr); + } + + return DB_SUCCESS; +} + +/** Try to extend a single-table tablespace so that a page would fit in the +data file. +@param[in,out] space tablespace +@param[in] page_no page number +@param[in,out] header tablespace header +@param[in,out] mtr mini-transaction +@return true if success */ +static ATTRIBUTE_COLD __attribute__((warn_unused_result)) +bool +fsp_try_extend_data_file_with_pages( + fil_space_t* space, + uint32_t page_no, + buf_block_t* header, + mtr_t* mtr) +{ + bool success; + ulint size; + + ut_ad(!is_system_tablespace(space->id)); + ut_d(space->modify_check(*mtr)); + + size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + + header->page.frame); + ut_ad(size == space->size_in_header); + + ut_a(page_no >= size); + + success = fil_space_extend(space, page_no + 1); + /* The size may be less than we wanted if we ran out of disk space. */ + /* recv_sys_t::parse() expects to find a WRITE record that + covers all 4 bytes. Therefore, we must specify mtr_t::FORCED + in order to avoid optimizing away any unchanged most + significant bytes of FSP_SIZE. */ + mtr->write<4,mtr_t::FORCED>(*header, FSP_HEADER_OFFSET + FSP_SIZE + + header->page.frame, space->size); + space->size_in_header = space->size; + + return(success); +} + +/** Calculate the number of physical pages in an extent for this file. +@param[in] physical_size page_size of the datafile +@return number of pages in an extent for this file */ +inline uint32_t fsp_get_extent_size_in_pages(ulint physical_size) +{ + return uint32_t((FSP_EXTENT_SIZE << srv_page_size_shift) / physical_size); +} + + +/** Calculate the number of pages to extend a datafile. +We extend single-table tablespaces first one extent at a time, +but 4 at a time for bigger tablespaces. It is not enough to extend always +by one extent, because we need to add at least one extent to FSP_FREE. +A single extent descriptor page will track many extents. And the extent +that uses its extent descriptor page is put onto the FSP_FREE_FRAG list. +Extents that do not use their extent descriptor page are added to FSP_FREE. +The physical page size is used to determine how many extents are tracked +on one extent descriptor page. See xdes_calc_descriptor_page(). +@param[in] physical_size page size in data file +@param[in] size current number of pages in the datafile +@return number of pages to extend the file. */ +static uint32_t fsp_get_pages_to_extend_ibd(unsigned physical_size, + uint32_t size) +{ + uint32_t extent_size = fsp_get_extent_size_in_pages(physical_size); + /* The threshold is set at 32MiB except when the physical page + size is small enough that it must be done sooner. */ + uint32_t threshold = std::min(32 * extent_size, physical_size); + + if (size >= threshold) { + /* Below in fsp_fill_free_list() we assume + that we add at most FSP_FREE_ADD extents at + a time */ + extent_size *= FSP_FREE_ADD; + } + + return extent_size; +} + +/** Try to extend the last data file of a tablespace if it is auto-extending. +@param[in,out] space tablespace +@param[in,out] header tablespace header +@param[in,out] mtr mini-transaction +@return number of pages added +@retval 0 if the tablespace was not extended */ +ATTRIBUTE_COLD __attribute__((nonnull)) +static +ulint +fsp_try_extend_data_file(fil_space_t *space, buf_block_t *header, mtr_t *mtr) +{ + const char* OUT_OF_SPACE_MSG = + "ran out of space. Please add another file or use" + " 'autoextend' for the last file in setting"; + + ut_d(space->modify_check(*mtr)); + + if (space->id == TRX_SYS_SPACE + && !srv_sys_space.can_auto_extend_last_file()) { + + /* We print the error message only once to avoid + spamming the error log. Note that we don't need + to reset the flag to false as dealing with this + error requires server restart. */ + if (!srv_sys_space.get_tablespace_full_status()) { + sql_print_error("InnoDB: The InnoDB system tablespace " + "%s" " innodb_data_file_path.", + OUT_OF_SPACE_MSG); + srv_sys_space.set_tablespace_full_status(true); + } + return(0); + } else if (space->id == SRV_TMP_SPACE_ID + && !srv_tmp_space.can_auto_extend_last_file()) { + + /* We print the error message only once to avoid + spamming the error log. Note that we don't need + to reset the flag to false as dealing with this + error requires server restart. */ + if (!srv_tmp_space.get_tablespace_full_status()) { + sql_print_error("InnoDB: The InnoDB temporary" + " tablespace %s" + " innodb_temp_data_file_path.", + OUT_OF_SPACE_MSG); + srv_tmp_space.set_tablespace_full_status(true); + } + return(0); + } + + uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + + header->page.frame); + ut_ad(size == space->size_in_header); + uint32_t size_increase; + + const unsigned ps = space->physical_size(); + + switch (space->id) { + case TRX_SYS_SPACE: + size_increase = srv_sys_space.get_increment(); + break; + case SRV_TMP_SPACE_ID: + size_increase = srv_tmp_space.get_increment(); + break; + default: + uint32_t extent_pages = fsp_get_extent_size_in_pages(ps); + if (size < extent_pages) { + /* Let us first extend the file to extent_size */ + if (!fsp_try_extend_data_file_with_pages( + space, extent_pages - 1, header, mtr)) { + return(0); + } + + size = extent_pages; + } + + size_increase = fsp_get_pages_to_extend_ibd(ps, size); + } + + if (size_increase == 0) { + return(0); + } + + if (!fil_space_extend(space, size + size_increase)) { + return(0); + } + + /* For the system tablespace, we ignore any fragments of a + full megabyte when storing the size to the space header */ + + space->size_in_header = space->id + ? space->size + : ut_2pow_round(space->size, (1024 * 1024) / ps); + + /* recv_sys_t::parse() expects to find a WRITE record that + covers all 4 bytes. Therefore, we must specify mtr_t::FORCED + in order to avoid optimizing away any unchanged most + significant bytes of FSP_SIZE. */ + mtr->write<4,mtr_t::FORCED>(*header, FSP_HEADER_OFFSET + FSP_SIZE + + header->page.frame, + space->size_in_header); + + return(size_increase); +} + +/** Reset the page type. +Data files created before MySQL 5.1.48 may contain garbage in FIL_PAGE_TYPE. +In MySQL 3.23.53, only undo log pages and index pages were tagged. +Any other pages were written with uninitialized bytes in FIL_PAGE_TYPE. +@param[in] block block with invalid FIL_PAGE_TYPE +@param[in] type expected page type +@param[in,out] mtr mini-transaction */ +ATTRIBUTE_COLD +void fil_block_reset_type(const buf_block_t& block, ulint type, mtr_t* mtr) +{ + ib::info() << "Resetting invalid page " << block.page.id() << " type " + << fil_page_get_type(block.page.frame) << " to " << type << "."; + mtr->write<2>(block, block.page.frame + FIL_PAGE_TYPE, type); +} + +/** Put new extents to the free list if there are free extents above the free +limit. If an extent happens to contain an extent descriptor page, the extent +is put to the FSP_FREE_FRAG list with the page marked as used. +@param[in] init_space true if this is a single-table tablespace +and we are only initializing the first extent and the first bitmap pages; +then we will not allocate more extents +@param[in,out] space tablespace +@param[in,out] header tablespace header +@param[in,out] mtr mini-transaction +@return error code */ +static +dberr_t +fsp_fill_free_list( + bool init_space, + fil_space_t* space, + buf_block_t* header, + mtr_t* mtr) +{ + ut_d(space->modify_check(*mtr)); + + /* Check if we can fill free list from above the free list limit */ + uint32_t size= + mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + header->page.frame); + uint32_t limit= + mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + header->page.frame); + + ut_ad(size == space->size_in_header); + ut_ad(limit == space->free_limit); + + const auto zip_size= space->zip_size(); + + if (size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD) + { + bool skip_resize= init_space; + switch (space->id) { + case TRX_SYS_SPACE: + skip_resize= !srv_sys_space.can_auto_extend_last_file(); + break; + case SRV_TMP_SPACE_ID: + skip_resize= !srv_tmp_space.can_auto_extend_last_file(); + break; + } + + if (!skip_resize) + { + fsp_try_extend_data_file(space, header, mtr); + size= space->size_in_header; + } + } + + uint32_t count= 0; + for (uint32_t i= limit, extent_size= FSP_EXTENT_SIZE, + physical_size= space->physical_size(); + (init_space && i < 1) || + (i + extent_size <= size && count < FSP_FREE_ADD); + i += extent_size) + { + const bool init_xdes= !ut_2pow_remainder(i, physical_size); + space->free_limit= i + extent_size; + mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FREE_LIMIT + + header->page.frame, i + extent_size); + + if (init_xdes) + { + /* We are going to initialize a new descriptor page + and a new ibuf bitmap page: the prior contents of the + pages should be ignored. */ + + if (i) + { + buf_block_t *f= buf_LRU_get_free_block(false); + buf_block_t *block= buf_page_create(space, static_cast(i), + zip_size, mtr, f); + if (UNIV_UNLIKELY(block != f)) + buf_pool.free_block(f); + fsp_init_file_page(space, block, mtr); + mtr->write<2>(*block, FIL_PAGE_TYPE + block->page.frame, + FIL_PAGE_TYPE_XDES); + } + + if (space->purpose != FIL_TYPE_TEMPORARY) + { + buf_block_t *f= buf_LRU_get_free_block(false); + buf_block_t *block= + buf_page_create(space, + static_cast(i + FSP_IBUF_BITMAP_OFFSET), + zip_size, mtr, f); + if (UNIV_UNLIKELY(block != f)) + buf_pool.free_block(f); + fsp_init_file_page(space, block, mtr); + mtr->write<2>(*block, FIL_PAGE_TYPE + block->page.frame, + FIL_PAGE_IBUF_BITMAP); + } + } + + buf_block_t *xdes= nullptr; + xdes_t *descr; + { + dberr_t err= DB_SUCCESS; + descr= xdes_get_descriptor_with_space_hdr(header, space, i, mtr, + &err, &xdes, init_space); + if (!descr) + return err; + } + + if (xdes != header && !space->full_crc32()) + fil_block_check_type(*xdes, FIL_PAGE_TYPE_XDES, mtr); + xdes_init(*xdes, descr, mtr); + const uint16_t xoffset= + static_cast(descr - xdes->page.frame + XDES_FLST_NODE); + if (UNIV_UNLIKELY(init_xdes)) + { + /* The first page in the extent is a descriptor page and the + second is an ibuf bitmap page: mark them used */ + xdes_set_free(*xdes, descr, 0, mtr); + xdes_set_free(*xdes, descr, FSP_IBUF_BITMAP_OFFSET, mtr); + xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr); + if (dberr_t err= flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG, + xdes, xoffset, mtr)) + return err; + byte *n_used= FSP_HEADER_OFFSET + FSP_FRAG_N_USED + header->page.frame; + mtr->write<4>(*header, n_used, 2U + mach_read_from_4(n_used)); + } + else + { + if (dberr_t err= + flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE, + xdes, xoffset, mtr)) + return err; + count++; + } + } + + space->free_len+= count; + return DB_SUCCESS; +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Allocates a new free extent. +@param[in,out] space tablespace +@param[in] hint hint of which extent would be desirable: any +page offset in the extent goes; the hint must not be > FSP_FREE_LIMIT +@param[out] xdes extent descriptor page +@param[in,out] mtr mini-transaction +@return extent descriptor +@retval nullptr if cannot be allocated */ +static xdes_t *fsp_alloc_free_extent(fil_space_t *space, uint32_t hint, + buf_block_t **xdes, mtr_t *mtr, + dberr_t *err) +{ + fil_addr_t first; + xdes_t* descr; + buf_block_t* desc_block; + + buf_block_t* header = fsp_get_header(space, mtr, err); + if (!header) { +corrupted: + space->set_corrupted(); + return nullptr; + } + + descr = xdes_get_descriptor_with_space_hdr( + header, space, hint, mtr, err, &desc_block); + if (!descr) { + goto corrupted; + } + + if (desc_block != header && !space->full_crc32()) { + fil_block_check_type(*desc_block, FIL_PAGE_TYPE_XDES, mtr); + } + + if (xdes_get_state(descr) == XDES_FREE) { + /* Ok, we can take this extent */ + } else { + /* Take the first extent in the free list */ + first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE + + header->page.frame); + + if (first.page == FIL_NULL) { + *err = fsp_fill_free_list(false, space, header, mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { + goto corrupted; + } + + first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE + + header->page.frame); + if (first.page == FIL_NULL) { + return nullptr; /* No free extents left */ + } + } + + descr = xdes_lst_get_descriptor(*space, first, mtr, + &desc_block, err); + if (!descr) { + return descr; + } + } + + *err = flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE, desc_block, + static_cast(descr - desc_block->page.frame + + XDES_FLST_NODE), + mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { + return nullptr; + } + + space->free_len--; + *xdes = desc_block; + + return(descr); +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Allocate a single free page. +@param[in,out] header tablespace header +@param[in,out] xdes extent descriptor page +@param[in,out] descr extent descriptor +@param[in] bit slot to allocate in the extent +@param[in,out] mtr mini-transaction +@return error code */ +static dberr_t +fsp_alloc_from_free_frag(buf_block_t *header, buf_block_t *xdes, xdes_t *descr, + ulint bit, mtr_t *mtr) +{ + if (UNIV_UNLIKELY(xdes_get_state(descr) != XDES_FREE_FRAG || + !xdes_is_free(descr, bit))) + return DB_CORRUPTION; + xdes_set_free(*xdes, descr, bit, mtr); + + /* Update the FRAG_N_USED field */ + byte *n_used_p= FSP_HEADER_OFFSET + FSP_FRAG_N_USED + header->page.frame; + uint32_t n_used = mach_read_from_4(n_used_p) + 1; + + if (xdes_is_full(descr)) + { + /* The fragment is full: move it to another list */ + const uint16_t xoffset= + static_cast(descr - xdes->page.frame + XDES_FLST_NODE); + if (dberr_t err= flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG, + xdes, xoffset, mtr)) + return err; + if (dberr_t err= flst_add_last(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG, + xdes, xoffset, mtr)) + return err; + xdes_set_state(*xdes, descr, XDES_FULL_FRAG, mtr); + n_used-= FSP_EXTENT_SIZE; + } + + mtr->write<4>(*header, n_used_p, n_used); + return DB_SUCCESS; +} + +/** Gets a buffer block for an allocated page. +@param[in,out] space tablespace +@param[in] offset page number of the allocated page +@param[in,out] mtr mini-transaction +@return block, initialized */ +static +buf_block_t* +fsp_page_create(fil_space_t *space, page_no_t offset, mtr_t *mtr) +{ + buf_block_t *block, *free_block; + + if (UNIV_UNLIKELY(space->is_being_truncated)) + { + const page_id_t page_id{space->id, offset}; + buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold()); + mysql_mutex_lock(&buf_pool.mutex); + block= reinterpret_cast + (buf_pool.page_hash.get(page_id, chain)); + if (block && block->page.oldest_modification() <= 1) + block= nullptr; + mysql_mutex_unlock(&buf_pool.mutex); + + if (block) + { + ut_ad(block->page.buf_fix_count() >= 1); + ut_ad(block->page.lock.x_lock_count() == 1); + ut_ad(mtr->have_x_latch(*block)); + free_block= block; + goto got_free_block; + } + } + + free_block= buf_LRU_get_free_block(false); +got_free_block: + block= buf_page_create(space, static_cast(offset), + space->zip_size(), mtr, free_block); + if (UNIV_UNLIKELY(block != free_block)) + buf_pool.free_block(free_block); + + fsp_init_file_page(space, block, mtr); + return block; +} + +/** Allocates a single free page from a space. +The page is marked as used. +@param[in,out] space tablespace +@param[in] hint hint of which page would be desirable +@param[in,out] mtr mini-transaction +@param[in,out] init_mtr mini-transaction in which the page should be +initialized (may be the same as mtr) +@param[out] err error code +@return allocated block +@retval nullptr if no page could be allocated */ +static MY_ATTRIBUTE((warn_unused_result, nonnull)) +buf_block_t *fsp_alloc_free_page(fil_space_t *space, uint32_t hint, + mtr_t *mtr, mtr_t *init_mtr, dberr_t *err) +{ + ut_d(space->modify_check(*mtr)); + buf_block_t *block= fsp_get_header(space, mtr, err); + if (!block) + return block; + + buf_block_t *xdes; + /* Get the hinted descriptor */ + xdes_t *descr= xdes_get_descriptor_with_space_hdr(block, space, hint, mtr, + err, &xdes); + if (descr && xdes_get_state(descr) == XDES_FREE_FRAG) + /* Ok, we can take this extent */; + else if (*err != DB_SUCCESS) + { + err_exit: + space->set_corrupted(); + return nullptr; + } + else + { + /* Else take the first extent in free_frag list */ + fil_addr_t first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE_FRAG + + block->page.frame); + if (first.page == FIL_NULL) + { + /* There are no partially full fragments: allocate a free extent + and add it to the FREE_FRAG list. NOTE that the allocation may + have as a side-effect that an extent containing a descriptor + page is added to the FREE_FRAG list. But we will allocate our + page from the the free extent anyway. */ + descr= fsp_alloc_free_extent(space, hint, &xdes, mtr, err); + if (!descr) + return nullptr; + *err= flst_add_last(block, FSP_HEADER_OFFSET + FSP_FREE_FRAG, xdes, + static_cast(descr - xdes->page.frame + + XDES_FLST_NODE), mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) + return nullptr; + xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr); + } + else + { + descr= xdes_lst_get_descriptor(*space, first, mtr, &xdes, err); + if (!descr) + return nullptr; + /* Reset the hint */ + hint= 0; + } + } + + /* Now we have in descr an extent with at least one free page. Look + for a free page in the extent. */ + uint32_t free= xdes_find_free(descr, hint % FSP_EXTENT_SIZE); + if (free == FIL_NULL) + { + corrupted: + *err= DB_CORRUPTION; + goto err_exit; + } + + uint32_t page_no= xdes_get_offset(descr) + free; + uint32_t space_size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + + block->page.frame); + ut_ad(space_size == space->size_in_header || + (space->id == TRX_SYS_SPACE && + srv_startup_is_before_trx_rollback_phase)); + + if (space_size <= page_no) + { + /* It must be that we are extending a single-table tablespace + whose size is still < 64 pages */ + ut_ad(!is_system_tablespace(space->id)); + if (page_no >= FSP_EXTENT_SIZE) + { + sql_print_error("InnoDB: Trying to extend %s" + " by single page(s) though the size is " UINT32PF "." + " Page no " UINT32PF ".", + space->chain.start->name, space_size, page_no); + goto corrupted; + } + + if (!fsp_try_extend_data_file_with_pages(space, page_no, block, mtr)) + { + *err= DB_OUT_OF_FILE_SPACE; + return nullptr; + } + } + + *err= fsp_alloc_from_free_frag(block, xdes, descr, free, mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) + goto corrupted; + return fsp_page_create(space, page_no, init_mtr); +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Return an extent to the free list of a space. +@param[in,out] space tablespace +@param[in] offset page number in the extent +@param[in,out] mtr mini-transaction +@return error code */ +static dberr_t fsp_free_extent(fil_space_t* space, page_no_t offset, + mtr_t* mtr) +{ + ut_ad(space->is_owner()); + dberr_t err; + buf_block_t *block= fsp_get_header(space, mtr, &err); + if (!block) + return err; + buf_block_t *xdes; + xdes_t *descr= xdes_get_descriptor_with_space_hdr(block, space, offset, mtr, + &err, &xdes); + if (!descr) + { + ut_ad(err || space->is_stopping()); + return err; + } + + if (UNIV_UNLIKELY(xdes_get_state(descr) == XDES_FREE)) + { + space->set_corrupted(); + return DB_CORRUPTION; + } + + xdes_init(*xdes, descr, mtr); + space->free_len++; + return flst_add_last(block, FSP_HEADER_OFFSET + FSP_FREE, + xdes, static_cast(descr - xdes->page.frame + + XDES_FLST_NODE), mtr); +} + +MY_ATTRIBUTE((nonnull)) +/** Frees a single page of a space. +The page is marked as free and clean. +@param[in,out] space tablespace +@param[in] offset page number +@param[in,out] mtr mini-transaction +@return error code */ +static dberr_t fsp_free_page(fil_space_t *space, page_no_t offset, mtr_t *mtr) +{ + xdes_t* descr; + ulint frag_n_used; + + ut_ad(mtr); + ut_d(space->modify_check(*mtr)); + + /* fprintf(stderr, "Freeing page %lu in space %lu\n", page, space); */ + + dberr_t err; + buf_block_t* header = fsp_get_header(space, mtr, &err); + if (!header) { + ut_ad(space->is_stopping()); + return err; + } + buf_block_t* xdes; + + descr = xdes_get_descriptor_with_space_hdr(header, space, offset, mtr, + &err, &xdes); + if (!descr) { + ut_ad(err || space->is_stopping()); + return err; + } + + const auto state = xdes_get_state(descr); + + switch (state) { + case XDES_FREE_FRAG: + case XDES_FULL_FRAG: + if (!xdes_is_free(descr, offset % FSP_EXTENT_SIZE)) { + break; + } + /* fall through */ + default: + space->set_corrupted(); + return DB_CORRUPTION; + } + + frag_n_used = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FRAG_N_USED + + header->page.frame); + + const uint16_t xoffset= static_cast(descr - xdes->page.frame + + XDES_FLST_NODE); + + if (state == XDES_FULL_FRAG) { + /* The fragment was full: move it to another list */ + err = flst_remove(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG, + xdes, xoffset, mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + return err; + } + err = flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG, + xdes, xoffset, mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + return err; + } + xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr); + mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FRAG_N_USED + + header->page.frame, + frag_n_used + FSP_EXTENT_SIZE - 1); + } else if (UNIV_UNLIKELY(!frag_n_used)) { + return DB_CORRUPTION; + } else { + mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FRAG_N_USED + + header->page.frame, frag_n_used - 1); + } + + mtr->free(*space, static_cast(offset)); + xdes_set_free(*xdes, descr, offset % FSP_EXTENT_SIZE, mtr); + ut_ad(err == DB_SUCCESS); + + if (!xdes_get_n_used(descr)) { + /* The extent has become free: move it to another list */ + err = flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG, + xdes, xoffset, mtr); + if (err == DB_SUCCESS) { + err = fsp_free_extent(space, offset, mtr); + } + } + + return err; +} + +/** @return Number of segment inodes which fit on a single page */ +inline ulint FSP_SEG_INODES_PER_PAGE(ulint physical_size) +{ + return (physical_size - FSEG_ARR_OFFSET - 10) / FSEG_INODE_SIZE; +} + +/** Returns the nth inode slot on an inode page. +@param[in] page segment inode page +@param[in] i inode index on page +@return segment inode */ +#define fsp_seg_inode_page_get_nth_inode(page, i) \ + FSEG_ARR_OFFSET + FSEG_INODE_SIZE * i + page + +/** Looks for a used segment inode on a segment inode page. +@param page segment inode page +@param physical_size page size +@return segment inode index +@retval ULINT_UNDEFINED if not found */ +static +ulint +fsp_seg_inode_page_find_used(const page_t *page, ulint physical_size) +{ + for (ulint i= 0; i < FSP_SEG_INODES_PER_PAGE(physical_size); i++) + { + const byte *inode= fsp_seg_inode_page_get_nth_inode(page, i); + if (mach_read_from_8(FSEG_ID + inode)) + { + ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)); + return i; + } + } + + return ULINT_UNDEFINED; +} + +/** Looks for an unused segment inode on a segment inode page. +@param[in] page segment inode page +@param[in] i search forward starting from this index +@param[in] physical_size page size +@return segment inode index +@retval ULINT_UNDEFINED if not found */ +static +ulint +fsp_seg_inode_page_find_free(const page_t *page, ulint i, ulint physical_size) +{ + for (; i < FSP_SEG_INODES_PER_PAGE(physical_size); i++) + { + const byte *inode= fsp_seg_inode_page_get_nth_inode(page, i); + if (mach_read_from_8(FSEG_ID + inode)) + ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)); + else + /* This is unused */ + return i; + } + return ULINT_UNDEFINED; +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Allocate a file segment inode page. +@param[in,out] space tablespace +@param[in,out] header tablespace header +@param[in,out] mtr mini-transaction +@return error code */ +static dberr_t fsp_alloc_seg_inode_page(fil_space_t *space, + buf_block_t *header, mtr_t *mtr) +{ + ut_ad(header->page.id().space() == space->id); + dberr_t err; + buf_block_t *block= fsp_alloc_free_page(space, 0, mtr, mtr, &err); + + if (!block) + return err; + + ut_ad(block->page.lock.not_recursive()); + + mtr->write<2>(*block, block->page.frame + FIL_PAGE_TYPE, FIL_PAGE_INODE); + +#ifdef UNIV_DEBUG + const byte *inode= FSEG_ID + FSEG_ARR_OFFSET + block->page.frame; + for (ulint i= FSP_SEG_INODES_PER_PAGE(space->physical_size()); i--; + inode += FSEG_INODE_SIZE) + ut_ad(!mach_read_from_8(inode)); +#endif + + return flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE, + block, FSEG_INODE_PAGE_NODE, mtr); +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Allocate a file segment inode. +@param[in,out] space tablespace +@param[in,out] header tablespace header +@param[out] iblock segment inode page +@param[in,out] mtr mini-transaction +@param[out] err error code +@return segment inode +@retval nullptr on failure */ +static fseg_inode_t* +fsp_alloc_seg_inode(fil_space_t *space, buf_block_t *header, + buf_block_t **iblock, mtr_t *mtr, dberr_t *err) +{ + /* Allocate a new segment inode page if needed. */ + if (!flst_get_len(FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE + + header->page.frame)) + { + *err= fsp_alloc_seg_inode_page(space, header, mtr); + if (*err != DB_SUCCESS) + return nullptr; + } + + const page_id_t page_id + { + space->id, + mach_read_from_4(FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE + FLST_FIRST + + FIL_ADDR_PAGE + header->page.frame) + }; + + buf_block_t *block= + buf_page_get_gen(page_id, space->zip_size(), RW_SX_LATCH, + nullptr, BUF_GET_POSSIBLY_FREED, mtr, err); + if (!block) + return nullptr; + + if (!space->full_crc32()) + fil_block_check_type(*block, FIL_PAGE_INODE, mtr); + + const ulint physical_size= space->physical_size(); + ulint n= fsp_seg_inode_page_find_free(block->page.frame, 0, physical_size); + + if (UNIV_UNLIKELY(n >= FSP_SEG_INODES_PER_PAGE(physical_size))) + { + *err= DB_CORRUPTION; + return nullptr; + } + fseg_inode_t *inode= fsp_seg_inode_page_get_nth_inode(block->page.frame, n); + + if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(block->page.frame, n + 1, + physical_size)) + { + /* There are no other unused headers left on the page: move it + to another list */ + *err= flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE, + block, FSEG_INODE_PAGE_NODE, mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) + return nullptr; + *err= flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL, + block, FSEG_INODE_PAGE_NODE, mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) + return nullptr; + } + + ut_ad(!mach_read_from_8(inode + FSEG_ID) || + !memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)); + *iblock= block; + return inode; +} + +MY_ATTRIBUTE((nonnull)) +/** Frees a file segment inode. +@param[in,out] space tablespace +@param[in,out] inode segment inode +@param[in,out] iblock segment inode page +@param[in,out] mtr mini-transaction */ +static void fsp_free_seg_inode(fil_space_t *space, fseg_inode_t *inode, + buf_block_t *iblock, mtr_t *mtr) +{ + ut_d(space->modify_check(*mtr)); + + dberr_t err; + buf_block_t *header= fsp_get_header(space, mtr, &err); + if (!header) + return; + if (UNIV_UNLIKELY(memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4))) + { + space->set_corrupted(); + return; + } + + const ulint physical_size= space->physical_size(); + + if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(iblock->page.frame, 0, + physical_size)) + { + /* Move the page to another list */ + if (flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL, + iblock, FSEG_INODE_PAGE_NODE, mtr) != DB_SUCCESS) + return; + if (flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE, + iblock, FSEG_INODE_PAGE_NODE, mtr) != DB_SUCCESS) + return; + } + + mtr->memset(iblock, page_offset(inode) + FSEG_ID, FSEG_INODE_SIZE, 0); + + if (ULINT_UNDEFINED != fsp_seg_inode_page_find_used(iblock->page.frame, + physical_size)) + return; + + /* There are no other used headers left on the page: free it */ + if (flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE, + iblock, FSEG_INODE_PAGE_NODE, mtr) == DB_SUCCESS) + fsp_free_page(space, iblock->page.id().page_no(), mtr); +} + +MY_ATTRIBUTE((nonnull(1,4,5), warn_unused_result)) +/** Returns the file segment inode, page x-latched. +@param[in] header segment header +@param[in] space space id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] mtr mini-transaction +@param[out] block inode block +@param[out] err error code +@return segment inode, page x-latched +@retrval nullptr if the inode is free or corruption was noticed */ +static +fseg_inode_t* +fseg_inode_try_get( + const fseg_header_t* header, + uint32_t space, + ulint zip_size, + mtr_t* mtr, + buf_block_t** block, + dberr_t* err = nullptr) +{ + if (UNIV_UNLIKELY(space != mach_read_from_4(header + FSEG_HDR_SPACE))) + { + corrupted: + if (err) + *err= DB_CORRUPTION; + return nullptr; + } + + *block= + buf_page_get_gen(page_id_t(space, + mach_read_from_4(header + FSEG_HDR_PAGE_NO)), + zip_size, RW_SX_LATCH, nullptr, BUF_GET_POSSIBLY_FREED, + mtr, err); + if (!*block) + return nullptr; + + const uint16_t offset= mach_read_from_2(header + FSEG_HDR_OFFSET); + if (UNIV_UNLIKELY(offset >= (*block)->physical_size())) + goto corrupted; + + fseg_inode_t *inode= (*block)->page.frame + offset; + if (UNIV_UNLIKELY(!mach_read_from_8(inode + FSEG_ID) || + memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4))) + goto corrupted; + + return inode; +} + +/** Get the page number from the nth fragment page slot. +@param inode file segment findex +@param n slot index +@return page number +@retval FIL_NULL if not in use */ +static uint32_t fseg_get_nth_frag_page_no(const fseg_inode_t *inode, ulint n) +{ + ut_ad(inode); + ut_ad(n < FSEG_FRAG_ARR_N_SLOTS); + ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)); + return(mach_read_from_4(inode + FSEG_FRAG_ARR + + n * FSEG_FRAG_SLOT_SIZE)); +} + +/** Set the page number in the nth fragment page slot. +@param[in,out] inode segment inode +@param[in,out] iblock segment inode page +@param[in] n slot index +@param[in] page_no page number to set +@param[in,out] mtr mini-transaction */ +inline void fseg_set_nth_frag_page_no(fseg_inode_t *inode, buf_block_t *iblock, + ulint n, ulint page_no, mtr_t *mtr) +{ + ut_ad(n < FSEG_FRAG_ARR_N_SLOTS); + ut_ad(mtr->memo_contains_flagged(iblock, MTR_MEMO_PAGE_SX_FIX)); + ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)); + + mtr->write<4>(*iblock, inode + FSEG_FRAG_ARR + n * FSEG_FRAG_SLOT_SIZE, + page_no); +} + +/**********************************************************************//** +Finds a fragment page slot which is free. +@return slot index; ULINT_UNDEFINED if none found */ +static +ulint +fseg_find_free_frag_page_slot( +/*==========================*/ + fseg_inode_t* inode) /*!< in: segment inode */ +{ + ulint i; + ulint page_no; + + for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) { + page_no = fseg_get_nth_frag_page_no(inode, i); + + if (page_no == FIL_NULL) { + + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/**********************************************************************//** +Finds a fragment page slot which is used and last in the array. +@return slot index; ULINT_UNDEFINED if none found */ +static +ulint +fseg_find_last_used_frag_page_slot( +/*===============================*/ + fseg_inode_t* inode) /*!< in: segment inode */ +{ + ulint i; + ulint page_no; + + for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) { + page_no = fseg_get_nth_frag_page_no( + inode, FSEG_FRAG_ARR_N_SLOTS - i - 1); + + if (page_no != FIL_NULL) { + + return(FSEG_FRAG_ARR_N_SLOTS - i - 1); + } + } + + return(ULINT_UNDEFINED); +} + +/** Calculate reserved fragment page slots. +@param inode file segment index +@return number of fragment pages */ +static ulint fseg_get_n_frag_pages(const fseg_inode_t *inode) +{ + ulint i; + ulint count = 0; + + for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) { + if (FIL_NULL != fseg_get_nth_frag_page_no(inode, i)) { + count++; + } + } + + return(count); +} + +/** Create a new segment. +@param space tablespace +@param byte_offset byte offset of the created segment header +@param mtr mini-transaction +@param err error code +@param has_done_reservation whether fsp_reserve_free_extents() was invoked +@param block block where segment header is placed, + or NULL to allocate an additional page for that +@return the block where the segment header is placed, x-latched +@retval nullptr if could not create segment */ +buf_block_t* +fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr, dberr_t *err, + bool has_done_reservation, buf_block_t *block) +{ + fseg_inode_t* inode; + ib_id_t seg_id; + uint32_t n_reserved; + bool reserved_extent = false; + + DBUG_ENTER("fseg_create"); + + ut_ad(mtr); + ut_ad(byte_offset >= FIL_PAGE_DATA); + ut_ad(byte_offset + FSEG_HEADER_SIZE + <= srv_page_size - FIL_PAGE_DATA_END); + + mtr->x_lock_space(space); + ut_d(space->modify_check(*mtr)); + + ut_ad(!block || block->page.id().space() == space->id); + + buf_block_t* header = fsp_get_header(space, mtr, err); + if (!header) { + block = nullptr; + goto funct_exit; + } + + buf_block_t* iblock; + +inode_alloc: + inode = fsp_alloc_seg_inode(space, header, &iblock, mtr, err); + + if (!inode) { + block = nullptr; +reserve_extent: + if (!has_done_reservation && !reserved_extent) { + *err = fsp_reserve_free_extents(&n_reserved, space, 2, + FSP_NORMAL, mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { + DBUG_RETURN(nullptr); + } + + /* Extents reserved successfully. So + try allocating the page or inode */ + reserved_extent = true; + if (inode) { + goto page_alloc; + } + + goto inode_alloc; + } + + if (inode) { + fsp_free_seg_inode(space, inode, iblock, mtr); + } + goto funct_exit; + } + + /* Read the next segment id from space header and increment the + value in space header */ + + seg_id = mach_read_from_8(FSP_HEADER_OFFSET + FSP_SEG_ID + + header->page.frame); + + mtr->write<8>(*header, + FSP_HEADER_OFFSET + FSP_SEG_ID + header->page.frame, + seg_id + 1); + mtr->write<8>(*iblock, inode + FSEG_ID, seg_id); + ut_ad(!mach_read_from_4(inode + FSEG_NOT_FULL_N_USED)); + + flst_init(*iblock, inode + FSEG_FREE, mtr); + flst_init(*iblock, inode + FSEG_NOT_FULL, mtr); + flst_init(*iblock, inode + FSEG_FULL, mtr); + + mtr->memcpy(*iblock, inode + FSEG_MAGIC_N, FSEG_MAGIC_N_BYTES, 4); + compile_time_assert(FSEG_FRAG_SLOT_SIZE == 4); + compile_time_assert(FIL_NULL == 0xffffffff); + mtr->memset(iblock, + uint16_t(inode - iblock->page.frame) + FSEG_FRAG_ARR, + FSEG_FRAG_SLOT_SIZE * FSEG_FRAG_ARR_N_SLOTS, 0xff); + + if (!block) { +page_alloc: + block = fseg_alloc_free_page_low(space, + inode, iblock, 0, FSP_UP, +#ifdef UNIV_DEBUG + has_done_reservation, +#endif /* UNIV_DEBUG */ + mtr, mtr, err); + + if (!block) { + ut_ad(!has_done_reservation); + goto reserve_extent; + } + + ut_d(const auto x = block->page.lock.x_lock_count()); + ut_ad(x || block->page.lock.not_recursive()); + ut_ad(x == 1 || space->is_being_truncated); + ut_ad(x <= 2); + ut_ad(!fil_page_get_type(block->page.frame)); + mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->page.frame, + FIL_PAGE_TYPE_SYS); + } + + mtr->write<2>(*block, byte_offset + FSEG_HDR_OFFSET + + block->page.frame, page_offset(inode)); + + mtr->write<4>(*block, byte_offset + FSEG_HDR_PAGE_NO + + block->page.frame, iblock->page.id().page_no()); + + mtr->write<4,mtr_t::MAYBE_NOP>(*block, byte_offset + FSEG_HDR_SPACE + + block->page.frame, space->id); + +funct_exit: + if (!has_done_reservation && reserved_extent) { + space->release_free_extents(n_reserved); + } + + DBUG_RETURN(block); +} + +/**********************************************************************//** +Calculates the number of pages reserved by a segment, and how many pages are +currently used. +@return number of reserved pages */ +static +ulint +fseg_n_reserved_pages_low( +/*======================*/ + const fseg_inode_t* inode, /*!< in: segment inode */ + ulint* used) /*!< out: number of pages used (not + more than reserved) */ +{ + *used = mach_read_from_4(inode + FSEG_NOT_FULL_N_USED) + + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL) + + fseg_get_n_frag_pages(inode); + + return fseg_get_n_frag_pages(inode) + + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FREE) + + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_NOT_FULL) + + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL); +} + +/** Calculate the number of pages reserved by a segment, +and how many pages are currently used. +@param[in] block buffer block containing the file segment header +@param[in] header file segment header +@param[out] used number of pages that are used (not more than reserved) +@param[in,out] mtr mini-transaction +@return number of reserved pages */ +ulint fseg_n_reserved_pages(const buf_block_t &block, + const fseg_header_t *header, ulint *used, + mtr_t *mtr) +{ + ut_ad(page_align(header) == block.page.frame); + buf_block_t *iblock; + if (fseg_inode_t *inode= + fseg_inode_try_get(header, block.page.id().space(), block.zip_size(), + mtr, &iblock)) + return fseg_n_reserved_pages_low(inode, used); + return *used= 0; +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Tries to fill the free list of a segment with consecutive free extents. +This happens if the segment is big enough to allow extents in the free list, +the free list is empty, and the extents can be allocated consecutively from +the hint onward. +@param[in] inode segment inode +@param[in,out] iblock segment inode page +@param[in] space tablespace +@param[in] hint hint which extent would be good as the first extent +@param[in,out] mtr mini-transaction */ +static dberr_t fseg_fill_free_list(const fseg_inode_t *inode, + buf_block_t *iblock, fil_space_t *space, + uint32_t hint, mtr_t *mtr) +{ + ulint used; + + ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + ut_d(space->modify_check(*mtr)); + + if (fseg_n_reserved_pages_low(inode, &used) < + FSEG_FREE_LIST_LIMIT * FSP_EXTENT_SIZE) + /* The segment is too small to allow extents in free list */ + return DB_SUCCESS; + + if (UNIV_UNLIKELY(memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4))) + { + space->set_corrupted(); + return DB_CORRUPTION; + } + + if (flst_get_len(inode + FSEG_FREE) > 0) + /* Free list is not empty */ + return DB_SUCCESS; + + for (ulint i= 0; i < FSEG_FREE_LIST_MAX_LEN; i++, hint += FSP_EXTENT_SIZE) + { + buf_block_t *xdes; + dberr_t err; + xdes_t *descr= xdes_get_descriptor(space, hint, mtr, &err, &xdes); + if (!descr || XDES_FREE != xdes_get_state(descr)) + /* We cannot allocate the desired extent: stop */ + return err; + + descr= fsp_alloc_free_extent(space, hint, &xdes, mtr, &err); + if (UNIV_UNLIKELY(!descr)) + return err; + + if (dberr_t err= + flst_add_last(iblock, + static_cast(inode - iblock->page.frame + + FSEG_FREE), xdes, + static_cast(descr - xdes->page.frame + + XDES_FLST_NODE), mtr)) + return err; + xdes_set_state(*xdes, descr, XDES_FSEG, mtr); + mtr->memcpy(*xdes, descr + XDES_ID, inode + FSEG_ID, 8); + } + + return DB_SUCCESS; +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Allocates a free extent for the segment: looks first in the free list of +the segment, then tries to allocate from the space free list. +NOTE that the extent returned still resides in the segment free list, it is +not yet taken off it! +@param[in] inode segment inode +@param[in,out] iblock segment inode page +@param[out] xdes extent descriptor page +@param[in,out] space tablespace +@param[in,out] mtr mini-transaction +@param[out] err error code +@retval nullptr if no page could be allocated */ +static +xdes_t* +fseg_alloc_free_extent( + const fseg_inode_t* inode, + buf_block_t* iblock, + buf_block_t** xdes, + fil_space_t* space, + mtr_t* mtr, + dberr_t* err) +{ + ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)); + ut_d(space->modify_check(*mtr)); + + if (flst_get_len(inode + FSEG_FREE)) + { + /* Segment free list is not empty, allocate from it */ + return xdes_lst_get_descriptor(*space, flst_get_first(inode + FSEG_FREE), + mtr, xdes, err); + } + + xdes_t* descr= fsp_alloc_free_extent(space, 0, xdes, mtr, err); + if (UNIV_UNLIKELY(!descr)) + return descr; + xdes_set_state(**xdes, descr, XDES_FSEG, mtr); + mtr->memcpy(**xdes, descr + XDES_ID, inode + FSEG_ID, 8); + *err= flst_add_last(iblock, + static_cast(inode - iblock->page.frame + + FSEG_FREE), *xdes, + static_cast(descr - (*xdes)->page.frame + + XDES_FLST_NODE), mtr); + if (UNIV_LIKELY(*err != DB_SUCCESS)) + return nullptr; + /* Try to fill the segment free list */ + *err= fseg_fill_free_list(inode, iblock, space, + xdes_get_offset(descr) + FSP_EXTENT_SIZE, mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) + return nullptr; + + return descr; +} + +/** Allocates a single free page from a segment. +This function implements the intelligent allocation strategy which tries to +minimize file space fragmentation. +@param[in,out] space tablespace +@param[in,out] seg_inode segment inode +@param[in,out] iblock segment inode page +@param[in] hint hint of which page would be desirable +@param[in] direction if the new page is needed because of +an index page split, and records are inserted there in order, into which +direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR +@param[in,out] mtr mini-transaction +@param[in,out] init_mtr mtr or another mini-transaction in +which the page should be initialized. +@param[out] err error code +@return the allocated page +@retval nullptr if no page could be allocated */ +static +buf_block_t* +fseg_alloc_free_page_low( + fil_space_t* space, + fseg_inode_t* seg_inode, + buf_block_t* iblock, + uint32_t hint, + byte direction, +#ifdef UNIV_DEBUG + bool has_done_reservation, + /*!< whether the space has already been reserved */ +#endif /* UNIV_DEBUG */ + mtr_t* mtr, + mtr_t* init_mtr, + dberr_t* err) +{ + ib_id_t seg_id; + ulint used; + ulint reserved; + xdes_t* descr; /*!< extent of the hinted page */ + uint32_t ret_page; /*!< the allocated page offset, FIL_NULL + if could not be allocated */ + xdes_t* ret_descr; /*!< the extent of the allocated page */ + buf_block_t* xdes; + ulint n; + + ut_ad((direction >= FSP_UP) && (direction <= FSP_NO_DIR)); + ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + seg_inode, 4)); + ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + seg_id = mach_read_from_8(seg_inode + FSEG_ID); + + ut_ad(seg_id); + ut_d(space->modify_check(*mtr)); + ut_ad(fil_page_get_type(page_align(seg_inode)) == FIL_PAGE_INODE); + + reserved = fseg_n_reserved_pages_low(seg_inode, &used); + + buf_block_t* header = fsp_get_header(space, mtr, err); + if (!header) { + return header; + } + + descr = xdes_get_descriptor_with_space_hdr(header, space, hint, mtr, + err, &xdes); + if (!descr) { + if (*err != DB_SUCCESS) { + return nullptr; + } + /* Hint outside space or too high above free limit: reset + hint */ + /* The file space header page is always allocated. */ + hint = 0; + descr = xdes_get_descriptor(space, hint, mtr, err, &xdes); + if (!descr) { + return nullptr; + } + } + + /* In the big if-else below we look for ret_page and ret_descr */ + /*-------------------------------------------------------------*/ + if ((xdes_get_state(descr) == XDES_FSEG) + && mach_read_from_8(descr + XDES_ID) == seg_id + && xdes_is_free(descr, hint % FSP_EXTENT_SIZE)) { +take_hinted_page: + /* 1. We can take the hinted page + =================================*/ + ret_descr = descr; + ret_page = hint; + /* Skip the check for extending the tablespace. If the + page hint were not within the size of the tablespace, + we would have got (descr == NULL) above and reset the hint. */ + goto got_hinted_page; + /*-----------------------------------------------------------*/ + } else if (xdes_get_state(descr) == XDES_FREE + && reserved - used < reserved / FSEG_FILLFACTOR + && used >= FSEG_FRAG_LIMIT) { + + /* 2. We allocate the free extent from space and can take + ========================================================= + the hinted page + ===============*/ + ret_descr = fsp_alloc_free_extent(space, hint, &xdes, + mtr, err); + + if (UNIV_UNLIKELY(ret_descr != descr)) { + if (*err != DB_SUCCESS) { + *err = DB_CORRUPTION; + } + return nullptr; + } + + xdes_set_state(*xdes, ret_descr, XDES_FSEG, mtr); + mtr->write<8,mtr_t::MAYBE_NOP>(*xdes, ret_descr + XDES_ID, + seg_id); + *err = flst_add_last( + iblock, + static_cast(seg_inode - iblock->page.frame + + FSEG_FREE), xdes, + static_cast(ret_descr + - xdes->page.frame + + XDES_FLST_NODE), mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { + return nullptr; + } + + /* Try to fill the segment free list */ + *err = fseg_fill_free_list(seg_inode, iblock, space, + hint + FSP_EXTENT_SIZE, mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { + return nullptr; + } + goto take_hinted_page; + /*-----------------------------------------------------------*/ + } else if ((direction != FSP_NO_DIR) + && ((reserved - used) < reserved / FSEG_FILLFACTOR) + && (used >= FSEG_FRAG_LIMIT) + && (ret_descr = fseg_alloc_free_extent(seg_inode, iblock, + &xdes, space, + mtr, err))) { + /* 3. We take any free extent (which was already assigned above + =============================================================== + in the if-condition to ret_descr) and take the lowest or + ======================================================== + highest page in it, depending on the direction + ==============================================*/ + ret_page = xdes_get_offset(ret_descr); + + if (direction == FSP_DOWN) { + ret_page += FSP_EXTENT_SIZE - 1; + } + ut_ad(!has_done_reservation || ret_page != FIL_NULL); + /*-----------------------------------------------------------*/ + } else if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { + return nullptr; + } else if ((xdes_get_state(descr) == XDES_FSEG) + && mach_read_from_8(descr + XDES_ID) == seg_id + && (!xdes_is_full(descr))) { + + /* 4. We can take the page from the same extent as the + ====================================================== + hinted page (and the extent already belongs to the + ================================================== + segment) + ========*/ + ret_descr = descr; + ret_page = xdes_find_free(ret_descr, hint % FSP_EXTENT_SIZE); + if (ret_page == FIL_NULL) { + ut_ad(!has_done_reservation); + } else { + ret_page += xdes_get_offset(ret_descr); + } + /*-----------------------------------------------------------*/ + } else if (reserved - used > 0) { + /* 5. We take any unused page from the segment + ==============================================*/ + fil_addr_t first; + + if (flst_get_len(seg_inode + FSEG_NOT_FULL) > 0) { + first = flst_get_first(seg_inode + FSEG_NOT_FULL); + } else if (flst_get_len(seg_inode + FSEG_FREE) > 0) { + first = flst_get_first(seg_inode + FSEG_FREE); + } else { + ut_ad(!has_done_reservation); + return(NULL); + } + + ret_descr = xdes_lst_get_descriptor(*space, first, mtr, &xdes); + if (!ret_descr) { + return nullptr; + } + + ret_page = xdes_find_free(ret_descr); + if (ret_page == FIL_NULL) { + ut_ad(!has_done_reservation); + } else { + ret_page += xdes_get_offset(ret_descr); + } + /*-----------------------------------------------------------*/ + } else if (used < FSEG_FRAG_LIMIT) { + /* 6. We allocate an individual page from the space + ===================================================*/ + buf_block_t* block = fsp_alloc_free_page( + space, hint, mtr, init_mtr, err); + + ut_ad(block || !has_done_reservation || *err); + + if (block) { + /* Put the page in the fragment page array of the + segment */ + n = fseg_find_free_frag_page_slot(seg_inode); + if (UNIV_UNLIKELY(n == ULINT_UNDEFINED)) { + *err = DB_CORRUPTION; + return nullptr; + } + + fseg_set_nth_frag_page_no( + seg_inode, iblock, n, + block->page.id().page_no(), mtr); + } + + /* fsp_alloc_free_page() invoked fsp_init_file_page() + already. */ + return(block); + /*-----------------------------------------------------------*/ + } else { + /* 7. We allocate a new extent and take its first page + ======================================================*/ + ret_descr = fseg_alloc_free_extent(seg_inode, iblock, &xdes, + space, mtr, err); + + if (!ret_descr) { + ut_ad(!has_done_reservation || *err); + return nullptr; + } else { + ret_page = xdes_get_offset(ret_descr); + } + } + + if (ret_page == FIL_NULL) { + /* Page could not be allocated */ + + ut_ad(!has_done_reservation); + return(NULL); + } + + if (space->size <= ret_page && !is_predefined_tablespace(space->id)) { + /* It must be that we are extending a single-table + tablespace whose size is still < 64 pages */ + + if (ret_page >= FSP_EXTENT_SIZE) { + sql_print_error("InnoDB: Trying to extend '%s'" + " by single page(s) though the" + " space size " UINT32PF "." + " Page no " UINT32PF ".", + space->chain.start->name, space->size, + ret_page); + ut_ad(!has_done_reservation); + return(NULL); + } + + if (!fsp_try_extend_data_file_with_pages( + space, ret_page, header, mtr)) { + /* No disk space left */ + ut_ad(!has_done_reservation); + return(NULL); + } + } + +got_hinted_page: + /* ret_descr == NULL if the block was allocated from free_frag + (XDES_FREE_FRAG) */ + if (ret_descr != NULL) { + /* At this point we know the extent and the page offset. + The extent is still in the appropriate list (FSEG_NOT_FULL + or FSEG_FREE), and the page is not yet marked as used. */ + + ut_d(buf_block_t* xxdes); + ut_ad(xdes_get_descriptor(space, ret_page, mtr, err, &xxdes) + == ret_descr); + ut_ad(xdes == xxdes); + ut_ad(xdes_is_free(ret_descr, ret_page % FSP_EXTENT_SIZE)); + + *err = fseg_mark_page_used(seg_inode, iblock, ret_page, + ret_descr, xdes, mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { + return nullptr; + } + } + + return fsp_page_create(space, ret_page, init_mtr); +} + +/**********************************************************************//** +Allocates a single free page from a segment. This function implements +the intelligent allocation strategy which tries to minimize file space +fragmentation. +@retval NULL if no page could be allocated */ +buf_block_t* +fseg_alloc_free_page_general( +/*=========================*/ + fseg_header_t* seg_header,/*!< in/out: segment header */ + uint32_t hint, /*!< in: hint of which page would be + desirable */ + byte direction,/*!< in: if the new page is needed because + of an index page split, and records are + inserted there in order, into which + direction they go alphabetically: FSP_DOWN, + FSP_UP, FSP_NO_DIR */ + bool has_done_reservation, /*!< in: true if the caller has + already done the reservation for the page + with fsp_reserve_free_extents, then there + is no need to do the check for this individual + page */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + mtr_t* init_mtr,/*!< in/out: mtr or another mini-transaction + in which the page should be initialized. */ + dberr_t* err) /*!< out: error code */ +{ + fseg_inode_t* inode; + fil_space_t* space; + buf_block_t* iblock; + buf_block_t* block; + uint32_t n_reserved; + + const uint32_t space_id = page_get_space_id(page_align(seg_header)); + space = mtr->x_lock_space(space_id); + inode = fseg_inode_try_get(seg_header, space_id, space->zip_size(), + mtr, &iblock, err); + if (!inode) { + return nullptr; + } + if (!space->full_crc32()) { + fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr); + } + + if (!has_done_reservation) { + *err = fsp_reserve_free_extents(&n_reserved, space, 2, + FSP_NORMAL, mtr); + if (*err != DB_SUCCESS) { + return nullptr; + } + } + + block = fseg_alloc_free_page_low(space, + inode, iblock, hint, direction, +#ifdef UNIV_DEBUG + has_done_reservation, +#endif /* UNIV_DEBUG */ + mtr, init_mtr, err); + + /* The allocation cannot fail if we have already reserved a + space for the page. */ + ut_ad(block || !has_done_reservation || *err); + + if (!has_done_reservation) { + space->release_free_extents(n_reserved); + } + + return(block); +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Check that we have at least n_pages frag pages free in the first extent +of a single-table tablespace, and they are also physically initialized to +the data file. That is we have already extended the data file so that those +pages are inside the data file. If not, this function extends the tablespace +with pages. +@param[in,out] space tablespace +@param[in,out] header tablespace header, x-latched +@param[in] size tablespace size in pages, less than FSP_EXTENT_SIZE +@param[in,out] mtr mini-transaction +@param[in] n_pages number of pages to reserve +@return error code */ +static +dberr_t +fsp_reserve_free_pages( + fil_space_t* space, + buf_block_t* header, + ulint size, + mtr_t* mtr, + uint32_t n_pages) +{ + ut_ad(space != fil_system.sys_space && space != fil_system.temp_space); + ut_ad(size < FSP_EXTENT_SIZE); + + dberr_t err= DB_OUT_OF_FILE_SPACE; + const xdes_t *descr= + xdes_get_descriptor_with_space_hdr(header, space, 0, mtr, &err); + if (!descr) + return err; + const uint32_t n_used= xdes_get_n_used(descr); + if (size >= n_used + n_pages) + return DB_SUCCESS; + if (n_used > size) + return DB_CORRUPTION; + return fsp_try_extend_data_file_with_pages(space, n_used + n_pages - 1, + header, mtr) + ? DB_SUCCESS + : DB_OUT_OF_FILE_SPACE; +} + +/** Reserves free pages from a tablespace. All mini-transactions which may +use several pages from the tablespace should call this function beforehand +and reserve enough free extents so that they certainly will be able +to do their operation, like a B-tree page split, fully. Reservations +must be released with function fil_space_t::release_free_extents()! + +The alloc_type below has the following meaning: FSP_NORMAL means an +operation which will probably result in more space usage, like an +insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are +deleting rows, then this allocation will in the long run result in +less space usage (after a purge); FSP_CLEANING means allocation done +in a physical record delete (like in a purge) or other cleaning operation +which will result in less space usage in the long run. We prefer the latter +two types of allocation: when space is scarce, FSP_NORMAL allocations +will not succeed, but the latter two allocations will succeed, if possible. +The purpose is to avoid dead end where the database is full but the +user cannot free any space because these freeing operations temporarily +reserve some space. + +Single-table tablespaces whose size is < FSP_EXTENT_SIZE pages are a special +case. In this function we would liberally reserve several extents for +every page split or merge in a B-tree. But we do not want to waste disk space +if the table only occupies < FSP_EXTENT_SIZE pages. That is why we apply +different rules in that special case, just ensuring that there are n_pages +free pages available. + +@param[out] n_reserved number of extents actually reserved; if we + return true and the tablespace size is < + FSP_EXTENT_SIZE pages, then this can be 0, + otherwise it is n_ext +@param[in,out] space tablespace +@param[in] n_ext number of extents to reserve +@param[in] alloc_type page reservation type (FSP_BLOB, etc) +@param[in,out] mtr the mini transaction +@param[in] n_pages for small tablespaces (tablespace size is + less than FSP_EXTENT_SIZE), number of free + pages to reserve. +@return error code +@retval DB_SUCCESS if we were able to make the reservation */ +dberr_t +fsp_reserve_free_extents( + uint32_t* n_reserved, + fil_space_t* space, + uint32_t n_ext, + fsp_reserve_t alloc_type, + mtr_t* mtr, + uint32_t n_pages) +{ + ulint reserve; + + ut_ad(mtr); + *n_reserved = n_ext; + + const uint32_t extent_size = FSP_EXTENT_SIZE; + + mtr->x_lock_space(space); + const unsigned physical_size = space->physical_size(); + + dberr_t err; + buf_block_t* header = fsp_get_header(space, mtr, &err); + if (!header) { + return err; + } +try_again: + uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + + header->page.frame); + ut_ad(size == space->size_in_header); + + if (size < extent_size && n_pages < extent_size / 2) { + /* Use different rules for small single-table tablespaces */ + *n_reserved = 0; + return fsp_reserve_free_pages(space, header, size, + mtr, n_pages); + } + + uint32_t n_free_list_ext = flst_get_len(FSP_HEADER_OFFSET + FSP_FREE + + header->page.frame); + ut_ad(space->free_len == n_free_list_ext); + + uint32_t free_limit = mach_read_from_4(FSP_HEADER_OFFSET + + FSP_FREE_LIMIT + + header->page.frame); + ut_ad(space->free_limit == free_limit); + + /* Below we play safe when counting free extents above the free limit: + some of them will contain extent descriptor pages, and therefore + will not be free extents */ + + uint32_t n_free_up; + + if (size >= free_limit) { + n_free_up = (size - free_limit) / extent_size; + if (n_free_up) { + n_free_up--; + n_free_up -= n_free_up / (physical_size / extent_size); + } + } else { + ut_ad(alloc_type == FSP_BLOB); + n_free_up = 0; + } + + uint32_t n_free = n_free_list_ext + n_free_up; + + switch (alloc_type) { + case FSP_NORMAL: + /* We reserve 1 extent + 0.5 % of the space size to undo logs + and 1 extent + 0.5 % to cleaning operations; NOTE: this source + code is duplicated in the function below! */ + + reserve = 2 + ((size / extent_size) * 2) / 200; + + if (n_free <= reserve + n_ext) { + + goto try_to_extend; + } + break; + case FSP_UNDO: + /* We reserve 0.5 % of the space size to cleaning operations */ + + reserve = 1 + ((size / extent_size) * 1) / 200; + + if (n_free <= reserve + n_ext) { + + goto try_to_extend; + } + break; + case FSP_CLEANING: + case FSP_BLOB: + reserve = 0; + break; + default: + ut_error; + } + + if (space->reserve_free_extents(n_free, n_ext)) { + return DB_SUCCESS; + } +try_to_extend: + if (fsp_try_extend_data_file(space, header, mtr)) { + goto try_again; + } + + return DB_OUT_OF_FILE_SPACE; +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Frees a single page of a segment. +@param[in] seg_inode segment inode +@param[in,out] space tablespace +@param[in] offset page number +@param[in,out] mtr mini-transaction +@param[in] ahi Drop adaptive hash index +@return error code */ +static +dberr_t +fseg_free_page_low( + fseg_inode_t* seg_inode, + buf_block_t* iblock, + fil_space_t* space, + page_no_t offset, + mtr_t* mtr +#ifdef BTR_CUR_HASH_ADAPT + ,bool ahi=false +#endif /* BTR_CUR_HASH_ADAPT */ + ) +{ + ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + seg_inode, 4)); + ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE)); + ut_ad(iblock->page.frame == page_align(seg_inode)); + ut_d(space->modify_check(*mtr)); + +#ifdef BTR_CUR_HASH_ADAPT + if (ahi) { + btr_search_drop_page_hash_when_freed( + page_id_t(space->id, offset)); + } +#endif /* BTR_CUR_HASH_ADAPT */ + + const uint32_t extent_size = FSP_EXTENT_SIZE; + ut_ad(ut_is_2pow(extent_size)); + buf_block_t* xdes; + dberr_t err; + xdes_t* descr = xdes_get_descriptor(space, offset, mtr, &err, &xdes); + + if (!descr) { + return err; + } + if (UNIV_UNLIKELY(xdes_is_free(descr, offset & (extent_size - 1)))) { +corrupted: + space->set_corrupted(); + return DB_CORRUPTION; + } + + if (xdes_get_state(descr) != XDES_FSEG) { + /* The page is in the fragment pages of the segment */ + for (ulint i = 0;; i++) { + if (fseg_get_nth_frag_page_no(seg_inode, i) + != offset) { + continue; + } + + compile_time_assert(FIL_NULL == 0xffffffff); + mtr->memset(iblock, uint16_t(seg_inode + - iblock->page.frame) + + FSEG_FRAG_ARR + + i * FSEG_FRAG_SLOT_SIZE, 4, 0xff); + break; + } + + return fsp_free_page(space, offset, mtr); + } + + /* If we get here, the page is in some extent of the segment */ + + if (UNIV_UNLIKELY(memcmp(descr + XDES_ID, seg_inode + FSEG_ID, 8))) { + goto corrupted; + } + + byte* p_not_full = seg_inode + FSEG_NOT_FULL_N_USED; + uint32_t not_full_n_used = mach_read_from_4(p_not_full); + const uint16_t xoffset= uint16_t(descr - xdes->page.frame + + XDES_FLST_NODE); + const uint16_t ioffset= uint16_t(seg_inode - iblock->page.frame); + + if (xdes_is_full(descr)) { + /* The fragment is full: move it to another list */ + err = flst_remove(iblock, + static_cast(FSEG_FULL + ioffset), + xdes, xoffset, mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + return err; + } + err = flst_add_last(iblock, static_cast(FSEG_NOT_FULL + + ioffset), + xdes, xoffset, mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + return err; + } + not_full_n_used += extent_size - 1; + } else { + if (!not_full_n_used) { + goto corrupted; + } + not_full_n_used--; + } + + mtr->write<4>(*iblock, p_not_full, not_full_n_used); + xdes_set_free(*xdes, descr, offset & (extent_size - 1), mtr); + + if (!xdes_get_n_used(descr)) { + err = flst_remove(iblock, static_cast(FSEG_NOT_FULL + + ioffset), + xdes, xoffset, mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + return err; + } + err = fsp_free_extent(space, offset, mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + return err; + } + } + + mtr->free(*space, static_cast(offset)); + return DB_SUCCESS; +} + +/** Free a page in a file segment. +@param[in,out] seg_header file segment header +@param[in,out] space tablespace +@param[in] offset page number +@param[in,out] mtr mini-transaction +@param[in] have_latch whether space->x_lock() was already called +@return error code */ +dberr_t fseg_free_page(fseg_header_t *seg_header, fil_space_t *space, + uint32_t offset, mtr_t *mtr, bool have_latch) +{ + buf_block_t *iblock; + if (have_latch) + ut_ad(space->is_owner()); + else + mtr->x_lock_space(space); + + DBUG_PRINT("fseg_free_page", + ("space_id: " ULINTPF ", page_no: %u", space->id, offset)); + + dberr_t err; + if (fseg_inode_t *seg_inode= fseg_inode_try_get(seg_header, + space->id, space->zip_size(), + mtr, &iblock, &err)) + { + if (!space->full_crc32()) + fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr); + return fseg_free_page_low(seg_inode, iblock, space, offset, mtr); + } + + return err; +} + +/** Determine whether a page is allocated. +@param space tablespace +@param page page number +@return error code +@retval DB_SUCCESS if the page is marked as free +@retval DB_SUCCESS_LOCKED_REC if the page is marked as allocated */ +dberr_t fseg_page_is_allocated(fil_space_t *space, unsigned page) +{ + mtr_t mtr; + uint32_t dpage= xdes_calc_descriptor_page(space->zip_size(), page); + const unsigned zip_size= space->zip_size(); + dberr_t err= DB_SUCCESS; + + mtr.start(); + if (!space->is_owner()) + mtr.x_lock_space(space); + + if (page >= space->free_limit || page >= space->size_in_header); + else if (const buf_block_t *b= + buf_page_get_gen(page_id_t(space->id, dpage), space->zip_size(), + RW_S_LATCH, nullptr, BUF_GET_POSSIBLY_FREED, + &mtr, &err)) + { + if (!dpage && + (space->free_limit != + mach_read_from_4(FSP_FREE_LIMIT + FSP_HEADER_OFFSET + + b->page.frame) || + space->size_in_header != + mach_read_from_4(FSP_SIZE + FSP_HEADER_OFFSET + b->page.frame))) + err= DB_CORRUPTION; + else + err= xdes_is_free(b->page.frame + XDES_ARR_OFFSET + XDES_SIZE + * xdes_calc_descriptor_index(zip_size, page), + page & (FSP_EXTENT_SIZE - 1)) + ? DB_SUCCESS + : DB_SUCCESS_LOCKED_REC; + } + + mtr.commit(); + return err; +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Free an extent of a segment to the space free list. +@param[in,out] seg_inode segment inode +@param[in,out] space tablespace +@param[in] page page number in the extent +@param[in,out] mtr mini-transaction +@return error code */ +static +dberr_t +fseg_free_extent( + fseg_inode_t* seg_inode, + buf_block_t* iblock, + fil_space_t* space, + uint32_t page, + mtr_t* mtr +#ifdef BTR_CUR_HASH_ADAPT + ,bool ahi=false +#endif /* BTR_CUR_HASH_ADAPT */ + ) +{ + buf_block_t* xdes; + dberr_t err; + xdes_t* descr = xdes_get_descriptor(space, page, mtr, &err, &xdes); + + if (!descr) { + return err; + } + + if (UNIV_UNLIKELY(xdes_get_state(descr) != XDES_FSEG + || memcmp(descr + XDES_ID, seg_inode + FSEG_ID, 8) + || memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + + seg_inode, 4))) { + return DB_CORRUPTION; + } + ut_d(space->modify_check(*mtr)); + const uint32_t first_page_in_extent = page - (page % FSP_EXTENT_SIZE); + + const uint16_t xoffset= uint16_t(descr - xdes->page.frame + + XDES_FLST_NODE); + const uint16_t ioffset= uint16_t(seg_inode - iblock->page.frame); + +#ifdef BTR_CUR_HASH_ADAPT + if (ahi) { + for (uint32_t i = 0; i < FSP_EXTENT_SIZE; i++) { + if (!xdes_is_free(descr, i)) { + /* Drop search system page hash index + if the page is found in the pool and + is hashed */ + btr_search_drop_page_hash_when_freed( + page_id_t(space->id, + first_page_in_extent + i)); + } + } + } +#endif /* BTR_CUR_HASH_ADAPT */ + + uint16_t lst; + + if (xdes_is_full(descr)) { + lst = static_cast(FSEG_FULL + ioffset); +remove: + err = flst_remove(iblock, lst, xdes, xoffset, mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + return err; + } + } else if (!xdes_get_n_used(descr)) { + lst = static_cast(FSEG_FREE + ioffset); + goto remove; + } else { + err = flst_remove( + iblock, static_cast(FSEG_NOT_FULL + ioffset), + xdes, xoffset, mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + return err; + } + uint32_t not_full_n_used = mach_read_from_4( + FSEG_NOT_FULL_N_USED + seg_inode); + uint32_t descr_n_used = xdes_get_n_used(descr); + if (not_full_n_used < descr_n_used) { + return DB_CORRUPTION; + } + mtr->write<4>(*iblock, seg_inode + FSEG_NOT_FULL_N_USED, + not_full_n_used - descr_n_used); + } + + std::vector going_to_free; + static_assert(FSP_EXTENT_SIZE_MIN == 256, "compatibility"); + static_assert(FSP_EXTENT_SIZE_MAX == 64, "compatibility"); + + for (uint32_t i = 0; i < FSP_EXTENT_SIZE; i++) { + if (!xdes_is_free(descr, i)) { + going_to_free.emplace_back(uint8_t(i)); + } + } + + if (dberr_t err = fsp_free_extent(space, page, mtr)) { + return err; + } + + for (uint32_t i : going_to_free) { + mtr->free(*space, first_page_in_extent + i); + buf_page_free(space, first_page_in_extent + i, mtr); + } + + return DB_SUCCESS; +} + +/** Frees part of a segment. This function can be used to free +a segment by repeatedly calling this function in different +mini-transactions. Doing the freeing in a single mini-transaction +might result in too big a mini-transaction. +@param header segment header; NOTE: if the header resides on first + page of the frag list of the segment, this pointer + becomes obsolete after the last freeing step +@param mtr mini-transaction +@param ahi Drop the adaptive hash index +@return whether the freeing was completed */ +bool +fseg_free_step( + fseg_header_t* header, + mtr_t* mtr +#ifdef BTR_CUR_HASH_ADAPT + ,bool ahi +#endif /* BTR_CUR_HASH_ADAPT */ + ) +{ + ulint n; + fseg_inode_t* inode; + + const uint32_t space_id = page_get_space_id(page_align(header)); + const uint32_t header_page = page_get_page_no(page_align(header)); + + fil_space_t* space = mtr->x_lock_space(space_id); + xdes_t* descr = xdes_get_descriptor(space, header_page, mtr); + + if (!descr) { + return true; + } + + /* Check that the header resides on a page which has not been + freed yet */ + + if (UNIV_UNLIKELY(xdes_is_free(descr, + header_page & (FSP_EXTENT_SIZE - 1)))) { + /* Some corruption was detected: stop the freeing + in order to prevent a crash. */ + return true; + } + buf_block_t* iblock; + const ulint zip_size = space->zip_size(); + inode = fseg_inode_try_get(header, space_id, zip_size, mtr, &iblock); + if (!inode || space->is_stopping()) { + return true; + } + + if (!space->full_crc32()) { + fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr); + } + + dberr_t err; + descr = fseg_get_first_extent(inode, space, mtr, &err); + + if (descr) { + /* Free the extent held by the segment */ + return fseg_free_extent(inode, iblock, space, + xdes_get_offset(descr), mtr +#ifdef BTR_CUR_HASH_ADAPT + , ahi +#endif /* BTR_CUR_HASH_ADAPT */ + ) != DB_SUCCESS; + } + + if (err != DB_SUCCESS || space->is_stopping()) { + return true; + } + + /* Free a frag page */ + n = fseg_find_last_used_frag_page_slot(inode); + + if (n == ULINT_UNDEFINED) { + /* Freeing completed: free the segment inode */ + fsp_free_seg_inode(space, inode, iblock, mtr); + return true; + } + + page_no_t page_no = fseg_get_nth_frag_page_no(inode, n); + + if (fseg_free_page_low(inode, iblock, space, page_no, mtr +#ifdef BTR_CUR_HASH_ADAPT + , ahi +#endif /* BTR_CUR_HASH_ADAPT */ + ) != DB_SUCCESS) { + return true; + } + + buf_page_free(space, page_no, mtr); + + n = fseg_find_last_used_frag_page_slot(inode); + + if (n == ULINT_UNDEFINED) { + /* Freeing completed: free the segment inode */ + fsp_free_seg_inode(space, inode, iblock, mtr); + + return true; + } + + return false; +} + +bool +fseg_free_step_not_header( + fseg_header_t* header, + mtr_t* mtr +#ifdef BTR_CUR_HASH_ADAPT + ,bool ahi +#endif /* BTR_CUR_HASH_ADAPT */ + ) +{ + fseg_inode_t* inode; + + const uint32_t space_id = page_get_space_id(page_align(header)); + ut_ad(mtr->is_named_space(space_id)); + + fil_space_t* space = mtr->x_lock_space(space_id); + buf_block_t* iblock; + + inode = fseg_inode_try_get(header, space_id, space->zip_size(), + mtr, &iblock); + if (space->is_stopping()) { + return true; + } + + if (!inode) { + ib::warn() << "Double free of " + << page_id_t(space_id, + page_get_page_no(page_align(header))); + return true; + } + + if (!space->full_crc32()) { + fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr); + } + + dberr_t err; + if (xdes_t* descr = fseg_get_first_extent(inode, space, mtr, &err)) { + /* Free the extent held by the segment */ + return fseg_free_extent(inode, iblock, space, + xdes_get_offset(descr), + mtr +#ifdef BTR_CUR_HASH_ADAPT + , ahi +#endif /* BTR_CUR_HASH_ADAPT */ + ) != DB_SUCCESS; + } else if (err != DB_SUCCESS) { + return true; + } + + /* Free a frag page */ + + ulint n = fseg_find_last_used_frag_page_slot(inode); + + if (UNIV_UNLIKELY(n == ULINT_UNDEFINED)) { + return true; + } + + uint32_t page_no = fseg_get_nth_frag_page_no(inode, n); + + if (page_no == page_get_page_no(page_align(header))) { + return true; + } + + if (fseg_free_page_low(inode, iblock, space, page_no, mtr +#ifdef BTR_CUR_HASH_ADAPT + , ahi +#endif /* BTR_CUR_HASH_ADAPT */ + ) != DB_SUCCESS) { + return true; + } + buf_page_free(space, page_no, mtr); + return false; +} + +/** Returns the first extent descriptor for a segment. +We think of the extent lists of the segment catenated in the order +FSEG_FULL -> FSEG_NOT_FULL -> FSEG_FREE. +@param[in] inode segment inode +@param[in] space tablespace +@param[in,out] mtr mini-transaction +@return the first extent descriptor +@retval nullptr if none, or on corruption */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +static +xdes_t* +fseg_get_first_extent( + fseg_inode_t* inode, + const fil_space_t* space, + mtr_t* mtr, + dberr_t* err) +{ + if (UNIV_UNLIKELY(space->id != page_get_space_id(page_align(inode)) || + memcmp(inode + FSEG_MAGIC_N, FSEG_MAGIC_N_BYTES, 4))) + { + corrupted: + *err= DB_CORRUPTION; + return nullptr; + } + + fil_addr_t first; + + if (flst_get_len(inode + FSEG_FULL)) + first= flst_get_first(inode + FSEG_FULL); + else if (flst_get_len(inode + FSEG_NOT_FULL)) + first= flst_get_first(inode + FSEG_NOT_FULL); + else if (flst_get_len(inode + FSEG_FREE)) + first= flst_get_first(inode + FSEG_FREE); + else + { + *err= DB_SUCCESS; + return nullptr; + } + + if (first.page == FIL_NULL) + goto corrupted; + + return xdes_lst_get_descriptor(*space, first, mtr, nullptr, err); +} + +#ifdef UNIV_BTR_PRINT +/*******************************************************************//** +Writes info of a segment. */ +static void fseg_print_low(const fseg_inode_t *inode) +{ + ulint space; + ulint n_used; + ulint n_frag; + ulint n_free; + ulint n_not_full; + ulint n_full; + ulint reserved; + ulint used; + ulint page_no; + ib_id_t seg_id; + + space = page_get_space_id(page_align(inode)); + page_no = page_get_page_no(page_align(inode)); + + reserved = fseg_n_reserved_pages_low(inode, &used); + + seg_id = mach_read_from_8(inode + FSEG_ID); + n_used = mach_read_from_4(inode + FSEG_NOT_FULL_N_USED); + n_frag = fseg_get_n_frag_pages(inode); + n_free = flst_get_len(inode + FSEG_FREE); + n_not_full = flst_get_len(inode + FSEG_NOT_FULL); + n_full = flst_get_len(inode + FSEG_FULL); + + ib::info() << "SEGMENT id " << seg_id + << " space " << space << ";" + << " page " << page_no << ";" + << " res " << reserved << " used " << used << ";" + << " full ext " << n_full << ";" + << " fragm pages " << n_frag << ";" + << " free extents " << n_free << ";" + << " not full extents " << n_not_full << ": pages " << n_used; + + ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)); +} + +/*******************************************************************//** +Writes info of a segment. */ +void +fseg_print( +/*=======*/ + fseg_header_t* header, /*!< in: segment header */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + const fil_space_t *space= + mtr->x_lock_space(page_get_space_id(page_align(header))); + buf_block_t *block; + if (fseg_inode_t *inode= + fseg_inode_try_get(header, space->id, space->zip_size(), mtr, &block)) + fseg_print_low(inode); +} +#endif /* UNIV_BTR_PRINT */ + +#ifdef UNIV_DEBUG +std::ostream &fseg_header::to_stream(std::ostream &out) const +{ + out << "[fseg_header_t: space=" + << mach_read_from_4(m_header + FSEG_HDR_SPACE) + << ", page=" << mach_read_from_4(m_header + FSEG_HDR_PAGE_NO) + << ", offset=" << mach_read_from_2(m_header + FSEG_HDR_OFFSET) << "]"; + return out; +} +#endif /* UNIV_DEBUG */ diff --git a/storage/innobase/fsp/fsp0space.cc b/storage/innobase/fsp/fsp0space.cc new file mode 100644 index 00000000..c2152b08 --- /dev/null +++ b/storage/innobase/fsp/fsp0space.cc @@ -0,0 +1,224 @@ +/***************************************************************************** + +Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file fsp/fsp0space.cc +Shared tablespace implementation. + +Created 2012-11-16 by Sunny Bains as srv/srv0space.cc +*******************************************************/ + +#include "fsp0sysspace.h" +#include "fsp0fsp.h" +#include "os0file.h" +#include "my_sys.h" + +/** Check if two tablespaces have common data file names. +@param other_space Tablespace to check against this. +@return true if they have the same data filenames and paths */ +bool +Tablespace::intersection( + const Tablespace* other_space) +{ + for (files_t::const_iterator it(other_space->begin()), + end(other_space->end()); it != end; ++it) { + + if (find(it->m_filename)) { + + return(true); + } + } + + return(false); +} + +/** Frees the memory allocated by the SysTablespace object. */ +void +Tablespace::shutdown() +{ + for (iterator it = begin(); it != end(); ++it) { + it->shutdown(); + } + + m_files.clear(); + ut_free(m_path); + m_path = NULL; + m_space_id = UINT32_MAX; +} + +/** Note that the data file was found. +@param[in,out] file Data file object to set */ +void +Tablespace::file_found(Datafile& file) +{ + /* Note that the file exists and can be opened + in the appropriate mode. */ + file.m_exists = true; + + file.set_open_flags( + &file == &m_files.front() + ? OS_FILE_OPEN_RETRY : OS_FILE_OPEN); +} + +/** Open or Create the data files if they do not exist. +@param[in] is_temp whether this is a temporary tablespace +@return DB_SUCCESS or error code */ +dberr_t +Tablespace::open_or_create(bool is_temp) +{ + fil_space_t* space = NULL; + dberr_t err = DB_SUCCESS; + + ut_ad(!m_files.empty()); + + for (iterator it = begin(); it != end(); ++it) { + if (it->m_exists) { + err = it->open_or_create( + m_ignore_read_only + ? false : srv_read_only_mode); + if (err != DB_SUCCESS) { + return err; + } + } else { + err = it->open_or_create( + m_ignore_read_only + ? false : srv_read_only_mode); + + if (err != DB_SUCCESS) { + return err; + } + + /* Set the correct open flags now that we have + successfully created the file. */ + file_found(*it); + } + + /* We can close the handle now and open the tablespace + the proper way. */ + it->close(); + + if (it == begin()) { + /* First data file. */ + + /* Create the tablespace entry for the multi-file + tablespace in the tablespace manager. */ + uint32_t fsp_flags; + + switch (srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_FULL_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: + fsp_flags = (FSP_FLAGS_FCRC32_MASK_MARKER + | FSP_FLAGS_FCRC32_PAGE_SSIZE()); + break; + default: + fsp_flags = FSP_FLAGS_PAGE_SSIZE(); + } + + mysql_mutex_lock(&fil_system.mutex); + space = fil_space_t::create( + m_space_id, fsp_flags, + is_temp + ? FIL_TYPE_TEMPORARY : FIL_TYPE_TABLESPACE, + NULL); + if (!space) { + mysql_mutex_unlock(&fil_system.mutex); + return DB_ERROR; + } + } else { + mysql_mutex_lock(&fil_system.mutex); + } + space->add(it->m_filepath, OS_FILE_CLOSED, it->m_size, + false, true); + mysql_mutex_unlock(&fil_system.mutex); + } + + return(err); +} + +/** Find a filename in the list of Datafiles for a tablespace +@return true if the filename exists in the data files */ +bool +Tablespace::find(const char* filename) const +{ + for (const_iterator it = begin(); it != end(); ++it) { + + if (innobase_strcasecmp(filename, it->m_filename) == 0) { + return(true); + } + } + + return(false); +} + +/** Delete all the data files. */ +void +Tablespace::delete_files() +{ + for (iterator it = begin(); it != end(); ++it) { + + it->close(); + + bool file_pre_exists; + bool success = os_file_delete_if_exists( + innodb_data_file_key, it->m_filepath, &file_pre_exists); + + if (success && file_pre_exists) { + ib::info() << "Removed temporary tablespace data" + " file: \"" << it->m_filepath << "\""; + } + } +} + +/** Use the ADD DATAFILE path to create a Datafile object and add it to the +front of m_files. +Parse the datafile path into a path and a filename with extension 'ibd'. +This datafile_path provided may or may not be an absolute path, but it +must end with the extension .ibd and have a basename of at least 1 byte. + +Set tablespace m_path member and add a Datafile with the filename. +@param[in] datafile_path full path of the tablespace file. */ +dberr_t Tablespace::add_datafile(const char *filepath) +{ + /* The path provided ends in ".ibd". This was assured by + validate_create_tablespace_info() */ + ut_d(const char* dot = strrchr(filepath, '.')); + ut_ad(dot != NULL && 0 == strcmp(dot, DOT_IBD)); + + /* If the path is an absolute path, separate it onto m_path and a + basename. For relative paths, make the whole thing a basename so that + it can be appended to the datadir. */ + bool is_abs_path = is_absolute_path(filepath); + size_t dirlen = (is_abs_path ? dirname_length(filepath) : 0); + const char* basename = filepath + dirlen; + + /* If the pathname contains a directory separator, fill the + m_path member which is the default directory for files in this + tablespace. Leave it null otherwise. */ + if (dirlen > 0) { + set_path(filepath, dirlen); + } + + /* Now add a new Datafile and set the filepath + using the m_path created above. */ + m_files.push_back(Datafile(m_flags, FIL_IBD_FILE_INITIAL_SIZE, 0)); + m_files.back().make_filepath(m_path, {basename, strlen(basename) - 4}, + IBD); + + return(DB_SUCCESS); +} diff --git a/storage/innobase/fsp/fsp0sysspace.cc b/storage/innobase/fsp/fsp0sysspace.cc new file mode 100644 index 00000000..e4a43e48 --- /dev/null +++ b/storage/innobase/fsp/fsp0sysspace.cc @@ -0,0 +1,1019 @@ +/***************************************************************************** + +Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file fsp/fsp0space.cc +Multi file, shared, system tablespace implementation. + +Created 2012-11-16 by Sunny Bains as srv/srv0space.cc +Refactored 2013-7-26 by Kevin Lewis +*******************************************************/ + +#include "fsp0sysspace.h" +#include "srv0start.h" +#include "trx0sys.h" +#include "dict0load.h" +#include "mem0mem.h" +#include "os0file.h" +#include "row0mysql.h" +#include "buf0dblwr.h" + +/** The server header file is included to access opt_initialize global variable. +If server passes the option for create/open DB to SE, we should remove such +direct reference to server header and global variable */ +#include "mysqld.h" + +/** The control info of the system tablespace. */ +SysTablespace srv_sys_space; + +/** The control info of a temporary table shared tablespace. */ +SysTablespace srv_tmp_space; + +/** If the last data file is auto-extended, we add this many pages to it +at a time. We have to make this public because it is a config variable. */ +uint sys_tablespace_auto_extend_increment; + +/** Convert a numeric string that optionally ends in G or M or K, + to a number containing megabytes. +@param[in] str String with a quantity in bytes +@param[out] megs The number in megabytes +@return next character in string */ +char* +SysTablespace::parse_units( + char* ptr, + ulint* megs) +{ + char* endp; + + *megs = strtoul(ptr, &endp, 10); + + ptr = endp; + + switch (*ptr) { + case 'G': case 'g': + *megs *= 1024; + /* fall through */ + case 'M': case 'm': + ++ptr; + break; + case 'K': case 'k': + *megs /= 1024; + ++ptr; + break; + default: + *megs /= 1024 * 1024; + break; + } + + return(ptr); +} + +/** Parse the input params and populate member variables. +@param[in] filepath path to data files +@param[in] supports_raw true if the tablespace supports raw devices +@return true on success parse */ +bool +SysTablespace::parse_params( + const char* filepath_spec, + bool supports_raw) +{ + char* filepath; + ulint size; + char* input_str; + ulint n_files = 0; + + ut_ad(m_last_file_size_max == 0); + ut_ad(!m_auto_extend_last_file); + + char* new_str = mem_strdup(filepath_spec); + char* str = new_str; + + input_str = str; + + /*---------------------- PASS 1 ---------------------------*/ + /* First calculate the number of data files and check syntax: + filepath:size[K |M | G];filepath:size[K |M | G]... . + Note that a Windows path may contain a drive name and a ':'. */ + while (*str != '\0') { + filepath = str; + + while ((*str != ':' && *str != '\0') + || (*str == ':' + && (*(str + 1) == '\\' || *(str + 1) == '/' + || *(str + 1) == ':'))) { + str++; + } + + if (*str == '\0') { + ut_free(new_str); + + ib::error() + << "syntax error in file path or size" + " specified is less than 1 megabyte"; + return(false); + } + + str++; + + str = parse_units(str, &size); + + if (0 == strncmp(str, ":autoextend", + (sizeof ":autoextend") - 1)) { + + str += (sizeof ":autoextend") - 1; + + if (0 == strncmp(str, ":max:", + (sizeof ":max:") - 1)) { + + str += (sizeof ":max:") - 1; + + str = parse_units(str, &size); + } + + if (*str != '\0') { + ut_free(new_str); + ib::error() + << "syntax error in file path or" + << " size specified is less than" + << " 1 megabyte"; + return(false); + } + } + + if (::strlen(str) >= 6 + && *str == 'n' + && *(str + 1) == 'e' + && *(str + 2) == 'w') { + + if (!supports_raw) { + ib::error() + << "Tablespace doesn't support raw" + " devices"; + ut_free(new_str); + return(false); + } + + str += 3; + } + + if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') { + str += 3; + + if (!supports_raw) { + ib::error() + << "Tablespace doesn't support raw" + " devices"; + ut_free(new_str); + return(false); + } + } + + if (size == 0) { + + ut_free(new_str); + + ib::error() + << "syntax error in file path or size" + " specified is less than 1 megabyte"; + + return(false); + } + + ++n_files; + + if (*str == ';') { + str++; + } else if (*str != '\0') { + ut_free(new_str); + + ib::error() + << "syntax error in file path or size" + " specified is less than 1 megabyte"; + return(false); + } + } + + if (n_files == 0) { + + /* filepath_spec must contain at least one data file + definition */ + + ut_free(new_str); + + ib::error() + << "syntax error in file path or size specified" + " is less than 1 megabyte"; + + return(false); + } + + /*---------------------- PASS 2 ---------------------------*/ + /* Then store the actual values to our arrays */ + str = input_str; + ulint order = 0; + + while (*str != '\0') { + filepath = str; + + /* Note that we must step over the ':' in a Windows filepath; + a Windows path normally looks like C:\ibdata\ibdata1:1G, but + a Windows raw partition may have a specification like + \\.\C::1Gnewraw or \\.\PHYSICALDRIVE2:1Gnewraw */ + + while ((*str != ':' && *str != '\0') + || (*str == ':' + && (*(str + 1) == '\\' || *(str + 1) == '/' + || *(str + 1) == ':'))) { + str++; + } + + if (*str == ':') { + /* Make filepath a null-terminated string */ + *str = '\0'; + str++; + } + + str = parse_units(str, &size); + + if (0 == strncmp(str, ":autoextend", + (sizeof ":autoextend") - 1)) { + + m_auto_extend_last_file = true; + + str += (sizeof ":autoextend") - 1; + + if (0 == strncmp(str, ":max:", + (sizeof ":max:") - 1)) { + + str += (sizeof ":max:") - 1; + + str = parse_units(str, &m_last_file_size_max); + } + + if (*str != '\0') { + ut_free(new_str); + ib::error() << "syntax error in file path or" + " size specified is less than 1" + " megabyte"; + return(false); + } + } + + m_files.push_back(Datafile(flags(), uint32_t(size), order)); + m_files.back().make_filepath(path(), + {filepath, strlen(filepath)}, + NO_EXT); + + if (::strlen(str) >= 6 + && *str == 'n' + && *(str + 1) == 'e' + && *(str + 2) == 'w') { + + ut_a(supports_raw); + + str += 3; + + /* Initialize new raw device only during initialize */ + /* JAN: TODO: MySQL 5.7 used opt_initialize */ + m_files.back().m_type = + opt_bootstrap ? SRV_NEW_RAW : SRV_OLD_RAW; + } + + if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') { + + ut_a(supports_raw); + + str += 3; + + /* Initialize new raw device only during initialize */ + if (m_files.back().m_type == SRV_NOT_RAW) { + /* JAN: TODO: MySQL 5.7 used opt_initialize */ + m_files.back().m_type = + opt_bootstrap ? SRV_NEW_RAW : SRV_OLD_RAW; + } + } + + if (*str == ';') { + ++str; + } + order++; + } + + ut_ad(n_files == ulint(m_files.size())); + + ut_free(new_str); + + return(true); +} + +/** Frees the memory allocated by the parse method. */ +void +SysTablespace::shutdown() +{ + Tablespace::shutdown(); + + m_auto_extend_last_file = 0; + m_last_file_size_max = 0; + m_created_new_raw = 0; + m_is_tablespace_full = false; + m_sanity_checks_done = false; +} + +/** Verify the size of the physical file. +@param[in] file data file object +@return DB_SUCCESS if OK else error code. */ +dberr_t +SysTablespace::check_size( + Datafile& file) +{ + os_offset_t size = os_file_get_size(file.m_handle); + ut_a(size != (os_offset_t) -1); + + /* Under some error conditions like disk full scenarios + or file size reaching filesystem limit the data file + could contain an incomplete extent at the end. When we + extend a data file and if some failure happens, then + also the data file could contain an incomplete extent. + So we need to round the size downward to a megabyte.*/ + + const uint32_t rounded_size_pages = static_cast( + size >> srv_page_size_shift); + + /* If last file */ + if (&file == &m_files.back() && m_auto_extend_last_file) { + + if (file.m_size > rounded_size_pages + || (m_last_file_size_max > 0 + && m_last_file_size_max < rounded_size_pages)) { + ib::error() << "The Auto-extending data file '" + << file.filepath() + << "' is of a different size " + << rounded_size_pages + << " pages than specified" + " by innodb_data_file_path"; + return(DB_ERROR); + } + + file.m_size = rounded_size_pages; + } + + if (rounded_size_pages != file.m_size) { + ib::error() << "The data file '" + << file.filepath() << "' is of a different size " + << rounded_size_pages << " pages" + " than the " << file.m_size << " pages specified by" + " innodb_data_file_path"; + return(DB_ERROR); + } + + return(DB_SUCCESS); +} + +/** Set the size of the file. +@param[in] file data file object +@return DB_SUCCESS or error code */ +dberr_t +SysTablespace::set_size( + Datafile& file) +{ + ut_ad(!srv_read_only_mode || m_ignore_read_only); + const ib::bytes_iec b{uint64_t{file.m_size} << srv_page_size_shift}; + + /* We created the data file and now write it full of zeros */ + ib::info() << "Setting file '" << file.filepath() << "' size to " << b + << ". Physically writing the file full; Please wait ..."; + + bool success = os_file_set_size( + file.m_filepath, file.m_handle, + static_cast(file.m_size) << srv_page_size_shift); + + if (success) { + ib::info() << "File '" << file.filepath() << "' size is now " + << b + << "."; + } else { + ib::error() << "Could not set the file size of '" + << file.filepath() << "'. Probably out of disk space"; + + return(DB_ERROR); + } + + return(DB_SUCCESS); +} + +/** Create a data file. +@param[in] file data file object +@return DB_SUCCESS or error code */ +dberr_t +SysTablespace::create_file( + Datafile& file) +{ + dberr_t err = DB_SUCCESS; + + ut_a(!file.m_exists); + ut_ad(!srv_read_only_mode || m_ignore_read_only); + + switch (file.m_type) { + case SRV_NEW_RAW: + + /* The partition is opened, not created; then it is + written over */ + m_created_new_raw = true; + + /* Fall through. */ + + case SRV_OLD_RAW: + + srv_start_raw_disk_in_use = TRUE; + + /* Fall through. */ + + case SRV_NOT_RAW: + err = file.open_or_create( + !m_ignore_read_only && srv_read_only_mode); + break; + } + + if (err != DB_SUCCESS) { + return err; + } + + switch (file.m_type) { + case SRV_OLD_RAW: + break; + case SRV_NOT_RAW: +#ifndef _WIN32 + if (!space_id() && my_disable_locking + && os_file_lock(file.m_handle, file.m_filepath)) { + err = DB_ERROR; + break; + } +#endif + /* fall through */ + case SRV_NEW_RAW: + err = set_size(file); + } + + return(err); +} + +/** Open a data file. +@param[in] file data file object +@return DB_SUCCESS or error code */ +dberr_t +SysTablespace::open_file( + Datafile& file) +{ + dberr_t err = DB_SUCCESS; + + ut_a(file.m_exists); + + switch (file.m_type) { + case SRV_NEW_RAW: + /* The partition is opened, not created; then it is + written over */ + m_created_new_raw = true; + + /* Fall through */ + + case SRV_OLD_RAW: + srv_start_raw_disk_in_use = TRUE; + + if (srv_read_only_mode && !m_ignore_read_only) { + ib::error() << "Can't open a raw device '" + << file.m_filepath << "' when" + " --innodb-read-only is set"; + + return(DB_ERROR); + } + + /* Fall through */ + + case SRV_NOT_RAW: + err = file.open_or_create( + !m_ignore_read_only && srv_read_only_mode); + + if (err != DB_SUCCESS) { + return(err); + } + break; + } + + switch (file.m_type) { + case SRV_NEW_RAW: + /* Set file size for new raw device. */ + err = set_size(file); + break; + + case SRV_NOT_RAW: +#ifndef _WIN32 + if (!space_id() && (m_ignore_read_only || !srv_read_only_mode) + && my_disable_locking + && os_file_lock(file.m_handle, file.m_filepath)) { + err = DB_ERROR; + break; + } +#endif + /* Check file size for existing file. */ + err = check_size(file); + break; + + case SRV_OLD_RAW: + err = DB_SUCCESS; + break; + + } + + if (err != DB_SUCCESS) { + file.close(); + } + + return(err); +} + +/** Check the tablespace header for this tablespace. +@return DB_SUCCESS or error code */ +inline dberr_t SysTablespace::read_lsn_and_check_flags() +{ + dberr_t err; + + files_t::iterator it = m_files.begin(); + + ut_a(it->m_exists); + + if (it->m_handle == OS_FILE_CLOSED) { + + err = it->open_or_create( + m_ignore_read_only ? false : srv_read_only_mode); + + if (err != DB_SUCCESS) { + return(err); + } + } + + err = it->read_first_page( + m_ignore_read_only ? false : srv_read_only_mode); + + if (err != DB_SUCCESS) { + return(err); + } + + ut_a(it->order() == 0); + + if (srv_operation <= SRV_OPERATION_EXPORT_RESTORED) { + buf_dblwr.init_or_load_pages(it->handle(), it->filepath()); + } + + /* Check the contents of the first page of the + first datafile. */ + for (int retry = 0; retry < 2; ++retry) { + + err = it->validate_first_page(); + + if (err != DB_SUCCESS + && (retry == 1 + || recv_sys.dblwr.restore_first_page( + it->m_space_id, it->m_filepath, + it->handle()))) { + + it->close(); + + return(err); + } + } + + /* Make sure the tablespace space ID matches the + space ID on the first page of the first datafile. */ + if (space_id() != it->m_space_id) { + + ib::error() + << "The data file '" << it->filepath() + << "' has the wrong space ID. It should be " + << space_id() << ", but " << it->m_space_id + << " was found"; + + it->close(); + + return(err); + } + + if (srv_operation == SRV_OPERATION_NORMAL) { + /* Prepare for possible upgrade from 0-sized ib_logfile0. */ + ut_ad(!log_sys.next_checkpoint_lsn); + log_sys.next_checkpoint_lsn = mach_read_from_8( + it->m_first_page + 26/*FIL_PAGE_FILE_FLUSH_LSN*/); + } + + it->close(); + + return(DB_SUCCESS); +} + +/** Check if a file can be opened in the correct mode. +@param[in] file data file object +@param[out] reason exact reason if file_status check failed. +@return DB_SUCCESS or error code. */ +dberr_t +SysTablespace::check_file_status( + const Datafile& file, + file_status_t& reason) +{ + os_file_stat_t stat; + + memset(&stat, 0x0, sizeof(stat)); + + dberr_t err = os_file_get_status( + file.m_filepath, &stat, true, + m_ignore_read_only ? false : srv_read_only_mode); + + reason = FILE_STATUS_VOID; + /* File exists but we can't read the rw-permission settings. */ + switch (err) { + case DB_FAIL: + ib::error() << "os_file_get_status() failed on '" + << file.filepath() + << "'. Can't determine file permissions"; + err = DB_ERROR; + reason = FILE_STATUS_RW_PERMISSION_ERROR; + break; + + case DB_SUCCESS: + /* Note: stat.rw_perm is only valid for "regular" files */ + + if (stat.type == OS_FILE_TYPE_FILE) { + if (!stat.rw_perm) { + ib::error() << "The data file" + << " '" << file.filepath() + << ((!srv_read_only_mode + || m_ignore_read_only) + ? "' must be writable" + : "' must be readable"); + + err = DB_ERROR; + reason = FILE_STATUS_READ_WRITE_ERROR; + } + + } else { + /* Not a regular file, bail out. */ + ib::error() << "The data file '" << file.filepath() + << "' is not a regular file."; + + err = DB_ERROR; + reason = FILE_STATUS_NOT_REGULAR_FILE_ERROR; + } + break; + + case DB_NOT_FOUND: + break; + + default: + ut_ad(0); + } + + return(err); +} + +/** Note that the data file was not found. +@param[in] file data file object +@param[out] create_new_db true if a new instance to be created +@return DB_SUCESS or error code */ +dberr_t +SysTablespace::file_not_found( + Datafile& file, + bool* create_new_db) +{ + file.m_exists = false; + + if (m_ignore_read_only) { + } else if (srv_read_only_mode) { + ib::error() << "Can't create file '" << file.filepath() + << "' when --innodb-read-only is set"; + return(DB_ERROR); + } else if (srv_force_recovery && space_id() == TRX_SYS_SPACE) { + ib::error() << "Can't create file '" << file.filepath() + << "' when --innodb-force-recovery is set"; + return DB_ERROR; + } + + if (&file == &m_files.front()) { + + /* First data file. */ + ut_a(!*create_new_db); + *create_new_db = TRUE; + + if (space_id() == TRX_SYS_SPACE) { + ib::info() << "The first data file '" + << file.filepath() << "' did not exist." + " A new tablespace will be created!"; + } + + } else { + ib::info() << "Need to create a new data file '" + << file.filepath() << "'."; + } + + /* Set the file create mode. */ + switch (file.m_type) { + case SRV_NOT_RAW: + file.set_open_flags(OS_FILE_CREATE); + break; + + case SRV_NEW_RAW: + case SRV_OLD_RAW: + file.set_open_flags(OS_FILE_OPEN_RAW); + break; + } + + return(DB_SUCCESS); +} + +/** Note that the data file was found. +@param[in,out] file data file object +@return true if a new instance to be created */ +bool +SysTablespace::file_found( + Datafile& file) +{ + /* Note that the file exists and can be opened + in the appropriate mode. */ + file.m_exists = true; + + /* Set the file open mode */ + switch (file.m_type) { + case SRV_NOT_RAW: + file.set_open_flags( + &file == &m_files.front() + ? OS_FILE_OPEN_RETRY : OS_FILE_OPEN); + break; + + case SRV_NEW_RAW: + case SRV_OLD_RAW: + file.set_open_flags(OS_FILE_OPEN_RAW); + break; + } + + /* Need to create the system tablespace for new raw device. */ + return(file.m_type == SRV_NEW_RAW); +} + +/** Check the data file specification. +@param[out] create_new_db true if a new database is to be created +@param[in] min_expected_size Minimum expected tablespace size in bytes +@return DB_SUCCESS if all OK else error code */ +dberr_t +SysTablespace::check_file_spec( + bool* create_new_db, + ulint min_expected_size) +{ + *create_new_db = FALSE; + + if (m_files.size() >= 1000) { + ib::error() << "There must be < 1000 data files " + " but " << m_files.size() << " have been" + " defined."; + + return(DB_ERROR); + } + + if (!m_auto_extend_last_file + && get_sum_of_sizes() + < (min_expected_size >> srv_page_size_shift)) { + ib::error() << "Tablespace size must be at least " + << (min_expected_size >> 20) << " MB"; + return(DB_ERROR); + } + + dberr_t err = DB_SUCCESS; + + ut_a(!m_files.empty()); + + /* If there is more than one data file and the last data file + doesn't exist, that is OK. We allow adding of new data files. */ + + files_t::iterator begin = m_files.begin(); + files_t::iterator end = m_files.end(); + + for (files_t::iterator it = begin; it != end; ++it) { + + file_status_t reason_if_failed; + err = check_file_status(*it, reason_if_failed); + + if (err == DB_NOT_FOUND) { + + err = file_not_found(*it, create_new_db); + + if (err != DB_SUCCESS) { + break; + } + + } else if (err != DB_SUCCESS) { + if (reason_if_failed == FILE_STATUS_READ_WRITE_ERROR) { + ib::error() << "The data file '" + << it->filepath() + << ((!srv_read_only_mode + || m_ignore_read_only) + ? "' must be writable" + : "' must be readable"); + } + + ut_a(err != DB_FAIL); + break; + + } else if (*create_new_db) { + ib::error() << "The data file '" + << begin->filepath() + << "' was not found but" + " one of the other data files '" + << it->filepath() << "' exists."; + + err = DB_ERROR; + break; + + } else { + *create_new_db = file_found(*it); + } + } + + return(err); +} + +/** Open or create the data files +@param[in] is_temp whether this is a temporary tablespace +@param[in] create_new_db whether we are creating a new database +@param[out] sum_new_sizes sum of sizes of the new files added +@return DB_SUCCESS or error code */ +dberr_t +SysTablespace::open_or_create( + bool is_temp, + bool create_new_db, + ulint* sum_new_sizes) +{ + dberr_t err = DB_SUCCESS; + fil_space_t* space = NULL; + + ut_ad(!m_files.empty()); + + if (sum_new_sizes) { + *sum_new_sizes = 0; + } + + files_t::iterator begin = m_files.begin(); + files_t::iterator end = m_files.end(); + + ut_ad(begin->order() == 0); + + for (files_t::iterator it = begin; it != end; ++it) { + + if (it->m_exists) { + err = open_file(*it); + + /* For new raw device increment new size. */ + if (sum_new_sizes && it->m_type == SRV_NEW_RAW) { + + *sum_new_sizes += it->m_size; + } + + } else { + err = create_file(*it); + + if (sum_new_sizes) { + *sum_new_sizes += it->m_size; + } + + /* Set the correct open flags now that we have + successfully created the file. */ + if (err == DB_SUCCESS) { + /* We ignore new_db OUT parameter here + as the information is known at this stage */ + file_found(*it); + } + } + + if (err != DB_SUCCESS) { + return(err); + } + + } + + if (!create_new_db && space_id() == TRX_SYS_SPACE) { + /* Validate the header page in the first datafile. */ + err = read_lsn_and_check_flags(); + if (err != DB_SUCCESS) { + return(err); + } + } + + /* Close the curent handles, add space and file info to the + fil_system cache and the Data Dictionary, and re-open them + in file_system cache so that they stay open until shutdown. */ + mysql_mutex_lock(&fil_system.mutex); + ulint node_counter = 0; + for (files_t::iterator it = begin; it != end; ++it) { + it->close(); + it->m_exists = true; + + if (it != begin) { + } else if (is_temp) { + ut_ad(space_id() == SRV_TMP_SPACE_ID); + space = fil_space_t::create( + SRV_TMP_SPACE_ID, flags(), + FIL_TYPE_TEMPORARY, NULL); + ut_ad(space == fil_system.temp_space); + if (!space) { + err = DB_ERROR; + break; + } + ut_ad(!space->is_compressed()); + ut_ad(space->full_crc32()); + } else { + ut_ad(space_id() == TRX_SYS_SPACE); + space = fil_space_t::create( + TRX_SYS_SPACE, it->flags(), + FIL_TYPE_TABLESPACE, NULL); + ut_ad(space == fil_system.sys_space); + if (!space) { + err = DB_ERROR; + break; + } + } + + uint32_t max_size = (++node_counter == m_files.size() + ? (m_last_file_size_max == 0 + ? UINT32_MAX + : uint32_t(m_last_file_size_max)) + : it->m_size); + + space->add(it->m_filepath, OS_FILE_CLOSED, it->m_size, + it->m_type != SRV_NOT_RAW, true, max_size); + } + + mysql_mutex_unlock(&fil_system.mutex); + return(err); +} + +/** Normalize the file size, convert from megabytes to number of pages. */ +void +SysTablespace::normalize_size() +{ + files_t::iterator end = m_files.end(); + + for (files_t::iterator it = m_files.begin(); it != end; ++it) { + + it->m_size <<= (20U - srv_page_size_shift); + } + + m_last_file_size_max <<= (20U - srv_page_size_shift); +} + + +/** +@return next increment size */ +uint32_t SysTablespace::get_increment() const +{ + if (m_last_file_size_max == 0) + return get_autoextend_increment(); + + if (!is_valid_size()) + { + ib::error() << "The last data file has a size of " << last_file_size() + << " but the max size allowed is " + << m_last_file_size_max; + } + + return std::min(uint32_t(m_last_file_size_max) - last_file_size(), + get_autoextend_increment()); +} + + +/** +@return true if configured to use raw devices */ +bool +SysTablespace::has_raw_device() +{ + files_t::iterator end = m_files.end(); + + for (files_t::iterator it = m_files.begin(); it != end; ++it) { + + if (it->is_raw_device()) { + return(true); + } + } + + return(false); +} diff --git a/storage/innobase/fts/Makefile.query b/storage/innobase/fts/Makefile.query new file mode 100644 index 00000000..d91b1b92 --- /dev/null +++ b/storage/innobase/fts/Makefile.query @@ -0,0 +1,18 @@ +LEX=flex +YACC=bison +PREFIX=fts + +all: fts0pars.cc fts0blex.cc fts0tlex.cc + +fts0par.cc: fts0pars.y +fts0blex.cc: fts0blex.l +fts0tlex.cc: fts0tlex.l + +.l.cc: + echo '#include "univ.i"' > $*.cc + $(LEX) --stdout -P$(subst lex,,$*) -o $*.cc \ + --header-file=../include/$*.h $< >> $*.cc + +.y.cc: + $(YACC) -p $(PREFIX) -o $*.cc -d $< + mv $*.h ../include diff --git a/storage/innobase/fts/fts0ast.cc b/storage/innobase/fts/fts0ast.cc new file mode 100644 index 00000000..74d02d63 --- /dev/null +++ b/storage/innobase/fts/fts0ast.cc @@ -0,0 +1,816 @@ +/***************************************************************************** + +Copyright (c) 2007, 2020, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file fts/fts0ast.cc +Full Text Search parser helper file. + +Created 2007/3/16 Sunny Bains. +***********************************************************************/ + +#include "row0sel.h" +#include "fts0ast.h" +#include "fts0pars.h" +#include "fts0fts.h" +#include "trx0trx.h" + +/* The FTS ast visit pass. */ +enum fts_ast_visit_pass_t { + FTS_PASS_FIRST, /*!< First visit pass, + process operators excluding + FTS_EXIST and FTS_IGNORE */ + FTS_PASS_EXIST, /*!< Exist visit pass, + process operator FTS_EXIST */ + FTS_PASS_IGNORE /*!< Ignore visit pass, + process operator FTS_IGNORE */ +}; + +/******************************************************************//** +Create an empty fts_ast_node_t. +@return Create a new node */ +static +fts_ast_node_t* +fts_ast_node_create(void) +/*=====================*/ +{ + fts_ast_node_t* node; + + node = (fts_ast_node_t*) ut_zalloc_nokey(sizeof(*node)); + + return(node); +} + +/** Track node allocations, in case there is an error during parsing. */ +static +void +fts_ast_state_add_node( + fts_ast_state_t*state, /*!< in: ast instance */ + fts_ast_node_t* node) /*!< in: node to add to ast */ +{ + if (!state->list.head) { + ut_a(!state->list.tail); + + state->list.head = state->list.tail = node; + } else { + state->list.tail->next_alloc = node; + state->list.tail = node; + } +} + +/******************************************************************//** +Create a operator fts_ast_node_t. +@return new node */ +fts_ast_node_t* +fts_ast_create_node_oper( +/*=====================*/ + void* arg, /*!< in: ast state instance */ + fts_ast_oper_t oper) /*!< in: ast operator */ +{ + fts_ast_node_t* node = fts_ast_node_create(); + + node->type = FTS_AST_OPER; + node->oper = oper; + + fts_ast_state_add_node((fts_ast_state_t*) arg, node); + + return(node); +} + +/******************************************************************//** +This function takes ownership of the ptr and is responsible +for free'ing it +@return new node or a node list with tokenized words */ +fts_ast_node_t* +fts_ast_create_node_term( +/*=====================*/ + void* arg, /*!< in: ast state instance */ + const fts_ast_string_t* ptr) /*!< in: ast term string */ +{ + fts_ast_state_t* state = static_cast(arg); + ulint len = ptr->len; + ulint cur_pos = 0; + fts_ast_node_t* node = NULL; + fts_ast_node_t* node_list = NULL; + fts_ast_node_t* first_node = NULL; + + /* Scan the incoming string and filter out any "non-word" characters */ + while (cur_pos < len) { + fts_string_t str; + ulint cur_len; + + cur_len = innobase_mysql_fts_get_token( + state->charset, + reinterpret_cast(ptr->str) + cur_pos, + reinterpret_cast(ptr->str) + len, &str); + + if (cur_len == 0) { + break; + } + + cur_pos += cur_len; + + if (str.f_n_char > 0) { + /* If the subsequent term (after the first one)'s size + is less than fts_min_token_size or the term is greater + than fts_max_token_size, we shall ignore that. This is + to make consistent with MyISAM behavior */ + if ((first_node && (str.f_n_char < fts_min_token_size)) + || str.f_n_char > fts_max_token_size) { + continue; + } + + node = fts_ast_node_create(); + + node->type = FTS_AST_TERM; + + node->term.ptr = fts_ast_string_create( + str.f_str, str.f_len); + + fts_ast_state_add_node( + static_cast(arg), node); + + if (first_node) { + /* There is more than one word, create + a list to organize them */ + if (!node_list) { + node_list = fts_ast_create_node_list( + static_cast( + arg), + first_node); + } + + fts_ast_add_node(node_list, node); + } else { + first_node = node; + } + } + } + + return((node_list != NULL) ? node_list : first_node); +} + +/******************************************************************//** +Create an AST term node, makes a copy of ptr for plugin parser +@return node */ +fts_ast_node_t* +fts_ast_create_node_term_for_parser( +/*================================*/ + void* arg, /*!< in: ast state */ + const char* ptr, /*!< in: term string */ + const ulint len) /*!< in: term string length */ +{ + fts_ast_node_t* node = NULL; + + /* '%' as first char is forbidden for LIKE in internal SQL parser; + '%' as last char is reserved for wildcard search;*/ + if (len == 0 || len > FTS_MAX_WORD_LEN + || ptr[0] == '%' || ptr[len - 1] == '%') { + return(NULL); + } + + node = fts_ast_node_create(); + + node->type = FTS_AST_TERM; + + node->term.ptr = fts_ast_string_create( + reinterpret_cast(ptr), len); + + fts_ast_state_add_node(static_cast(arg), node); + + return(node); +} + +/******************************************************************//** +This function takes ownership of the ptr and is responsible +for free'ing it. +@return new node */ +fts_ast_node_t* +fts_ast_create_node_text( +/*=====================*/ + void* arg, /*!< in: ast state instance */ + const fts_ast_string_t* ptr) /*!< in: ast text string */ +{ + ulint len = ptr->len; + fts_ast_node_t* node = NULL; + + /* Once we come here, the string must have at least 2 quotes "" + around the query string, which could be empty. Also the query + string may contain 0x00 in it, we don't treat it as null-terminated. */ + ut_ad(len >= 2); + ut_ad(ptr->str[0] == '\"' && ptr->str[len - 1] == '\"'); + + if (len == 2) { + /* If the query string contains nothing except quotes, + it's obviously an invalid query. */ + return(NULL); + } + + node = fts_ast_node_create(); + + /*!< We ignore the actual quotes "" */ + len -= 2; + + node->type = FTS_AST_TEXT; + /*!< Skip copying the first quote */ + node->text.ptr = fts_ast_string_create( + reinterpret_cast(ptr->str + 1), len); + node->text.distance = ULINT_UNDEFINED; + + fts_ast_state_add_node((fts_ast_state_t*) arg, node); + + return(node); +} + +/******************************************************************//** +Create an AST phrase list node for plugin parser +@return node */ +fts_ast_node_t* +fts_ast_create_node_phrase_list( +/*============================*/ + void* arg) /*!< in: ast state */ +{ + fts_ast_node_t* node = fts_ast_node_create(); + + node->type = FTS_AST_PARSER_PHRASE_LIST; + + node->text.distance = ULINT_UNDEFINED; + node->list.head = node->list.tail = NULL; + + fts_ast_state_add_node(static_cast(arg), node); + + return(node); +} + +/******************************************************************//** +This function takes ownership of the expr and is responsible +for free'ing it. +@return new node */ +fts_ast_node_t* +fts_ast_create_node_list( +/*=====================*/ + void* arg, /*!< in: ast state instance */ + fts_ast_node_t* expr) /*!< in: ast expr instance */ +{ + fts_ast_node_t* node = fts_ast_node_create(); + + node->type = FTS_AST_LIST; + node->list.head = node->list.tail = expr; + + fts_ast_state_add_node((fts_ast_state_t*) arg, node); + + return(node); +} + +/******************************************************************//** +Create a sub-expression list node. This function takes ownership of +expr and is responsible for deleting it. +@return new node */ +fts_ast_node_t* +fts_ast_create_node_subexp_list( +/*============================*/ + void* arg, /*!< in: ast state instance */ + fts_ast_node_t* expr) /*!< in: ast expr instance */ +{ + fts_ast_node_t* node = fts_ast_node_create(); + + node->type = FTS_AST_SUBEXP_LIST; + node->list.head = node->list.tail = expr; + + fts_ast_state_add_node((fts_ast_state_t*) arg, node); + + return(node); +} + +/******************************************************************//** +Free an expr list node elements. */ +static +void +fts_ast_free_list( +/*==============*/ + fts_ast_node_t* node) /*!< in: ast node to free */ +{ + ut_a(node->type == FTS_AST_LIST + || node->type == FTS_AST_SUBEXP_LIST + || node->type == FTS_AST_PARSER_PHRASE_LIST); + + for (node = node->list.head; + node != NULL; + node = fts_ast_free_node(node)) { + + /*!< No op */ + } +} + +/********************************************************************//** +Free a fts_ast_node_t instance. +@return next node to free */ +fts_ast_node_t* +fts_ast_free_node( +/*==============*/ + fts_ast_node_t* node) /*!< in: the node to free */ +{ + fts_ast_node_t* next_node; + + switch (node->type) { + case FTS_AST_TEXT: + if (node->text.ptr) { + fts_ast_string_free(node->text.ptr); + node->text.ptr = NULL; + } + break; + + case FTS_AST_TERM: + if (node->term.ptr) { + fts_ast_string_free(node->term.ptr); + node->term.ptr = NULL; + } + break; + + case FTS_AST_LIST: + case FTS_AST_SUBEXP_LIST: + case FTS_AST_PARSER_PHRASE_LIST: + fts_ast_free_list(node); + node->list.head = node->list.tail = NULL; + break; + + case FTS_AST_OPER: + break; + + default: + ut_error; + } + + /*!< Get next node before freeing the node itself */ + next_node = node->next; + + ut_free(node); + + return(next_node); +} + +/******************************************************************//** +This AST takes ownership of the expr and is responsible +for free'ing it. +@return in param "list" */ +fts_ast_node_t* +fts_ast_add_node( +/*=============*/ + fts_ast_node_t* node, /*!< in: list instance */ + fts_ast_node_t* elem) /*!< in: node to add to list */ +{ + if (!elem) { + return(NULL); + } + + ut_a(!elem->next); + ut_a(node->type == FTS_AST_LIST + || node->type == FTS_AST_SUBEXP_LIST + || node->type == FTS_AST_PARSER_PHRASE_LIST); + + if (!node->list.head) { + ut_a(!node->list.tail); + + node->list.head = node->list.tail = elem; + } else { + ut_a(node->list.tail); + + node->list.tail->next = elem; + node->list.tail = elem; + } + + return(node); +} + +/******************************************************************//** +Set the wildcard attribute of a term. */ +void +fts_ast_term_set_wildcard( +/*======================*/ + fts_ast_node_t* node) /*!< in/out: set attribute of + a term node */ +{ + if (!node) { + return; + } + + /* If it's a node list, the wildcard should be set to the tail node*/ + if (node->type == FTS_AST_LIST) { + ut_ad(node->list.tail != NULL); + node = node->list.tail; + } + + ut_a(node->type == FTS_AST_TERM); + ut_a(!node->term.wildcard); + + node->term.wildcard = TRUE; +} + +/******************************************************************//** +Set the proximity attribute of a text node. */ +void +fts_ast_text_set_distance( +/*======================*/ + fts_ast_node_t* node, /*!< in/out: text node */ + ulint distance) /*!< in: the text proximity + distance */ +{ + if (node == NULL) { + return; + } + + ut_a(node->type == FTS_AST_TEXT); + ut_a(node->text.distance == ULINT_UNDEFINED); + + node->text.distance = distance; +} + +/******************************************************************//** +Free node and expr allocations. */ +void +fts_ast_state_free( +/*===============*/ + fts_ast_state_t*state) /*!< in: ast state to free */ +{ + fts_ast_node_t* node = state->list.head; + + /* Free the nodes that were allocated during parsing. */ + while (node) { + fts_ast_node_t* next = node->next_alloc; + + if (node->type == FTS_AST_TEXT && node->text.ptr) { + fts_ast_string_free(node->text.ptr); + node->text.ptr = NULL; + } else if (node->type == FTS_AST_TERM && node->term.ptr) { + fts_ast_string_free(node->term.ptr); + node->term.ptr = NULL; + } + + ut_free(node); + node = next; + } + + state->root = state->list.head = state->list.tail = NULL; +} + +/** Print the ast string +@param[in] str string to print */ +static +void +fts_ast_string_print( + const fts_ast_string_t* ast_str) +{ + for (ulint i = 0; i < ast_str->len; ++i) { + printf("%c", ast_str->str[i]); + } + + printf("\n"); +} + +/******************************************************************//** +Print an ast node recursively. */ +static +void +fts_ast_node_print_recursive( +/*=========================*/ + fts_ast_node_t* node, /*!< in: ast node to print */ + ulint level) /*!< in: recursive level */ +{ + /* Print alignment blank */ + for (ulint i = 0; i < level; i++) { + printf(" "); + } + + switch (node->type) { + case FTS_AST_TEXT: + printf("TEXT: "); + fts_ast_string_print(node->text.ptr); + break; + + case FTS_AST_TERM: + printf("TERM: "); + fts_ast_string_print(node->term.ptr); + break; + + case FTS_AST_LIST: + printf("LIST: \n"); + + for (node = node->list.head; node; node = node->next) { + fts_ast_node_print_recursive(node, level + 1); + } + break; + + case FTS_AST_SUBEXP_LIST: + printf("SUBEXP_LIST: \n"); + + for (node = node->list.head; node; node = node->next) { + fts_ast_node_print_recursive(node, level + 1); + } + break; + + case FTS_AST_OPER: + printf("OPER: %d\n", node->oper); + break; + + case FTS_AST_PARSER_PHRASE_LIST: + printf("PARSER_PHRASE_LIST: \n"); + + for (node = node->list.head; node; node = node->next) { + fts_ast_node_print_recursive(node, level + 1); + } + break; + + default: + ut_error; + } +} + +/******************************************************************//** +Print an ast node */ +void +fts_ast_node_print( +/*===============*/ + fts_ast_node_t* node) /*!< in: ast node to print */ +{ + fts_ast_node_print_recursive(node, 0); +} + +/** Check only union operation involved in the node +@param[in] node ast node to check +@return true if the node contains only union else false. */ +bool +fts_ast_node_check_union( + fts_ast_node_t* node) +{ + if (node->type == FTS_AST_LIST + || node->type == FTS_AST_SUBEXP_LIST) { + + for (node = node->list.head; node; node = node->next) { + if (!fts_ast_node_check_union(node)) { + return(false); + } + } + + } else if (node->type == FTS_AST_PARSER_PHRASE_LIST) { + /* Phrase search for plugin parser */ + return(false); + } else if (node->type == FTS_AST_OPER + && (node->oper == FTS_IGNORE + || node->oper == FTS_EXIST)) { + + return(false); + } else if (node->type == FTS_AST_TEXT) { + /* Distance or phrase search query. */ + return(false); + } + + return(true); +} + +/******************************************************************//** +Traverse the AST - in-order traversal, except for the FTX_EXIST and FTS_IGNORE +nodes, which will be ignored in the first pass of each level, and visited in a +second and third pass after all other nodes in the same level are visited. +@return DB_SUCCESS if all went well */ +dberr_t +fts_ast_visit( +/*==========*/ + fts_ast_oper_t oper, /*!< in: current operator */ + fts_ast_node_t* node, /*!< in: current root node */ + fts_ast_callback visitor, /*!< in: callback function */ + void* arg, /*!< in: arg for callback */ + bool* has_ignore) /*!< out: true, if the operator + was ignored during processing, + currently we ignore FTS_EXIST + and FTS_IGNORE operators */ +{ + dberr_t error = DB_SUCCESS; + fts_ast_node_t* oper_node = NULL; + fts_ast_node_t* start_node; + bool revisit = false; + bool will_be_ignored = false; + fts_ast_visit_pass_t visit_pass = FTS_PASS_FIRST; + const trx_t* trx = node->trx; + + start_node = node->list.head; + + ut_a(node->type == FTS_AST_LIST + || node->type == FTS_AST_SUBEXP_LIST); + + if (oper == FTS_EXIST_SKIP) { + visit_pass = FTS_PASS_EXIST; + } else if (oper == FTS_IGNORE_SKIP) { + visit_pass = FTS_PASS_IGNORE; + } + + /* In the first pass of the tree, at the leaf level of the + tree, FTS_EXIST and FTS_IGNORE operation will be ignored. + It will be repeated at the level above the leaf level. + + The basic idea here is that when we encounter FTS_EXIST or + FTS_IGNORE, we will change the operator node into FTS_EXIST_SKIP + or FTS_IGNORE_SKIP, and term node & text node with the operators + is ignored in the first pass. We have two passes during the revisit: + We process nodes with FTS_EXIST_SKIP in the exist pass, and then + process nodes with FTS_IGNORE_SKIP in the ignore pass. + + The order should be restrictly followed, or we will get wrong results. + For example, we have a query 'a +b -c d +e -f'. + first pass: process 'a' and 'd' by union; + exist pass: process '+b' and '+e' by intersection; + ignore pass: process '-c' and '-f' by difference. */ + + for (node = node->list.head; + node && (error == DB_SUCCESS); + node = node->next) { + + switch (node->type) { + case FTS_AST_LIST: + if (visit_pass != FTS_PASS_FIRST) { + break; + } + + error = fts_ast_visit(oper, node, visitor, + arg, &will_be_ignored); + + /* If will_be_ignored is set to true, then + we encountered and ignored a FTS_EXIST or FTS_IGNORE + operator. */ + if (will_be_ignored) { + revisit = true; + /* Remember oper for list in case '-abc&def', + ignored oper is from previous node of list.*/ + node->oper = oper; + } + + break; + + case FTS_AST_OPER: + oper = node->oper; + oper_node = node; + + /* Change the operator for revisit */ + if (oper == FTS_EXIST) { + oper_node->oper = FTS_EXIST_SKIP; + } else if (oper == FTS_IGNORE) { + oper_node->oper = FTS_IGNORE_SKIP; + } + + break; + + default: + if (node->visited) { + continue; + } + + ut_a(oper == FTS_NONE || !oper_node + || oper_node->oper == oper + || oper_node->oper == FTS_EXIST_SKIP + || oper_node->oper == FTS_IGNORE_SKIP); + + if (oper== FTS_EXIST || oper == FTS_IGNORE) { + *has_ignore = true; + continue; + } + + /* Process leaf node accroding to its pass.*/ + if (oper == FTS_EXIST_SKIP + && visit_pass == FTS_PASS_EXIST) { + error = visitor(FTS_EXIST, node, arg); + node->visited = true; + } else if (oper == FTS_IGNORE_SKIP + && visit_pass == FTS_PASS_IGNORE) { + error = visitor(FTS_IGNORE, node, arg); + node->visited = true; + } else if (visit_pass == FTS_PASS_FIRST) { + error = visitor(oper, node, arg); + node->visited = true; + } + } + } + + if (trx_is_interrupted(trx)) { + return DB_INTERRUPTED; + } + + if (revisit) { + /* Exist pass processes the skipped FTS_EXIST operation. */ + for (node = start_node; + node && error == DB_SUCCESS; + node = node->next) { + + if (node->type == FTS_AST_LIST + && node->oper != FTS_IGNORE) { + error = fts_ast_visit(FTS_EXIST_SKIP, node, + visitor, arg, &will_be_ignored); + } + } + + /* Ignore pass processes the skipped FTS_IGNORE operation. */ + for (node = start_node; + node && error == DB_SUCCESS; + node = node->next) { + + if (node->type == FTS_AST_LIST) { + error = fts_ast_visit(FTS_IGNORE_SKIP, node, + visitor, arg, &will_be_ignored); + } + } + } + + return(error); +} + +/** +Create an ast string object, with NUL-terminator, so the string +has one more byte than len +@param[in] str pointer to string +@param[in] len length of the string +@return ast string with NUL-terminator */ +fts_ast_string_t* +fts_ast_string_create( + const byte* str, + ulint len) +{ + fts_ast_string_t* ast_str; + + ut_ad(len > 0); + + ast_str = static_cast( + ut_malloc_nokey(sizeof(fts_ast_string_t))); + + ast_str->str = static_cast(ut_malloc_nokey(len + 1)); + + ast_str->len = len; + memcpy(ast_str->str, str, len); + ast_str->str[len] = '\0'; + + return(ast_str); +} + +/** +Free an ast string instance +@param[in,out] ast_str string to free */ +void +fts_ast_string_free( + fts_ast_string_t* ast_str) +{ + if (ast_str != NULL) { + ut_free(ast_str->str); + ut_free(ast_str); + } +} + +/** +Translate ast string of type FTS_AST_NUMB to unsigned long by strtoul +@param[in] str string to translate +@param[in] base the base +@return translated number */ +ulint +fts_ast_string_to_ul( + const fts_ast_string_t* ast_str, + int base) +{ + return(strtoul(reinterpret_cast(ast_str->str), + NULL, base)); +} + +#ifdef UNIV_DEBUG +const char* +fts_ast_node_type_get(fts_ast_type_t type) +{ + switch (type) { + case FTS_AST_OPER: + return("FTS_AST_OPER"); + case FTS_AST_NUMB: + return("FTS_AST_NUMB"); + case FTS_AST_TERM: + return("FTS_AST_TERM"); + case FTS_AST_TEXT: + return("FTS_AST_TEXT"); + case FTS_AST_LIST: + return("FTS_AST_LIST"); + case FTS_AST_SUBEXP_LIST: + return("FTS_AST_SUBEXP_LIST"); + case FTS_AST_PARSER_PHRASE_LIST: + return("FTS_AST_PARSER_PHRASE_LIST"); + } + ut_ad(0); + return("FTS_UNKNOWN"); +} +#endif /* UNIV_DEBUG */ diff --git a/storage/innobase/fts/fts0blex.cc b/storage/innobase/fts/fts0blex.cc new file mode 100644 index 00000000..6a2b4202 --- /dev/null +++ b/storage/innobase/fts/fts0blex.cc @@ -0,0 +1,2177 @@ +#include "univ.i" +#line 2 "fts0blex.cc" + +#line 4 "fts0blex.cc" + +#define YY_INT_ALIGNED short int + +/* A lexical scanner generated by flex */ + +#define FLEX_SCANNER +#define YY_FLEX_MAJOR_VERSION 2 +#define YY_FLEX_MINOR_VERSION 6 +#define YY_FLEX_SUBMINOR_VERSION 4 +#if YY_FLEX_SUBMINOR_VERSION > 0 +#define FLEX_BETA +#endif + +#ifdef yy_create_buffer +#define fts0b_create_buffer_ALREADY_DEFINED +#else +#define yy_create_buffer fts0b_create_buffer +#endif + +#ifdef yy_delete_buffer +#define fts0b_delete_buffer_ALREADY_DEFINED +#else +#define yy_delete_buffer fts0b_delete_buffer +#endif + +#ifdef yy_scan_buffer +#define fts0b_scan_buffer_ALREADY_DEFINED +#else +#define yy_scan_buffer fts0b_scan_buffer +#endif + +#ifdef yy_scan_string +#define fts0b_scan_string_ALREADY_DEFINED +#else +#define yy_scan_string fts0b_scan_string +#endif + +#ifdef yy_scan_bytes +#define fts0b_scan_bytes_ALREADY_DEFINED +#else +#define yy_scan_bytes fts0b_scan_bytes +#endif + +#ifdef yy_init_buffer +#define fts0b_init_buffer_ALREADY_DEFINED +#else +#define yy_init_buffer fts0b_init_buffer +#endif + +#ifdef yy_flush_buffer +#define fts0b_flush_buffer_ALREADY_DEFINED +#else +#define yy_flush_buffer fts0b_flush_buffer +#endif + +#ifdef yy_load_buffer_state +#define fts0b_load_buffer_state_ALREADY_DEFINED +#else +#define yy_load_buffer_state fts0b_load_buffer_state +#endif + +#ifdef yy_switch_to_buffer +#define fts0b_switch_to_buffer_ALREADY_DEFINED +#else +#define yy_switch_to_buffer fts0b_switch_to_buffer +#endif + +#ifdef yypush_buffer_state +#define fts0bpush_buffer_state_ALREADY_DEFINED +#else +#define yypush_buffer_state fts0bpush_buffer_state +#endif + +#ifdef yypop_buffer_state +#define fts0bpop_buffer_state_ALREADY_DEFINED +#else +#define yypop_buffer_state fts0bpop_buffer_state +#endif + +#ifdef yyensure_buffer_stack +#define fts0bensure_buffer_stack_ALREADY_DEFINED +#else +#define yyensure_buffer_stack fts0bensure_buffer_stack +#endif + +#ifdef yylex +#define fts0blex_ALREADY_DEFINED +#else +#define yylex fts0blex +#endif + +#ifdef yyrestart +#define fts0brestart_ALREADY_DEFINED +#else +#define yyrestart fts0brestart +#endif + +#ifdef yylex_init +#define fts0blex_init_ALREADY_DEFINED +#else +#define yylex_init fts0blex_init +#endif + +#ifdef yylex_init_extra +#define fts0blex_init_extra_ALREADY_DEFINED +#else +#define yylex_init_extra fts0blex_init_extra +#endif + +#ifdef yylex_destroy +#define fts0blex_destroy_ALREADY_DEFINED +#else +#define yylex_destroy fts0blex_destroy +#endif + +#ifdef yyget_debug +#define fts0bget_debug_ALREADY_DEFINED +#else +#define yyget_debug fts0bget_debug +#endif + +#ifdef yyset_debug +#define fts0bset_debug_ALREADY_DEFINED +#else +#define yyset_debug fts0bset_debug +#endif + +#ifdef yyget_extra +#define fts0bget_extra_ALREADY_DEFINED +#else +#define yyget_extra fts0bget_extra +#endif + +#ifdef yyset_extra +#define fts0bset_extra_ALREADY_DEFINED +#else +#define yyset_extra fts0bset_extra +#endif + +#ifdef yyget_in +#define fts0bget_in_ALREADY_DEFINED +#else +#define yyget_in fts0bget_in +#endif + +#ifdef yyset_in +#define fts0bset_in_ALREADY_DEFINED +#else +#define yyset_in fts0bset_in +#endif + +#ifdef yyget_out +#define fts0bget_out_ALREADY_DEFINED +#else +#define yyget_out fts0bget_out +#endif + +#ifdef yyset_out +#define fts0bset_out_ALREADY_DEFINED +#else +#define yyset_out fts0bset_out +#endif + +#ifdef yyget_leng +#define fts0bget_leng_ALREADY_DEFINED +#else +#define yyget_leng fts0bget_leng +#endif + +#ifdef yyget_text +#define fts0bget_text_ALREADY_DEFINED +#else +#define yyget_text fts0bget_text +#endif + +#ifdef yyget_lineno +#define fts0bget_lineno_ALREADY_DEFINED +#else +#define yyget_lineno fts0bget_lineno +#endif + +#ifdef yyset_lineno +#define fts0bset_lineno_ALREADY_DEFINED +#else +#define yyset_lineno fts0bset_lineno +#endif + +#ifdef yyget_column +#define fts0bget_column_ALREADY_DEFINED +#else +#define yyget_column fts0bget_column +#endif + +#ifdef yyset_column +#define fts0bset_column_ALREADY_DEFINED +#else +#define yyset_column fts0bset_column +#endif + +#ifdef yywrap +#define fts0bwrap_ALREADY_DEFINED +#else +#define yywrap fts0bwrap +#endif + +#ifdef yyalloc +#define fts0balloc_ALREADY_DEFINED +#else +#define yyalloc fts0balloc +#endif + +#ifdef yyrealloc +#define fts0brealloc_ALREADY_DEFINED +#else +#define yyrealloc fts0brealloc +#endif + +#ifdef yyfree +#define fts0bfree_ALREADY_DEFINED +#else +#define yyfree fts0bfree +#endif + +/* First, we deal with platform-specific or compiler-specific issues. */ + +/* begin standard C headers. */ +#include +#include +#include +#include + +/* end standard C headers. */ + +/* flex integer type definitions */ + +#ifndef FLEXINT_H +#define FLEXINT_H + +/* C99 systems have . Non-C99 systems may or may not. */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + +/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, + * if you want the limit (max/min) macros for int types. + */ +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif + +#include +typedef int8_t flex_int8_t; +typedef uint8_t flex_uint8_t; +typedef int16_t flex_int16_t; +typedef uint16_t flex_uint16_t; +typedef int32_t flex_int32_t; +typedef uint32_t flex_uint32_t; +#else +typedef signed char flex_int8_t; +typedef short int flex_int16_t; +typedef int flex_int32_t; +typedef unsigned char flex_uint8_t; +typedef unsigned short int flex_uint16_t; +typedef unsigned int flex_uint32_t; + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#ifndef SIZE_MAX +#define SIZE_MAX (~(size_t)0) +#endif + +#endif /* ! C99 */ + +#endif /* ! FLEXINT_H */ + +/* begin standard C++ headers. */ + +/* TODO: this is always defined, so inline it */ +#define yyconst const + +#if defined(__GNUC__) && __GNUC__ >= 3 +#define yynoreturn __attribute__((__noreturn__)) +#else +#define yynoreturn +#endif + +/* Returned upon end-of-file. */ +#define YY_NULL 0 + +/* Promotes a possibly negative, possibly signed char to an + * integer in range [0..255] for use as an array index. + */ +#define YY_SC_TO_UI(c) ((YY_CHAR) (c)) + +/* An opaque pointer. */ +#ifndef YY_TYPEDEF_YY_SCANNER_T +#define YY_TYPEDEF_YY_SCANNER_T +typedef void* yyscan_t; +#endif + +/* For convenience, these vars (plus the bison vars far below) + are macros in the reentrant scanner. */ +#define yyin yyg->yyin_r +#define yyout yyg->yyout_r +#define yyextra yyg->yyextra_r +#define yyleng yyg->yyleng_r +#define yytext yyg->yytext_r +#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno) +#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column) +#define yy_flex_debug yyg->yy_flex_debug_r + +/* Enter a start condition. This macro really ought to take a parameter, + * but we do it the disgusting crufty way forced on us by the ()-less + * definition of BEGIN. + */ +#define BEGIN yyg->yy_start = 1 + 2 * +/* Translate the current start state into a value that can be later handed + * to BEGIN to return to the state. The YYSTATE alias is for lex + * compatibility. + */ +#define YY_START ((yyg->yy_start - 1) / 2) +#define YYSTATE YY_START +/* Action number for EOF rule of a given start state. */ +#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1) +/* Special action meaning "start processing a new file". */ +#define YY_NEW_FILE yyrestart( yyin , yyscanner ) +#define YY_END_OF_BUFFER_CHAR 0 + +/* Size of default input buffer. */ +#ifndef YY_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k. + * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case. + * Ditto for the __ia64__ case accordingly. + */ +#define YY_BUF_SIZE 32768 +#else +#define YY_BUF_SIZE 16384 +#endif /* __ia64__ */ +#endif + +/* The state buf must be large enough to hold one state per character in the main buffer. + */ +#define YY_STATE_BUF_SIZE ((YY_BUF_SIZE + 2) * sizeof(yy_state_type)) + +#ifndef YY_TYPEDEF_YY_BUFFER_STATE +#define YY_TYPEDEF_YY_BUFFER_STATE +typedef struct yy_buffer_state *YY_BUFFER_STATE; +#endif + +#ifndef YY_TYPEDEF_YY_SIZE_T +#define YY_TYPEDEF_YY_SIZE_T +typedef size_t yy_size_t; +#endif + +#define EOB_ACT_CONTINUE_SCAN 0 +#define EOB_ACT_END_OF_FILE 1 +#define EOB_ACT_LAST_MATCH 2 + + #define YY_LESS_LINENO(n) + #define YY_LINENO_REWIND_TO(ptr) + +/* Return all but the first "n" matched characters back to the input stream. */ +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + *yy_cp = yyg->yy_hold_char; \ + YY_RESTORE_YY_MORE_OFFSET \ + yyg->yy_c_buf_p = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \ + YY_DO_BEFORE_ACTION; /* set up yytext again */ \ + } \ + while ( 0 ) +#define unput(c) yyunput( c, yyg->yytext_ptr , yyscanner ) + +#ifndef YY_STRUCT_YY_BUFFER_STATE +#define YY_STRUCT_YY_BUFFER_STATE +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + int yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + int yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + int yy_bs_lineno; /**< The line count. */ + int yy_bs_column; /**< The column count. */ + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; + +#define YY_BUFFER_NEW 0 +#define YY_BUFFER_NORMAL 1 + /* When an EOF's been seen but there's still some text to process + * then we mark the buffer as YY_EOF_PENDING, to indicate that we + * shouldn't try reading from the input source any more. We might + * still have a bunch of tokens to match, though, because of + * possible backing-up. + * + * When we actually see the EOF, we change the status to "new" + * (via yyrestart()), so that the user can continue scanning by + * just pointing yyin at a new input file. + */ +#define YY_BUFFER_EOF_PENDING 2 + + }; +#endif /* !YY_STRUCT_YY_BUFFER_STATE */ + +/* We provide macros for accessing buffer states in case in the + * future we want to put the buffer states in a more general + * "scanner state". + * + * Returns the top of the stack, or NULL. + */ +#define YY_CURRENT_BUFFER ( yyg->yy_buffer_stack \ + ? yyg->yy_buffer_stack[yyg->yy_buffer_stack_top] \ + : 0) +/* Same as previous macro, but useful when we know that the buffer stack is not + * NULL or when we need an lvalue. For internal use only. + */ +#define YY_CURRENT_BUFFER_LVALUE yyg->yy_buffer_stack[yyg->yy_buffer_stack_top] + +void yyrestart ( FILE *input_file , yyscan_t yyscanner ); +void yy_switch_to_buffer ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner ); +YY_BUFFER_STATE yy_create_buffer ( FILE *file, int size , yyscan_t yyscanner ); +void yy_delete_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner ); +void yy_flush_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner ); +void yypush_buffer_state ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner ); +void yypop_buffer_state ( yyscan_t yyscanner ); + +static void yyensure_buffer_stack ( yyscan_t yyscanner ); +static void yy_load_buffer_state ( yyscan_t yyscanner ); +static void yy_init_buffer ( YY_BUFFER_STATE b, FILE *file , yyscan_t yyscanner ); +#define YY_FLUSH_BUFFER yy_flush_buffer( YY_CURRENT_BUFFER , yyscanner) + +YY_BUFFER_STATE yy_scan_buffer ( char *base, yy_size_t size , yyscan_t yyscanner ); +YY_BUFFER_STATE yy_scan_string ( const char *yy_str , yyscan_t yyscanner ); +YY_BUFFER_STATE yy_scan_bytes ( const char *bytes, int len , yyscan_t yyscanner ); + +void *yyalloc ( yy_size_t , yyscan_t yyscanner ); +void *yyrealloc ( void *, yy_size_t , yyscan_t yyscanner ); +void yyfree ( void * , yyscan_t yyscanner ); + +#define yy_new_buffer yy_create_buffer +#define yy_set_interactive(is_interactive) \ + { \ + if ( ! YY_CURRENT_BUFFER ){ \ + yyensure_buffer_stack (yyscanner); \ + YY_CURRENT_BUFFER_LVALUE = \ + yy_create_buffer( yyin, YY_BUF_SIZE , yyscanner); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \ + } +#define yy_set_bol(at_bol) \ + { \ + if ( ! YY_CURRENT_BUFFER ){\ + yyensure_buffer_stack (yyscanner); \ + YY_CURRENT_BUFFER_LVALUE = \ + yy_create_buffer( yyin, YY_BUF_SIZE , yyscanner); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \ + } +#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol) + +/* Begin user sect3 */ + +#define fts0bwrap(yyscanner) (/*CONSTCOND*/1) +#define YY_SKIP_YYWRAP +typedef flex_uint8_t YY_CHAR; + +typedef int yy_state_type; + +#define yytext_ptr yytext_r + +static yy_state_type yy_get_previous_state ( yyscan_t yyscanner ); +static yy_state_type yy_try_NUL_trans ( yy_state_type current_state , yyscan_t yyscanner); +static int yy_get_next_buffer ( yyscan_t yyscanner ); +static void yynoreturn yy_fatal_error ( const char* msg , yyscan_t yyscanner ); + +/* Done after the current pattern has been matched and before the + * corresponding action - sets up yytext. + */ +#define YY_DO_BEFORE_ACTION \ + yyg->yytext_ptr = yy_bp; \ + yyleng = (int) (yy_cp - yy_bp); \ + yyg->yy_hold_char = *yy_cp; \ + *yy_cp = '\0'; \ + yyg->yy_c_buf_p = yy_cp; +#define YY_NUM_RULES 7 +#define YY_END_OF_BUFFER 8 +/* This struct is not used in this scanner, + but its presence is necessary. */ +struct yy_trans_info + { + flex_int32_t yy_verify; + flex_int32_t yy_nxt; + }; +static const flex_int16_t yy_accept[19] = + { 0, + 4, 4, 8, 4, 1, 6, 1, 7, 7, 2, + 3, 4, 1, 1, 0, 5, 3, 0 + } ; + +static const YY_CHAR yy_ec[256] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 4, 1, 5, 1, 1, 6, 1, 1, 7, + 7, 7, 7, 1, 7, 1, 1, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 1, 1, 7, + 1, 7, 1, 7, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 7, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1 + } ; + +static const YY_CHAR yy_meta[9] = + { 0, + 1, 2, 3, 4, 5, 5, 5, 1 + } ; + +static const flex_int16_t yy_base[22] = + { 0, + 0, 0, 22, 0, 7, 23, 0, 14, 23, 23, + 7, 0, 0, 0, 5, 23, 0, 23, 11, 12, + 16 + } ; + +static const flex_int16_t yy_def[22] = + { 0, + 18, 1, 18, 19, 19, 18, 20, 21, 18, 18, + 19, 19, 5, 20, 21, 18, 11, 0, 18, 18, + 18 + } ; + +static const flex_int16_t yy_nxt[32] = + { 0, + 4, 5, 6, 7, 8, 9, 10, 11, 13, 16, + 14, 12, 12, 14, 17, 14, 15, 15, 16, 15, + 15, 18, 3, 18, 18, 18, 18, 18, 18, 18, + 18 + } ; + +static const flex_int16_t yy_chk[32] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 1, 5, 15, + 5, 19, 19, 20, 11, 20, 21, 21, 8, 21, + 21, 3, 18, 18, 18, 18, 18, 18, 18, 18, + 18 + } ; + +/* The intent behind this definition is that it'll catch + * any uses of REJECT which flex missed. + */ +#define REJECT reject_used_but_not_detected +#define yymore() yymore_used_but_not_detected +#define YY_MORE_ADJ 0 +#define YY_RESTORE_YY_MORE_OFFSET +#line 1 "fts0blex.l" +/***************************************************************************** + +Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ +/** + * @file fts/fts0blex.l + * FTS parser lexical analyzer + * + * Created 2007/5/9 Sunny Bains + */ +#line 27 "fts0blex.l" + +#include "fts0ast.h" +#include "fts0pars.h" + +/* Required for reentrant parser */ +#define YY_DECL int fts_blexer(YYSTYPE* val, yyscan_t yyscanner) +#define exit(A) ut_error + +#line 675 "fts0blex.cc" +#define YY_NO_INPUT 1 +#line 677 "fts0blex.cc" + +#define INITIAL 0 + +#ifndef YY_NO_UNISTD_H +/* Special case for "unistd.h", since it is non-ANSI. We include it way + * down here because we want the user's section 1 to have been scanned first. + * The user has a chance to override it with an option. + */ +#include +#endif + +#ifndef YY_EXTRA_TYPE +#define YY_EXTRA_TYPE void * +#endif + +/* Holds the entire state of the reentrant scanner. */ +struct yyguts_t + { + + /* User-defined. Not touched by flex. */ + YY_EXTRA_TYPE yyextra_r; + + /* The rest are the same as the globals declared in the non-reentrant scanner. */ + FILE *yyin_r, *yyout_r; + size_t yy_buffer_stack_top; /**< index of top of stack. */ + size_t yy_buffer_stack_max; /**< capacity of stack. */ + YY_BUFFER_STATE * yy_buffer_stack; /**< Stack as an array. */ + char yy_hold_char; + int yy_n_chars; + int yyleng_r; + char *yy_c_buf_p; + int yy_init; + int yy_start; + int yy_did_buffer_switch_on_eof; + int yy_start_stack_ptr; + int yy_start_stack_depth; + int *yy_start_stack; + yy_state_type yy_last_accepting_state; + char* yy_last_accepting_cpos; + + int yylineno_r; + int yy_flex_debug_r; + + char *yytext_r; + int yy_more_flag; + int yy_more_len; + + }; /* end struct yyguts_t */ + +static int yy_init_globals ( yyscan_t yyscanner ); + +int yylex_init (yyscan_t* scanner); + +int yylex_init_extra ( YY_EXTRA_TYPE user_defined, yyscan_t* scanner); + +/* Accessor methods to globals. + These are made visible to non-reentrant scanners for convenience. */ + +int yylex_destroy ( yyscan_t yyscanner ); + +int yyget_debug ( yyscan_t yyscanner ); + +void yyset_debug ( int debug_flag , yyscan_t yyscanner ); + +YY_EXTRA_TYPE yyget_extra ( yyscan_t yyscanner ); + +void yyset_extra ( YY_EXTRA_TYPE user_defined , yyscan_t yyscanner ); + +FILE *yyget_in ( yyscan_t yyscanner ); + +void yyset_in ( FILE * _in_str , yyscan_t yyscanner ); + +FILE *yyget_out ( yyscan_t yyscanner ); + +void yyset_out ( FILE * _out_str , yyscan_t yyscanner ); + + int yyget_leng ( yyscan_t yyscanner ); + +char *yyget_text ( yyscan_t yyscanner ); + +int yyget_lineno ( yyscan_t yyscanner ); + +void yyset_lineno ( int _line_number , yyscan_t yyscanner ); + +int yyget_column ( yyscan_t yyscanner ); + +void yyset_column ( int _column_no , yyscan_t yyscanner ); + +/* Macros after this point can all be overridden by user definitions in + * section 1. + */ + +#ifndef YY_SKIP_YYWRAP +#ifdef __cplusplus +extern "C" int yywrap ( yyscan_t yyscanner ); +#else +extern int yywrap ( yyscan_t yyscanner ); +#endif +#endif + +#ifndef YY_NO_UNPUT + +#endif + +#ifndef yytext_ptr +static void yy_flex_strncpy ( char *, const char *, int , yyscan_t yyscanner); +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen ( const char * , yyscan_t yyscanner); +#endif + +#ifndef YY_NO_INPUT +#ifdef __cplusplus +static int yyinput ( yyscan_t yyscanner ); +#else +static int input ( yyscan_t yyscanner ); +#endif + +#endif + +/* Amount of stuff to slurp up with each read. */ +#ifndef YY_READ_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k */ +#define YY_READ_BUF_SIZE 16384 +#else +#define YY_READ_BUF_SIZE 8192 +#endif /* __ia64__ */ +#endif + +/* Copy whatever the last rule matched to the standard output. */ +#ifndef ECHO +/* This used to be an fputs(), but since the string might contain NUL's, + * we now use fwrite(). + */ +#define ECHO do { if (fwrite( yytext, (size_t) yyleng, 1, yyout )) {} } while (0) +#endif + +/* Gets input and stuffs it into "buf". number of characters read, or YY_NULL, + * is returned in "result". + */ +#ifndef YY_INPUT +#define YY_INPUT(buf,result,max_size) \ + if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \ + { \ + int c = '*'; \ + int n; \ + for ( n = 0; n < max_size && \ + (c = getc( yyin )) != EOF && c != '\n'; ++n ) \ + buf[n] = (char) c; \ + if ( c == '\n' ) \ + buf[n++] = (char) c; \ + if ( c == EOF && ferror( yyin ) ) \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + result = n; \ + } \ + else \ + { \ + errno=0; \ + while ( (result = (int) fread(buf, 1, (yy_size_t) max_size, yyin)) == 0 && ferror(yyin)) \ + { \ + if( errno != EINTR) \ + { \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + break; \ + } \ + errno=0; \ + clearerr(yyin); \ + } \ + }\ +\ + +#endif + +/* No semi-colon after return; correct usage is to write "yyterminate();" - + * we don't want an extra ';' after the "return" because that will cause + * some compilers to complain about unreachable statements. + */ +#ifndef yyterminate +#define yyterminate() return YY_NULL +#endif + +/* Number of entries by which start-condition stack grows. */ +#ifndef YY_START_STACK_INCR +#define YY_START_STACK_INCR 25 +#endif + +/* Report a fatal error. */ +#ifndef YY_FATAL_ERROR +#define YY_FATAL_ERROR(msg) yy_fatal_error( msg , yyscanner) +#endif + +/* end tables serialization structures and prototypes */ + +/* Default declaration of generated scanner - a define so the user can + * easily add parameters. + */ +#ifndef YY_DECL +#define YY_DECL_IS_OURS 1 + +extern int yylex (yyscan_t yyscanner); + +#define YY_DECL int yylex (yyscan_t yyscanner) +#endif /* !YY_DECL */ + +/* Code executed at the beginning of each rule, after yytext and yyleng + * have been set up. + */ +#ifndef YY_USER_ACTION +#define YY_USER_ACTION +#endif + +/* Code executed at the end of each rule. */ +#ifndef YY_BREAK +#define YY_BREAK /*LINTED*/break; +#endif + +#define YY_RULE_SETUP \ + YY_USER_ACTION + +/** The main scanner function which does all the work. + */ +YY_DECL +{ + yy_state_type yy_current_state; + char *yy_cp, *yy_bp; + int yy_act; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if ( !yyg->yy_init ) + { + yyg->yy_init = 1; + +#ifdef YY_USER_INIT + YY_USER_INIT; +#endif + + if ( ! yyg->yy_start ) + yyg->yy_start = 1; /* first start state */ + + if ( ! yyin ) + yyin = stdin; + + if ( ! yyout ) + yyout = stdout; + + if ( ! YY_CURRENT_BUFFER ) { + yyensure_buffer_stack (yyscanner); + YY_CURRENT_BUFFER_LVALUE = + yy_create_buffer( yyin, YY_BUF_SIZE , yyscanner); + } + + yy_load_buffer_state( yyscanner ); + } + + { +#line 44 "fts0blex.l" + + +#line 938 "fts0blex.cc" + + while ( /*CONSTCOND*/1 ) /* loops until end-of-file is reached */ + { + yy_cp = yyg->yy_c_buf_p; + + /* Support of yytext. */ + *yy_cp = yyg->yy_hold_char; + + /* yy_bp points to the position in yy_ch_buf of the start of + * the current run. + */ + yy_bp = yy_cp; + + yy_current_state = yyg->yy_start; +yy_match: + do + { + YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)] ; + if ( yy_accept[yy_current_state] ) + { + yyg->yy_last_accepting_state = yy_current_state; + yyg->yy_last_accepting_cpos = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 19 ) + yy_c = yy_meta[yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c]; + ++yy_cp; + } + while ( yy_current_state != 18 ); + yy_cp = yyg->yy_last_accepting_cpos; + yy_current_state = yyg->yy_last_accepting_state; + +yy_find_action: + yy_act = yy_accept[yy_current_state]; + + YY_DO_BEFORE_ACTION; + +do_action: /* This label is used only to access EOF actions. */ + + switch ( yy_act ) + { /* beginning of action switch */ + case 0: /* must back up */ + /* undo the effects of YY_DO_BEFORE_ACTION */ + *yy_cp = yyg->yy_hold_char; + yy_cp = yyg->yy_last_accepting_cpos; + yy_current_state = yyg->yy_last_accepting_state; + goto yy_find_action; + +case 1: +YY_RULE_SETUP +#line 46 "fts0blex.l" +/* Ignore whitespace */ ; + YY_BREAK +case 2: +YY_RULE_SETUP +#line 48 "fts0blex.l" +{ + val->oper = fts0bget_text(yyscanner)[0]; + + return(val->oper); +} + YY_BREAK +case 3: +YY_RULE_SETUP +#line 54 "fts0blex.l" +{ + val->token = fts_ast_string_create(reinterpret_cast(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner)); + + return(FTS_NUMB); +} + YY_BREAK +case 4: +YY_RULE_SETUP +#line 60 "fts0blex.l" +{ + val->token = fts_ast_string_create(reinterpret_cast(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner)); + + return(FTS_TERM); +} + YY_BREAK +case 5: +YY_RULE_SETUP +#line 66 "fts0blex.l" +{ + val->token = fts_ast_string_create(reinterpret_cast(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner)); + + return(FTS_TEXT); +} + YY_BREAK +case 6: +/* rule 6 can match eol */ +YY_RULE_SETUP +#line 72 "fts0blex.l" + + YY_BREAK +case 7: +YY_RULE_SETUP +#line 74 "fts0blex.l" +ECHO; + YY_BREAK +#line 1043 "fts0blex.cc" +case YY_STATE_EOF(INITIAL): + yyterminate(); + + case YY_END_OF_BUFFER: + { + /* Amount of text matched not including the EOB char. */ + int yy_amount_of_matched_text = (int) (yy_cp - yyg->yytext_ptr) - 1; + + /* Undo the effects of YY_DO_BEFORE_ACTION. */ + *yy_cp = yyg->yy_hold_char; + YY_RESTORE_YY_MORE_OFFSET + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW ) + { + /* We're scanning a new file or input source. It's + * possible that this happened because the user + * just pointed yyin at a new source and called + * yylex(). If so, then we have to assure + * consistency between YY_CURRENT_BUFFER and our + * globals. Here is the right place to do so, because + * this is the first action (other than possibly a + * back-up) that will match for the new input source. + */ + yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL; + } + + /* Note that here we test for yy_c_buf_p "<=" to the position + * of the first EOB in the buffer, since yy_c_buf_p will + * already have been incremented past the NUL character + * (since all states make transitions on EOB to the + * end-of-buffer state). Contrast this with the test + * in input(). + */ + if ( yyg->yy_c_buf_p <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] ) + { /* This was really a NUL. */ + yy_state_type yy_next_state; + + yyg->yy_c_buf_p = yyg->yytext_ptr + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( yyscanner ); + + /* Okay, we're now positioned to make the NUL + * transition. We couldn't have + * yy_get_previous_state() go ahead and do it + * for us because it doesn't know how to deal + * with the possibility of jamming (and we don't + * want to build jamming into it because then it + * will run more slowly). + */ + + yy_next_state = yy_try_NUL_trans( yy_current_state , yyscanner); + + yy_bp = yyg->yytext_ptr + YY_MORE_ADJ; + + if ( yy_next_state ) + { + /* Consume the NUL. */ + yy_cp = ++yyg->yy_c_buf_p; + yy_current_state = yy_next_state; + goto yy_match; + } + + else + { + yy_cp = yyg->yy_last_accepting_cpos; + yy_current_state = yyg->yy_last_accepting_state; + goto yy_find_action; + } + } + + else switch ( yy_get_next_buffer( yyscanner ) ) + { + case EOB_ACT_END_OF_FILE: + { + yyg->yy_did_buffer_switch_on_eof = 0; + + if ( yywrap( yyscanner ) ) + { + /* Note: because we've taken care in + * yy_get_next_buffer() to have set up + * yytext, we can now set up + * yy_c_buf_p so that if some total + * hoser (like flex itself) wants to + * call the scanner after we return the + * YY_NULL, it'll still work - another + * YY_NULL will get returned. + */ + yyg->yy_c_buf_p = yyg->yytext_ptr + YY_MORE_ADJ; + + yy_act = YY_STATE_EOF(YY_START); + goto do_action; + } + + else + { + if ( ! yyg->yy_did_buffer_switch_on_eof ) + YY_NEW_FILE; + } + break; + } + + case EOB_ACT_CONTINUE_SCAN: + yyg->yy_c_buf_p = + yyg->yytext_ptr + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( yyscanner ); + + yy_cp = yyg->yy_c_buf_p; + yy_bp = yyg->yytext_ptr + YY_MORE_ADJ; + goto yy_match; + + case EOB_ACT_LAST_MATCH: + yyg->yy_c_buf_p = + &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars]; + + yy_current_state = yy_get_previous_state( yyscanner ); + + yy_cp = yyg->yy_c_buf_p; + yy_bp = yyg->yytext_ptr + YY_MORE_ADJ; + goto yy_find_action; + } + break; + } + + default: + YY_FATAL_ERROR( + "fatal flex scanner internal error--no action found" ); + } /* end of action switch */ + } /* end of scanning one token */ + } /* end of user's declarations */ +} /* end of yylex */ + +/* yy_get_next_buffer - try to read in a new buffer + * + * Returns a code representing an action: + * EOB_ACT_LAST_MATCH - + * EOB_ACT_CONTINUE_SCAN - continue scanning from current position + * EOB_ACT_END_OF_FILE - end of file + */ +static int yy_get_next_buffer (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf; + char *source = yyg->yytext_ptr; + int number_to_move, i; + int ret_val; + + if ( yyg->yy_c_buf_p > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] ) + YY_FATAL_ERROR( + "fatal flex scanner internal error--end of buffer missed" ); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 ) + { /* Don't try to fill the buffer, so this is an EOF. */ + if ( yyg->yy_c_buf_p - yyg->yytext_ptr - YY_MORE_ADJ == 1 ) + { + /* We matched a single character, the EOB, so + * treat this as a final EOF. + */ + return EOB_ACT_END_OF_FILE; + } + + else + { + /* We matched some text prior to the EOB, first + * process it. + */ + return EOB_ACT_LAST_MATCH; + } + } + + /* Try to read more data. */ + + /* First move last chars to start of buffer. */ + number_to_move = (int) (yyg->yy_c_buf_p - yyg->yytext_ptr - 1); + + for ( i = 0; i < number_to_move; ++i ) + *(dest++) = *(source++); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING ) + /* don't do the read, it's not guaranteed to return an EOF, + * just force an EOF + */ + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars = 0; + + else + { + int num_to_read = + YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1; + + while ( num_to_read <= 0 ) + { /* Not enough room in the buffer - grow it. */ + + /* just a shorter name for the current buffer */ + YY_BUFFER_STATE b = YY_CURRENT_BUFFER_LVALUE; + + int yy_c_buf_p_offset = + (int) (yyg->yy_c_buf_p - b->yy_ch_buf); + + if ( b->yy_is_our_buffer ) + { + int new_size = b->yy_buf_size * 2; + + if ( new_size <= 0 ) + b->yy_buf_size += b->yy_buf_size / 8; + else + b->yy_buf_size *= 2; + + b->yy_ch_buf = (char *) + /* Include room in for 2 EOB chars. */ + yyrealloc( (void *) b->yy_ch_buf, + (yy_size_t) (b->yy_buf_size + 2) , yyscanner ); + } + else + /* Can't grow it, we don't own it. */ + b->yy_ch_buf = NULL; + + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( + "fatal error - scanner input buffer overflow" ); + + yyg->yy_c_buf_p = &b->yy_ch_buf[yy_c_buf_p_offset]; + + num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size - + number_to_move - 1; + + } + + if ( num_to_read > YY_READ_BUF_SIZE ) + num_to_read = YY_READ_BUF_SIZE; + + /* Read in more data. */ + YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]), + yyg->yy_n_chars, num_to_read ); + + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars; + } + + if ( yyg->yy_n_chars == 0 ) + { + if ( number_to_move == YY_MORE_ADJ ) + { + ret_val = EOB_ACT_END_OF_FILE; + yyrestart( yyin , yyscanner); + } + + else + { + ret_val = EOB_ACT_LAST_MATCH; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = + YY_BUFFER_EOF_PENDING; + } + } + + else + ret_val = EOB_ACT_CONTINUE_SCAN; + + if ((yyg->yy_n_chars + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) { + /* Extend the array by 50%, plus the number we really need. */ + int new_size = yyg->yy_n_chars + number_to_move + (yyg->yy_n_chars >> 1); + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) yyrealloc( + (void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf, (yy_size_t) new_size , yyscanner ); + if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" ); + /* "- 2" to take care of EOB's */ + YY_CURRENT_BUFFER_LVALUE->yy_buf_size = (int) (new_size - 2); + } + + yyg->yy_n_chars += number_to_move; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] = YY_END_OF_BUFFER_CHAR; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] = YY_END_OF_BUFFER_CHAR; + + yyg->yytext_ptr = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0]; + + return ret_val; +} + +/* yy_get_previous_state - get the state just before the EOB char was reached */ + + static yy_state_type yy_get_previous_state (yyscan_t yyscanner) +{ + yy_state_type yy_current_state; + char *yy_cp; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + yy_current_state = yyg->yy_start; + + for ( yy_cp = yyg->yytext_ptr + YY_MORE_ADJ; yy_cp < yyg->yy_c_buf_p; ++yy_cp ) + { + YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1); + if ( yy_accept[yy_current_state] ) + { + yyg->yy_last_accepting_state = yy_current_state; + yyg->yy_last_accepting_cpos = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 19 ) + yy_c = yy_meta[yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c]; + } + + return yy_current_state; +} + +/* yy_try_NUL_trans - try to make a transition on the NUL character + * + * synopsis + * next_state = yy_try_NUL_trans( current_state ); + */ + static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state , yyscan_t yyscanner) +{ + int yy_is_jam; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* This var may be unused depending upon options. */ + char *yy_cp = yyg->yy_c_buf_p; + + YY_CHAR yy_c = 1; + if ( yy_accept[yy_current_state] ) + { + yyg->yy_last_accepting_state = yy_current_state; + yyg->yy_last_accepting_cpos = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 19 ) + yy_c = yy_meta[yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c]; + yy_is_jam = (yy_current_state == 18); + + (void)yyg; + return yy_is_jam ? 0 : yy_current_state; +} + +#ifndef YY_NO_UNPUT + +#endif + +#ifndef YY_NO_INPUT +#ifdef __cplusplus + static int yyinput (yyscan_t yyscanner) +#else + static int input (yyscan_t yyscanner) +#endif + +{ + int c; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + *yyg->yy_c_buf_p = yyg->yy_hold_char; + + if ( *yyg->yy_c_buf_p == YY_END_OF_BUFFER_CHAR ) + { + /* yy_c_buf_p now points to the character we want to return. + * If this occurs *before* the EOB characters, then it's a + * valid NUL; if not, then we've hit the end of the buffer. + */ + if ( yyg->yy_c_buf_p < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] ) + /* This was really a NUL. */ + *yyg->yy_c_buf_p = '\0'; + + else + { /* need more input */ + int offset = (int) (yyg->yy_c_buf_p - yyg->yytext_ptr); + ++yyg->yy_c_buf_p; + + switch ( yy_get_next_buffer( yyscanner ) ) + { + case EOB_ACT_LAST_MATCH: + /* This happens because yy_g_n_b() + * sees that we've accumulated a + * token and flags that we need to + * try matching the token before + * proceeding. But for input(), + * there's no matching to consider. + * So convert the EOB_ACT_LAST_MATCH + * to EOB_ACT_END_OF_FILE. + */ + + /* Reset buffer status. */ + yyrestart( yyin , yyscanner); + + /*FALLTHROUGH*/ + + case EOB_ACT_END_OF_FILE: + { + if ( yywrap( yyscanner ) ) + return 0; + + if ( ! yyg->yy_did_buffer_switch_on_eof ) + YY_NEW_FILE; +#ifdef __cplusplus + return yyinput(yyscanner); +#else + return input(yyscanner); +#endif + } + + case EOB_ACT_CONTINUE_SCAN: + yyg->yy_c_buf_p = yyg->yytext_ptr + offset; + break; + } + } + } + + c = *(unsigned char *) yyg->yy_c_buf_p; /* cast for 8-bit char's */ + *yyg->yy_c_buf_p = '\0'; /* preserve yytext */ + yyg->yy_hold_char = *++yyg->yy_c_buf_p; + + return c; +} +#endif /* ifndef YY_NO_INPUT */ + +/** Immediately switch to a different input stream. + * @param input_file A readable stream. + * @param yyscanner The scanner object. + * @note This function does not reset the start condition to @c INITIAL . + */ + void yyrestart (FILE * input_file , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if ( ! YY_CURRENT_BUFFER ){ + yyensure_buffer_stack (yyscanner); + YY_CURRENT_BUFFER_LVALUE = + yy_create_buffer( yyin, YY_BUF_SIZE , yyscanner); + } + + yy_init_buffer( YY_CURRENT_BUFFER, input_file , yyscanner); + yy_load_buffer_state( yyscanner ); +} + +/** Switch to a different input buffer. + * @param new_buffer The new input buffer. + * @param yyscanner The scanner object. + */ + void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + /* TODO. We should be able to replace this entire function body + * with + * yypop_buffer_state(); + * yypush_buffer_state(new_buffer); + */ + yyensure_buffer_stack (yyscanner); + if ( YY_CURRENT_BUFFER == new_buffer ) + return; + + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *yyg->yy_c_buf_p = yyg->yy_hold_char; + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p; + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars; + } + + YY_CURRENT_BUFFER_LVALUE = new_buffer; + yy_load_buffer_state( yyscanner ); + + /* We don't actually know whether we did this switch during + * EOF (yywrap()) processing, but the only time this flag + * is looked at is after yywrap() is called, so it's safe + * to go ahead and always set it. + */ + yyg->yy_did_buffer_switch_on_eof = 1; +} + +static void yy_load_buffer_state (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + yyg->yytext_ptr = yyg->yy_c_buf_p = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos; + yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file; + yyg->yy_hold_char = *yyg->yy_c_buf_p; +} + +/** Allocate and initialize an input buffer state. + * @param file A readable stream. + * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE. + * @param yyscanner The scanner object. + * @return the allocated buffer state. + */ + YY_BUFFER_STATE yy_create_buffer (FILE * file, int size , yyscan_t yyscanner) +{ + YY_BUFFER_STATE b; + + b = (YY_BUFFER_STATE) yyalloc( sizeof( struct yy_buffer_state ) , yyscanner ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); + + b->yy_buf_size = size; + + /* yy_ch_buf has to be 2 characters longer than the size given because + * we need to put in 2 end-of-buffer characters. + */ + b->yy_ch_buf = (char *) yyalloc( (yy_size_t) (b->yy_buf_size + 2) , yyscanner ); + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); + + b->yy_is_our_buffer = 1; + + yy_init_buffer( b, file , yyscanner); + + return b; +} + +/** Destroy the buffer. + * @param b a buffer created with yy_create_buffer() + * @param yyscanner The scanner object. + */ + void yy_delete_buffer (YY_BUFFER_STATE b , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if ( ! b ) + return; + + if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */ + YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0; + + if ( b->yy_is_our_buffer ) + yyfree( (void *) b->yy_ch_buf , yyscanner ); + + yyfree( (void *) b , yyscanner ); +} + +/* Initializes or reinitializes a buffer. + * This function is sometimes called more than once on the same buffer, + * such as during a yyrestart() or at EOF. + */ + static void yy_init_buffer (YY_BUFFER_STATE b, FILE * file , yyscan_t yyscanner) + +{ + int oerrno = errno; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + yy_flush_buffer( b , yyscanner); + + b->yy_input_file = file; + b->yy_fill_buffer = 1; + + /* If b is the current buffer, then yy_init_buffer was _probably_ + * called from yyrestart() or through yy_get_next_buffer. + * In that case, we don't want to reset the lineno or column. + */ + if (b != YY_CURRENT_BUFFER){ + b->yy_bs_lineno = 1; + b->yy_bs_column = 0; + } + + b->yy_is_interactive = 0; + + errno = oerrno; +} + +/** Discard all buffered characters. On the next scan, YY_INPUT will be called. + * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER. + * @param yyscanner The scanner object. + */ + void yy_flush_buffer (YY_BUFFER_STATE b , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + if ( ! b ) + return; + + b->yy_n_chars = 0; + + /* We always need two end-of-buffer characters. The first causes + * a transition to the end-of-buffer state. The second causes + * a jam in that state. + */ + b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR; + b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR; + + b->yy_buf_pos = &b->yy_ch_buf[0]; + + b->yy_at_bol = 1; + b->yy_buffer_status = YY_BUFFER_NEW; + + if ( b == YY_CURRENT_BUFFER ) + yy_load_buffer_state( yyscanner ); +} + +/** Pushes the new state onto the stack. The new state becomes + * the current state. This function will allocate the stack + * if necessary. + * @param new_buffer The new state. + * @param yyscanner The scanner object. + */ +void yypush_buffer_state (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + if (new_buffer == NULL) + return; + + yyensure_buffer_stack(yyscanner); + + /* This block is copied from yy_switch_to_buffer. */ + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *yyg->yy_c_buf_p = yyg->yy_hold_char; + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p; + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars; + } + + /* Only push if top exists. Otherwise, replace top. */ + if (YY_CURRENT_BUFFER) + yyg->yy_buffer_stack_top++; + YY_CURRENT_BUFFER_LVALUE = new_buffer; + + /* copied from yy_switch_to_buffer. */ + yy_load_buffer_state( yyscanner ); + yyg->yy_did_buffer_switch_on_eof = 1; +} + +/** Removes and deletes the top of the stack, if present. + * The next element becomes the new top. + * @param yyscanner The scanner object. + */ +void yypop_buffer_state (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + if (!YY_CURRENT_BUFFER) + return; + + yy_delete_buffer(YY_CURRENT_BUFFER , yyscanner); + YY_CURRENT_BUFFER_LVALUE = NULL; + if (yyg->yy_buffer_stack_top > 0) + --yyg->yy_buffer_stack_top; + + if (YY_CURRENT_BUFFER) { + yy_load_buffer_state( yyscanner ); + yyg->yy_did_buffer_switch_on_eof = 1; + } +} + +/* Allocates the stack if it does not exist. + * Guarantees space for at least one push. + */ +static void yyensure_buffer_stack (yyscan_t yyscanner) +{ + yy_size_t num_to_alloc; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if (!yyg->yy_buffer_stack) { + + /* First allocation is just for 2 elements, since we don't know if this + * scanner will even need a stack. We use 2 instead of 1 to avoid an + * immediate realloc on the next call. + */ + num_to_alloc = 1; /* After all that talk, this was set to 1 anyways... */ + yyg->yy_buffer_stack = (struct yy_buffer_state**)yyalloc + (num_to_alloc * sizeof(struct yy_buffer_state*) + , yyscanner); + if ( ! yyg->yy_buffer_stack ) + YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" ); + + memset(yyg->yy_buffer_stack, 0, num_to_alloc * sizeof(struct yy_buffer_state*)); + + yyg->yy_buffer_stack_max = num_to_alloc; + yyg->yy_buffer_stack_top = 0; + return; + } + + if (yyg->yy_buffer_stack_top >= (yyg->yy_buffer_stack_max) - 1){ + + /* Increase the buffer to prepare for a possible push. */ + yy_size_t grow_size = 8 /* arbitrary grow size */; + + num_to_alloc = yyg->yy_buffer_stack_max + grow_size; + yyg->yy_buffer_stack = (struct yy_buffer_state**)yyrealloc + (yyg->yy_buffer_stack, + num_to_alloc * sizeof(struct yy_buffer_state*) + , yyscanner); + if ( ! yyg->yy_buffer_stack ) + YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" ); + + /* zero only the new slots.*/ + memset(yyg->yy_buffer_stack + yyg->yy_buffer_stack_max, 0, grow_size * sizeof(struct yy_buffer_state*)); + yyg->yy_buffer_stack_max = num_to_alloc; + } +} + +/** Setup the input buffer state to scan directly from a user-specified character buffer. + * @param base the character buffer + * @param size the size in bytes of the character buffer + * @param yyscanner The scanner object. + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE yy_scan_buffer (char * base, yy_size_t size , yyscan_t yyscanner) +{ + YY_BUFFER_STATE b; + + if ( size < 2 || + base[size-2] != YY_END_OF_BUFFER_CHAR || + base[size-1] != YY_END_OF_BUFFER_CHAR ) + /* They forgot to leave room for the EOB's. */ + return NULL; + + b = (YY_BUFFER_STATE) yyalloc( sizeof( struct yy_buffer_state ) , yyscanner ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in yy_scan_buffer()" ); + + b->yy_buf_size = (int) (size - 2); /* "- 2" to take care of EOB's */ + b->yy_buf_pos = b->yy_ch_buf = base; + b->yy_is_our_buffer = 0; + b->yy_input_file = NULL; + b->yy_n_chars = b->yy_buf_size; + b->yy_is_interactive = 0; + b->yy_at_bol = 1; + b->yy_fill_buffer = 0; + b->yy_buffer_status = YY_BUFFER_NEW; + + yy_switch_to_buffer( b , yyscanner ); + + return b; +} + +/** Setup the input buffer state to scan a string. The next call to yylex() will + * scan from a @e copy of @a str. + * @param yystr a NUL-terminated string to scan + * @param yyscanner The scanner object. + * @return the newly allocated buffer state object. + * @note If you want to scan bytes that may contain NUL values, then use + * yy_scan_bytes() instead. + */ +YY_BUFFER_STATE yy_scan_string (const char * yystr , yyscan_t yyscanner) +{ + + return yy_scan_bytes( yystr, (int) strlen(yystr) , yyscanner); +} + +/** Setup the input buffer state to scan the given bytes. The next call to yylex() will + * scan from a @e copy of @a bytes. + * @param yybytes the byte buffer to scan + * @param _yybytes_len the number of bytes in the buffer pointed to by @a bytes. + * @param yyscanner The scanner object. + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE yy_scan_bytes (const char * yybytes, int _yybytes_len , yyscan_t yyscanner) +{ + YY_BUFFER_STATE b; + char *buf; + yy_size_t n; + int i; + + /* Get memory for full buffer, including space for trailing EOB's. */ + n = (yy_size_t) (_yybytes_len + 2); + buf = (char *) yyalloc( n , yyscanner ); + if ( ! buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_scan_bytes()" ); + + for ( i = 0; i < _yybytes_len; ++i ) + buf[i] = yybytes[i]; + + buf[_yybytes_len] = buf[_yybytes_len+1] = YY_END_OF_BUFFER_CHAR; + + b = yy_scan_buffer( buf, n , yyscanner); + if ( ! b ) + YY_FATAL_ERROR( "bad buffer in yy_scan_bytes()" ); + + /* It's okay to grow etc. this buffer, and we should throw it + * away when we're done. + */ + b->yy_is_our_buffer = 1; + + return b; +} + +#ifndef YY_EXIT_FAILURE +#define YY_EXIT_FAILURE 2 +#endif + +static void yynoreturn yy_fatal_error (const char* msg , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + (void)yyg; + fprintf( stderr, "%s\n", msg ); + exit( YY_EXIT_FAILURE ); +} + +/* Redefine yyless() so it works in section 3 code. */ + +#undef yyless +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + yytext[yyleng] = yyg->yy_hold_char; \ + yyg->yy_c_buf_p = yytext + yyless_macro_arg; \ + yyg->yy_hold_char = *yyg->yy_c_buf_p; \ + *yyg->yy_c_buf_p = '\0'; \ + yyleng = yyless_macro_arg; \ + } \ + while ( 0 ) + +/* Accessor methods (get/set functions) to struct members. */ + +/** Get the user-defined data for this scanner. + * @param yyscanner The scanner object. + */ +YY_EXTRA_TYPE yyget_extra (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yyextra; +} + +/** Get the current line number. + * @param yyscanner The scanner object. + */ +int yyget_lineno (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if (! YY_CURRENT_BUFFER) + return 0; + + return yylineno; +} + +/** Get the current column number. + * @param yyscanner The scanner object. + */ +int yyget_column (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if (! YY_CURRENT_BUFFER) + return 0; + + return yycolumn; +} + +/** Get the input stream. + * @param yyscanner The scanner object. + */ +FILE *yyget_in (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yyin; +} + +/** Get the output stream. + * @param yyscanner The scanner object. + */ +FILE *yyget_out (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yyout; +} + +/** Get the length of the current token. + * @param yyscanner The scanner object. + */ +int yyget_leng (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yyleng; +} + +/** Get the current token. + * @param yyscanner The scanner object. + */ + +char *yyget_text (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yytext; +} + +/** Set the user-defined data. This data is never touched by the scanner. + * @param user_defined The data to be associated with this scanner. + * @param yyscanner The scanner object. + */ +void yyset_extra (YY_EXTRA_TYPE user_defined , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yyextra = user_defined ; +} + +/** Set the current line number. + * @param _line_number line number + * @param yyscanner The scanner object. + */ +void yyset_lineno (int _line_number , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + /* lineno is only valid if an input buffer exists. */ + if (! YY_CURRENT_BUFFER ) + YY_FATAL_ERROR( "yyset_lineno called with no buffer" ); + + yylineno = _line_number; +} + +/** Set the current column. + * @param _column_no column number + * @param yyscanner The scanner object. + */ +void yyset_column (int _column_no , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + /* column is only valid if an input buffer exists. */ + if (! YY_CURRENT_BUFFER ) + YY_FATAL_ERROR( "yyset_column called with no buffer" ); + + yycolumn = _column_no; +} + +/** Set the input stream. This does not discard the current + * input buffer. + * @param _in_str A readable stream. + * @param yyscanner The scanner object. + * @see yy_switch_to_buffer + */ +void yyset_in (FILE * _in_str , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yyin = _in_str ; +} + +void yyset_out (FILE * _out_str , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yyout = _out_str ; +} + +int yyget_debug (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yy_flex_debug; +} + +void yyset_debug (int _bdebug , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yy_flex_debug = _bdebug ; +} + +/* Accessor methods for yylval and yylloc */ + +/* User-visible API */ + +/* yylex_init is special because it creates the scanner itself, so it is + * the ONLY reentrant function that doesn't take the scanner as the last argument. + * That's why we explicitly handle the declaration, instead of using our macros. + */ +int yylex_init(yyscan_t* ptr_yy_globals) +{ + if (ptr_yy_globals == NULL){ + errno = EINVAL; + return 1; + } + + *ptr_yy_globals = (yyscan_t) yyalloc ( sizeof( struct yyguts_t ), NULL ); + + if (*ptr_yy_globals == NULL){ + errno = ENOMEM; + return 1; + } + + /* By setting to 0xAA, we expose bugs in yy_init_globals. Leave at 0x00 for releases. */ + memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t)); + + return yy_init_globals ( *ptr_yy_globals ); +} + +/* yylex_init_extra has the same functionality as yylex_init, but follows the + * convention of taking the scanner as the last argument. Note however, that + * this is a *pointer* to a scanner, as it will be allocated by this call (and + * is the reason, too, why this function also must handle its own declaration). + * The user defined value in the first argument will be available to yyalloc in + * the yyextra field. + */ +int yylex_init_extra( YY_EXTRA_TYPE yy_user_defined, yyscan_t* ptr_yy_globals ) +{ + struct yyguts_t dummy_yyguts; + + yyset_extra (yy_user_defined, &dummy_yyguts); + + if (ptr_yy_globals == NULL){ + errno = EINVAL; + return 1; + } + + *ptr_yy_globals = (yyscan_t) yyalloc ( sizeof( struct yyguts_t ), &dummy_yyguts ); + + if (*ptr_yy_globals == NULL){ + errno = ENOMEM; + return 1; + } + + /* By setting to 0xAA, we expose bugs in + yy_init_globals. Leave at 0x00 for releases. */ + memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t)); + + yyset_extra (yy_user_defined, *ptr_yy_globals); + + return yy_init_globals ( *ptr_yy_globals ); +} + +static int yy_init_globals (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + /* Initialization is the same as for the non-reentrant scanner. + * This function is called from yylex_destroy(), so don't allocate here. + */ + + yyg->yy_buffer_stack = NULL; + yyg->yy_buffer_stack_top = 0; + yyg->yy_buffer_stack_max = 0; + yyg->yy_c_buf_p = NULL; + yyg->yy_init = 0; + yyg->yy_start = 0; + + yyg->yy_start_stack_ptr = 0; + yyg->yy_start_stack_depth = 0; + yyg->yy_start_stack = NULL; + +/* Defined in main.c */ +#ifdef YY_STDINIT + yyin = stdin; + yyout = stdout; +#else + yyin = NULL; + yyout = NULL; +#endif + + /* For future reference: Set errno on error, since we are called by + * yylex_init() + */ + return 0; +} + +/* yylex_destroy is for both reentrant and non-reentrant scanners. */ +int yylex_destroy (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + /* Pop the buffer stack, destroying each element. */ + while(YY_CURRENT_BUFFER){ + yy_delete_buffer( YY_CURRENT_BUFFER , yyscanner ); + YY_CURRENT_BUFFER_LVALUE = NULL; + yypop_buffer_state(yyscanner); + } + + /* Destroy the stack itself. */ + yyfree(yyg->yy_buffer_stack , yyscanner); + yyg->yy_buffer_stack = NULL; + + /* Destroy the start condition stack. */ + yyfree( yyg->yy_start_stack , yyscanner ); + yyg->yy_start_stack = NULL; + + /* Reset the globals. This is important in a non-reentrant scanner so the next time + * yylex() is called, initialization will occur. */ + yy_init_globals( yyscanner); + + /* Destroy the main struct (reentrant only). */ + yyfree ( yyscanner , yyscanner ); + yyscanner = NULL; + return 0; +} + +/* + * Internal utility routines. + */ + +#ifndef yytext_ptr +static void yy_flex_strncpy (char* s1, const char * s2, int n , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + (void)yyg; + + int i; + for ( i = 0; i < n; ++i ) + s1[i] = s2[i]; +} +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (const char * s , yyscan_t yyscanner) +{ + int n; + for ( n = 0; s[n]; ++n ) + ; + + return n; +} +#endif + +void *yyalloc (yy_size_t size , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + (void)yyg; + return malloc(size); +} + +void *yyrealloc (void * ptr, yy_size_t size , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + (void)yyg; + + /* The cast to (char *) in the following accommodates both + * implementations that use char* generic pointers, and those + * that use void* generic pointers. It works with the latter + * because both ANSI C and C++ allow castless assignment from + * any pointer type to void*, and deal with argument conversions + * as though doing an assignment. + */ + return realloc(ptr, size); +} + +void yyfree (void * ptr , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + (void)yyg; + free( (char *) ptr ); /* see yyrealloc() for (char *) cast */ +} + +#define YYTABLES_NAME "yytables" + +#line 74 "fts0blex.l" + + diff --git a/storage/innobase/fts/fts0blex.l b/storage/innobase/fts/fts0blex.l new file mode 100644 index 00000000..cf19cd0f --- /dev/null +++ b/storage/innobase/fts/fts0blex.l @@ -0,0 +1,74 @@ +/***************************************************************************** + +Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/** + * @file fts/fts0blex.l + * FTS parser lexical analyzer + * + * Created 2007/5/9 Sunny Bains + */ + +%{ + +#include "fts0ast.h" +#include "fts0pars.h" + +/* Required for reentrant parser */ +#define YY_DECL int fts_blexer(YYSTYPE* val, yyscan_t yyscanner) +#define exit(A) ut_error + +%} + +%option noinput +%option nounput +%option noyywrap +%option nostdinit +%option reentrant +%option never-interactive + +%% + +[\t ]+ /* Ignore whitespace */ ; + +[*()+\-<>~@] { + val->oper = fts0bget_text(yyscanner)[0]; + + return(val->oper); +} + +[0-9]+ { + val->token = fts_ast_string_create(reinterpret_cast(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner)); + + return(FTS_NUMB); +} + +[^" \n*()+\-<>~@%]* { + val->token = fts_ast_string_create(reinterpret_cast(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner)); + + return(FTS_TERM); +} + +\"[^\"\n]*\" { + val->token = fts_ast_string_create(reinterpret_cast(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner)); + + return(FTS_TEXT); +} + +\n + +%% diff --git a/storage/innobase/fts/fts0config.cc b/storage/innobase/fts/fts0config.cc new file mode 100644 index 00000000..4566224e --- /dev/null +++ b/storage/innobase/fts/fts0config.cc @@ -0,0 +1,428 @@ +/***************************************************************************** + +Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file fts/fts0config.cc +Full Text Search configuration table. + +Created 2007/5/9 Sunny Bains +***********************************************************************/ + +#include "trx0roll.h" +#include "row0sel.h" + +#include "fts0priv.h" + +/******************************************************************//** +Callback function for fetching the config value. +@return always returns TRUE */ +static +ibool +fts_config_fetch_value( +/*===================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: pointer to + ib_vector_t */ +{ + sel_node_t* node = static_cast(row); + fts_string_t* value = static_cast(user_arg); + + dfield_t* dfield = que_node_get_val(node->select_list); + dtype_t* type = dfield_get_type(dfield); + ulint len = dfield_get_len(dfield); + void* data = dfield_get_data(dfield); + + ut_a(dtype_get_mtype(type) == DATA_VARCHAR); + + if (len != UNIV_SQL_NULL) { + ulint max_len = ut_min(value->f_len - 1, len); + + memcpy(value->f_str, data, max_len); + value->f_len = max_len; + value->f_str[value->f_len] = '\0'; + } + + return(TRUE); +} + +/******************************************************************//** +Get value from the config table. The caller must ensure that enough +space is allocated for value to hold the column contents. +@return DB_SUCCESS or error code */ +dberr_t +fts_config_get_value( +/*=================*/ + trx_t* trx, /*!< transaction */ + fts_table_t* fts_table, /*!< in: the indexed + FTS table */ + const char* name, /*!< in: get config value for + this parameter name */ + fts_string_t* value) /*!< out: value read from + config table */ +{ + pars_info_t* info; + que_t* graph; + dberr_t error; + ulint name_len = strlen(name); + char table_name[MAX_FULL_NAME_LEN]; + + info = pars_info_create(); + + *value->f_str = '\0'; + ut_a(value->f_len > 0); + + pars_info_bind_function(info, "my_func", fts_config_fetch_value, + value); + + /* The len field of value must be set to the max bytes that + it can hold. On a successful read, the len field will be set + to the actual number of bytes copied to value. */ + pars_info_bind_varchar_literal(info, "name", (byte*) name, name_len); + + fts_table->suffix = "CONFIG"; + fts_get_table_name(fts_table, table_name); + pars_info_bind_id(info, "table_name", table_name); + + graph = fts_parse_sql( + fts_table, + info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS SELECT value FROM $table_name" + " WHERE key = :name;\n" + "BEGIN\n" + "" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + + trx->op_info = "getting FTS config value"; + + error = fts_eval_sql(trx, graph); + que_graph_free(graph); + return(error); +} + +/*********************************************************************//** +Create the config table name for retrieving index specific value. +@return index config parameter name */ +char* +fts_config_create_index_param_name( +/*===============================*/ + const char* param, /*!< in: base name of param */ + const dict_index_t* index) /*!< in: index for config */ +{ + ulint len; + char* name; + + /* The format of the config name is: name_. */ + len = strlen(param); + + /* Caller is responsible for deleting name. */ + name = static_cast(ut_malloc_nokey( + len + FTS_AUX_MIN_TABLE_ID_LENGTH + 2)); + ::strcpy(name, param); + name[len] = '_'; + + fts_write_object_id(index->id, name + len + 1); + + return(name); +} + +/******************************************************************//** +Get value specific to an FTS index from the config table. The caller +must ensure that enough space is allocated for value to hold the +column contents. +@return DB_SUCCESS or error code */ +dberr_t +fts_config_get_index_value( +/*=======================*/ + trx_t* trx, /*!< transaction */ + dict_index_t* index, /*!< in: index */ + const char* param, /*!< in: get config value for + this parameter name */ + fts_string_t* value) /*!< out: value read from + config table */ +{ + char* name; + dberr_t error; + fts_table_t fts_table; + + FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, + index->table); + + /* We are responsible for free'ing name. */ + name = fts_config_create_index_param_name(param, index); + + error = fts_config_get_value(trx, &fts_table, name, value); + + ut_free(name); + + return(error); +} + +/******************************************************************//** +Set the value in the config table for name. +@return DB_SUCCESS or error code */ +dberr_t +fts_config_set_value( +/*=================*/ + trx_t* trx, /*!< transaction */ + fts_table_t* fts_table, /*!< in: the indexed + FTS table */ + const char* name, /*!< in: get config value for + this parameter name */ + const fts_string_t* + value) /*!< in: value to update */ +{ + pars_info_t* info; + que_t* graph; + dberr_t error; + undo_no_t undo_no; + undo_no_t n_rows_updated; + ulint name_len = strlen(name); + char table_name[MAX_FULL_NAME_LEN]; + + info = pars_info_create(); + + pars_info_bind_varchar_literal(info, "name", (byte*) name, name_len); + pars_info_bind_varchar_literal(info, "value", + value->f_str, value->f_len); + + const bool dict_locked = fts_table->table->fts->dict_locked; + + fts_table->suffix = "CONFIG"; + fts_get_table_name(fts_table, table_name, dict_locked); + pars_info_bind_id(info, "table_name", table_name); + + graph = fts_parse_sql( + fts_table, info, + "BEGIN UPDATE $table_name SET value = :value" + " WHERE key = :name;"); + + trx->op_info = "setting FTS config value"; + + undo_no = trx->undo_no; + + error = fts_eval_sql(trx, graph); + + que_graph_free(graph); + + n_rows_updated = trx->undo_no - undo_no; + + /* Check if we need to do an insert. */ + if (n_rows_updated == 0) { + info = pars_info_create(); + + pars_info_bind_varchar_literal( + info, "name", (byte*) name, name_len); + + pars_info_bind_varchar_literal( + info, "value", value->f_str, value->f_len); + + fts_get_table_name(fts_table, table_name, dict_locked); + pars_info_bind_id(info, "table_name", table_name); + + graph = fts_parse_sql( + fts_table, info, + "BEGIN\n" + "INSERT INTO $table_name VALUES(:name, :value);"); + + trx->op_info = "inserting FTS config value"; + + error = fts_eval_sql(trx, graph); + + que_graph_free(graph); + } + + return(error); +} + +/******************************************************************//** +Set the value specific to an FTS index in the config table. +@return DB_SUCCESS or error code */ +dberr_t +fts_config_set_index_value( +/*=======================*/ + trx_t* trx, /*!< transaction */ + dict_index_t* index, /*!< in: index */ + const char* param, /*!< in: get config value for + this parameter name */ + fts_string_t* value) /*!< out: value read from + config table */ +{ + char* name; + dberr_t error; + fts_table_t fts_table; + + FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, + index->table); + + /* We are responsible for free'ing name. */ + name = fts_config_create_index_param_name(param, index); + + error = fts_config_set_value(trx, &fts_table, name, value); + + ut_free(name); + + return(error); +} + +#ifdef FTS_OPTIMIZE_DEBUG +/******************************************************************//** +Get an ulint value from the config table. +@return DB_SUCCESS if all OK else error code */ +dberr_t +fts_config_get_index_ulint( +/*=======================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: FTS index */ + const char* name, /*!< in: param name */ + ulint* int_value) /*!< out: value */ +{ + dberr_t error; + fts_string_t value; + + /* We set the length of value to the max bytes it can hold. This + information is used by the callback that reads the value.*/ + value.f_len = FTS_MAX_CONFIG_VALUE_LEN; + value.f_str = static_cast(ut_malloc_nokey(value.f_len + 1)); + + error = fts_config_get_index_value(trx, index, name, &value); + + if (UNIV_UNLIKELY(error != DB_SUCCESS)) { + ib::error() << "(" << error << ") reading `" << name << "'"; + } else { + *int_value = strtoul((char*) value.f_str, NULL, 10); + } + + ut_free(value.f_str); + + return(error); +} + +/******************************************************************//** +Set an ulint value in the config table. +@return DB_SUCCESS if all OK else error code */ +dberr_t +fts_config_set_index_ulint( +/*=======================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: FTS index */ + const char* name, /*!< in: param name */ + ulint int_value) /*!< in: value */ +{ + dberr_t error; + fts_string_t value; + + /* We set the length of value to the max bytes it can hold. This + information is used by the callback that reads the value.*/ + value.f_len = FTS_MAX_CONFIG_VALUE_LEN; + value.f_str = static_cast(ut_malloc_nokey(value.f_len + 1)); + + // FIXME: Get rid of snprintf + ut_a(FTS_MAX_INT_LEN < FTS_MAX_CONFIG_VALUE_LEN); + + value.f_len = snprintf( + (char*) value.f_str, FTS_MAX_INT_LEN, ULINTPF, int_value); + + error = fts_config_set_index_value(trx, index, name, &value); + + if (UNIV_UNLIKELY(error != DB_SUCCESS)) { + ib::error() << "(" << error << ") writing `" << name << "'"; + } + + ut_free(value.f_str); + + return(error); +} +#endif /* FTS_OPTIMIZE_DEBUG */ + +/******************************************************************//** +Get an ulint value from the config table. +@return DB_SUCCESS if all OK else error code */ +dberr_t +fts_config_get_ulint( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + fts_table_t* fts_table, /*!< in: the indexed + FTS table */ + const char* name, /*!< in: param name */ + ulint* int_value) /*!< out: value */ +{ + dberr_t error; + fts_string_t value; + + /* We set the length of value to the max bytes it can hold. This + information is used by the callback that reads the value.*/ + value.f_len = FTS_MAX_CONFIG_VALUE_LEN; + value.f_str = static_cast(ut_malloc_nokey(value.f_len + 1)); + + error = fts_config_get_value(trx, fts_table, name, &value); + + if (UNIV_UNLIKELY(error != DB_SUCCESS)) { + ib::error() << "(" << error << ") reading `" << name << "'"; + } else { + *int_value = strtoul((char*) value.f_str, NULL, 10); + } + + ut_free(value.f_str); + + return(error); +} + +/******************************************************************//** +Set an ulint value in the config table. +@return DB_SUCCESS if all OK else error code */ +dberr_t +fts_config_set_ulint( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + fts_table_t* fts_table, /*!< in: the indexed + FTS table */ + const char* name, /*!< in: param name */ + ulint int_value) /*!< in: value */ +{ + dberr_t error; + fts_string_t value; + + /* We set the length of value to the max bytes it can hold. This + information is used by the callback that reads the value.*/ + value.f_len = FTS_MAX_CONFIG_VALUE_LEN; + value.f_str = static_cast(ut_malloc_nokey(value.f_len + 1)); + + ut_a(FTS_MAX_INT_LEN < FTS_MAX_CONFIG_VALUE_LEN); + + value.f_len = (ulint) snprintf( + (char*) value.f_str, FTS_MAX_INT_LEN, ULINTPF, int_value); + + error = fts_config_set_value(trx, fts_table, name, &value); + + if (UNIV_UNLIKELY(error != DB_SUCCESS)) { + ib::error() << "(" << error << ") writing `" << name << "'"; + } + + ut_free(value.f_str); + + return(error); +} diff --git a/storage/innobase/fts/fts0fts.cc b/storage/innobase/fts/fts0fts.cc new file mode 100644 index 00000000..0775d939 --- /dev/null +++ b/storage/innobase/fts/fts0fts.cc @@ -0,0 +1,6182 @@ +/***************************************************************************** + +Copyright (c) 2011, 2021, Oracle and/or its affiliates. +Copyright (c) 2016, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file fts/fts0fts.cc +Full Text Search interface +***********************************************************************/ + +#include "trx0roll.h" +#include "trx0purge.h" +#include "row0mysql.h" +#include "row0upd.h" +#include "dict0types.h" +#include "dict0stats_bg.h" +#include "row0sel.h" +#include "fts0fts.h" +#include "fts0priv.h" +#include "fts0types.h" +#include "fts0types.inl" +#include "fts0vlc.h" +#include "fts0plugin.h" +#include "dict0stats.h" +#include "btr0pcur.h" + +static const ulint FTS_MAX_ID_LEN = 32; + +/** Column name from the FTS config table */ +#define FTS_MAX_CACHE_SIZE_IN_MB "cache_size_in_mb" + +/** Verify if a aux table name is a obsolete table +by looking up the key word in the obsolete table names */ +#define FTS_IS_OBSOLETE_AUX_TABLE(table_name) \ + (strstr((table_name), "DOC_ID") != NULL \ + || strstr((table_name), "ADDED") != NULL \ + || strstr((table_name), "STOPWORDS") != NULL) + +/** This is maximum FTS cache for each table and would be +a configurable variable */ +Atomic_relaxed fts_max_cache_size; + +/** Whether the total memory used for FTS cache is exhausted, and we will +need a sync to free some memory */ +bool fts_need_sync = false; + +/** Variable specifying the total memory allocated for FTS cache */ +Atomic_relaxed fts_max_total_cache_size; + +/** This is FTS result cache limit for each query and would be +a configurable variable */ +size_t fts_result_cache_limit; + +/** Variable specifying the maximum FTS max token size */ +ulong fts_max_token_size; + +/** Variable specifying the minimum FTS max token size */ +ulong fts_min_token_size; + + +// FIXME: testing +static time_t elapsed_time; +static ulint n_nodes; + +#ifdef FTS_CACHE_SIZE_DEBUG +/** The cache size permissible lower limit (1K) */ +static const ulint FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB = 1; + +/** The cache size permissible upper limit (1G) */ +static const ulint FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB = 1024; +#endif + +/** Time to sleep after DEADLOCK error before retrying operation. */ +static const std::chrono::milliseconds FTS_DEADLOCK_RETRY_WAIT(100); + +/** InnoDB default stopword list: +There are different versions of stopwords, the stop words listed +below comes from "Google Stopword" list. Reference: +http://meta.wikimedia.org/wiki/Stop_word_list/google_stop_word_list. +The final version of InnoDB default stopword list is still pending +for decision */ +const char *fts_default_stopword[] = +{ + "a", + "about", + "an", + "are", + "as", + "at", + "be", + "by", + "com", + "de", + "en", + "for", + "from", + "how", + "i", + "in", + "is", + "it", + "la", + "of", + "on", + "or", + "that", + "the", + "this", + "to", + "was", + "what", + "when", + "where", + "who", + "will", + "with", + "und", + "the", + "www", + NULL +}; + +/** FTS auxiliary table suffixes that are common to all FT indexes. */ +const char* fts_common_tables[] = { + "BEING_DELETED", + "BEING_DELETED_CACHE", + "CONFIG", + "DELETED", + "DELETED_CACHE", + NULL +}; + +/** FTS auxiliary INDEX split intervals. */ +const fts_index_selector_t fts_index_selector[] = { + { 9, "INDEX_1" }, + { 65, "INDEX_2" }, + { 70, "INDEX_3" }, + { 75, "INDEX_4" }, + { 80, "INDEX_5" }, + { 85, "INDEX_6" }, + { 0 , NULL } +}; + +/** Default config values for FTS indexes on a table. */ +static const char* fts_config_table_insert_values_sql = + "PROCEDURE P() IS\n" + "BEGIN\n" + "\n" + "INSERT INTO $config_table VALUES('" + FTS_MAX_CACHE_SIZE_IN_MB "', '256');\n" + "" + "INSERT INTO $config_table VALUES('" + FTS_OPTIMIZE_LIMIT_IN_SECS "', '180');\n" + "" + "INSERT INTO $config_table VALUES ('" + FTS_SYNCED_DOC_ID "', '0');\n" + "" + "INSERT INTO $config_table VALUES ('" + FTS_TOTAL_DELETED_COUNT "', '0');\n" + "" /* Note: 0 == FTS_TABLE_STATE_RUNNING */ + "INSERT INTO $config_table VALUES ('" + FTS_TABLE_STATE "', '0');\n" + "END;\n"; + +/** FTS tokenize parmameter for plugin parser */ +struct fts_tokenize_param_t { + fts_doc_t* result_doc; /*!< Result doc for tokens */ + ulint add_pos; /*!< Added position for tokens */ +}; + +/** Run SYNC on the table, i.e., write out data from the cache to the +FTS auxiliary INDEX table and clear the cache at the end. +@param[in,out] sync sync state +@param[in] unlock_cache whether unlock cache lock when write node +@param[in] wait whether wait when a sync is in progress +@return DB_SUCCESS if all OK */ +static +dberr_t +fts_sync( + fts_sync_t* sync, + bool unlock_cache, + bool wait); + +/****************************************************************//** +Release all resources help by the words rb tree e.g., the node ilist. */ +static +void +fts_words_free( +/*===========*/ + ib_rbt_t* words) /*!< in: rb tree of words */ + MY_ATTRIBUTE((nonnull)); +#ifdef FTS_CACHE_SIZE_DEBUG +/****************************************************************//** +Read the max cache size parameter from the config table. */ +static +void +fts_update_max_cache_size( +/*======================*/ + fts_sync_t* sync); /*!< in: sync state */ +#endif + +/*********************************************************************//** +This function fetches the document just inserted right before +we commit the transaction, and tokenize the inserted text data +and insert into FTS auxiliary table and its cache. */ +static +void +fts_add_doc_by_id( +/*==============*/ + fts_trx_table_t*ftt, /*!< in: FTS trx table */ + doc_id_t doc_id); /*!< in: doc id */ + +/** Tokenize a document. +@param[in,out] doc document to tokenize +@param[out] result tokenization result +@param[in] parser pluggable parser */ +static +void +fts_tokenize_document( + fts_doc_t* doc, + fts_doc_t* result, + st_mysql_ftparser* parser); + +/** Continue to tokenize a document. +@param[in,out] doc document to tokenize +@param[in] add_pos add this position to all tokens from this tokenization +@param[out] result tokenization result +@param[in] parser pluggable parser */ +static +void +fts_tokenize_document_next( + fts_doc_t* doc, + ulint add_pos, + fts_doc_t* result, + st_mysql_ftparser* parser); + +/** Create the vector of fts_get_doc_t instances. +@param[in,out] cache fts cache +@return vector of fts_get_doc_t instances */ +static +ib_vector_t* +fts_get_docs_create( + fts_cache_t* cache); + +/** Free the FTS cache. +@param[in,out] cache to be freed */ +static +void +fts_cache_destroy(fts_cache_t* cache) +{ + mysql_mutex_destroy(&cache->lock); + mysql_mutex_destroy(&cache->init_lock); + mysql_mutex_destroy(&cache->deleted_lock); + mysql_mutex_destroy(&cache->doc_id_lock); + pthread_cond_destroy(&cache->sync->cond); + + if (cache->stopword_info.cached_stopword) { + rbt_free(cache->stopword_info.cached_stopword); + } + + if (cache->sync_heap->arg) { + mem_heap_free(static_cast(cache->sync_heap->arg)); + } + + mem_heap_free(cache->cache_heap); +} + +/** Get a character set based on precise type. +@param prtype precise type +@return the corresponding character set */ +UNIV_INLINE +CHARSET_INFO* +fts_get_charset(ulint prtype) +{ +#ifdef UNIV_DEBUG + switch (prtype & DATA_MYSQL_TYPE_MASK) { + case MYSQL_TYPE_BIT: + case MYSQL_TYPE_STRING: + case MYSQL_TYPE_VAR_STRING: + case MYSQL_TYPE_TINY_BLOB: + case MYSQL_TYPE_MEDIUM_BLOB: + case MYSQL_TYPE_BLOB: + case MYSQL_TYPE_LONG_BLOB: + case MYSQL_TYPE_VARCHAR: + break; + default: + ut_error; + } +#endif /* UNIV_DEBUG */ + + uint cs_num = (uint) dtype_get_charset_coll(prtype); + + if (CHARSET_INFO* cs = get_charset(cs_num, MYF(MY_WME))) { + return(cs); + } + + ib::fatal() << "Unable to find charset-collation " << cs_num; + return(NULL); +} + +/****************************************************************//** +This function loads the default InnoDB stopword list */ +static +void +fts_load_default_stopword( +/*======================*/ + fts_stopword_t* stopword_info) /*!< in: stopword info */ +{ + fts_string_t str; + mem_heap_t* heap; + ib_alloc_t* allocator; + ib_rbt_t* stop_words; + + allocator = stopword_info->heap; + heap = static_cast(allocator->arg); + + if (!stopword_info->cached_stopword) { + stopword_info->cached_stopword = rbt_create_arg_cmp( + sizeof(fts_tokenizer_word_t), innobase_fts_text_cmp, + &my_charset_latin1); + } + + stop_words = stopword_info->cached_stopword; + + str.f_n_char = 0; + + for (ulint i = 0; fts_default_stopword[i]; ++i) { + char* word; + fts_tokenizer_word_t new_word; + + /* We are going to duplicate the value below. */ + word = const_cast(fts_default_stopword[i]); + + new_word.nodes = ib_vector_create( + allocator, sizeof(fts_node_t), 4); + + str.f_len = strlen(word); + str.f_str = reinterpret_cast(word); + + fts_string_dup(&new_word.text, &str, heap); + + rbt_insert(stop_words, &new_word, &new_word); + } + + stopword_info->status = STOPWORD_FROM_DEFAULT; +} + +/****************************************************************//** +Callback function to read a single stopword value. +@return Always return TRUE */ +static +ibool +fts_read_stopword( +/*==============*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: pointer to ib_vector_t */ +{ + ib_alloc_t* allocator; + fts_stopword_t* stopword_info; + sel_node_t* sel_node; + que_node_t* exp; + ib_rbt_t* stop_words; + dfield_t* dfield; + fts_string_t str; + mem_heap_t* heap; + ib_rbt_bound_t parent; + dict_table_t* table; + + sel_node = static_cast(row); + table = sel_node->table_list->table; + stopword_info = static_cast(user_arg); + + stop_words = stopword_info->cached_stopword; + allocator = static_cast(stopword_info->heap); + heap = static_cast(allocator->arg); + + exp = sel_node->select_list; + + /* We only need to read the first column */ + dfield = que_node_get_val(exp); + + str.f_n_char = 0; + str.f_str = static_cast(dfield_get_data(dfield)); + str.f_len = dfield_get_len(dfield); + exp = que_node_get_next(exp); + ut_ad(exp); + + if (table->versioned()) { + dfield = que_node_get_val(exp); + ut_ad(dfield_get_type(dfield)->vers_sys_end()); + void* data = dfield_get_data(dfield); + ulint len = dfield_get_len(dfield); + if (table->versioned_by_id()) { + ut_ad(len == sizeof trx_id_max_bytes); + if (0 != memcmp(data, trx_id_max_bytes, len)) { + return true; + } + } else { + ut_ad(len == sizeof timestamp_max_bytes); + if (0 != memcmp(data, timestamp_max_bytes, len)) { + return true; + } + } + } + ut_ad(!que_node_get_next(exp)); + + /* Only create new node if it is a value not already existed */ + if (str.f_len != UNIV_SQL_NULL + && rbt_search(stop_words, &parent, &str) != 0) { + + fts_tokenizer_word_t new_word; + + new_word.nodes = ib_vector_create( + allocator, sizeof(fts_node_t), 4); + + new_word.text.f_str = static_cast( + mem_heap_alloc(heap, str.f_len + 1)); + + memcpy(new_word.text.f_str, str.f_str, str.f_len); + + new_word.text.f_n_char = 0; + new_word.text.f_len = str.f_len; + new_word.text.f_str[str.f_len] = 0; + + rbt_insert(stop_words, &new_word, &new_word); + } + + return(TRUE); +} + +/******************************************************************//** +Load user defined stopword from designated user table +@return whether the operation is successful */ +static +bool +fts_load_user_stopword( +/*===================*/ + fts_t* fts, /*!< in: FTS struct */ + const char* stopword_table_name, /*!< in: Stopword table + name */ + fts_stopword_t* stopword_info) /*!< in: Stopword info */ +{ + if (!fts->dict_locked) { + dict_sys.lock(SRW_LOCK_CALL); + } + + /* Validate the user table existence in the right format */ + bool ret= false; + const char* row_end; + stopword_info->charset = fts_valid_stopword_table(stopword_table_name, + &row_end); + if (!stopword_info->charset) { +cleanup: + if (!fts->dict_locked) { + dict_sys.unlock(); + } + + return ret; + } + + trx_t* trx = trx_create(); + trx->op_info = "Load user stopword table into FTS cache"; + + if (!stopword_info->cached_stopword) { + /* Create the stopword RB tree with the stopword column + charset. All comparison will use this charset */ + stopword_info->cached_stopword = rbt_create_arg_cmp( + sizeof(fts_tokenizer_word_t), innobase_fts_text_cmp, + (void*)stopword_info->charset); + + } + + pars_info_t* info = pars_info_create(); + + pars_info_bind_id(info, "table_stopword", stopword_table_name); + pars_info_bind_id(info, "row_end", row_end); + + pars_info_bind_function(info, "my_func", fts_read_stopword, + stopword_info); + + que_t* graph = pars_sql( + info, + "PROCEDURE P() IS\n" + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT value, $row_end" + " FROM $table_stopword;\n" + "BEGIN\n" + "\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;" + "END;\n"); + + for (;;) { + dberr_t error = fts_eval_sql(trx, graph); + + if (UNIV_LIKELY(error == DB_SUCCESS)) { + fts_sql_commit(trx); + stopword_info->status = STOPWORD_USER_TABLE; + break; + } else { + fts_sql_rollback(trx); + + if (error == DB_LOCK_WAIT_TIMEOUT) { + ib::warn() << "Lock wait timeout reading user" + " stopword table. Retrying!"; + + trx->error_state = DB_SUCCESS; + } else { + ib::error() << "Error '" << error + << "' while reading user stopword" + " table."; + ret = FALSE; + break; + } + } + } + + que_graph_free(graph); + trx->free(); + ret = true; + goto cleanup; +} + +/******************************************************************//** +Initialize the index cache. */ +static +void +fts_index_cache_init( +/*=================*/ + ib_alloc_t* allocator, /*!< in: the allocator to use */ + fts_index_cache_t* index_cache) /*!< in: index cache */ +{ + ulint i; + + ut_a(index_cache->words == NULL); + + index_cache->words = rbt_create_arg_cmp( + sizeof(fts_tokenizer_word_t), innobase_fts_text_cmp, + (void*) index_cache->charset); + + ut_a(index_cache->doc_stats == NULL); + + index_cache->doc_stats = ib_vector_create( + allocator, sizeof(fts_doc_stats_t), 4); + + for (i = 0; i < FTS_NUM_AUX_INDEX; ++i) { + ut_a(index_cache->ins_graph[i] == NULL); + ut_a(index_cache->sel_graph[i] == NULL); + } +} + +/*********************************************************************//** +Initialize FTS cache. */ +void +fts_cache_init( +/*===========*/ + fts_cache_t* cache) /*!< in: cache to initialize */ +{ + ulint i; + + /* Just to make sure */ + ut_a(cache->sync_heap->arg == NULL); + + cache->sync_heap->arg = mem_heap_create(1024); + + cache->total_size = 0; + cache->total_size_at_sync = 0; + + mysql_mutex_lock(&cache->deleted_lock); + cache->deleted_doc_ids = ib_vector_create( + cache->sync_heap, sizeof(doc_id_t), 4); + mysql_mutex_unlock(&cache->deleted_lock); + + /* Reset the cache data for all the FTS indexes. */ + for (i = 0; i < ib_vector_size(cache->indexes); ++i) { + fts_index_cache_t* index_cache; + + index_cache = static_cast( + ib_vector_get(cache->indexes, i)); + + fts_index_cache_init(cache->sync_heap, index_cache); + } +} + +/****************************************************************//** +Create a FTS cache. */ +fts_cache_t* +fts_cache_create( +/*=============*/ + dict_table_t* table) /*!< in: table owns the FTS cache */ +{ + mem_heap_t* heap; + fts_cache_t* cache; + + heap = static_cast(mem_heap_create(512)); + + cache = static_cast( + mem_heap_zalloc(heap, sizeof(*cache))); + + cache->cache_heap = heap; + + mysql_mutex_init(fts_cache_mutex_key, &cache->lock, nullptr); + mysql_mutex_init(fts_cache_init_mutex_key, &cache->init_lock, nullptr); + mysql_mutex_init(fts_delete_mutex_key, &cache->deleted_lock, nullptr); + mysql_mutex_init(fts_doc_id_mutex_key, &cache->doc_id_lock, nullptr); + + /* This is the heap used to create the cache itself. */ + cache->self_heap = ib_heap_allocator_create(heap); + + /* This is a transient heap, used for storing sync data. */ + cache->sync_heap = ib_heap_allocator_create(heap); + cache->sync_heap->arg = NULL; + + cache->sync = static_cast( + mem_heap_zalloc(heap, sizeof(fts_sync_t))); + + cache->sync->table = table; + pthread_cond_init(&cache->sync->cond, nullptr); + + /* Create the index cache vector that will hold the inverted indexes. */ + cache->indexes = ib_vector_create( + cache->self_heap, sizeof(fts_index_cache_t), 2); + + fts_cache_init(cache); + + cache->stopword_info.cached_stopword = NULL; + cache->stopword_info.charset = NULL; + + cache->stopword_info.heap = cache->self_heap; + + cache->stopword_info.status = STOPWORD_NOT_INIT; + + return(cache); +} + +/*******************************************************************//** +Add a newly create index into FTS cache */ +void +fts_add_index( +/*==========*/ + dict_index_t* index, /*!< FTS index to be added */ + dict_table_t* table) /*!< table */ +{ + fts_t* fts = table->fts; + fts_cache_t* cache; + fts_index_cache_t* index_cache; + + ut_ad(fts); + cache = table->fts->cache; + + mysql_mutex_lock(&cache->init_lock); + + ib_vector_push(fts->indexes, &index); + + index_cache = fts_find_index_cache(cache, index); + + if (!index_cache) { + /* Add new index cache structure */ + index_cache = fts_cache_index_cache_create(table, index); + } + + mysql_mutex_unlock(&cache->init_lock); +} + +/*******************************************************************//** +recalibrate get_doc structure after index_cache in cache->indexes changed */ +static +void +fts_reset_get_doc( +/*==============*/ + fts_cache_t* cache) /*!< in: FTS index cache */ +{ + fts_get_doc_t* get_doc; + ulint i; + + mysql_mutex_assert_owner(&cache->init_lock); + + ib_vector_reset(cache->get_docs); + + for (i = 0; i < ib_vector_size(cache->indexes); i++) { + fts_index_cache_t* ind_cache; + + ind_cache = static_cast( + ib_vector_get(cache->indexes, i)); + + get_doc = static_cast( + ib_vector_push(cache->get_docs, NULL)); + + memset(get_doc, 0x0, sizeof(*get_doc)); + + get_doc->index_cache = ind_cache; + get_doc->cache = cache; + } + + ut_ad(ib_vector_size(cache->get_docs) + == ib_vector_size(cache->indexes)); +} + +/*******************************************************************//** +Check an index is in the table->indexes list +@return TRUE if it exists */ +static +ibool +fts_in_dict_index( +/*==============*/ + dict_table_t* table, /*!< in: Table */ + dict_index_t* index_check) /*!< in: index to be checked */ +{ + dict_index_t* index; + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + + if (index == index_check) { + return(TRUE); + } + } + + return(FALSE); +} + +/*******************************************************************//** +Check an index is in the fts->cache->indexes list +@return TRUE if it exists */ +static +ibool +fts_in_index_cache( +/*===============*/ + dict_table_t* table, /*!< in: Table */ + dict_index_t* index) /*!< in: index to be checked */ +{ + ulint i; + + for (i = 0; i < ib_vector_size(table->fts->cache->indexes); i++) { + fts_index_cache_t* index_cache; + + index_cache = static_cast( + ib_vector_get(table->fts->cache->indexes, i)); + + if (index_cache->index == index) { + return(TRUE); + } + } + + return(FALSE); +} + +/*******************************************************************//** +Check indexes in the fts->indexes is also present in index cache and +table->indexes list +@return TRUE if all indexes match */ +ibool +fts_check_cached_index( +/*===================*/ + dict_table_t* table) /*!< in: Table where indexes are dropped */ +{ + ulint i; + + if (!table->fts || !table->fts->cache) { + return(TRUE); + } + + ut_a(ib_vector_size(table->fts->indexes) + == ib_vector_size(table->fts->cache->indexes)); + + for (i = 0; i < ib_vector_size(table->fts->indexes); i++) { + dict_index_t* index; + + index = static_cast( + ib_vector_getp(table->fts->indexes, i)); + + if (!fts_in_index_cache(table, index)) { + return(FALSE); + } + + if (!fts_in_dict_index(table, index)) { + return(FALSE); + } + } + + return(TRUE); +} + +/** Clear all fts resources when there is no internal DOC_ID +and there are no new fts index to add. +@param[in,out] table table where fts is to be freed */ +void fts_clear_all(dict_table_t *table) +{ + if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID) || + !table->fts || + !ib_vector_is_empty(table->fts->indexes)) + return; + + for (const dict_index_t *index= dict_table_get_first_index(table); + index; index= dict_table_get_next_index(index)) + if (index->type & DICT_FTS) + return; + + fts_optimize_remove_table(table); + + table->fts->~fts_t(); + table->fts= nullptr; + DICT_TF2_FLAG_UNSET(table, DICT_TF2_FTS); +} + +/*******************************************************************//** +Drop auxiliary tables related to an FTS index +@return DB_SUCCESS or error number */ +dberr_t +fts_drop_index( +/*===========*/ + dict_table_t* table, /*!< in: Table where indexes are dropped */ + dict_index_t* index, /*!< in: Index to be dropped */ + trx_t* trx) /*!< in: Transaction for the drop */ +{ + ib_vector_t* indexes = table->fts->indexes; + dberr_t err = DB_SUCCESS; + + ut_a(indexes); + + if ((ib_vector_size(indexes) == 1 + && (index == static_cast( + ib_vector_getp(table->fts->indexes, 0))) + && DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) + || ib_vector_is_empty(indexes)) { + doc_id_t current_doc_id; + doc_id_t first_doc_id; + + DICT_TF2_FLAG_UNSET(table, DICT_TF2_FTS); + + current_doc_id = table->fts->cache->next_doc_id; + first_doc_id = table->fts->cache->first_doc_id; + fts_cache_clear(table->fts->cache); + fts_cache_destroy(table->fts->cache); + table->fts->cache = fts_cache_create(table); + table->fts->cache->next_doc_id = current_doc_id; + table->fts->cache->first_doc_id = first_doc_id; + } else { + fts_cache_t* cache = table->fts->cache; + fts_index_cache_t* index_cache; + + mysql_mutex_lock(&cache->init_lock); + + index_cache = fts_find_index_cache(cache, index); + + if (index_cache != NULL) { + if (index_cache->words) { + fts_words_free(index_cache->words); + rbt_free(index_cache->words); + } + + ib_vector_remove(cache->indexes, *(void**) index_cache); + } + + if (cache->get_docs) { + fts_reset_get_doc(cache); + } + + mysql_mutex_unlock(&cache->init_lock); + } + + err = fts_drop_index_tables(trx, *index); + + ib_vector_remove(indexes, (const void*) index); + + return(err); +} + +/****************************************************************//** +Create an FTS index cache. */ +CHARSET_INFO* +fts_index_get_charset( +/*==================*/ + dict_index_t* index) /*!< in: FTS index */ +{ + CHARSET_INFO* charset = NULL; + dict_field_t* field; + ulint prtype; + + field = dict_index_get_nth_field(index, 0); + prtype = field->col->prtype; + + charset = fts_get_charset(prtype); + +#ifdef FTS_DEBUG + /* Set up charset info for this index. Please note all + field of the FTS index should have the same charset */ + for (i = 1; i < index->n_fields; i++) { + CHARSET_INFO* fld_charset; + + field = dict_index_get_nth_field(index, i); + prtype = field->col->prtype; + + fld_charset = fts_get_charset(prtype); + + /* All FTS columns should have the same charset */ + if (charset) { + ut_a(charset == fld_charset); + } else { + charset = fld_charset; + } + } +#endif + + return(charset); + +} +/****************************************************************//** +Create an FTS index cache. +@return Index Cache */ +fts_index_cache_t* +fts_cache_index_cache_create( +/*=========================*/ + dict_table_t* table, /*!< in: table with FTS index */ + dict_index_t* index) /*!< in: FTS index */ +{ + ulint n_bytes; + fts_index_cache_t* index_cache; + fts_cache_t* cache = table->fts->cache; + + ut_a(cache != NULL); + + mysql_mutex_assert_owner(&cache->init_lock); + + /* Must not already exist in the cache vector. */ + ut_a(fts_find_index_cache(cache, index) == NULL); + + index_cache = static_cast( + ib_vector_push(cache->indexes, NULL)); + + memset(index_cache, 0x0, sizeof(*index_cache)); + + index_cache->index = index; + + index_cache->charset = fts_index_get_charset(index); + + n_bytes = sizeof(que_t*) * FTS_NUM_AUX_INDEX; + + index_cache->ins_graph = static_cast( + mem_heap_zalloc(static_cast( + cache->self_heap->arg), n_bytes)); + + index_cache->sel_graph = static_cast( + mem_heap_zalloc(static_cast( + cache->self_heap->arg), n_bytes)); + + fts_index_cache_init(cache->sync_heap, index_cache); + + if (cache->get_docs) { + fts_reset_get_doc(cache); + } + + return(index_cache); +} + +/****************************************************************//** +Release all resources help by the words rb tree e.g., the node ilist. */ +static +void +fts_words_free( +/*===========*/ + ib_rbt_t* words) /*!< in: rb tree of words */ +{ + const ib_rbt_node_t* rbt_node; + + /* Free the resources held by a word. */ + for (rbt_node = rbt_first(words); + rbt_node != NULL; + rbt_node = rbt_first(words)) { + + ulint i; + fts_tokenizer_word_t* word; + + word = rbt_value(fts_tokenizer_word_t, rbt_node); + + /* Free the ilists of this word. */ + for (i = 0; i < ib_vector_size(word->nodes); ++i) { + + fts_node_t* fts_node = static_cast( + ib_vector_get(word->nodes, i)); + + ut_free(fts_node->ilist); + fts_node->ilist = NULL; + } + + /* NOTE: We are responsible for free'ing the node */ + ut_free(rbt_remove_node(words, rbt_node)); + } +} + +/** Clear cache. +@param[in,out] cache fts cache */ +void +fts_cache_clear( + fts_cache_t* cache) +{ + ulint i; + + for (i = 0; i < ib_vector_size(cache->indexes); ++i) { + ulint j; + fts_index_cache_t* index_cache; + + index_cache = static_cast( + ib_vector_get(cache->indexes, i)); + + fts_words_free(index_cache->words); + + rbt_free(index_cache->words); + + index_cache->words = NULL; + + for (j = 0; j < FTS_NUM_AUX_INDEX; ++j) { + + if (index_cache->ins_graph[j] != NULL) { + + que_graph_free(index_cache->ins_graph[j]); + + index_cache->ins_graph[j] = NULL; + } + + if (index_cache->sel_graph[j] != NULL) { + + que_graph_free(index_cache->sel_graph[j]); + + index_cache->sel_graph[j] = NULL; + } + } + + index_cache->doc_stats = NULL; + } + + fts_need_sync = false; + + cache->total_size = 0; + + mysql_mutex_lock(&cache->deleted_lock); + cache->deleted_doc_ids = NULL; + mysql_mutex_unlock(&cache->deleted_lock); + + mem_heap_free(static_cast(cache->sync_heap->arg)); + cache->sync_heap->arg = NULL; +} + +/*********************************************************************//** +Search the index specific cache for a particular FTS index. +@return the index cache else NULL */ +UNIV_INLINE +fts_index_cache_t* +fts_get_index_cache( +/*================*/ + fts_cache_t* cache, /*!< in: cache to search */ + const dict_index_t* index) /*!< in: index to search for */ +{ +#ifdef SAFE_MUTEX + ut_ad(mysql_mutex_is_owner(&cache->lock) + || mysql_mutex_is_owner(&cache->init_lock)); +#endif /* SAFE_MUTEX */ + + for (ulint i = 0; i < ib_vector_size(cache->indexes); ++i) { + fts_index_cache_t* index_cache; + + index_cache = static_cast( + ib_vector_get(cache->indexes, i)); + + if (index_cache->index == index) { + + return(index_cache); + } + } + + return(NULL); +} + +#ifdef FTS_DEBUG +/*********************************************************************//** +Search the index cache for a get_doc structure. +@return the fts_get_doc_t item else NULL */ +static +fts_get_doc_t* +fts_get_index_get_doc( +/*==================*/ + fts_cache_t* cache, /*!< in: cache to search */ + const dict_index_t* index) /*!< in: index to search for */ +{ + ulint i; + + mysql_mutex_assert_owner(&cache->init_lock); + + for (i = 0; i < ib_vector_size(cache->get_docs); ++i) { + fts_get_doc_t* get_doc; + + get_doc = static_cast( + ib_vector_get(cache->get_docs, i)); + + if (get_doc->index_cache->index == index) { + + return(get_doc); + } + } + + return(NULL); +} +#endif + +/**********************************************************************//** +Find an existing word, or if not found, create one and return it. +@return specified word token */ +static +fts_tokenizer_word_t* +fts_tokenizer_word_get( +/*===================*/ + fts_cache_t* cache, /*!< in: cache */ + fts_index_cache_t* + index_cache, /*!< in: index cache */ + fts_string_t* text) /*!< in: node text */ +{ + fts_tokenizer_word_t* word; + ib_rbt_bound_t parent; + + mysql_mutex_assert_owner(&cache->lock); + + /* If it is a stopword, do not index it */ + if (!fts_check_token(text, + cache->stopword_info.cached_stopword, + index_cache->charset)) { + + return(NULL); + } + + /* Check if we found a match, if not then add word to tree. */ + if (rbt_search(index_cache->words, &parent, text) != 0) { + mem_heap_t* heap; + fts_tokenizer_word_t new_word; + + heap = static_cast(cache->sync_heap->arg); + + new_word.nodes = ib_vector_create( + cache->sync_heap, sizeof(fts_node_t), 4); + + fts_string_dup(&new_word.text, text, heap); + + parent.last = rbt_add_node( + index_cache->words, &parent, &new_word); + + /* Take into account the RB tree memory use and the vector. */ + cache->total_size += sizeof(new_word) + + sizeof(ib_rbt_node_t) + + text->f_len + + (sizeof(fts_node_t) * 4) + + sizeof(*new_word.nodes); + + ut_ad(rbt_validate(index_cache->words)); + } + + word = rbt_value(fts_tokenizer_word_t, parent.last); + + return(word); +} + +/**********************************************************************//** +Add the given doc_id/word positions to the given node's ilist. */ +void +fts_cache_node_add_positions( +/*=========================*/ + fts_cache_t* cache, /*!< in: cache */ + fts_node_t* node, /*!< in: word node */ + doc_id_t doc_id, /*!< in: doc id */ + ib_vector_t* positions) /*!< in: fts_token_t::positions */ +{ + ulint i; + byte* ptr; + byte* ilist; + ulint enc_len; + ulint last_pos; + byte* ptr_start; + doc_id_t doc_id_delta; + +#ifdef SAFE_MUTEX + if (cache) { + mysql_mutex_assert_owner(&cache->lock); + } +#endif /* SAFE_MUTEX */ + + ut_ad(doc_id >= node->last_doc_id); + + /* Calculate the space required to store the ilist. */ + doc_id_delta = doc_id - node->last_doc_id; + enc_len = fts_get_encoded_len(doc_id_delta); + + last_pos = 0; + for (i = 0; i < ib_vector_size(positions); i++) { + ulint pos = *(static_cast( + ib_vector_get(positions, i))); + + ut_ad(last_pos == 0 || pos > last_pos); + + enc_len += fts_get_encoded_len(pos - last_pos); + last_pos = pos; + } + + /* The 0x00 byte at the end of the token positions list. */ + enc_len++; + + if ((node->ilist_size_alloc - node->ilist_size) >= enc_len) { + /* No need to allocate more space, we can fit in the new + data at the end of the old one. */ + ilist = NULL; + ptr = node->ilist + node->ilist_size; + } else { + ulint new_size = node->ilist_size + enc_len; + + /* Over-reserve space by a fixed size for small lengths and + by 20% for lengths >= 48 bytes. */ + if (new_size < 16) { + new_size = 16; + } else if (new_size < 32) { + new_size = 32; + } else if (new_size < 48) { + new_size = 48; + } else { + new_size = new_size * 6 / 5; + } + + ilist = static_cast(ut_malloc_nokey(new_size)); + ptr = ilist + node->ilist_size; + + node->ilist_size_alloc = new_size; + if (cache) { + cache->total_size += new_size; + } + } + + ptr_start = ptr; + + /* Encode the new fragment. */ + ptr = fts_encode_int(doc_id_delta, ptr); + + last_pos = 0; + for (i = 0; i < ib_vector_size(positions); i++) { + ulint pos = *(static_cast( + ib_vector_get(positions, i))); + + ptr = fts_encode_int(pos - last_pos, ptr); + last_pos = pos; + } + + *ptr++ = 0; + + ut_a(enc_len == (ulint)(ptr - ptr_start)); + + if (ilist) { + /* Copy old ilist to the start of the new one and switch the + new one into place in the node. */ + if (node->ilist_size > 0) { + memcpy(ilist, node->ilist, node->ilist_size); + ut_free(node->ilist); + if (cache) { + cache->total_size -= node->ilist_size; + } + } + + node->ilist = ilist; + } + + node->ilist_size += enc_len; + + if (node->first_doc_id == FTS_NULL_DOC_ID) { + node->first_doc_id = doc_id; + } + + node->last_doc_id = doc_id; + ++node->doc_count; +} + +/**********************************************************************//** +Add document to the cache. */ +static +void +fts_cache_add_doc( +/*==============*/ + fts_cache_t* cache, /*!< in: cache */ + fts_index_cache_t* + index_cache, /*!< in: index cache */ + doc_id_t doc_id, /*!< in: doc id to add */ + ib_rbt_t* tokens) /*!< in: document tokens */ +{ + const ib_rbt_node_t* node; + ulint n_words; + fts_doc_stats_t* doc_stats; + + if (!tokens) { + return; + } + + mysql_mutex_assert_owner(&cache->lock); + + n_words = rbt_size(tokens); + + for (node = rbt_first(tokens); node; node = rbt_first(tokens)) { + + fts_tokenizer_word_t* word; + fts_node_t* fts_node = NULL; + fts_token_t* token = rbt_value(fts_token_t, node); + + /* Find and/or add token to the cache. */ + word = fts_tokenizer_word_get( + cache, index_cache, &token->text); + + if (!word) { + ut_free(rbt_remove_node(tokens, node)); + continue; + } + + if (ib_vector_size(word->nodes) > 0) { + fts_node = static_cast( + ib_vector_last(word->nodes)); + } + + if (fts_node == NULL || fts_node->synced + || fts_node->ilist_size > FTS_ILIST_MAX_SIZE + || doc_id < fts_node->last_doc_id) { + + fts_node = static_cast( + ib_vector_push(word->nodes, NULL)); + + memset(fts_node, 0x0, sizeof(*fts_node)); + + cache->total_size += sizeof(*fts_node); + } + + fts_cache_node_add_positions( + cache, fts_node, doc_id, token->positions); + + ut_free(rbt_remove_node(tokens, node)); + } + + ut_a(rbt_empty(tokens)); + + /* Add to doc ids processed so far. */ + doc_stats = static_cast( + ib_vector_push(index_cache->doc_stats, NULL)); + + doc_stats->doc_id = doc_id; + doc_stats->word_count = n_words; + + /* Add the doc stats memory usage too. */ + cache->total_size += sizeof(*doc_stats); + + if (doc_id > cache->sync->max_doc_id) { + cache->sync->max_doc_id = doc_id; + } +} + +/** Drop a table. +@param trx transaction +@param table_name FTS_ table name +@param rename whether to rename before dropping +@return error code +@retval DB_SUCCESS if the table was dropped +@retval DB_FAIL if the table did not exist */ +static dberr_t fts_drop_table(trx_t *trx, const char *table_name, bool rename) +{ + if (dict_table_t *table= dict_table_open_on_name(table_name, true, + DICT_ERR_IGNORE_TABLESPACE)) + { + table->release(); + if (rename) + { + mem_heap_t *heap= mem_heap_create(FN_REFLEN); + char *tmp= dict_mem_create_temporary_tablename(heap, table->name.m_name, + table->id); + dberr_t err= row_rename_table_for_mysql(table->name.m_name, tmp, trx, + false); + mem_heap_free(heap); + if (err != DB_SUCCESS) + { + ib::error() << "Unable to rename table " << table_name << ": " << err; + return err; + } + } + if (dberr_t err= trx->drop_table(*table)) + { + ib::error() << "Unable to drop table " << table->name << ": " << err; + return err; + } + +#ifdef UNIV_DEBUG + for (auto &p : trx->mod_tables) + { + if (p.first == table) + p.second.set_aux_table(); + } +#endif /* UNIV_DEBUG */ + return DB_SUCCESS; + } + + return DB_FAIL; +} + +/****************************************************************//** +Rename a single auxiliary table due to database name change. +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_rename_one_aux_table( +/*=====================*/ + const char* new_name, /*!< in: new parent tbl name */ + const char* fts_table_old_name, /*!< in: old aux tbl name */ + trx_t* trx) /*!< in: transaction */ +{ + char fts_table_new_name[MAX_TABLE_NAME_LEN]; + ulint new_db_name_len = dict_get_db_name_len(new_name); + ulint old_db_name_len = dict_get_db_name_len(fts_table_old_name); + ulint table_new_name_len = strlen(fts_table_old_name) + + new_db_name_len - old_db_name_len; + + /* Check if the new and old database names are the same, if so, + nothing to do */ + ut_ad((new_db_name_len != old_db_name_len) + || strncmp(new_name, fts_table_old_name, old_db_name_len) != 0); + + /* Get the database name from "new_name", and table name + from the fts_table_old_name */ + strncpy(fts_table_new_name, new_name, new_db_name_len); + strncpy(fts_table_new_name + new_db_name_len, + strchr(fts_table_old_name, '/'), + table_new_name_len - new_db_name_len); + fts_table_new_name[table_new_name_len] = 0; + + return row_rename_table_for_mysql( + fts_table_old_name, fts_table_new_name, trx, false); +} + +/****************************************************************//** +Rename auxiliary tables for all fts index for a table. This(rename) +is due to database name change +@return DB_SUCCESS or error code */ +dberr_t +fts_rename_aux_tables( +/*==================*/ + dict_table_t* table, /*!< in: user Table */ + const char* new_name, /*!< in: new table name */ + trx_t* trx) /*!< in: transaction */ +{ + ulint i; + fts_table_t fts_table; + + FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table); + + dberr_t err = DB_SUCCESS; + char old_table_name[MAX_FULL_NAME_LEN]; + + /* Rename common auxiliary tables */ + for (i = 0; fts_common_tables[i] != NULL; ++i) { + fts_table.suffix = fts_common_tables[i]; + fts_get_table_name(&fts_table, old_table_name, true); + + err = fts_rename_one_aux_table(new_name, old_table_name, trx); + + if (err != DB_SUCCESS) { + return(err); + } + } + + fts_t* fts = table->fts; + + /* Rename index specific auxiliary tables */ + for (i = 0; fts->indexes != 0 && i < ib_vector_size(fts->indexes); + ++i) { + dict_index_t* index; + + index = static_cast( + ib_vector_getp(fts->indexes, i)); + + FTS_INIT_INDEX_TABLE(&fts_table, NULL, FTS_INDEX_TABLE, index); + + for (ulint j = 0; j < FTS_NUM_AUX_INDEX; ++j) { + fts_table.suffix = fts_get_suffix(j); + fts_get_table_name(&fts_table, old_table_name, true); + + err = fts_rename_one_aux_table( + new_name, old_table_name, trx); + + DBUG_EXECUTE_IF("fts_rename_failure", + err = DB_DEADLOCK; + fts_sql_rollback(trx);); + + if (err != DB_SUCCESS) { + return(err); + } + } + } + + return(DB_SUCCESS); +} + +/** Lock an internal FTS_ table, before fts_drop_table() */ +static dberr_t fts_lock_table(trx_t *trx, const char *table_name) +{ + ut_ad(purge_sys.must_wait_FTS()); + + if (dict_table_t *table= dict_table_open_on_name(table_name, false, + DICT_ERR_IGNORE_TABLESPACE)) + { + dberr_t err= lock_table_for_trx(table, trx, LOCK_X); + /* Wait for purge threads to stop using the table. */ + for (uint n= 15; table->get_ref_count() > 1; ) + { + if (!--n) + { + err= DB_LOCK_WAIT_TIMEOUT; + goto fail; + } + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } +fail: + table->release(); + return err; + } + return DB_SUCCESS; +} + +/** Lock the internal FTS_ tables for an index, before fts_drop_index_tables(). +@param trx transaction +@param index fulltext index */ +dberr_t fts_lock_index_tables(trx_t *trx, const dict_index_t &index) +{ + ut_ad(index.type & DICT_FTS); + fts_table_t fts_table; + char table_name[MAX_FULL_NAME_LEN]; + FTS_INIT_INDEX_TABLE(&fts_table, nullptr, FTS_INDEX_TABLE, (&index)); + for (const fts_index_selector_t *s= fts_index_selector; s->suffix; s++) + { + fts_table.suffix= s->suffix; + fts_get_table_name(&fts_table, table_name, false); + if (dberr_t err= fts_lock_table(trx, table_name)) + return err; + } + return DB_SUCCESS; +} + +/** Lock the internal common FTS_ tables, before fts_drop_common_tables(). +@param trx transaction +@param table table containing FULLTEXT INDEX +@return DB_SUCCESS or error code */ +dberr_t fts_lock_common_tables(trx_t *trx, const dict_table_t &table) +{ + fts_table_t fts_table; + char table_name[MAX_FULL_NAME_LEN]; + + FTS_INIT_FTS_TABLE(&fts_table, nullptr, FTS_COMMON_TABLE, (&table)); + + for (const char **suffix= fts_common_tables; *suffix; suffix++) + { + fts_table.suffix= *suffix; + fts_get_table_name(&fts_table, table_name, false); + if (dberr_t err= fts_lock_table(trx, table_name)) + return err; + } + return DB_SUCCESS; +} + +/** This function make sure that table doesn't +have any other reference count. +@param table_name table name */ +static void fts_table_no_ref_count(const char *table_name) +{ + dict_table_t *table= dict_table_open_on_name( + table_name, true, DICT_ERR_IGNORE_TABLESPACE); + if (!table) + return; + + while (table->get_ref_count() > 1) + { + dict_sys.unlock(); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + dict_sys.lock(SRW_LOCK_CALL); + } + + table->release(); +} + +/** Stop the purge thread and check n_ref_count of all auxiliary +and common table associated with the fts table. +@param table parent FTS table +@param already_stopped True indicates purge threads were + already stopped*/ +void purge_sys_t::stop_FTS(const dict_table_t &table, bool already_stopped) +{ + if (!already_stopped) + purge_sys.stop_FTS(); + + dict_sys.lock(SRW_LOCK_CALL); + + fts_table_t fts_table; + char table_name[MAX_FULL_NAME_LEN]; + + FTS_INIT_FTS_TABLE(&fts_table, nullptr, FTS_COMMON_TABLE, (&table)); + + for (const char **suffix= fts_common_tables; *suffix; suffix++) + { + fts_table.suffix= *suffix; + fts_get_table_name(&fts_table, table_name, true); + fts_table_no_ref_count(table_name); + } + + if (table.fts) + { + if (auto indexes= table.fts->indexes) + { + for (ulint i= 0;i < ib_vector_size(indexes); ++i) + { + const dict_index_t *index= static_cast( + ib_vector_getp(indexes, i)); + FTS_INIT_INDEX_TABLE(&fts_table, nullptr, FTS_INDEX_TABLE, index); + for (const fts_index_selector_t *s= fts_index_selector; + s->suffix; s++) + { + fts_table.suffix= s->suffix; + fts_get_table_name(&fts_table, table_name, true); + fts_table_no_ref_count(table_name); + } + } + } + } + + dict_sys.unlock(); +} + +/** Lock the internal FTS_ tables for table, before fts_drop_tables(). +@param trx transaction +@param table table containing FULLTEXT INDEX +@return DB_SUCCESS or error code */ +dberr_t fts_lock_tables(trx_t *trx, const dict_table_t &table) +{ + if (dberr_t err= fts_lock_common_tables(trx, table)) + return err; + + if (!table.fts) + return DB_SUCCESS; + + auto indexes= table.fts->indexes; + if (!indexes) + return DB_SUCCESS; + + for (ulint i= 0; i < ib_vector_size(indexes); ++i) + if (dberr_t err= + fts_lock_index_tables(trx, *static_cast + (ib_vector_getp(indexes, i)))) + return err; + return DB_SUCCESS; +} + +/** Drops the common ancillary tables needed for supporting an FTS index +on the given table. +@param trx transaction to drop fts common table +@param fts_table table with an FTS index +@param rename whether to rename before dropping +@return DB_SUCCESS or error code */ +static dberr_t fts_drop_common_tables(trx_t *trx, fts_table_t *fts_table, + bool rename) +{ + dberr_t error= DB_SUCCESS; + + for (ulint i= 0; fts_common_tables[i]; ++i) + { + char table_name[MAX_FULL_NAME_LEN]; + + fts_table->suffix= fts_common_tables[i]; + fts_get_table_name(fts_table, table_name, true); + + if (dberr_t err= fts_drop_table(trx, table_name, rename)) + { + if (trx->state != TRX_STATE_ACTIVE) + return err; + /* We only return the status of the last error. */ + if (err != DB_FAIL) + error= err; + } + } + + return error; +} + +/****************************************************************//** +Drops FTS auxiliary tables for an FTS index +@return DB_SUCCESS or error code */ +dberr_t fts_drop_index_tables(trx_t *trx, const dict_index_t &index) +{ + ulint i; + fts_table_t fts_table; + dberr_t error = DB_SUCCESS; + + FTS_INIT_INDEX_TABLE(&fts_table, nullptr, FTS_INDEX_TABLE, (&index)); + + for (i = 0; i < FTS_NUM_AUX_INDEX; ++i) { + dberr_t err; + char table_name[MAX_FULL_NAME_LEN]; + + fts_table.suffix = fts_get_suffix(i); + fts_get_table_name(&fts_table, table_name, true); + + err = fts_drop_table(trx, table_name, false); + + /* We only return the status of the last error. */ + if (err != DB_SUCCESS && err != DB_FAIL) { + error = err; + } + } + + return(error); +} + +/****************************************************************//** +Drops FTS ancillary tables needed for supporting an FTS index +on the given table. +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_drop_all_index_tables( +/*======================*/ + trx_t* trx, /*!< in: transaction */ + const fts_t* fts) /*!< in: fts instance */ +{ + dberr_t error= DB_SUCCESS; + auto indexes= fts->indexes; + if (!indexes) + return DB_SUCCESS; + + for (ulint i= 0; i < ib_vector_size(indexes); ++i) + if (dberr_t err= fts_drop_index_tables(trx, + *static_cast + (ib_vector_getp(indexes, i)))) + error= err; + return error; +} + +/** Drop the internal FTS_ tables for table. +@param trx transaction +@param table table containing FULLTEXT INDEX +@return DB_SUCCESS or error code */ +dberr_t fts_drop_tables(trx_t *trx, const dict_table_t &table) +{ + dberr_t error; + fts_table_t fts_table; + + FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, (&table)); + + error = fts_drop_common_tables(trx, &fts_table, false); + + if (error == DB_SUCCESS && table.fts) { + error = fts_drop_all_index_tables(trx, table.fts); + } + + return(error); +} + +/** Create dict_table_t object for FTS Aux tables. +@param[in] aux_table_name FTS Aux table name +@param[in] table table object of FTS Index +@param[in] n_cols number of columns for FTS Aux table +@return table object for FTS Aux table */ +static +dict_table_t* +fts_create_in_mem_aux_table( + const char* aux_table_name, + const dict_table_t* table, + ulint n_cols) +{ + dict_table_t* new_table = dict_table_t::create( + {aux_table_name,strlen(aux_table_name)}, + nullptr, n_cols, 0, table->flags, + table->space_id == TRX_SYS_SPACE + ? 0 : table->space_id == SRV_TMP_SPACE_ID + ? DICT_TF2_TEMPORARY : DICT_TF2_USE_FILE_PER_TABLE); + + if (DICT_TF_HAS_DATA_DIR(table->flags)) { + ut_ad(table->data_dir_path != NULL); + new_table->data_dir_path = mem_heap_strdup( + new_table->heap, table->data_dir_path); + } + + return(new_table); +} + +/** Function to create on FTS common table. +@param[in,out] trx InnoDB transaction +@param[in] table Table that has FTS Index +@param[in] fts_table_name FTS AUX table name +@param[in] fts_suffix FTS AUX table suffix +@param[in,out] heap temporary memory heap +@return table object if created, else NULL */ +static +dict_table_t* +fts_create_one_common_table( + trx_t* trx, + const dict_table_t* table, + const char* fts_table_name, + const char* fts_suffix, + mem_heap_t* heap) +{ + dict_table_t* new_table; + dberr_t error; + bool is_config = strcmp(fts_suffix, "CONFIG") == 0; + + if (!is_config) { + + new_table = fts_create_in_mem_aux_table( + fts_table_name, table, FTS_DELETED_TABLE_NUM_COLS); + + dict_mem_table_add_col( + new_table, heap, "doc_id", DATA_INT, DATA_UNSIGNED, + FTS_DELETED_TABLE_COL_LEN); + } else { + /* Config table has different schema. */ + new_table = fts_create_in_mem_aux_table( + fts_table_name, table, FTS_CONFIG_TABLE_NUM_COLS); + + dict_mem_table_add_col( + new_table, heap, "key", DATA_VARCHAR, 0, + FTS_CONFIG_TABLE_KEY_COL_LEN); + + dict_mem_table_add_col( + new_table, heap, "value", DATA_VARCHAR, DATA_NOT_NULL, + FTS_CONFIG_TABLE_VALUE_COL_LEN); + } + + dict_table_add_system_columns(new_table, heap); + error = row_create_table_for_mysql(new_table, trx); + + if (error == DB_SUCCESS) { + + dict_index_t* index = dict_mem_index_create( + new_table, "FTS_COMMON_TABLE_IND", + DICT_UNIQUE|DICT_CLUSTERED, 1); + + if (!is_config) { + dict_mem_index_add_field(index, "doc_id", 0); + } else { + dict_mem_index_add_field(index, "key", 0); + } + + error = row_create_index_for_mysql(index, trx, NULL, + FIL_ENCRYPTION_DEFAULT, + FIL_DEFAULT_ENCRYPTION_KEY); + if (error == DB_SUCCESS) { + return new_table; + } + } + + ib::warn() << "Failed to create FTS common table " << fts_table_name; + trx->error_state = error; + return NULL; +} + +/** Creates the common auxiliary tables needed for supporting an FTS index +on the given table. +The following tables are created. +CREATE TABLE $FTS_PREFIX_DELETED + (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id) +CREATE TABLE $FTS_PREFIX_DELETED_CACHE + (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id) +CREATE TABLE $FTS_PREFIX_BEING_DELETED + (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id) +CREATE TABLE $FTS_PREFIX_BEING_DELETED_CACHE + (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id) +CREATE TABLE $FTS_PREFIX_CONFIG + (key CHAR(50), value CHAR(200), UNIQUE CLUSTERED INDEX on key) +@param[in,out] trx transaction +@param[in,out] table table with FTS index +@param[in] skip_doc_id_index Skip index on doc id +@return DB_SUCCESS if succeed */ +dberr_t +fts_create_common_tables( + trx_t* trx, + dict_table_t* table, + bool skip_doc_id_index) +{ + dberr_t error; + que_t* graph; + fts_table_t fts_table; + mem_heap_t* heap = mem_heap_create(1024); + pars_info_t* info; + char fts_name[MAX_FULL_NAME_LEN]; + char full_name[sizeof(fts_common_tables) / sizeof(char*)] + [MAX_FULL_NAME_LEN]; + + dict_index_t* index = NULL; + + FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table); + + error = fts_drop_common_tables(trx, &fts_table, true); + + if (error != DB_SUCCESS) { + + goto func_exit; + } + + /* Create the FTS tables that are common to an FTS index. */ + for (ulint i = 0; fts_common_tables[i] != NULL; ++i) { + + fts_table.suffix = fts_common_tables[i]; + fts_get_table_name(&fts_table, full_name[i], true); + dict_table_t* common_table = fts_create_one_common_table( + trx, table, full_name[i], fts_table.suffix, heap); + + if (!common_table) { + trx->error_state = DB_SUCCESS; + error = DB_ERROR; + goto func_exit; + } + + mem_heap_empty(heap); + } + + /* Write the default settings to the config table. */ + info = pars_info_create(); + + fts_table.suffix = "CONFIG"; + fts_get_table_name(&fts_table, fts_name, true); + pars_info_bind_id(info, "config_table", fts_name); + + graph = pars_sql( + info, fts_config_table_insert_values_sql); + + error = fts_eval_sql(trx, graph); + + que_graph_free(graph); + + if (error != DB_SUCCESS || skip_doc_id_index) { + + goto func_exit; + } + + if (table->versioned()) { + index = dict_mem_index_create(table, FTS_DOC_ID_INDEX_NAME, + DICT_UNIQUE, 2); + dict_mem_index_add_field(index, FTS_DOC_ID_COL_NAME, 0); + dict_mem_index_add_field(index, table->cols[table->vers_end].name(*table), 0); + } else { + index = dict_mem_index_create(table, FTS_DOC_ID_INDEX_NAME, + DICT_UNIQUE, 1); + dict_mem_index_add_field(index, FTS_DOC_ID_COL_NAME, 0); + } + + error = row_create_index_for_mysql(index, trx, NULL, + FIL_ENCRYPTION_DEFAULT, + FIL_DEFAULT_ENCRYPTION_KEY); + +func_exit: + mem_heap_free(heap); + + return(error); +} + +/** Create one FTS auxiliary index table for an FTS index. +@param[in,out] trx transaction +@param[in] index the index instance +@param[in] fts_table fts_table structure +@param[in,out] heap temporary memory heap +@see row_merge_create_fts_sort_index() +@return DB_SUCCESS or error code */ +static +dict_table_t* +fts_create_one_index_table( + trx_t* trx, + const dict_index_t* index, + const fts_table_t* fts_table, + mem_heap_t* heap) +{ + dict_field_t* field; + dict_table_t* new_table; + char table_name[MAX_FULL_NAME_LEN]; + dberr_t error; + CHARSET_INFO* charset; + + ut_ad(index->type & DICT_FTS); + + fts_get_table_name(fts_table, table_name, true); + + new_table = fts_create_in_mem_aux_table( + table_name, fts_table->table, + FTS_AUX_INDEX_TABLE_NUM_COLS); + + field = dict_index_get_nth_field(index, 0); + charset = fts_get_charset(field->col->prtype); + + dict_mem_table_add_col(new_table, heap, "word", + charset == &my_charset_latin1 + ? DATA_VARCHAR : DATA_VARMYSQL, + field->col->prtype, + FTS_MAX_WORD_LEN_IN_CHAR + * unsigned(field->col->mbmaxlen)); + + dict_mem_table_add_col(new_table, heap, "first_doc_id", DATA_INT, + DATA_NOT_NULL | DATA_UNSIGNED, + FTS_INDEX_FIRST_DOC_ID_LEN); + + dict_mem_table_add_col(new_table, heap, "last_doc_id", DATA_INT, + DATA_NOT_NULL | DATA_UNSIGNED, + FTS_INDEX_LAST_DOC_ID_LEN); + + dict_mem_table_add_col(new_table, heap, "doc_count", DATA_INT, + DATA_NOT_NULL | DATA_UNSIGNED, + FTS_INDEX_DOC_COUNT_LEN); + + /* The precise type calculation is as follows: + least signficiant byte: MySQL type code (not applicable for sys cols) + second least : DATA_NOT_NULL | DATA_BINARY_TYPE + third least : the MySQL charset-collation code (DATA_MTYPE_MAX) */ + + dict_mem_table_add_col( + new_table, heap, "ilist", DATA_BLOB, + (DATA_MTYPE_MAX << 16) | DATA_UNSIGNED | DATA_NOT_NULL, + FTS_INDEX_ILIST_LEN); + + dict_table_add_system_columns(new_table, heap); + error = row_create_table_for_mysql(new_table, trx); + + if (error == DB_SUCCESS) { + dict_index_t* index = dict_mem_index_create( + new_table, "FTS_INDEX_TABLE_IND", + DICT_UNIQUE|DICT_CLUSTERED, 2); + dict_mem_index_add_field(index, "word", 0); + dict_mem_index_add_field(index, "first_doc_id", 0); + + error = row_create_index_for_mysql(index, trx, NULL, + FIL_ENCRYPTION_DEFAULT, + FIL_DEFAULT_ENCRYPTION_KEY); + + if (error == DB_SUCCESS) { + return new_table; + } + } + + ib::warn() << "Failed to create FTS index table " << table_name; + trx->error_state = error; + return NULL; +} + +/** Creates the column specific ancillary tables needed for supporting an +FTS index on the given table. + +All FTS AUX Index tables have the following schema. +CREAT TABLE $FTS_PREFIX_INDEX_[1-6]( + word VARCHAR(FTS_MAX_WORD_LEN), + first_doc_id INT NOT NULL, + last_doc_id UNSIGNED NOT NULL, + doc_count UNSIGNED INT NOT NULL, + ilist VARBINARY NOT NULL, + UNIQUE CLUSTERED INDEX ON (word, first_doc_id)) +@param[in,out] trx dictionary transaction +@param[in] index fulltext index +@param[in] id table id +@return DB_SUCCESS or error code */ +dberr_t +fts_create_index_tables(trx_t* trx, const dict_index_t* index, table_id_t id) +{ + ulint i; + fts_table_t fts_table; + dberr_t error = DB_SUCCESS; + mem_heap_t* heap = mem_heap_create(1024); + + fts_table.type = FTS_INDEX_TABLE; + fts_table.index_id = index->id; + fts_table.table_id = id; + fts_table.table = index->table; + + for (i = 0; i < FTS_NUM_AUX_INDEX && error == DB_SUCCESS; ++i) { + dict_table_t* new_table; + + /* Create the FTS auxiliary tables that are specific + to an FTS index. */ + fts_table.suffix = fts_get_suffix(i); + + new_table = fts_create_one_index_table( + trx, index, &fts_table, heap); + + if (new_table == NULL) { + error = DB_FAIL; + break; + } + + mem_heap_empty(heap); + } + + mem_heap_free(heap); + + return(error); +} + +/******************************************************************//** +Calculate the new state of a row given the existing state and a new event. +@return new state of row */ +static +fts_row_state +fts_trx_row_get_new_state( +/*======================*/ + fts_row_state old_state, /*!< in: existing state of row */ + fts_row_state event) /*!< in: new event */ +{ + /* The rules for transforming states: + + I = inserted + M = modified + D = deleted + N = nothing + + M+D -> D: + + If the row existed before the transaction started and it is modified + during the transaction, followed by a deletion of the row, only the + deletion will be signaled. + + M+ -> M: + + If the row existed before the transaction started and it is modified + more than once during the transaction, only the last modification + will be signaled. + + IM*D -> N: + + If a new row is added during the transaction (and possibly modified + after its initial insertion) but it is deleted before the end of the + transaction, nothing will be signaled. + + IM* -> I: + + If a new row is added during the transaction and modified after its + initial insertion, only the addition will be signaled. + + M*DI -> M: + + If the row existed before the transaction started and it is deleted, + then re-inserted, only a modification will be signaled. Note that + this case is only possible if the table is using the row's primary + key for FTS row ids, since those can be re-inserted by the user, + which is not true for InnoDB generated row ids. + + It is easily seen that the above rules decompose such that we do not + need to store the row's entire history of events. Instead, we can + store just one state for the row and update that when new events + arrive. Then we can implement the above rules as a two-dimensional + look-up table, and get checking of invalid combinations "for free" + in the process. */ + + /* The lookup table for transforming states. old_state is the + Y-axis, event is the X-axis. */ + static const fts_row_state table[4][4] = { + /* I M D N */ + /* I */ { FTS_INVALID, FTS_INSERT, FTS_NOTHING, FTS_INVALID }, + /* M */ { FTS_INVALID, FTS_MODIFY, FTS_DELETE, FTS_INVALID }, + /* D */ { FTS_MODIFY, FTS_INVALID, FTS_INVALID, FTS_INVALID }, + /* N */ { FTS_INVALID, FTS_INVALID, FTS_INVALID, FTS_INVALID } + }; + + fts_row_state result; + + ut_a(old_state < FTS_INVALID); + ut_a(event < FTS_INVALID); + + result = table[(int) old_state][(int) event]; + ut_a(result != FTS_INVALID); + + return(result); +} + +/******************************************************************//** +Create a savepoint instance. +@return savepoint instance */ +static +fts_savepoint_t* +fts_savepoint_create( +/*=================*/ + ib_vector_t* savepoints, /*!< out: InnoDB transaction */ + const char* name, /*!< in: savepoint name */ + mem_heap_t* heap) /*!< in: heap */ +{ + fts_savepoint_t* savepoint; + + savepoint = static_cast( + ib_vector_push(savepoints, NULL)); + + memset(savepoint, 0x0, sizeof(*savepoint)); + + if (name) { + savepoint->name = mem_heap_strdup(heap, name); + } + + savepoint->tables = rbt_create( + sizeof(fts_trx_table_t*), fts_trx_table_cmp); + + return(savepoint); +} + +/******************************************************************//** +Create an FTS trx. +@return FTS trx */ +fts_trx_t* +fts_trx_create( +/*===========*/ + trx_t* trx) /*!< in/out: InnoDB + transaction */ +{ + fts_trx_t* ftt; + ib_alloc_t* heap_alloc; + mem_heap_t* heap = mem_heap_create(1024); + trx_named_savept_t* savep; + + ut_a(trx->fts_trx == NULL); + + ftt = static_cast(mem_heap_alloc(heap, sizeof(fts_trx_t))); + ftt->trx = trx; + ftt->heap = heap; + + heap_alloc = ib_heap_allocator_create(heap); + + ftt->savepoints = static_cast(ib_vector_create( + heap_alloc, sizeof(fts_savepoint_t), 4)); + + ftt->last_stmt = static_cast(ib_vector_create( + heap_alloc, sizeof(fts_savepoint_t), 4)); + + /* Default instance has no name and no heap. */ + fts_savepoint_create(ftt->savepoints, NULL, NULL); + fts_savepoint_create(ftt->last_stmt, NULL, NULL); + + /* Copy savepoints that already set before. */ + for (savep = UT_LIST_GET_FIRST(trx->trx_savepoints); + savep != NULL; + savep = UT_LIST_GET_NEXT(trx_savepoints, savep)) { + + fts_savepoint_take(ftt, savep->name); + } + + return(ftt); +} + +/******************************************************************//** +Create an FTS trx table. +@return FTS trx table */ +static +fts_trx_table_t* +fts_trx_table_create( +/*=================*/ + fts_trx_t* fts_trx, /*!< in: FTS trx */ + dict_table_t* table) /*!< in: table */ +{ + fts_trx_table_t* ftt; + + ftt = static_cast( + mem_heap_zalloc(fts_trx->heap, sizeof *ftt)); + + ftt->table = table; + ftt->fts_trx = fts_trx; + + ftt->rows = rbt_create(sizeof(fts_trx_row_t), fts_trx_row_doc_id_cmp); + + return(ftt); +} + +/******************************************************************//** +Clone an FTS trx table. +@return FTS trx table */ +static +fts_trx_table_t* +fts_trx_table_clone( +/*=================*/ + const fts_trx_table_t* ftt_src) /*!< in: FTS trx */ +{ + fts_trx_table_t* ftt; + + ftt = static_cast( + mem_heap_alloc(ftt_src->fts_trx->heap, sizeof(*ftt))); + + memset(ftt, 0x0, sizeof(*ftt)); + + ftt->table = ftt_src->table; + ftt->fts_trx = ftt_src->fts_trx; + + ftt->rows = rbt_create(sizeof(fts_trx_row_t), fts_trx_row_doc_id_cmp); + + /* Copy the rb tree values to the new savepoint. */ + rbt_merge_uniq(ftt->rows, ftt_src->rows); + + /* These are only added on commit. At this stage we only have + the updated row state. */ + ut_a(ftt_src->added_doc_ids == NULL); + + return(ftt); +} + +/******************************************************************//** +Initialize the FTS trx instance. +@return FTS trx instance */ +static +fts_trx_table_t* +fts_trx_init( +/*=========*/ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table, /*!< in: FTS table instance */ + ib_vector_t* savepoints) /*!< in: Savepoints */ +{ + fts_trx_table_t* ftt; + ib_rbt_bound_t parent; + ib_rbt_t* tables; + fts_savepoint_t* savepoint; + + savepoint = static_cast(ib_vector_last(savepoints)); + + tables = savepoint->tables; + rbt_search_cmp(tables, &parent, &table->id, fts_trx_table_id_cmp, NULL); + + if (parent.result == 0) { + fts_trx_table_t** fttp; + + fttp = rbt_value(fts_trx_table_t*, parent.last); + ftt = *fttp; + } else { + ftt = fts_trx_table_create(trx->fts_trx, table); + rbt_add_node(tables, &parent, &ftt); + } + + ut_a(ftt->table == table); + + return(ftt); +} + +/******************************************************************//** +Notify the FTS system about an operation on an FTS-indexed table. */ +static +void +fts_trx_table_add_op( +/*=================*/ + fts_trx_table_t*ftt, /*!< in: FTS trx table */ + doc_id_t doc_id, /*!< in: doc id */ + fts_row_state state, /*!< in: state of the row */ + ib_vector_t* fts_indexes) /*!< in: FTS indexes affected */ +{ + ib_rbt_t* rows; + ib_rbt_bound_t parent; + + rows = ftt->rows; + rbt_search(rows, &parent, &doc_id); + + /* Row id found, update state, and if new state is FTS_NOTHING, + we delete the row from our tree. */ + if (parent.result == 0) { + fts_trx_row_t* row = rbt_value(fts_trx_row_t, parent.last); + + row->state = fts_trx_row_get_new_state(row->state, state); + + if (row->state == FTS_NOTHING) { + if (row->fts_indexes) { + ib_vector_free(row->fts_indexes); + } + + ut_free(rbt_remove_node(rows, parent.last)); + row = NULL; + } else if (row->fts_indexes != NULL) { + ib_vector_free(row->fts_indexes); + row->fts_indexes = fts_indexes; + } + + } else { /* Row-id not found, create a new one. */ + fts_trx_row_t row; + + row.doc_id = doc_id; + row.state = state; + row.fts_indexes = fts_indexes; + + rbt_add_node(rows, &parent, &row); + } +} + +/******************************************************************//** +Notify the FTS system about an operation on an FTS-indexed table. */ +void +fts_trx_add_op( +/*===========*/ + trx_t* trx, /*!< in: InnoDB transaction */ + dict_table_t* table, /*!< in: table */ + doc_id_t doc_id, /*!< in: new doc id */ + fts_row_state state, /*!< in: state of the row */ + ib_vector_t* fts_indexes) /*!< in: FTS indexes affected + (NULL=all) */ +{ + fts_trx_table_t* tran_ftt; + fts_trx_table_t* stmt_ftt; + + if (!trx->fts_trx) { + trx->fts_trx = fts_trx_create(trx); + } + + tran_ftt = fts_trx_init(trx, table, trx->fts_trx->savepoints); + stmt_ftt = fts_trx_init(trx, table, trx->fts_trx->last_stmt); + + fts_trx_table_add_op(tran_ftt, doc_id, state, fts_indexes); + fts_trx_table_add_op(stmt_ftt, doc_id, state, fts_indexes); +} + +/******************************************************************//** +Fetch callback that converts a textual document id to a binary value and +stores it in the given place. +@return always returns NULL */ +static +ibool +fts_fetch_store_doc_id( +/*===================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: doc_id_t* to store + doc_id in */ +{ + int n_parsed; + sel_node_t* node = static_cast(row); + doc_id_t* doc_id = static_cast(user_arg); + dfield_t* dfield = que_node_get_val(node->select_list); + dtype_t* type = dfield_get_type(dfield); + ulint len = dfield_get_len(dfield); + + char buf[32]; + + ut_a(dtype_get_mtype(type) == DATA_VARCHAR); + ut_a(len > 0 && len < sizeof(buf)); + + memcpy(buf, dfield_get_data(dfield), len); + buf[len] = '\0'; + + n_parsed = sscanf(buf, FTS_DOC_ID_FORMAT, doc_id); + ut_a(n_parsed == 1); + + return(FALSE); +} + +#ifdef FTS_CACHE_SIZE_DEBUG +/******************************************************************//** +Get the max cache size in bytes. If there is an error reading the +value we simply print an error message here and return the default +value to the caller. +@return max cache size in bytes */ +static +ulint +fts_get_max_cache_size( +/*===================*/ + trx_t* trx, /*!< in: transaction */ + fts_table_t* fts_table) /*!< in: table instance */ +{ + dberr_t error; + fts_string_t value; + ulong cache_size_in_mb; + + /* Set to the default value. */ + cache_size_in_mb = FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB; + + /* We set the length of value to the max bytes it can hold. This + information is used by the callback that reads the value. */ + value.f_n_char = 0; + value.f_len = FTS_MAX_CONFIG_VALUE_LEN; + value.f_str = ut_malloc_nokey(value.f_len + 1); + + error = fts_config_get_value( + trx, fts_table, FTS_MAX_CACHE_SIZE_IN_MB, &value); + + if (UNIV_LIKELY(error == DB_SUCCESS)) { + value.f_str[value.f_len] = 0; + cache_size_in_mb = strtoul((char*) value.f_str, NULL, 10); + + if (cache_size_in_mb > FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB) { + + ib::warn() << "FTS max cache size (" + << cache_size_in_mb << ") out of range." + " Minimum value is " + << FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB + << "MB and the maximum value is " + << FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB + << "MB, setting cache size to upper limit"; + + cache_size_in_mb = FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB; + + } else if (cache_size_in_mb + < FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB) { + + ib::warn() << "FTS max cache size (" + << cache_size_in_mb << ") out of range." + " Minimum value is " + << FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB + << "MB and the maximum value is" + << FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB + << "MB, setting cache size to lower limit"; + + cache_size_in_mb = FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB; + } + } else { + ib::error() << "(" << error << ") reading max" + " cache config value from config table " + << fts_table->table->name; + } + + ut_free(value.f_str); + + return(cache_size_in_mb * 1024 * 1024); +} +#endif + +/*********************************************************************//** +Get the next available document id. +@return DB_SUCCESS if OK */ +dberr_t +fts_get_next_doc_id( +/*================*/ + const dict_table_t* table, /*!< in: table */ + doc_id_t* doc_id) /*!< out: new document id */ +{ + fts_cache_t* cache = table->fts->cache; + + /* If the Doc ID system has not yet been initialized, we + will consult the CONFIG table and user table to re-establish + the initial value of the Doc ID */ + if (cache->first_doc_id == FTS_NULL_DOC_ID) { + fts_init_doc_id(table); + } + + if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) { + *doc_id = FTS_NULL_DOC_ID; + return(DB_SUCCESS); + } + + DEBUG_SYNC_C("get_next_FTS_DOC_ID"); + mysql_mutex_lock(&cache->doc_id_lock); + *doc_id = cache->next_doc_id++; + mysql_mutex_unlock(&cache->doc_id_lock); + + return(DB_SUCCESS); +} + +/*********************************************************************//** +This function fetch the Doc ID from CONFIG table, and compare with +the Doc ID supplied. And store the larger one to the CONFIG table. +@return DB_SUCCESS if OK */ +static MY_ATTRIBUTE((nonnull)) +dberr_t +fts_cmp_set_sync_doc_id( +/*====================*/ + const dict_table_t* table, /*!< in: table */ + doc_id_t cmp_doc_id, /*!< in: Doc ID to compare */ + ibool read_only, /*!< in: TRUE if read the + synced_doc_id only */ + doc_id_t* doc_id) /*!< out: larger document id + after comparing "cmp_doc_id" + to the one stored in CONFIG + table */ +{ + if (srv_read_only_mode) { + return DB_READ_ONLY; + } + + trx_t* trx; + pars_info_t* info; + dberr_t error; + fts_table_t fts_table; + que_t* graph = NULL; + fts_cache_t* cache = table->fts->cache; + char table_name[MAX_FULL_NAME_LEN]; + ut_a(table->fts->doc_col != ULINT_UNDEFINED); + + fts_table.suffix = "CONFIG"; + fts_table.table_id = table->id; + fts_table.type = FTS_COMMON_TABLE; + fts_table.table = table; + + trx= trx_create(); +retry: + trx_start_internal(trx); + + trx->op_info = "update the next FTS document id"; + + info = pars_info_create(); + + pars_info_bind_function( + info, "my_func", fts_fetch_store_doc_id, doc_id); + + fts_get_table_name(&fts_table, table_name); + pars_info_bind_id(info, "config_table", table_name); + + graph = fts_parse_sql( + &fts_table, info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS SELECT value FROM $config_table" + " WHERE key = 'synced_doc_id' FOR UPDATE;\n" + "BEGIN\n" + "" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + + *doc_id = 0; + + error = fts_eval_sql(trx, graph); + + que_graph_free(graph); + + // FIXME: We need to retry deadlock errors + if (error != DB_SUCCESS) { + goto func_exit; + } + + if (read_only) { + /* InnoDB stores actual synced_doc_id value + 1 in + FTS_CONFIG table. Reduce the value by 1 while reading + after startup. */ + if (*doc_id) *doc_id -= 1; + goto func_exit; + } + + if (cmp_doc_id == 0 && *doc_id) { + cache->synced_doc_id = *doc_id - 1; + } else { + cache->synced_doc_id = ut_max(cmp_doc_id, *doc_id); + } + + mysql_mutex_lock(&cache->doc_id_lock); + /* For each sync operation, we will add next_doc_id by 1, + so to mark a sync operation */ + if (cache->next_doc_id < cache->synced_doc_id + 1) { + cache->next_doc_id = cache->synced_doc_id + 1; + } + mysql_mutex_unlock(&cache->doc_id_lock); + + if (cmp_doc_id && cmp_doc_id >= *doc_id) { + error = fts_update_sync_doc_id( + table, cache->synced_doc_id, trx); + } + + *doc_id = cache->next_doc_id; + +func_exit: + + if (UNIV_LIKELY(error == DB_SUCCESS)) { + fts_sql_commit(trx); + } else { + *doc_id = 0; + + ib::error() << "(" << error << ") while getting next doc id " + "for table " << table->name; + fts_sql_rollback(trx); + + if (error == DB_DEADLOCK || error == DB_LOCK_WAIT_TIMEOUT) { + DEBUG_SYNC_C("fts_cmp_set_sync_doc_id_retry"); + std::this_thread::sleep_for(FTS_DEADLOCK_RETRY_WAIT); + goto retry; + } + } + + trx->free(); + + return(error); +} + +/** Update the last document id. This function could create a new +transaction to update the last document id. +@param table table to be updated +@param doc_id last document id +@param trx update trx or null +@retval DB_SUCCESS if OK */ +dberr_t +fts_update_sync_doc_id( + const dict_table_t* table, + doc_id_t doc_id, + trx_t* trx) +{ + byte id[FTS_MAX_ID_LEN]; + pars_info_t* info; + fts_table_t fts_table; + ulint id_len; + que_t* graph = NULL; + dberr_t error; + ibool local_trx = FALSE; + fts_cache_t* cache = table->fts->cache; + char fts_name[MAX_FULL_NAME_LEN]; + + if (srv_read_only_mode) { + return DB_READ_ONLY; + } + + fts_table.suffix = "CONFIG"; + fts_table.table_id = table->id; + fts_table.type = FTS_COMMON_TABLE; + fts_table.table = table; + + if (!trx) { + trx = trx_create(); + trx_start_internal(trx); + + trx->op_info = "setting last FTS document id"; + local_trx = TRUE; + } + + info = pars_info_create(); + + id_len = (ulint) snprintf( + (char*) id, sizeof(id), FTS_DOC_ID_FORMAT, doc_id + 1); + + pars_info_bind_varchar_literal(info, "doc_id", id, id_len); + + fts_get_table_name(&fts_table, fts_name, + table->fts->dict_locked); + pars_info_bind_id(info, "table_name", fts_name); + + graph = fts_parse_sql( + &fts_table, info, + "BEGIN" + " UPDATE $table_name SET value = :doc_id" + " WHERE key = 'synced_doc_id';"); + + error = fts_eval_sql(trx, graph); + + que_graph_free(graph); + + if (local_trx) { + if (UNIV_LIKELY(error == DB_SUCCESS)) { + fts_sql_commit(trx); + cache->synced_doc_id = doc_id; + } else { + ib::error() << "(" << error << ") while" + " updating last doc id for table" + << table->name; + + fts_sql_rollback(trx); + } + trx->free(); + } + + return(error); +} + +/*********************************************************************//** +Create a new fts_doc_ids_t. +@return new fts_doc_ids_t */ +fts_doc_ids_t* +fts_doc_ids_create(void) +/*====================*/ +{ + fts_doc_ids_t* fts_doc_ids; + mem_heap_t* heap = mem_heap_create(512); + + fts_doc_ids = static_cast( + mem_heap_alloc(heap, sizeof(*fts_doc_ids))); + + fts_doc_ids->self_heap = ib_heap_allocator_create(heap); + + fts_doc_ids->doc_ids = static_cast(ib_vector_create( + fts_doc_ids->self_heap, sizeof(doc_id_t), 32)); + + return(fts_doc_ids); +} + +/*********************************************************************//** +Do commit-phase steps necessary for the insertion of a new row. */ +void +fts_add( +/*====*/ + fts_trx_table_t*ftt, /*!< in: FTS trx table */ + fts_trx_row_t* row) /*!< in: row */ +{ + dict_table_t* table = ftt->table; + doc_id_t doc_id = row->doc_id; + + ut_a(row->state == FTS_INSERT || row->state == FTS_MODIFY); + + fts_add_doc_by_id(ftt, doc_id); + + mysql_mutex_lock(&table->fts->cache->deleted_lock); + ++table->fts->cache->added; + mysql_mutex_unlock(&table->fts->cache->deleted_lock); + + if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID) + && doc_id >= table->fts->cache->next_doc_id) { + table->fts->cache->next_doc_id = doc_id + 1; + } +} + +/*********************************************************************//** +Do commit-phase steps necessary for the deletion of a row. +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_delete( +/*=======*/ + fts_trx_table_t*ftt, /*!< in: FTS trx table */ + fts_trx_row_t* row) /*!< in: row */ +{ + que_t* graph; + fts_table_t fts_table; + doc_id_t write_doc_id; + dict_table_t* table = ftt->table; + doc_id_t doc_id = row->doc_id; + trx_t* trx = ftt->fts_trx->trx; + pars_info_t* info = pars_info_create(); + fts_cache_t* cache = table->fts->cache; + + /* we do not index Documents whose Doc ID value is 0 */ + if (doc_id == FTS_NULL_DOC_ID) { + ut_ad(!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)); + return DB_SUCCESS; + } + + ut_a(row->state == FTS_DELETE || row->state == FTS_MODIFY); + + FTS_INIT_FTS_TABLE(&fts_table, "DELETED", FTS_COMMON_TABLE, table); + + /* Convert to "storage" byte order. */ + fts_write_doc_id((byte*) &write_doc_id, doc_id); + fts_bind_doc_id(info, "doc_id", &write_doc_id); + + /* It is possible we update a record that has not yet been sync-ed + into cache from last crash (delete Doc will not initialize the + sync). Avoid any added counter accounting until the FTS cache + is re-established and sync-ed */ + if (table->fts->added_synced + && doc_id > cache->synced_doc_id) { + mysql_mutex_lock(&table->fts->cache->deleted_lock); + + /* The Doc ID could belong to those left in + ADDED table from last crash. So need to check + if it is less than first_doc_id when we initialize + the Doc ID system after reboot */ + if (doc_id >= table->fts->cache->first_doc_id + && table->fts->cache->added > 0) { + --table->fts->cache->added; + } + + mysql_mutex_unlock(&table->fts->cache->deleted_lock); + + /* Only if the row was really deleted. */ + ut_a(row->state == FTS_DELETE || row->state == FTS_MODIFY); + } + + /* Note the deleted document for OPTIMIZE to purge. */ + char table_name[MAX_FULL_NAME_LEN]; + + trx->op_info = "adding doc id to FTS DELETED"; + + fts_table.suffix = "DELETED"; + + fts_get_table_name(&fts_table, table_name); + pars_info_bind_id(info, "deleted", table_name); + + graph = fts_parse_sql(&fts_table, info, + "BEGIN INSERT INTO $deleted VALUES (:doc_id);"); + + dberr_t error = fts_eval_sql(trx, graph); + que_graph_free(graph); + + /* Increment the total deleted count, this is used to calculate the + number of documents indexed. */ + if (error == DB_SUCCESS) { + mysql_mutex_lock(&table->fts->cache->deleted_lock); + + ++table->fts->cache->deleted; + + mysql_mutex_unlock(&table->fts->cache->deleted_lock); + } + + return(error); +} + +/*********************************************************************//** +Do commit-phase steps necessary for the modification of a row. +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_modify( +/*=======*/ + fts_trx_table_t* ftt, /*!< in: FTS trx table */ + fts_trx_row_t* row) /*!< in: row */ +{ + dberr_t error; + + ut_a(row->state == FTS_MODIFY); + + error = fts_delete(ftt, row); + + if (error == DB_SUCCESS) { + fts_add(ftt, row); + } + + return(error); +} + +/*********************************************************************//** +The given transaction is about to be committed; do whatever is necessary +from the FTS system's POV. +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_commit_table( +/*=============*/ + fts_trx_table_t* ftt) /*!< in: FTS table to commit*/ +{ + if (srv_read_only_mode) { + return DB_READ_ONLY; + } + + const ib_rbt_node_t* node; + ib_rbt_t* rows; + dberr_t error = DB_SUCCESS; + fts_cache_t* cache = ftt->table->fts->cache; + trx_t* trx = trx_create(); + + trx_start_internal(trx); + + rows = ftt->rows; + + ftt->fts_trx->trx = trx; + + if (cache->get_docs == NULL) { + mysql_mutex_lock(&cache->init_lock); + if (cache->get_docs == NULL) { + cache->get_docs = fts_get_docs_create(cache); + } + mysql_mutex_unlock(&cache->init_lock); + } + + for (node = rbt_first(rows); + node != NULL && error == DB_SUCCESS; + node = rbt_next(rows, node)) { + + fts_trx_row_t* row = rbt_value(fts_trx_row_t, node); + + switch (row->state) { + case FTS_INSERT: + fts_add(ftt, row); + break; + + case FTS_MODIFY: + error = fts_modify(ftt, row); + break; + + case FTS_DELETE: + error = fts_delete(ftt, row); + break; + + default: + ut_error; + } + } + + fts_sql_commit(trx); + + trx->free(); + + return(error); +} + +/*********************************************************************//** +The given transaction is about to be committed; do whatever is necessary +from the FTS system's POV. +@return DB_SUCCESS or error code */ +dberr_t +fts_commit( +/*=======*/ + trx_t* trx) /*!< in: transaction */ +{ + const ib_rbt_node_t* node; + dberr_t error; + ib_rbt_t* tables; + fts_savepoint_t* savepoint; + + savepoint = static_cast( + ib_vector_last(trx->fts_trx->savepoints)); + tables = savepoint->tables; + + for (node = rbt_first(tables), error = DB_SUCCESS; + node != NULL && error == DB_SUCCESS; + node = rbt_next(tables, node)) { + + fts_trx_table_t** ftt; + + ftt = rbt_value(fts_trx_table_t*, node); + + error = fts_commit_table(*ftt); + } + + return(error); +} + +/*********************************************************************//** +Initialize a document. */ +void +fts_doc_init( +/*=========*/ + fts_doc_t* doc) /*!< in: doc to initialize */ +{ + mem_heap_t* heap = mem_heap_create(32); + + memset(doc, 0, sizeof(*doc)); + + doc->self_heap = ib_heap_allocator_create(heap); +} + +/*********************************************************************//** +Free document. */ +void +fts_doc_free( +/*=========*/ + fts_doc_t* doc) /*!< in: document */ +{ + mem_heap_t* heap = static_cast(doc->self_heap->arg); + + if (doc->tokens) { + rbt_free(doc->tokens); + } + + ut_d(memset(doc, 0, sizeof(*doc))); + + mem_heap_free(heap); +} + +/*********************************************************************//** +Callback function for fetch that stores the text of an FTS document, +converting each column to UTF-16. +@return always FALSE */ +ibool +fts_query_expansion_fetch_doc( +/*==========================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: fts_doc_t* */ +{ + que_node_t* exp; + sel_node_t* node = static_cast(row); + fts_doc_t* result_doc = static_cast(user_arg); + dfield_t* dfield; + ulint len; + ulint doc_len; + fts_doc_t doc; + CHARSET_INFO* doc_charset = NULL; + ulint field_no = 0; + + len = 0; + + fts_doc_init(&doc); + doc.found = TRUE; + + exp = node->select_list; + doc_len = 0; + + doc_charset = result_doc->charset; + + /* Copy each indexed column content into doc->text.f_str */ + while (exp) { + dfield = que_node_get_val(exp); + len = dfield_get_len(dfield); + + /* NULL column */ + if (len == UNIV_SQL_NULL) { + exp = que_node_get_next(exp); + continue; + } + + if (!doc_charset) { + doc_charset = fts_get_charset(dfield->type.prtype); + } + + doc.charset = doc_charset; + + if (dfield_is_ext(dfield)) { + /* We ignore columns that are stored externally, this + could result in too many words to search */ + exp = que_node_get_next(exp); + continue; + } else { + doc.text.f_n_char = 0; + + doc.text.f_str = static_cast( + dfield_get_data(dfield)); + + doc.text.f_len = len; + } + + if (field_no == 0) { + fts_tokenize_document(&doc, result_doc, + result_doc->parser); + } else { + fts_tokenize_document_next(&doc, doc_len, result_doc, + result_doc->parser); + } + + exp = que_node_get_next(exp); + + doc_len += (exp) ? len + 1 : len; + + field_no++; + } + + ut_ad(doc_charset); + + if (!result_doc->charset) { + result_doc->charset = doc_charset; + } + + fts_doc_free(&doc); + + return(FALSE); +} + +/*********************************************************************//** +fetch and tokenize the document. */ +static +void +fts_fetch_doc_from_rec( +/*===================*/ + fts_get_doc_t* get_doc, /*!< in: FTS index's get_doc struct */ + dict_index_t* clust_index, /*!< in: cluster index */ + btr_pcur_t* pcur, /*!< in: cursor whose position + has been stored */ + rec_offs* offsets, /*!< in: offsets */ + fts_doc_t* doc) /*!< out: fts doc to hold parsed + documents */ +{ + dict_index_t* index; + const rec_t* clust_rec; + const dict_field_t* ifield; + ulint clust_pos; + ulint doc_len = 0; + st_mysql_ftparser* parser; + + if (!get_doc) { + return; + } + + index = get_doc->index_cache->index; + parser = get_doc->index_cache->index->parser; + + clust_rec = btr_pcur_get_rec(pcur); + ut_ad(!page_rec_is_comp(clust_rec) + || rec_get_status(clust_rec) == REC_STATUS_ORDINARY); + + for (ulint i = 0; i < index->n_fields; i++) { + ifield = dict_index_get_nth_field(index, i); + clust_pos = dict_col_get_clust_pos(ifield->col, clust_index); + + if (!get_doc->index_cache->charset) { + get_doc->index_cache->charset = fts_get_charset( + ifield->col->prtype); + } + + if (rec_offs_nth_extern(offsets, clust_pos)) { + doc->text.f_str = + btr_rec_copy_externally_stored_field( + clust_rec, offsets, + btr_pcur_get_block(pcur)->zip_size(), + clust_pos, &doc->text.f_len, + static_cast( + doc->self_heap->arg)); + } else { + doc->text.f_str = (byte*) rec_get_nth_field( + clust_rec, offsets, clust_pos, + &doc->text.f_len); + } + + doc->found = TRUE; + doc->charset = get_doc->index_cache->charset; + + /* Null Field */ + if (doc->text.f_len == UNIV_SQL_NULL || doc->text.f_len == 0) { + continue; + } + + if (!doc_len) { + fts_tokenize_document(doc, NULL, parser); + } else { + fts_tokenize_document_next(doc, doc_len, NULL, parser); + } + + doc_len += doc->text.f_len + 1; + } +} + +/** Fetch the data from tuple and tokenize the document. +@param[in] get_doc FTS index's get_doc struct +@param[in] tuple tuple should be arranged in table schema order +@param[out] doc fts doc to hold parsed documents. */ +static +void +fts_fetch_doc_from_tuple( + fts_get_doc_t* get_doc, + const dtuple_t* tuple, + fts_doc_t* doc) +{ + dict_index_t* index; + st_mysql_ftparser* parser; + ulint doc_len = 0; + ulint processed_doc = 0; + ulint num_field; + + if (get_doc == NULL) { + return; + } + + index = get_doc->index_cache->index; + parser = get_doc->index_cache->index->parser; + num_field = dict_index_get_n_fields(index); + + for (ulint i = 0; i < num_field; i++) { + const dict_field_t* ifield; + const dict_col_t* col; + ulint pos; + + ifield = dict_index_get_nth_field(index, i); + col = dict_field_get_col(ifield); + pos = dict_col_get_no(col); + const dfield_t* field = dtuple_get_nth_field(tuple, pos); + + if (!get_doc->index_cache->charset) { + get_doc->index_cache->charset = fts_get_charset( + ifield->col->prtype); + } + + ut_ad(!dfield_is_ext(field)); + + doc->text.f_str = (byte*) dfield_get_data(field); + doc->text.f_len = dfield_get_len(field); + doc->found = TRUE; + doc->charset = get_doc->index_cache->charset; + + /* field data is NULL. */ + if (doc->text.f_len == UNIV_SQL_NULL || doc->text.f_len == 0) { + continue; + } + + if (processed_doc == 0) { + fts_tokenize_document(doc, NULL, parser); + } else { + fts_tokenize_document_next(doc, doc_len, NULL, parser); + } + + processed_doc++; + doc_len += doc->text.f_len + 1; + } +} + +/** Fetch the document from tuple, tokenize the text data and +insert the text data into fts auxiliary table and +its cache. Moreover this tuple fields doesn't contain any information +about externally stored field. This tuple contains data directly +converted from mysql. +@param[in] ftt FTS transaction table +@param[in] doc_id doc id +@param[in] tuple tuple from where data can be retrieved + and tuple should be arranged in table + schema order. */ +void +fts_add_doc_from_tuple( + fts_trx_table_t*ftt, + doc_id_t doc_id, + const dtuple_t* tuple) +{ + mtr_t mtr; + fts_cache_t* cache = ftt->table->fts->cache; + + ut_ad(cache->get_docs); + + if (!ftt->table->fts->added_synced) { + fts_init_index(ftt->table, FALSE); + } + + mtr_start(&mtr); + + ulint num_idx = ib_vector_size(cache->get_docs); + + for (ulint i = 0; i < num_idx; ++i) { + fts_doc_t doc; + dict_table_t* table; + fts_get_doc_t* get_doc; + + get_doc = static_cast( + ib_vector_get(cache->get_docs, i)); + table = get_doc->index_cache->index->table; + + fts_doc_init(&doc); + fts_fetch_doc_from_tuple( + get_doc, tuple, &doc); + + if (doc.found) { + mtr_commit(&mtr); + mysql_mutex_lock(&table->fts->cache->lock); + + if (table->fts->cache->stopword_info.status + & STOPWORD_NOT_INIT) { + fts_load_stopword(table, NULL, NULL, + true, true); + } + + fts_cache_add_doc( + table->fts->cache, + get_doc->index_cache, + doc_id, doc.tokens); + + mysql_mutex_unlock(&table->fts->cache->lock); + + if (cache->total_size > fts_max_cache_size / 5 + || fts_need_sync) { + fts_sync(cache->sync, true, false); + } + + mtr_start(&mtr); + + } + + fts_doc_free(&doc); + } + + mtr_commit(&mtr); +} + +/*********************************************************************//** +This function fetches the document inserted during the committing +transaction, and tokenize the inserted text data and insert into +FTS auxiliary table and its cache. */ +static +void +fts_add_doc_by_id( +/*==============*/ + fts_trx_table_t*ftt, /*!< in: FTS trx table */ + doc_id_t doc_id) /*!< in: doc id */ +{ + mtr_t mtr; + mem_heap_t* heap; + btr_pcur_t pcur; + dict_table_t* table; + dtuple_t* tuple; + dfield_t* dfield; + fts_get_doc_t* get_doc; + doc_id_t temp_doc_id; + dict_index_t* clust_index; + dict_index_t* fts_id_index; + ibool is_id_cluster; + fts_cache_t* cache = ftt->table->fts->cache; + + ut_ad(cache->get_docs); + + /* If Doc ID has been supplied by the user, then the table + might not yet be sync-ed */ + + if (!ftt->table->fts->added_synced) { + fts_init_index(ftt->table, FALSE); + } + + /* Get the first FTS index's get_doc */ + get_doc = static_cast( + ib_vector_get(cache->get_docs, 0)); + ut_ad(get_doc); + + table = get_doc->index_cache->index->table; + + heap = mem_heap_create(512); + + clust_index = dict_table_get_first_index(table); + fts_id_index = table->fts_doc_id_index; + + /* Check whether the index on FTS_DOC_ID is cluster index */ + is_id_cluster = (clust_index == fts_id_index); + + mtr_start(&mtr); + + /* Search based on Doc ID. Here, we'll need to consider the case + when there is no primary index on Doc ID */ + const ulint n_uniq = table->fts_n_uniq(); + tuple = dtuple_create(heap, n_uniq); + dfield = dtuple_get_nth_field(tuple, 0); + dfield->type.mtype = DATA_INT; + dfield->type.prtype = DATA_NOT_NULL | DATA_UNSIGNED | DATA_BINARY_TYPE; + + mach_write_to_8((byte*) &temp_doc_id, doc_id); + dfield_set_data(dfield, &temp_doc_id, sizeof(temp_doc_id)); + pcur.btr_cur.page_cur.index = fts_id_index; + + if (n_uniq == 2) { + ut_ad(table->versioned()); + ut_ad(fts_id_index->fields[1].col->vers_sys_end()); + dfield = dtuple_get_nth_field(tuple, 1); + dfield->type.mtype = fts_id_index->fields[1].col->mtype; + dfield->type.prtype = fts_id_index->fields[1].col->prtype; + if (table->versioned_by_id()) { + dfield_set_data(dfield, trx_id_max_bytes, + sizeof(trx_id_max_bytes)); + } else { + dfield_set_data(dfield, timestamp_max_bytes, + sizeof(timestamp_max_bytes)); + } + } + + /* If we have a match, add the data to doc structure */ + if (btr_pcur_open_with_no_init(tuple, PAGE_CUR_LE, + BTR_SEARCH_LEAF, &pcur, &mtr) + == DB_SUCCESS + && btr_pcur_get_low_match(&pcur) == n_uniq) { + const rec_t* rec; + btr_pcur_t* doc_pcur; + const rec_t* clust_rec; + btr_pcur_t clust_pcur; + rec_offs* offsets = NULL; + ulint num_idx = ib_vector_size(cache->get_docs); + + rec = btr_pcur_get_rec(&pcur); + + /* Doc could be deleted */ + if (page_rec_is_infimum(rec) + || rec_get_deleted_flag(rec, dict_table_is_comp(table))) { + + goto func_exit; + } + + if (is_id_cluster) { + clust_rec = rec; + doc_pcur = &pcur; + } else { + dtuple_t* clust_ref; + ulint n_fields; + + n_fields = dict_index_get_n_unique(clust_index); + + clust_ref = dtuple_create(heap, n_fields); + dict_index_copy_types(clust_ref, clust_index, n_fields); + + row_build_row_ref_in_tuple( + clust_ref, rec, fts_id_index, NULL); + clust_pcur.btr_cur.page_cur.index = clust_index; + + if (btr_pcur_open_with_no_init(clust_ref, + PAGE_CUR_LE, + BTR_SEARCH_LEAF, + &clust_pcur, &mtr) + != DB_SUCCESS) { + goto func_exit; + } + + doc_pcur = &clust_pcur; + clust_rec = btr_pcur_get_rec(&clust_pcur); + } + + offsets = rec_get_offsets(clust_rec, clust_index, NULL, + clust_index->n_core_fields, + ULINT_UNDEFINED, &heap); + + for (ulint i = 0; i < num_idx; ++i) { + fts_doc_t doc; + dict_table_t* table; + fts_get_doc_t* get_doc; + + get_doc = static_cast( + ib_vector_get(cache->get_docs, i)); + + table = get_doc->index_cache->index->table; + + fts_doc_init(&doc); + + fts_fetch_doc_from_rec( + get_doc, clust_index, doc_pcur, offsets, &doc); + + if (doc.found) { + + btr_pcur_store_position(doc_pcur, &mtr); + mtr_commit(&mtr); + + mysql_mutex_lock(&table->fts->cache->lock); + + if (table->fts->cache->stopword_info.status + & STOPWORD_NOT_INIT) { + fts_load_stopword(table, NULL, + NULL, true, true); + } + + fts_cache_add_doc( + table->fts->cache, + get_doc->index_cache, + doc_id, doc.tokens); + + bool need_sync = !cache->sync->in_progress + && (fts_need_sync + || (cache->total_size + - cache->total_size_at_sync) + > fts_max_cache_size / 10); + if (need_sync) { + cache->total_size_at_sync = + cache->total_size; + } + + mysql_mutex_unlock(&table->fts->cache->lock); + + DBUG_EXECUTE_IF( + "fts_instrument_sync", + fts_optimize_request_sync_table(table); + mysql_mutex_lock(&cache->lock); + if (cache->sync->in_progress) + my_cond_wait( + &cache->sync->cond, + &cache->lock.m_mutex); + mysql_mutex_unlock(&cache->lock); + ); + + DBUG_EXECUTE_IF( + "fts_instrument_sync_debug", + fts_sync(cache->sync, true, true); + ); + + DEBUG_SYNC_C("fts_instrument_sync_request"); + DBUG_EXECUTE_IF( + "fts_instrument_sync_request", + fts_optimize_request_sync_table(table); + ); + + if (need_sync) { + fts_optimize_request_sync_table(table); + } + + mtr_start(&mtr); + + if (i < num_idx - 1) { + if (doc_pcur->restore_position( + BTR_SEARCH_LEAF, &mtr) + != btr_pcur_t::SAME_ALL) { + ut_ad("invalid state" == 0); + i = num_idx - 1; + } + } + } + + fts_doc_free(&doc); + } + + if (!is_id_cluster) { + ut_free(doc_pcur->old_rec_buf); + } + } +func_exit: + mtr_commit(&mtr); + + ut_free(pcur.old_rec_buf); + + mem_heap_free(heap); +} + + +/*********************************************************************//** +Callback function to read a single ulint column. +return always returns TRUE */ +static +ibool +fts_read_ulint( +/*===========*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: pointer to ulint */ +{ + sel_node_t* sel_node = static_cast(row); + ulint* value = static_cast(user_arg); + que_node_t* exp = sel_node->select_list; + dfield_t* dfield = que_node_get_val(exp); + void* data = dfield_get_data(dfield); + + *value = mach_read_from_4(static_cast(data)); + + return(TRUE); +} + +/*********************************************************************//** +Get maximum Doc ID in a table if index "FTS_DOC_ID_INDEX" exists +@return max Doc ID or 0 if index "FTS_DOC_ID_INDEX" does not exist */ +doc_id_t +fts_get_max_doc_id( +/*===============*/ + dict_table_t* table) /*!< in: user table */ +{ + dict_index_t* index; + dict_field_t* dfield MY_ATTRIBUTE((unused)) = NULL; + doc_id_t doc_id = 0; + mtr_t mtr; + btr_pcur_t pcur; + + index = table->fts_doc_id_index; + + if (!index) { + return(0); + } + + ut_ad(!index->is_instant()); + + dfield = dict_index_get_nth_field(index, 0); + +#if 0 /* This can fail when renaming a column to FTS_DOC_ID_COL_NAME. */ + ut_ad(innobase_strcasecmp(FTS_DOC_ID_COL_NAME, dfield->name) == 0); +#endif + + mtr.start(); + + /* fetch the largest indexes value */ + if (pcur.open_leaf(false, index, BTR_SEARCH_LEAF, &mtr) == DB_SUCCESS + && !page_is_empty(btr_pcur_get_page(&pcur))) { + const rec_t* rec = NULL; + constexpr ulint doc_id_len= 8; + + do { + rec = btr_pcur_get_rec(&pcur); + + if (!page_rec_is_user_rec(rec)) { + continue; + } + + if (index->n_uniq == 1) { + break; + } + + ut_ad(table->versioned()); + ut_ad(index->n_uniq == 2); + + const byte *data = rec + doc_id_len; + if (table->versioned_by_id()) { + if (0 == memcmp(data, trx_id_max_bytes, + sizeof trx_id_max_bytes)) { + break; + } + } else { + if (0 == memcmp(data, timestamp_max_bytes, + sizeof timestamp_max_bytes)) { + break; + } + } + } while (btr_pcur_move_to_prev(&pcur, &mtr)); + + if (!rec || rec_is_metadata(rec, *index)) { + goto func_exit; + } + + doc_id = fts_read_doc_id(rec); + } + +func_exit: + mtr.commit(); + return(doc_id); +} + +/*********************************************************************//** +Fetch document with the given document id. +@return DB_SUCCESS if OK else error */ +dberr_t +fts_doc_fetch_by_doc_id( +/*====================*/ + fts_get_doc_t* get_doc, /*!< in: state */ + doc_id_t doc_id, /*!< in: id of document to + fetch */ + dict_index_t* index_to_use, /*!< in: caller supplied FTS index, + or NULL */ + ulint option, /*!< in: search option, if it is + greater than doc_id or equal */ + fts_sql_callback + callback, /*!< in: callback to read */ + void* arg) /*!< in: callback arg */ +{ + pars_info_t* info; + dberr_t error; + const char* select_str; + doc_id_t write_doc_id; + dict_index_t* index; + trx_t* trx = trx_create(); + que_t* graph; + + trx->op_info = "fetching indexed FTS document"; + + /* The FTS index can be supplied by caller directly with + "index_to_use", otherwise, get it from "get_doc" */ + index = (index_to_use) ? index_to_use : get_doc->index_cache->index; + + if (get_doc && get_doc->get_document_graph) { + info = get_doc->get_document_graph->info; + } else { + info = pars_info_create(); + } + + /* Convert to "storage" byte order. */ + fts_write_doc_id((byte*) &write_doc_id, doc_id); + fts_bind_doc_id(info, "doc_id", &write_doc_id); + pars_info_bind_function(info, "my_func", callback, arg); + + select_str = fts_get_select_columns_str(index, info, info->heap); + pars_info_bind_id(info, "table_name", index->table->name.m_name); + + if (!get_doc || !get_doc->get_document_graph) { + if (option == FTS_FETCH_DOC_BY_ID_EQUAL) { + graph = fts_parse_sql( + NULL, + info, + mem_heap_printf(info->heap, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT %s FROM $table_name" + " WHERE %s = :doc_id;\n" + "BEGIN\n" + "" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c %% NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;", + select_str, FTS_DOC_ID_COL_NAME)); + } else { + ut_ad(option == FTS_FETCH_DOC_BY_ID_LARGE); + + /* This is used for crash recovery of table with + hidden DOC ID or FTS indexes. We will scan the table + to re-processing user table rows whose DOC ID or + FTS indexed documents have not been sync-ed to disc + during recent crash. + In the case that all fulltext indexes are dropped + for a table, we will keep the "hidden" FTS_DOC_ID + column, and this scan is to retreive the largest + DOC ID being used in the table to determine the + appropriate next DOC ID. + In the case of there exists fulltext index(es), this + operation will re-tokenize any docs that have not + been sync-ed to the disk, and re-prime the FTS + cached */ + graph = fts_parse_sql( + NULL, + info, + mem_heap_printf(info->heap, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT %s, %s FROM $table_name" + " WHERE %s > :doc_id;\n" + "BEGIN\n" + "" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c %% NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;", + FTS_DOC_ID_COL_NAME, + select_str, FTS_DOC_ID_COL_NAME)); + } + if (get_doc) { + get_doc->get_document_graph = graph; + } + } else { + graph = get_doc->get_document_graph; + } + + error = fts_eval_sql(trx, graph); + fts_sql_commit(trx); + trx->free(); + + if (!get_doc) { + que_graph_free(graph); + } + + return(error); +} + +/*********************************************************************//** +Write out a single word's data as new entry/entries in the INDEX table. +@return DB_SUCCESS if all OK. */ +dberr_t +fts_write_node( +/*===========*/ + trx_t* trx, /*!< in: transaction */ + que_t** graph, /*!< in: query graph */ + fts_table_t* fts_table, /*!< in: aux table */ + fts_string_t* word, /*!< in: word in UTF-8 */ + fts_node_t* node) /*!< in: node columns */ +{ + pars_info_t* info; + dberr_t error; + ib_uint32_t doc_count; + time_t start_time; + doc_id_t last_doc_id; + doc_id_t first_doc_id; + char table_name[MAX_FULL_NAME_LEN]; + + ut_a(node->ilist != NULL); + + if (*graph) { + info = (*graph)->info; + } else { + info = pars_info_create(); + + fts_get_table_name(fts_table, table_name); + pars_info_bind_id(info, "index_table_name", table_name); + } + + pars_info_bind_varchar_literal(info, "token", word->f_str, word->f_len); + + /* Convert to "storage" byte order. */ + fts_write_doc_id((byte*) &first_doc_id, node->first_doc_id); + fts_bind_doc_id(info, "first_doc_id", &first_doc_id); + + /* Convert to "storage" byte order. */ + fts_write_doc_id((byte*) &last_doc_id, node->last_doc_id); + fts_bind_doc_id(info, "last_doc_id", &last_doc_id); + + ut_a(node->last_doc_id >= node->first_doc_id); + + /* Convert to "storage" byte order. */ + mach_write_to_4((byte*) &doc_count, node->doc_count); + pars_info_bind_int4_literal( + info, "doc_count", (const ib_uint32_t*) &doc_count); + + /* Set copy_name to FALSE since it's a static. */ + pars_info_bind_literal( + info, "ilist", node->ilist, node->ilist_size, + DATA_BLOB, DATA_BINARY_TYPE); + + if (!*graph) { + + *graph = fts_parse_sql( + fts_table, + info, + "BEGIN\n" + "INSERT INTO $index_table_name VALUES" + " (:token, :first_doc_id," + " :last_doc_id, :doc_count, :ilist);"); + } + + start_time = time(NULL); + error = fts_eval_sql(trx, *graph); + elapsed_time += time(NULL) - start_time; + ++n_nodes; + + return(error); +} + +/*********************************************************************//** +Add rows to the DELETED_CACHE table. +@return DB_SUCCESS if all went well else error code*/ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_sync_add_deleted_cache( +/*=======================*/ + fts_sync_t* sync, /*!< in: sync state */ + ib_vector_t* doc_ids) /*!< in: doc ids to add */ +{ + ulint i; + pars_info_t* info; + que_t* graph; + fts_table_t fts_table; + char table_name[MAX_FULL_NAME_LEN]; + doc_id_t dummy = 0; + dberr_t error = DB_SUCCESS; + ulint n_elems = ib_vector_size(doc_ids); + + ut_a(ib_vector_size(doc_ids) > 0); + + ib_vector_sort(doc_ids, fts_doc_id_cmp); + + info = pars_info_create(); + + fts_bind_doc_id(info, "doc_id", &dummy); + + FTS_INIT_FTS_TABLE( + &fts_table, "DELETED_CACHE", FTS_COMMON_TABLE, sync->table); + + fts_get_table_name(&fts_table, table_name); + pars_info_bind_id(info, "table_name", table_name); + + graph = fts_parse_sql( + &fts_table, + info, + "BEGIN INSERT INTO $table_name VALUES (:doc_id);"); + + for (i = 0; i < n_elems && error == DB_SUCCESS; ++i) { + doc_id_t* update; + doc_id_t write_doc_id; + + update = static_cast(ib_vector_get(doc_ids, i)); + + /* Convert to "storage" byte order. */ + fts_write_doc_id((byte*) &write_doc_id, *update); + fts_bind_doc_id(info, "doc_id", &write_doc_id); + + error = fts_eval_sql(sync->trx, graph); + } + + que_graph_free(graph); + + return(error); +} + +/** Write the words and ilist to disk. +@param[in,out] trx transaction +@param[in] index_cache index cache +@param[in] unlock_cache whether unlock cache when write node +@return DB_SUCCESS if all went well else error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_sync_write_words( + trx_t* trx, + fts_index_cache_t* index_cache, + bool unlock_cache) +{ + fts_table_t fts_table; + ulint n_nodes = 0; + ulint n_words = 0; + const ib_rbt_node_t* rbt_node; + dberr_t error = DB_SUCCESS; + ibool print_error = FALSE; + dict_table_t* table = index_cache->index->table; + + FTS_INIT_INDEX_TABLE( + &fts_table, NULL, FTS_INDEX_TABLE, index_cache->index); + + n_words = rbt_size(index_cache->words); + + /* We iterate over the entire tree, even if there is an error, + since we want to free the memory used during caching. */ + for (rbt_node = rbt_first(index_cache->words); + rbt_node; + rbt_node = rbt_next(index_cache->words, rbt_node)) { + + ulint i; + ulint selected; + fts_tokenizer_word_t* word; + + word = rbt_value(fts_tokenizer_word_t, rbt_node); + + DBUG_EXECUTE_IF( + "fts_instrument_write_words_before_select_index", + std::this_thread::sleep_for( + std::chrono::milliseconds(300));); + + selected = fts_select_index( + index_cache->charset, word->text.f_str, + word->text.f_len); + + fts_table.suffix = fts_get_suffix(selected); + + /* We iterate over all the nodes even if there was an error */ + for (i = 0; i < ib_vector_size(word->nodes); ++i) { + + fts_node_t* fts_node = static_cast( + ib_vector_get(word->nodes, i)); + + if (fts_node->synced) { + continue; + } else { + fts_node->synced = true; + } + + /*FIXME: we need to handle the error properly. */ + if (error == DB_SUCCESS) { + if (unlock_cache) { + mysql_mutex_unlock( + &table->fts->cache->lock); + } + + error = fts_write_node( + trx, + &index_cache->ins_graph[selected], + &fts_table, &word->text, fts_node); + + DEBUG_SYNC_C("fts_write_node"); + DBUG_EXECUTE_IF("fts_write_node_crash", + DBUG_SUICIDE();); + + DBUG_EXECUTE_IF( + "fts_instrument_sync_sleep", + std::this_thread::sleep_for( + std::chrono::seconds(1));); + + if (unlock_cache) { + mysql_mutex_lock( + &table->fts->cache->lock); + } + } + } + + n_nodes += ib_vector_size(word->nodes); + + if (UNIV_UNLIKELY(error != DB_SUCCESS) && !print_error) { + ib::error() << "(" << error << ") writing" + " word node to FTS auxiliary index table " + << table->name; + print_error = TRUE; + } + } + + if (UNIV_UNLIKELY(fts_enable_diag_print)) { + printf("Avg number of nodes: %lf\n", + (double) n_nodes / (double) (n_words > 1 ? n_words : 1)); + } + + return(error); +} + +/*********************************************************************//** +Begin Sync, create transaction, acquire locks, etc. */ +static +void +fts_sync_begin( +/*===========*/ + fts_sync_t* sync) /*!< in: sync state */ +{ + fts_cache_t* cache = sync->table->fts->cache; + + n_nodes = 0; + elapsed_time = 0; + + sync->start_time = time(NULL); + + sync->trx = trx_create(); + trx_start_internal(sync->trx); + + if (UNIV_UNLIKELY(fts_enable_diag_print)) { + ib::info() << "FTS SYNC for table " << sync->table->name + << ", deleted count: " + << ib_vector_size(cache->deleted_doc_ids) + << " size: " << ib::bytes_iec{cache->total_size}; + } +} + +/*********************************************************************//** +Run SYNC on the table, i.e., write out data from the index specific +cache to the FTS aux INDEX table and FTS aux doc id stats table. +@return DB_SUCCESS if all OK */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_sync_index( +/*===========*/ + fts_sync_t* sync, /*!< in: sync state */ + fts_index_cache_t* index_cache) /*!< in: index cache */ +{ + trx_t* trx = sync->trx; + + trx->op_info = "doing SYNC index"; + + if (UNIV_UNLIKELY(fts_enable_diag_print)) { + ib::info() << "SYNC words: " << rbt_size(index_cache->words); + } + + ut_ad(rbt_validate(index_cache->words)); + + return(fts_sync_write_words(trx, index_cache, sync->unlock_cache)); +} + +/** Check if index cache has been synced completely +@param[in,out] index_cache index cache +@return true if index is synced, otherwise false. */ +static +bool +fts_sync_index_check( + fts_index_cache_t* index_cache) +{ + const ib_rbt_node_t* rbt_node; + + for (rbt_node = rbt_first(index_cache->words); + rbt_node != NULL; + rbt_node = rbt_next(index_cache->words, rbt_node)) { + + fts_tokenizer_word_t* word; + word = rbt_value(fts_tokenizer_word_t, rbt_node); + + fts_node_t* fts_node; + fts_node = static_cast(ib_vector_last(word->nodes)); + + if (!fts_node->synced) { + return(false); + } + } + + return(true); +} + +/** Reset synced flag in index cache when rollback +@param[in,out] index_cache index cache */ +static +void +fts_sync_index_reset( + fts_index_cache_t* index_cache) +{ + const ib_rbt_node_t* rbt_node; + + for (rbt_node = rbt_first(index_cache->words); + rbt_node != NULL; + rbt_node = rbt_next(index_cache->words, rbt_node)) { + + fts_tokenizer_word_t* word; + word = rbt_value(fts_tokenizer_word_t, rbt_node); + + fts_node_t* fts_node; + fts_node = static_cast(ib_vector_last(word->nodes)); + + fts_node->synced = false; + } +} + +/** Commit the SYNC, change state of processed doc ids etc. +@param[in,out] sync sync state +@return DB_SUCCESS if all OK */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_sync_commit( + fts_sync_t* sync) +{ + dberr_t error; + trx_t* trx = sync->trx; + fts_cache_t* cache = sync->table->fts->cache; + doc_id_t last_doc_id; + + trx->op_info = "doing SYNC commit"; + + /* After each Sync, update the CONFIG table about the max doc id + we just sync-ed to index table */ + error = fts_cmp_set_sync_doc_id(sync->table, sync->max_doc_id, FALSE, + &last_doc_id); + + /* Get the list of deleted documents that are either in the + cache or were headed there but were deleted before the add + thread got to them. */ + + if (error == DB_SUCCESS && ib_vector_size(cache->deleted_doc_ids) > 0) { + + error = fts_sync_add_deleted_cache( + sync, cache->deleted_doc_ids); + } + + /* We need to do this within the deleted lock since fts_delete() can + attempt to add a deleted doc id to the cache deleted id array. */ + fts_cache_clear(cache); + DEBUG_SYNC_C("fts_deleted_doc_ids_clear"); + fts_cache_init(cache); + mysql_mutex_unlock(&cache->lock); + + if (UNIV_LIKELY(error == DB_SUCCESS)) { + fts_sql_commit(trx); + } else { + fts_sql_rollback(trx); + ib::error() << "(" << error << ") during SYNC of " + "table " << sync->table->name; + } + + if (UNIV_UNLIKELY(fts_enable_diag_print) && elapsed_time) { + ib::info() << "SYNC for table " << sync->table->name + << ": SYNC time: " + << (time(NULL) - sync->start_time) + << " secs: elapsed " + << static_cast(n_nodes) + / static_cast(elapsed_time) + << " ins/sec"; + } + + /* Avoid assertion in trx_t::free(). */ + trx->dict_operation_lock_mode = false; + trx->free(); + + return(error); +} + +/** Rollback a sync operation +@param[in,out] sync sync state */ +static +void +fts_sync_rollback( + fts_sync_t* sync) +{ + trx_t* trx = sync->trx; + fts_cache_t* cache = sync->table->fts->cache; + + for (ulint i = 0; i < ib_vector_size(cache->indexes); ++i) { + ulint j; + fts_index_cache_t* index_cache; + + index_cache = static_cast( + ib_vector_get(cache->indexes, i)); + + /* Reset synced flag so nodes will not be skipped + in the next sync, see fts_sync_write_words(). */ + fts_sync_index_reset(index_cache); + + for (j = 0; fts_index_selector[j].value; ++j) { + + if (index_cache->ins_graph[j] != NULL) { + + que_graph_free(index_cache->ins_graph[j]); + + index_cache->ins_graph[j] = NULL; + } + + if (index_cache->sel_graph[j] != NULL) { + + que_graph_free(index_cache->sel_graph[j]); + + index_cache->sel_graph[j] = NULL; + } + } + } + + mysql_mutex_unlock(&cache->lock); + + fts_sql_rollback(trx); + + /* Avoid assertion in trx_t::free(). */ + trx->dict_operation_lock_mode = false; + trx->free(); +} + +/** Run SYNC on the table, i.e., write out data from the cache to the +FTS auxiliary INDEX table and clear the cache at the end. +@param[in,out] sync sync state +@param[in] unlock_cache whether unlock cache lock when write node +@param[in] wait whether wait when a sync is in progress +@return DB_SUCCESS if all OK */ +static +dberr_t +fts_sync( + fts_sync_t* sync, + bool unlock_cache, + bool wait) +{ + if (srv_read_only_mode) { + return DB_READ_ONLY; + } + + ulint i; + dberr_t error = DB_SUCCESS; + fts_cache_t* cache = sync->table->fts->cache; + + mysql_mutex_lock(&cache->lock); + + if (cache->total_size == 0) { + mysql_mutex_unlock(&cache->lock); + return DB_SUCCESS; + } + + /* Check if cache is being synced. + Note: we release cache lock in fts_sync_write_words() to + avoid long wait for the lock by other threads. */ + if (sync->in_progress) { + if (!wait) { + mysql_mutex_unlock(&cache->lock); + return(DB_SUCCESS); + } + do { + my_cond_wait(&sync->cond, &cache->lock.m_mutex); + } while (sync->in_progress); + } + + sync->unlock_cache = unlock_cache; + sync->in_progress = true; + + DEBUG_SYNC_C("fts_sync_begin"); + fts_sync_begin(sync); + +begin_sync: + const size_t fts_cache_size= fts_max_cache_size; + if (cache->total_size > fts_cache_size) { + /* Avoid the case: sync never finish when + insert/update keeps comming. */ + ut_ad(sync->unlock_cache); + sync->unlock_cache = false; + ib::warn() << "Total InnoDB FTS size " + << cache->total_size << " for the table " + << cache->sync->table->name + << " exceeds the innodb_ft_cache_size " + << fts_cache_size; + } + + for (i = 0; i < ib_vector_size(cache->indexes); ++i) { + fts_index_cache_t* index_cache; + + index_cache = static_cast( + ib_vector_get(cache->indexes, i)); + + if (index_cache->index->to_be_dropped) { + continue; + } + + DBUG_EXECUTE_IF("fts_instrument_sync_before_syncing", + std::this_thread::sleep_for( + std::chrono::milliseconds(300));); + error = fts_sync_index(sync, index_cache); + + if (error != DB_SUCCESS) { + goto end_sync; + } + + if (!sync->unlock_cache + && cache->total_size < fts_max_cache_size) { + /* Reset the unlock cache if the value + is less than innodb_ft_cache_size */ + sync->unlock_cache = true; + } + } + + DBUG_EXECUTE_IF("fts_instrument_sync_interrupted", + sync->interrupted = true; + error = DB_INTERRUPTED; + goto end_sync; + ); + + /* Make sure all the caches are synced. */ + for (i = 0; i < ib_vector_size(cache->indexes); ++i) { + fts_index_cache_t* index_cache; + + index_cache = static_cast( + ib_vector_get(cache->indexes, i)); + + if (index_cache->index->to_be_dropped + || fts_sync_index_check(index_cache)) { + continue; + } + + goto begin_sync; + } + +end_sync: + if (error == DB_SUCCESS && !sync->interrupted) { + error = fts_sync_commit(sync); + } else { + fts_sync_rollback(sync); + } + + mysql_mutex_lock(&cache->lock); + ut_ad(sync->in_progress); + sync->interrupted = false; + sync->in_progress = false; + pthread_cond_broadcast(&sync->cond); + mysql_mutex_unlock(&cache->lock); + + /* We need to check whether an optimize is required, for that + we make copies of the two variables that control the trigger. These + variables can change behind our back and we don't want to hold the + lock for longer than is needed. */ + mysql_mutex_lock(&cache->deleted_lock); + + cache->added = 0; + cache->deleted = 0; + + mysql_mutex_unlock(&cache->deleted_lock); + + return(error); +} + +/** Run SYNC on the table, i.e., write out data from the cache to the +FTS auxiliary INDEX table and clear the cache at the end. +@param[in,out] table fts table +@param[in] wait whether wait for existing sync to finish +@return DB_SUCCESS on success, error code on failure. */ +dberr_t fts_sync_table(dict_table_t* table, bool wait) +{ + ut_ad(table->fts); + + return table->space && !table->corrupted && table->fts->cache + ? fts_sync(table->fts->cache->sync, !wait, wait) + : DB_SUCCESS; +} + +/** Check if a fts token is a stopword or less than fts_min_token_size +or greater than fts_max_token_size. +@param[in] token token string +@param[in] stopwords stopwords rb tree +@param[in] cs token charset +@retval true if it is not stopword and length in range +@retval false if it is stopword or lenght not in range */ +bool +fts_check_token( + const fts_string_t* token, + const ib_rbt_t* stopwords, + const CHARSET_INFO* cs) +{ + ut_ad(cs != NULL || stopwords == NULL); + + ib_rbt_bound_t parent; + + return(token->f_n_char >= fts_min_token_size + && token->f_n_char <= fts_max_token_size + && (stopwords == NULL + || rbt_search(stopwords, &parent, token) != 0)); +} + +/** Add the token and its start position to the token's list of positions. +@param[in,out] result_doc result doc rb tree +@param[in] str token string +@param[in] position token position */ +static +void +fts_add_token( + fts_doc_t* result_doc, + fts_string_t str, + ulint position) +{ + /* Ignore string whose character number is less than + "fts_min_token_size" or more than "fts_max_token_size" */ + + if (fts_check_token(&str, NULL, result_doc->charset)) { + + mem_heap_t* heap; + fts_string_t t_str; + fts_token_t* token; + ib_rbt_bound_t parent; + ulint newlen; + + heap = static_cast(result_doc->self_heap->arg); + + t_str.f_n_char = str.f_n_char; + + t_str.f_len = str.f_len * result_doc->charset->casedn_multiply() + 1; + + t_str.f_str = static_cast( + mem_heap_alloc(heap, t_str.f_len)); + + /* For binary collations, a case sensitive search is + performed. Hence don't convert to lower case. */ + if (my_binary_compare(result_doc->charset)) { + memcpy(t_str.f_str, str.f_str, str.f_len); + t_str.f_str[str.f_len]= 0; + newlen= str.f_len; + } else { + newlen = innobase_fts_casedn_str( + result_doc->charset, (char*) str.f_str, str.f_len, + (char*) t_str.f_str, t_str.f_len); + } + + t_str.f_len = newlen; + t_str.f_str[newlen] = 0; + + /* Add the word to the document statistics. If the word + hasn't been seen before we create a new entry for it. */ + if (rbt_search(result_doc->tokens, &parent, &t_str) != 0) { + fts_token_t new_token; + + new_token.text.f_len = newlen; + new_token.text.f_str = t_str.f_str; + new_token.text.f_n_char = t_str.f_n_char; + + new_token.positions = ib_vector_create( + result_doc->self_heap, sizeof(ulint), 32); + + parent.last = rbt_add_node( + result_doc->tokens, &parent, &new_token); + + ut_ad(rbt_validate(result_doc->tokens)); + } + + token = rbt_value(fts_token_t, parent.last); + ib_vector_push(token->positions, &position); + } +} + +/******************************************************************** +Process next token from document starting at the given position, i.e., add +the token's start position to the token's list of positions. +@return number of characters handled in this call */ +static +ulint +fts_process_token( +/*==============*/ + fts_doc_t* doc, /* in/out: document to + tokenize */ + fts_doc_t* result, /* out: if provided, save + result here */ + ulint start_pos, /*!< in: start position in text */ + ulint add_pos) /*!< in: add this position to all + tokens from this tokenization */ +{ + ulint ret; + fts_string_t str; + ulint position; + fts_doc_t* result_doc; + byte buf[FTS_MAX_WORD_LEN + 1]; + + str.f_str = buf; + + /* Determine where to save the result. */ + result_doc = (result != NULL) ? result : doc; + + /* The length of a string in characters is set here only. */ + + ret = innobase_mysql_fts_get_token( + doc->charset, doc->text.f_str + start_pos, + doc->text.f_str + doc->text.f_len, &str); + + position = start_pos + ret - str.f_len + add_pos; + + fts_add_token(result_doc, str, position); + + return(ret); +} + +/*************************************************************//** +Get token char size by charset +@return token size */ +ulint +fts_get_token_size( +/*===============*/ + const CHARSET_INFO* cs, /*!< in: Character set */ + const char* token, /*!< in: token */ + ulint len) /*!< in: token length */ +{ + char* start; + char* end; + ulint size = 0; + + /* const_cast is for reinterpret_cast below, or it will fail. */ + start = const_cast(token); + end = start + len; + while (start < end) { + int ctype; + int mbl; + + mbl = cs->ctype( + &ctype, + reinterpret_cast(start), + reinterpret_cast(end)); + + size++; + + start += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1); + } + + return(size); +} + +/*************************************************************//** +FTS plugin parser 'myql_parser' callback function for document tokenize. +Refer to 'st_mysql_ftparser_param' for more detail. +@return always returns 0 */ +int +fts_tokenize_document_internal( +/*===========================*/ + MYSQL_FTPARSER_PARAM* param, /*!< in: parser parameter */ + const char* doc,/*!< in/out: document */ + int len) /*!< in: document length */ +{ + fts_string_t str; + byte buf[FTS_MAX_WORD_LEN + 1]; + /* JAN: TODO: MySQL 5.7 + MYSQL_FTPARSER_BOOLEAN_INFO bool_info = + { FT_TOKEN_WORD, 0, 0, 0, 0, 0, ' ', 0 }; + */ + MYSQL_FTPARSER_BOOLEAN_INFO bool_info = + { FT_TOKEN_WORD, 0, 0, 0, 0, ' ', 0}; + + ut_ad(len >= 0); + + str.f_str = buf; + + for (ulint i = 0, inc = 0; i < static_cast(len); i += inc) { + inc = innobase_mysql_fts_get_token( + const_cast(param->cs), + (uchar*)(doc) + i, + (uchar*)(doc) + len, + &str); + + if (str.f_len > 0) { + /* JAN: TODO: MySQL 5.7 + bool_info.position = + static_cast(i + inc - str.f_len); + ut_ad(bool_info.position >= 0); + */ + + /* Stop when add word fails */ + if (param->mysql_add_word( + param, + reinterpret_cast(str.f_str), + static_cast(str.f_len), + &bool_info)) { + break; + } + } + } + + return(0); +} + +/******************************************************************//** +FTS plugin parser 'myql_add_word' callback function for document tokenize. +Refer to 'st_mysql_ftparser_param' for more detail. +@return always returns 0 */ +static +int +fts_tokenize_add_word_for_parser( +/*=============================*/ + MYSQL_FTPARSER_PARAM* param, /* in: parser paramter */ + const char* word, /* in: token word */ + int word_len, /* in: word len */ + MYSQL_FTPARSER_BOOLEAN_INFO*) +{ + fts_string_t str; + fts_tokenize_param_t* fts_param; + fts_doc_t* result_doc; + ulint position; + + fts_param = static_cast(param->mysql_ftparam); + result_doc = fts_param->result_doc; + ut_ad(result_doc != NULL); + + str.f_str = (byte*)(word); + str.f_len = ulint(word_len); + str.f_n_char = fts_get_token_size( + const_cast(param->cs), word, str.f_len); + + /* JAN: TODO: MySQL 5.7 FTS + ut_ad(boolean_info->position >= 0); + position = boolean_info->position + fts_param->add_pos; + */ + position = fts_param->add_pos++; + + fts_add_token(result_doc, str, position); + + return(0); +} + +/******************************************************************//** +Parse a document using an external / user supplied parser */ +static +void +fts_tokenize_by_parser( +/*===================*/ + fts_doc_t* doc, /* in/out: document to tokenize */ + st_mysql_ftparser* parser, /* in: plugin fts parser */ + fts_tokenize_param_t* fts_param) /* in: fts tokenize param */ +{ + MYSQL_FTPARSER_PARAM param; + + ut_a(parser); + + /* Set paramters for param */ + param.mysql_parse = fts_tokenize_document_internal; + param.mysql_add_word = fts_tokenize_add_word_for_parser; + param.mysql_ftparam = fts_param; + param.cs = doc->charset; + param.doc = reinterpret_cast(doc->text.f_str); + param.length = static_cast(doc->text.f_len); + param.mode= MYSQL_FTPARSER_SIMPLE_MODE; + + PARSER_INIT(parser, ¶m); + parser->parse(¶m); + PARSER_DEINIT(parser, ¶m); +} + +/** Tokenize a document. +@param[in,out] doc document to tokenize +@param[out] result tokenization result +@param[in] parser pluggable parser */ +static +void +fts_tokenize_document( + fts_doc_t* doc, + fts_doc_t* result, + st_mysql_ftparser* parser) +{ + ut_a(!doc->tokens); + ut_a(doc->charset); + + doc->tokens = rbt_create_arg_cmp(sizeof(fts_token_t), + innobase_fts_text_cmp, + (void*) doc->charset); + + if (parser != NULL) { + fts_tokenize_param_t fts_param; + fts_param.result_doc = (result != NULL) ? result : doc; + fts_param.add_pos = 0; + + fts_tokenize_by_parser(doc, parser, &fts_param); + } else { + ulint inc; + + for (ulint i = 0; i < doc->text.f_len; i += inc) { + inc = fts_process_token(doc, result, i, 0); + ut_a(inc > 0); + } + } +} + +/** Continue to tokenize a document. +@param[in,out] doc document to tokenize +@param[in] add_pos add this position to all tokens from this tokenization +@param[out] result tokenization result +@param[in] parser pluggable parser */ +static +void +fts_tokenize_document_next( + fts_doc_t* doc, + ulint add_pos, + fts_doc_t* result, + st_mysql_ftparser* parser) +{ + ut_a(doc->tokens); + + if (parser) { + fts_tokenize_param_t fts_param; + + fts_param.result_doc = (result != NULL) ? result : doc; + fts_param.add_pos = add_pos; + + fts_tokenize_by_parser(doc, parser, &fts_param); + } else { + ulint inc; + + for (ulint i = 0; i < doc->text.f_len; i += inc) { + inc = fts_process_token(doc, result, i, add_pos); + ut_a(inc > 0); + } + } +} + +/** Create the vector of fts_get_doc_t instances. +@param[in,out] cache fts cache +@return vector of fts_get_doc_t instances */ +static +ib_vector_t* +fts_get_docs_create( + fts_cache_t* cache) +{ + ib_vector_t* get_docs; + + mysql_mutex_assert_owner(&cache->init_lock); + + /* We need one instance of fts_get_doc_t per index. */ + get_docs = ib_vector_create(cache->self_heap, sizeof(fts_get_doc_t), 4); + + /* Create the get_doc instance, we need one of these + per FTS index. */ + for (ulint i = 0; i < ib_vector_size(cache->indexes); ++i) { + + dict_index_t** index; + fts_get_doc_t* get_doc; + + index = static_cast( + ib_vector_get(cache->indexes, i)); + + get_doc = static_cast( + ib_vector_push(get_docs, NULL)); + + memset(get_doc, 0x0, sizeof(*get_doc)); + + get_doc->index_cache = fts_get_index_cache(cache, *index); + get_doc->cache = cache; + + /* Must find the index cache. */ + ut_a(get_doc->index_cache != NULL); + } + + return(get_docs); +} + +/******************************************************************** +Release any resources held by the fts_get_doc_t instances. */ +static +void +fts_get_docs_clear( +/*===============*/ + ib_vector_t* get_docs) /*!< in: Doc retrieval vector */ +{ + ulint i; + + /* Release the get doc graphs if any. */ + for (i = 0; i < ib_vector_size(get_docs); ++i) { + + fts_get_doc_t* get_doc = static_cast( + ib_vector_get(get_docs, i)); + + if (get_doc->get_document_graph != NULL) { + + ut_a(get_doc->index_cache); + + que_graph_free(get_doc->get_document_graph); + get_doc->get_document_graph = NULL; + } + } +} + +/*********************************************************************//** +Get the initial Doc ID by consulting the CONFIG table +@return initial Doc ID */ +doc_id_t +fts_init_doc_id( +/*============*/ + const dict_table_t* table) /*!< in: table */ +{ + doc_id_t max_doc_id = 0; + + mysql_mutex_lock(&table->fts->cache->lock); + + /* Return if the table is already initialized for DOC ID */ + if (table->fts->cache->first_doc_id != FTS_NULL_DOC_ID) { + mysql_mutex_unlock(&table->fts->cache->lock); + return(0); + } + + DEBUG_SYNC_C("fts_initialize_doc_id"); + + /* Then compare this value with the ID value stored in the CONFIG + table. The larger one will be our new initial Doc ID */ + fts_cmp_set_sync_doc_id(table, 0, FALSE, &max_doc_id); + + /* If DICT_TF2_FTS_ADD_DOC_ID is set, we are in the process of + creating index (and add doc id column. No need to recovery + documents */ + if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) { + fts_init_index((dict_table_t*) table, TRUE); + } + + table->fts->added_synced = true; + + table->fts->cache->first_doc_id = max_doc_id; + + mysql_mutex_unlock(&table->fts->cache->lock); + + ut_ad(max_doc_id > 0); + + return(max_doc_id); +} + +#ifdef FTS_MULT_INDEX +/*********************************************************************//** +Check if the index is in the affected set. +@return TRUE if index is updated */ +static +ibool +fts_is_index_updated( +/*=================*/ + const ib_vector_t* fts_indexes, /*!< in: affected FTS indexes */ + const fts_get_doc_t* get_doc) /*!< in: info for reading + document */ +{ + ulint i; + dict_index_t* index = get_doc->index_cache->index; + + for (i = 0; i < ib_vector_size(fts_indexes); ++i) { + const dict_index_t* updated_fts_index; + + updated_fts_index = static_cast( + ib_vector_getp_const(fts_indexes, i)); + + ut_a(updated_fts_index != NULL); + + if (updated_fts_index == index) { + return(TRUE); + } + } + + return(FALSE); +} +#endif + +/*********************************************************************//** +Fetch COUNT(*) from specified table. +@return the number of rows in the table */ +ulint +fts_get_rows_count( +/*===============*/ + fts_table_t* fts_table) /*!< in: fts table to read */ +{ + trx_t* trx; + pars_info_t* info; + que_t* graph; + dberr_t error; + ulint count = 0; + char table_name[MAX_FULL_NAME_LEN]; + + trx = trx_create(); + trx->op_info = "fetching FT table rows count"; + + info = pars_info_create(); + + pars_info_bind_function(info, "my_func", fts_read_ulint, &count); + + fts_get_table_name(fts_table, table_name); + pars_info_bind_id(info, "table_name", table_name); + + graph = fts_parse_sql( + fts_table, + info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT COUNT(*)" + " FROM $table_name;\n" + "BEGIN\n" + "\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + + for (;;) { + error = fts_eval_sql(trx, graph); + + if (UNIV_LIKELY(error == DB_SUCCESS)) { + fts_sql_commit(trx); + + break; /* Exit the loop. */ + } else { + fts_sql_rollback(trx); + + if (error == DB_LOCK_WAIT_TIMEOUT) { + ib::warn() << "lock wait timeout reading" + " FTS table. Retrying!"; + + trx->error_state = DB_SUCCESS; + } else { + ib::error() << "(" << error + << ") while reading FTS table " + << table_name; + + break; /* Exit the loop. */ + } + } + } + + que_graph_free(graph); + + trx->free(); + + return(count); +} + +#ifdef FTS_CACHE_SIZE_DEBUG +/*********************************************************************//** +Read the max cache size parameter from the config table. */ +static +void +fts_update_max_cache_size( +/*======================*/ + fts_sync_t* sync) /*!< in: sync state */ +{ + trx_t* trx; + fts_table_t fts_table; + + trx = trx_create(); + + FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, sync->table); + + /* The size returned is in bytes. */ + sync->max_cache_size = fts_get_max_cache_size(trx, &fts_table); + + fts_sql_commit(trx); + + trx->free(); +} +#endif /* FTS_CACHE_SIZE_DEBUG */ + +/*********************************************************************//** +Free the modified rows of a table. */ +UNIV_INLINE +void +fts_trx_table_rows_free( +/*====================*/ + ib_rbt_t* rows) /*!< in: rbt of rows to free */ +{ + const ib_rbt_node_t* node; + + for (node = rbt_first(rows); node; node = rbt_first(rows)) { + fts_trx_row_t* row; + + row = rbt_value(fts_trx_row_t, node); + + if (row->fts_indexes != NULL) { + /* This vector shouldn't be using the + heap allocator. */ + ut_a(row->fts_indexes->allocator->arg == NULL); + + ib_vector_free(row->fts_indexes); + row->fts_indexes = NULL; + } + + ut_free(rbt_remove_node(rows, node)); + } + + ut_a(rbt_empty(rows)); + rbt_free(rows); +} + +/*********************************************************************//** +Free an FTS savepoint instance. */ +UNIV_INLINE +void +fts_savepoint_free( +/*===============*/ + fts_savepoint_t* savepoint) /*!< in: savepoint instance */ +{ + const ib_rbt_node_t* node; + ib_rbt_t* tables = savepoint->tables; + + /* Nothing to free! */ + if (tables == NULL) { + return; + } + + for (node = rbt_first(tables); node; node = rbt_first(tables)) { + fts_trx_table_t* ftt; + fts_trx_table_t** fttp; + + fttp = rbt_value(fts_trx_table_t*, node); + ftt = *fttp; + + /* This can be NULL if a savepoint was released. */ + if (ftt->rows != NULL) { + fts_trx_table_rows_free(ftt->rows); + ftt->rows = NULL; + } + + /* This can be NULL if a savepoint was released. */ + if (ftt->added_doc_ids != NULL) { + fts_doc_ids_free(ftt->added_doc_ids); + ftt->added_doc_ids = NULL; + } + + /* The default savepoint name must be NULL. */ + if (ftt->docs_added_graph) { + que_graph_free(ftt->docs_added_graph); + } + + /* NOTE: We are responsible for free'ing the node */ + ut_free(rbt_remove_node(tables, node)); + } + + ut_a(rbt_empty(tables)); + rbt_free(tables); + savepoint->tables = NULL; +} + +/*********************************************************************//** +Free an FTS trx. */ +void +fts_trx_free( +/*=========*/ + fts_trx_t* fts_trx) /* in, own: FTS trx */ +{ + ulint i; + + for (i = 0; i < ib_vector_size(fts_trx->savepoints); ++i) { + fts_savepoint_t* savepoint; + + savepoint = static_cast( + ib_vector_get(fts_trx->savepoints, i)); + + /* The default savepoint name must be NULL. */ + if (i == 0) { + ut_a(savepoint->name == NULL); + } + + fts_savepoint_free(savepoint); + } + + for (i = 0; i < ib_vector_size(fts_trx->last_stmt); ++i) { + fts_savepoint_t* savepoint; + + savepoint = static_cast( + ib_vector_get(fts_trx->last_stmt, i)); + + /* The default savepoint name must be NULL. */ + if (i == 0) { + ut_a(savepoint->name == NULL); + } + + fts_savepoint_free(savepoint); + } + + if (fts_trx->heap) { + mem_heap_free(fts_trx->heap); + } +} + +/*********************************************************************//** +Extract the doc id from the FTS hidden column. +@return doc id that was extracted from rec */ +doc_id_t +fts_get_doc_id_from_row( +/*====================*/ + dict_table_t* table, /*!< in: table */ + dtuple_t* row) /*!< in: row whose FTS doc id we + want to extract.*/ +{ + dfield_t* field; + doc_id_t doc_id = 0; + + ut_a(table->fts->doc_col != ULINT_UNDEFINED); + + field = dtuple_get_nth_field(row, table->fts->doc_col); + + ut_a(dfield_get_len(field) == sizeof(doc_id)); + ut_a(dfield_get_type(field)->mtype == DATA_INT); + + doc_id = fts_read_doc_id( + static_cast(dfield_get_data(field))); + + return(doc_id); +} + +/** Extract the doc id from the record that belongs to index. +@param[in] rec record containing FTS_DOC_ID +@param[in] index index of rec +@param[in] offsets rec_get_offsets(rec,index) +@return doc id that was extracted from rec */ +doc_id_t +fts_get_doc_id_from_rec( + const rec_t* rec, + const dict_index_t* index, + const rec_offs* offsets) +{ + ulint f = dict_col_get_index_pos( + &index->table->cols[index->table->fts->doc_col], index); + ulint len; + doc_id_t doc_id = mach_read_from_8( + rec_get_nth_field(rec, offsets, f, &len)); + ut_ad(len == 8); + return doc_id; +} + +/*********************************************************************//** +Search the index specific cache for a particular FTS index. +@return the index specific cache else NULL */ +fts_index_cache_t* +fts_find_index_cache( +/*=================*/ + const fts_cache_t* cache, /*!< in: cache to search */ + const dict_index_t* index) /*!< in: index to search for */ +{ + /* We cast away the const because our internal function, takes + non-const cache arg and returns a non-const pointer. */ + return(static_cast( + fts_get_index_cache((fts_cache_t*) cache, index))); +} + +/*********************************************************************//** +Search cache for word. +@return the word node vector if found else NULL */ +const ib_vector_t* +fts_cache_find_word( +/*================*/ + const fts_index_cache_t*index_cache, /*!< in: cache to search */ + const fts_string_t* text) /*!< in: word to search for */ +{ + ib_rbt_bound_t parent; + const ib_vector_t* nodes = NULL; + + mysql_mutex_assert_owner(&index_cache->index->table->fts->cache->lock); + + /* Lookup the word in the rb tree */ + if (rbt_search(index_cache->words, &parent, text) == 0) { + const fts_tokenizer_word_t* word; + + word = rbt_value(fts_tokenizer_word_t, parent.last); + + nodes = word->nodes; + } + + return(nodes); +} + +/*********************************************************************//** +Append deleted doc ids to vector. */ +void +fts_cache_append_deleted_doc_ids( +/*=============================*/ + fts_cache_t* cache, /*!< in: cache to use */ + ib_vector_t* vector) /*!< in: append to this vector */ +{ + mysql_mutex_lock(&cache->deleted_lock); + + if (cache->deleted_doc_ids) + for (ulint i= 0; i < ib_vector_size(cache->deleted_doc_ids); ++i) + { + doc_id_t *update= static_cast( + ib_vector_get(cache->deleted_doc_ids, i)); + ib_vector_push(vector, &update); + } + + mysql_mutex_unlock(&cache->deleted_lock); +} + +/*********************************************************************//** +Add the FTS document id hidden column. */ +void +fts_add_doc_id_column( +/*==================*/ + dict_table_t* table, /*!< in/out: Table with FTS index */ + mem_heap_t* heap) /*!< in: temporary memory heap, or NULL */ +{ + dict_mem_table_add_col( + table, heap, + FTS_DOC_ID_COL_NAME, + DATA_INT, + dtype_form_prtype( + DATA_NOT_NULL | DATA_UNSIGNED + | DATA_BINARY_TYPE | DATA_FTS_DOC_ID, 0), + sizeof(doc_id_t)); + DICT_TF2_FLAG_SET(table, DICT_TF2_FTS_HAS_DOC_ID); +} + +/** Add new fts doc id to the update vector. +@param[in] table the table that contains the FTS index. +@param[in,out] ufield the fts doc id field in the update vector. + No new memory is allocated for this in this + function. +@param[in,out] next_doc_id the fts doc id that has been added to the + update vector. If 0, a new fts doc id is + automatically generated. The memory provided + for this argument will be used by the update + vector. Ensure that the life time of this + memory matches that of the update vector. +@return the fts doc id used in the update vector */ +doc_id_t +fts_update_doc_id( + dict_table_t* table, + upd_field_t* ufield, + doc_id_t* next_doc_id) +{ + doc_id_t doc_id; + dberr_t error = DB_SUCCESS; + + if (*next_doc_id) { + doc_id = *next_doc_id; + } else { + /* Get the new document id that will be added. */ + error = fts_get_next_doc_id(table, &doc_id); + } + + if (error == DB_SUCCESS) { + dict_index_t* clust_index; + dict_col_t* col = dict_table_get_nth_col( + table, table->fts->doc_col); + + ufield->exp = NULL; + + ufield->new_val.len = sizeof(doc_id); + + clust_index = dict_table_get_first_index(table); + + ufield->field_no = static_cast( + dict_col_get_clust_pos(col, clust_index)) + & dict_index_t::MAX_N_FIELDS; + dict_col_copy_type(col, dfield_get_type(&ufield->new_val)); + + /* It is possible we update record that has + not yet be sync-ed from last crash. */ + + /* Convert to storage byte order. */ + ut_a(doc_id != FTS_NULL_DOC_ID); + fts_write_doc_id((byte*) next_doc_id, doc_id); + + ufield->new_val.data = next_doc_id; + ufield->new_val.ext = 0; + } + + return(doc_id); +} + +/** fts_t constructor. +@param[in] table table with FTS indexes +@param[in,out] heap memory heap where 'this' is stored */ +fts_t::fts_t( + const dict_table_t* table, + mem_heap_t* heap) + : + added_synced(0), dict_locked(0), + add_wq(NULL), + cache(NULL), + doc_col(ULINT_UNDEFINED), in_queue(false), sync_message(false), + fts_heap(heap) +{ + ut_a(table->fts == NULL); + + ib_alloc_t* heap_alloc = ib_heap_allocator_create(fts_heap); + + indexes = ib_vector_create(heap_alloc, sizeof(dict_index_t*), 4); + + dict_table_get_all_fts_indexes(table, indexes); +} + +/** fts_t destructor. */ +fts_t::~fts_t() +{ + ut_ad(add_wq == NULL); + + if (cache) { + fts_cache_clear(cache); + fts_cache_destroy(cache); + } + + /* There is no need to call ib_vector_free() on this->indexes + because it is stored in this->fts_heap. */ + mem_heap_free(fts_heap); +} + +/*********************************************************************//** +Create an instance of fts_t. +@return instance of fts_t */ +fts_t* +fts_create( +/*=======*/ + dict_table_t* table) /*!< in/out: table with FTS indexes */ +{ + fts_t* fts; + mem_heap_t* heap; + + heap = mem_heap_create(512); + + fts = static_cast(mem_heap_alloc(heap, sizeof(*fts))); + + new(fts) fts_t(table, heap); + + return(fts); +} + +/*********************************************************************//** +Take a FTS savepoint. */ +UNIV_INLINE +void +fts_savepoint_copy( +/*===============*/ + const fts_savepoint_t* src, /*!< in: source savepoint */ + fts_savepoint_t* dst) /*!< out: destination savepoint */ +{ + const ib_rbt_node_t* node; + const ib_rbt_t* tables; + + tables = src->tables; + + for (node = rbt_first(tables); node; node = rbt_next(tables, node)) { + + fts_trx_table_t* ftt_dst; + const fts_trx_table_t** ftt_src; + + ftt_src = rbt_value(const fts_trx_table_t*, node); + + ftt_dst = fts_trx_table_clone(*ftt_src); + + rbt_insert(dst->tables, &ftt_dst, &ftt_dst); + } +} + +/*********************************************************************//** +Take a FTS savepoint. */ +void +fts_savepoint_take( +/*===============*/ + fts_trx_t* fts_trx, /*!< in: fts transaction */ + const char* name) /*!< in: savepoint name */ +{ + mem_heap_t* heap; + fts_savepoint_t* savepoint; + fts_savepoint_t* last_savepoint; + + ut_a(name != NULL); + + heap = fts_trx->heap; + + /* The implied savepoint must exist. */ + ut_a(ib_vector_size(fts_trx->savepoints) > 0); + + last_savepoint = static_cast( + ib_vector_last(fts_trx->savepoints)); + savepoint = fts_savepoint_create(fts_trx->savepoints, name, heap); + + if (last_savepoint->tables != NULL) { + fts_savepoint_copy(last_savepoint, savepoint); + } +} + +/*********************************************************************//** +Lookup a savepoint instance by name. +@return ULINT_UNDEFINED if not found */ +UNIV_INLINE +ulint +fts_savepoint_lookup( +/*==================*/ + ib_vector_t* savepoints, /*!< in: savepoints */ + const char* name) /*!< in: savepoint name */ +{ + ulint i; + + ut_a(ib_vector_size(savepoints) > 0); + + for (i = 1; i < ib_vector_size(savepoints); ++i) { + fts_savepoint_t* savepoint; + + savepoint = static_cast( + ib_vector_get(savepoints, i)); + + if (strcmp(name, savepoint->name) == 0) { + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/*********************************************************************//** +Release the savepoint data identified by name. All savepoints created +after the named savepoint are kept. +@return DB_SUCCESS or error code */ +void +fts_savepoint_release( +/*==================*/ + trx_t* trx, /*!< in: transaction */ + const char* name) /*!< in: savepoint name */ +{ + ut_a(name != NULL); + + ib_vector_t* savepoints = trx->fts_trx->savepoints; + + ut_a(ib_vector_size(savepoints) > 0); + + ulint i = fts_savepoint_lookup(savepoints, name); + if (i != ULINT_UNDEFINED) { + ut_a(i >= 1); + + fts_savepoint_t* savepoint; + savepoint = static_cast( + ib_vector_get(savepoints, i)); + + if (i == ib_vector_size(savepoints) - 1) { + /* If the savepoint is the last, we save its + tables to the previous savepoint. */ + fts_savepoint_t* prev_savepoint; + prev_savepoint = static_cast( + ib_vector_get(savepoints, i - 1)); + + ib_rbt_t* tables = savepoint->tables; + savepoint->tables = prev_savepoint->tables; + prev_savepoint->tables = tables; + } + + fts_savepoint_free(savepoint); + ib_vector_remove(savepoints, *(void**)savepoint); + + /* Make sure we don't delete the implied savepoint. */ + ut_a(ib_vector_size(savepoints) > 0); + } +} + +/**********************************************************************//** +Refresh last statement savepoint. */ +void +fts_savepoint_laststmt_refresh( +/*===========================*/ + trx_t* trx) /*!< in: transaction */ +{ + + fts_trx_t* fts_trx; + fts_savepoint_t* savepoint; + + fts_trx = trx->fts_trx; + + savepoint = static_cast( + ib_vector_pop(fts_trx->last_stmt)); + fts_savepoint_free(savepoint); + + ut_ad(ib_vector_is_empty(fts_trx->last_stmt)); + savepoint = fts_savepoint_create(fts_trx->last_stmt, NULL, NULL); +} + +/******************************************************************** +Undo the Doc ID add/delete operations in last stmt */ +static +void +fts_undo_last_stmt( +/*===============*/ + fts_trx_table_t* s_ftt, /*!< in: Transaction FTS table */ + fts_trx_table_t* l_ftt) /*!< in: last stmt FTS table */ +{ + ib_rbt_t* s_rows; + ib_rbt_t* l_rows; + const ib_rbt_node_t* node; + + l_rows = l_ftt->rows; + s_rows = s_ftt->rows; + + for (node = rbt_first(l_rows); + node; + node = rbt_next(l_rows, node)) { + fts_trx_row_t* l_row = rbt_value(fts_trx_row_t, node); + ib_rbt_bound_t parent; + + rbt_search(s_rows, &parent, &(l_row->doc_id)); + + if (parent.result == 0) { + fts_trx_row_t* s_row = rbt_value( + fts_trx_row_t, parent.last); + + switch (l_row->state) { + case FTS_INSERT: + ut_free(rbt_remove_node(s_rows, parent.last)); + break; + + case FTS_DELETE: + if (s_row->state == FTS_NOTHING) { + s_row->state = FTS_INSERT; + } else if (s_row->state == FTS_DELETE) { + ut_free(rbt_remove_node( + s_rows, parent.last)); + } + break; + + /* FIXME: Check if FTS_MODIFY need to be addressed */ + case FTS_MODIFY: + case FTS_NOTHING: + break; + default: + ut_error; + } + } + } +} + +/**********************************************************************//** +Rollback to savepoint indentified by name. +@return DB_SUCCESS or error code */ +void +fts_savepoint_rollback_last_stmt( +/*=============================*/ + trx_t* trx) /*!< in: transaction */ +{ + ib_vector_t* savepoints; + fts_savepoint_t* savepoint; + fts_savepoint_t* last_stmt; + fts_trx_t* fts_trx; + ib_rbt_bound_t parent; + const ib_rbt_node_t* node; + ib_rbt_t* l_tables; + ib_rbt_t* s_tables; + + fts_trx = trx->fts_trx; + savepoints = fts_trx->savepoints; + + savepoint = static_cast(ib_vector_last(savepoints)); + last_stmt = static_cast( + ib_vector_last(fts_trx->last_stmt)); + + l_tables = last_stmt->tables; + s_tables = savepoint->tables; + + for (node = rbt_first(l_tables); + node; + node = rbt_next(l_tables, node)) { + + fts_trx_table_t** l_ftt; + + l_ftt = rbt_value(fts_trx_table_t*, node); + + rbt_search_cmp( + s_tables, &parent, &(*l_ftt)->table->id, + fts_trx_table_id_cmp, NULL); + + if (parent.result == 0) { + fts_trx_table_t** s_ftt; + + s_ftt = rbt_value(fts_trx_table_t*, parent.last); + + fts_undo_last_stmt(*s_ftt, *l_ftt); + } + } +} + +/**********************************************************************//** +Rollback to savepoint indentified by name. +@return DB_SUCCESS or error code */ +void +fts_savepoint_rollback( +/*===================*/ + trx_t* trx, /*!< in: transaction */ + const char* name) /*!< in: savepoint name */ +{ + ulint i; + ib_vector_t* savepoints; + + ut_a(name != NULL); + + savepoints = trx->fts_trx->savepoints; + + /* We pop all savepoints from the the top of the stack up to + and including the instance that was found. */ + i = fts_savepoint_lookup(savepoints, name); + + if (i != ULINT_UNDEFINED) { + fts_savepoint_t* savepoint; + + ut_a(i > 0); + + while (ib_vector_size(savepoints) > i) { + fts_savepoint_t* savepoint; + + savepoint = static_cast( + ib_vector_pop(savepoints)); + + if (savepoint->name != NULL) { + /* Since name was allocated on the heap, the + memory will be released when the transaction + completes. */ + savepoint->name = NULL; + + fts_savepoint_free(savepoint); + } + } + + /* Pop all a elements from the top of the stack that may + have been released. We have to be careful that we don't + delete the implied savepoint. */ + + for (savepoint = static_cast( + ib_vector_last(savepoints)); + ib_vector_size(savepoints) > 1 + && savepoint->name == NULL; + savepoint = static_cast( + ib_vector_last(savepoints))) { + + ib_vector_pop(savepoints); + } + + /* Make sure we don't delete the implied savepoint. */ + ut_a(ib_vector_size(savepoints) > 0); + + /* Restore the savepoint. */ + fts_savepoint_take(trx->fts_trx, name); + } +} + +bool fts_check_aux_table(const char *name, + table_id_t *table_id, + index_id_t *index_id) +{ + ulint len= strlen(name); + const char* ptr; + const char* end= name + len; + + ut_ad(len <= MAX_FULL_NAME_LEN); + ptr= static_cast(memchr(name, '/', len)); + IF_WIN(if (!ptr) ptr= static_cast(memchr(name, '\\', len)), ); + + if (!ptr) + return false; + + /* We will start the match after the '/' */ + ++ptr; + len= end - ptr; + + /* All auxiliary tables are prefixed with "FTS_" and the name + length will be at the very least greater than 20 bytes. */ + if (len > 24 && !memcmp(ptr, "FTS_", 4)) + { + /* Skip the prefix. */ + ptr+= 4; + len-= 4; + + const char *table_id_ptr= ptr; + /* Skip the table id. */ + ptr= static_cast(memchr(ptr, '_', len)); + + if (!ptr) + return false; + + /* Skip the underscore. */ + ++ptr; + ut_ad(end > ptr); + len= end - ptr; + + sscanf(table_id_ptr, UINT64PFx, table_id); + /* First search the common table suffix array. */ + for (ulint i = 0; fts_common_tables[i]; ++i) + { + if (!strncmp(ptr, fts_common_tables[i], len)) + return true; + } + + /* Could be obsolete common tables. */ + if ((len == 5 && !memcmp(ptr, "ADDED", len)) || + (len == 9 && !memcmp(ptr, "STOPWORDS", len))) + return true; + + const char* index_id_ptr= ptr; + /* Skip the index id. */ + ptr= static_cast(memchr(ptr, '_', len)); + if (!ptr) + return false; + + sscanf(index_id_ptr, UINT64PFx, index_id); + + /* Skip the underscore. */ + ++ptr; + ut_a(end > ptr); + len= end - ptr; + + if (len <= 4) + return false; + + len-= 4; /* .ibd suffix */ + + if (len > 7) + return false; + + /* Search the FT index specific array. */ + for (ulint i = 0; i < FTS_NUM_AUX_INDEX; ++i) + { + if (!memcmp(ptr, "INDEX_", len - 1)) + return true; + } + + /* Other FT index specific table(s). */ + if (len == 6 && !memcmp(ptr, "DOC_ID", len)) + return true; + } + + return false; +} + +/**********************************************************************//** +Check whether user supplied stopword table is of the right format. +Caller is responsible to hold dictionary locks. +@param stopword_table_name table name +@param row_end name of the system-versioning end column, or "value" +@return the stopword column charset +@retval NULL if the table does not exist or qualify */ +CHARSET_INFO* +fts_valid_stopword_table( +/*=====================*/ + const char* stopword_table_name, /*!< in: Stopword table + name */ + const char** row_end) /* row_end value of system-versioned table */ +{ + dict_table_t* table; + dict_col_t* col = NULL; + + if (!stopword_table_name) { + return(NULL); + } + + table = dict_sys.load_table( + {stopword_table_name, strlen(stopword_table_name)}); + + if (!table) { + ib::error() << "User stopword table " << stopword_table_name + << " does not exist."; + + return(NULL); + } else { + if (strcmp(dict_table_get_col_name(table, 0), "value")) { + ib::error() << "Invalid column name for stopword" + " table " << stopword_table_name << ". Its" + " first column must be named as 'value'."; + + return(NULL); + } + + col = dict_table_get_nth_col(table, 0); + + if (col->mtype != DATA_VARCHAR + && col->mtype != DATA_VARMYSQL) { + ib::error() << "Invalid column type for stopword" + " table " << stopword_table_name << ". Its" + " first column must be of varchar type"; + + return(NULL); + } + } + + ut_ad(col); + ut_ad(!table->versioned() || col->ind != table->vers_end); + + if (row_end) { + *row_end = table->versioned() + ? dict_table_get_col_name(table, table->vers_end) + : "value"; /* for fts_load_user_stopword() */ + } + + return(fts_get_charset(col->prtype)); +} + +/**********************************************************************//** +This function loads the stopword into the FTS cache. It also +records/fetches stopword configuration to/from FTS configure +table, depending on whether we are creating or reloading the +FTS. +@return true if load operation is successful */ +bool +fts_load_stopword( +/*==============*/ + const dict_table_t* + table, /*!< in: Table with FTS */ + trx_t* trx, /*!< in: Transactions */ + const char* session_stopword_table, /*!< in: Session stopword table + name */ + bool stopword_is_on, /*!< in: Whether stopword + option is turned on/off */ + bool reload) /*!< in: Whether it is + for reloading FTS table */ +{ + fts_table_t fts_table; + fts_string_t str; + dberr_t error = DB_SUCCESS; + ulint use_stopword; + fts_cache_t* cache; + const char* stopword_to_use = NULL; + ibool new_trx = FALSE; + byte str_buffer[MAX_FULL_NAME_LEN + 1]; + + FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, table); + + cache = table->fts->cache; + + if (!reload && !(cache->stopword_info.status & STOPWORD_NOT_INIT)) { + return true; + } + + if (!trx) { + trx = trx_create(); +#ifdef UNIV_DEBUG + trx->start_line = __LINE__; + trx->start_file = __FILE__; +#endif + trx_start_internal_low(trx, !high_level_read_only); + trx->op_info = "upload FTS stopword"; + new_trx = TRUE; + } + + /* First check whether stopword filtering is turned off */ + if (reload) { + error = fts_config_get_ulint( + trx, &fts_table, FTS_USE_STOPWORD, &use_stopword); + } else { + use_stopword = (ulint) stopword_is_on; + + error = fts_config_set_ulint( + trx, &fts_table, FTS_USE_STOPWORD, use_stopword); + } + + if (error != DB_SUCCESS) { + goto cleanup; + } + + /* If stopword is turned off, no need to continue to load the + stopword into cache, but still need to do initialization */ + if (!use_stopword) { + cache->stopword_info.status = STOPWORD_OFF; + goto cleanup; + } + + if (reload) { + /* Fetch the stopword table name from FTS config + table */ + str.f_n_char = 0; + str.f_str = str_buffer; + str.f_len = sizeof(str_buffer) - 1; + + error = fts_config_get_value( + trx, &fts_table, FTS_STOPWORD_TABLE_NAME, &str); + + if (error != DB_SUCCESS) { + goto cleanup; + } + + if (*str.f_str) { + stopword_to_use = (const char*) str.f_str; + } + } else { + stopword_to_use = session_stopword_table; + } + + if (stopword_to_use + && fts_load_user_stopword(table->fts, stopword_to_use, + &cache->stopword_info)) { + /* Save the stopword table name to the configure + table */ + if (!reload) { + str.f_n_char = 0; + str.f_str = (byte*) stopword_to_use; + str.f_len = strlen(stopword_to_use); + + error = fts_config_set_value( + trx, &fts_table, FTS_STOPWORD_TABLE_NAME, &str); + } + } else { + /* Load system default stopword list */ + fts_load_default_stopword(&cache->stopword_info); + } + +cleanup: + if (new_trx) { + if (error == DB_SUCCESS) { + fts_sql_commit(trx); + } else { + fts_sql_rollback(trx); + } + + trx->free(); + } + + if (!cache->stopword_info.cached_stopword) { + cache->stopword_info.cached_stopword = rbt_create_arg_cmp( + sizeof(fts_tokenizer_word_t), innobase_fts_text_cmp, + &my_charset_latin1); + } + + return error == DB_SUCCESS; +} + +/**********************************************************************//** +Callback function when we initialize the FTS at the start up +time. It recovers the maximum Doc IDs presented in the current table. +Tested by innodb_fts.crash_recovery +@return: always returns TRUE */ +static +ibool +fts_init_get_doc_id( +/*================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: table with fts */ +{ + doc_id_t doc_id = FTS_NULL_DOC_ID; + sel_node_t* node = static_cast(row); + que_node_t* exp = node->select_list; + dict_table_t* table = static_cast(user_arg); + fts_cache_t* cache = table->fts->cache; + + ut_ad(ib_vector_is_empty(cache->get_docs)); + + /* Copy each indexed column content into doc->text.f_str */ + if (exp) { + dfield_t* dfield = que_node_get_val(exp); + dtype_t* type = dfield_get_type(dfield); + void* data = dfield_get_data(dfield); + + ut_a(dtype_get_mtype(type) == DATA_INT); + + doc_id = static_cast(mach_read_from_8( + static_cast(data))); + + exp = que_node_get_next(que_node_get_next(exp)); + if (exp) { + ut_ad(table->versioned()); + dfield = que_node_get_val(exp); + type = dfield_get_type(dfield); + ut_ad(type->vers_sys_end()); + data = dfield_get_data(dfield); + ulint len = dfield_get_len(dfield); + if (table->versioned_by_id()) { + ut_ad(len == sizeof trx_id_max_bytes); + if (0 != memcmp(data, trx_id_max_bytes, len)) { + return true; + } + } else { + ut_ad(len == sizeof timestamp_max_bytes); + if (0 != memcmp(data, timestamp_max_bytes, len)) { + return true; + } + } + ut_ad(!(exp = que_node_get_next(exp))); + } + ut_ad(!exp); + + if (doc_id >= cache->next_doc_id) { + cache->next_doc_id = doc_id + 1; + } + } + + return(TRUE); +} + +/**********************************************************************//** +Callback function when we initialize the FTS at the start up +time. It recovers Doc IDs that have not sync-ed to the auxiliary +table, and require to bring them back into FTS index. +@return: always returns TRUE */ +static +ibool +fts_init_recover_doc( +/*=================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: fts cache */ +{ + + fts_doc_t doc; + ulint doc_len = 0; + ulint field_no = 0; + fts_get_doc_t* get_doc = static_cast(user_arg); + doc_id_t doc_id = FTS_NULL_DOC_ID; + sel_node_t* node = static_cast(row); + que_node_t* exp = node->select_list; + fts_cache_t* cache = get_doc->cache; + st_mysql_ftparser* parser = get_doc->index_cache->index->parser; + + fts_doc_init(&doc); + doc.found = TRUE; + + ut_ad(cache); + + /* Copy each indexed column content into doc->text.f_str */ + while (exp) { + dfield_t* dfield = que_node_get_val(exp); + ulint len = dfield_get_len(dfield); + + if (field_no == 0) { + dtype_t* type = dfield_get_type(dfield); + void* data = dfield_get_data(dfield); + + ut_a(dtype_get_mtype(type) == DATA_INT); + + doc_id = static_cast(mach_read_from_8( + static_cast(data))); + + field_no++; + exp = que_node_get_next(exp); + continue; + } + + if (len == UNIV_SQL_NULL) { + exp = que_node_get_next(exp); + continue; + } + + ut_ad(get_doc); + + if (!get_doc->index_cache->charset) { + get_doc->index_cache->charset = fts_get_charset( + dfield->type.prtype); + } + + doc.charset = get_doc->index_cache->charset; + + if (dfield_is_ext(dfield)) { + dict_table_t* table = cache->sync->table; + + doc.text.f_str = btr_copy_externally_stored_field( + &doc.text.f_len, + static_cast(dfield_get_data(dfield)), + table->space->zip_size(), len, + static_cast(doc.self_heap->arg)); + } else { + doc.text.f_str = static_cast( + dfield_get_data(dfield)); + + doc.text.f_len = len; + } + + if (field_no == 1) { + fts_tokenize_document(&doc, NULL, parser); + } else { + fts_tokenize_document_next(&doc, doc_len, NULL, parser); + } + + exp = que_node_get_next(exp); + + doc_len += (exp) ? len + 1 : len; + + field_no++; + } + + fts_cache_add_doc(cache, get_doc->index_cache, doc_id, doc.tokens); + + fts_doc_free(&doc); + + cache->added++; + + if (doc_id >= cache->next_doc_id) { + cache->next_doc_id = doc_id + 1; + } + + return(TRUE); +} + +/**********************************************************************//** +This function brings FTS index in sync when FTS index is first +used. There are documents that have not yet sync-ed to auxiliary +tables from last server abnormally shutdown, we will need to bring +such document into FTS cache before any further operations */ +void +fts_init_index( +/*===========*/ + dict_table_t* table, /*!< in: Table with FTS */ + bool has_cache_lock) /*!< in: Whether we already have + cache lock */ +{ + dict_index_t* index; + doc_id_t start_doc; + fts_get_doc_t* get_doc = NULL; + fts_cache_t* cache = table->fts->cache; + bool need_init = false; + + /* First check cache->get_docs is initialized */ + if (!has_cache_lock) { + mysql_mutex_lock(&cache->lock); + } + + mysql_mutex_lock(&cache->init_lock); + if (cache->get_docs == NULL) { + cache->get_docs = fts_get_docs_create(cache); + } + mysql_mutex_unlock(&cache->init_lock); + + if (table->fts->added_synced) { + goto func_exit; + } + + need_init = true; + + start_doc = cache->synced_doc_id; + + if (!start_doc) { + fts_cmp_set_sync_doc_id(table, 0, TRUE, &start_doc); + cache->synced_doc_id = start_doc; + } + + /* No FTS index, this is the case when previous FTS index + dropped, and we re-initialize the Doc ID system for subsequent + insertion */ + if (ib_vector_is_empty(cache->get_docs)) { + index = table->fts_doc_id_index; + + ut_a(index); + + fts_doc_fetch_by_doc_id(NULL, start_doc, index, + FTS_FETCH_DOC_BY_ID_LARGE, + fts_init_get_doc_id, table); + } else { + if (table->fts->cache->stopword_info.status + & STOPWORD_NOT_INIT) { + fts_load_stopword(table, NULL, NULL, true, true); + } + + for (ulint i = 0; i < ib_vector_size(cache->get_docs); ++i) { + get_doc = static_cast( + ib_vector_get(cache->get_docs, i)); + + index = get_doc->index_cache->index; + + fts_doc_fetch_by_doc_id(NULL, start_doc, index, + FTS_FETCH_DOC_BY_ID_LARGE, + fts_init_recover_doc, get_doc); + } + } + + table->fts->added_synced = true; + + fts_get_docs_clear(cache->get_docs); + +func_exit: + if (!has_cache_lock) { + mysql_mutex_unlock(&cache->lock); + } + + if (need_init) { + dict_sys.lock(SRW_LOCK_CALL); + /* Register the table with the optimize thread. */ + fts_optimize_add_table(table); + dict_sys.unlock(); + } +} diff --git a/storage/innobase/fts/fts0opt.cc b/storage/innobase/fts/fts0opt.cc new file mode 100644 index 00000000..fe31767d --- /dev/null +++ b/storage/innobase/fts/fts0opt.cc @@ -0,0 +1,3054 @@ +/***************************************************************************** + +Copyright (c) 2007, 2018, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file fts/fts0opt.cc +Full Text Search optimize thread + +Created 2007/03/27 Sunny Bains +Completed 2011/7/10 Sunny and Jimmy Yang + +***********************************************************************/ + +#include "fts0fts.h" +#include "row0sel.h" +#include "que0types.h" +#include "fts0priv.h" +#include "fts0types.h" +#include "ut0wqueue.h" +#include "srv0start.h" +#include "ut0list.h" +#include "zlib.h" +#include "fts0opt.h" +#include "fts0vlc.h" +#include "wsrep.h" + +#ifdef WITH_WSREP +extern Atomic_relaxed wsrep_sst_disable_writes; +#else +constexpr bool wsrep_sst_disable_writes= false; +#endif + +/** The FTS optimize thread's work queue. */ +ib_wqueue_t* fts_optimize_wq; +static void fts_optimize_callback(void *); +static void timer_callback(void*); +static tpool::timer* timer; + +static tpool::task_group task_group(1); +static tpool::task task(fts_optimize_callback,0, &task_group); + +/** FTS optimize thread, for MDL acquisition */ +static THD *fts_opt_thd; + +/** The FTS vector to store fts_slot_t */ +static ib_vector_t* fts_slots; + +/** Default optimize interval in secs. */ +static const ulint FTS_OPTIMIZE_INTERVAL_IN_SECS = 300; + +/** Server is shutting down, so does we exiting the optimize thread */ +static bool fts_opt_start_shutdown = false; + +/** Condition variable for shutting down the optimize thread. +Protected by fts_optimize_wq->mutex. */ +static pthread_cond_t fts_opt_shutdown_cond; + +/** Initial size of nodes in fts_word_t. */ +static const ulint FTS_WORD_NODES_INIT_SIZE = 64; + +/** Last time we did check whether system need a sync */ +static time_t last_check_sync_time; + +/** FTS optimize thread message types. */ +enum fts_msg_type_t { + FTS_MSG_STOP, /*!< Stop optimizing and exit thread */ + + FTS_MSG_ADD_TABLE, /*!< Add table to the optimize thread's + work queue */ + + FTS_MSG_DEL_TABLE, /*!< Remove a table from the optimize + threads work queue */ + FTS_MSG_SYNC_TABLE /*!< Sync fts cache of a table */ +}; + +/** Compressed list of words that have been read from FTS INDEX +that needs to be optimized. */ +struct fts_zip_t { + lint status; /*!< Status of (un)/zip operation */ + + ulint n_words; /*!< Number of words compressed */ + + ulint block_sz; /*!< Size of a block in bytes */ + + ib_vector_t* blocks; /*!< Vector of compressed blocks */ + + ib_alloc_t* heap_alloc; /*!< Heap to use for allocations */ + + ulint pos; /*!< Offset into blocks */ + + ulint last_big_block; /*!< Offset of last block in the + blocks array that is of size + block_sz. Blocks beyond this offset + are of size FTS_MAX_WORD_LEN */ + + z_streamp zp; /*!< ZLib state */ + + /*!< The value of the last word read + from the FTS INDEX table. This is + used to discard duplicates */ + + fts_string_t word; /*!< UTF-8 string */ + + ulint max_words; /*!< maximum number of words to read + in one pase */ +}; + +/** Prepared statemets used during optimize */ +struct fts_optimize_graph_t { + /*!< Delete a word from FTS INDEX */ + que_t* delete_nodes_graph; + /*!< Insert a word into FTS INDEX */ + que_t* write_nodes_graph; + /*!< COMMIT a transaction */ + que_t* commit_graph; + /*!< Read the nodes from FTS_INDEX */ + que_t* read_nodes_graph; +}; + +/** Used by fts_optimize() to store state. */ +struct fts_optimize_t { + trx_t* trx; /*!< The transaction used for all SQL */ + + ib_alloc_t* self_heap; /*!< Heap to use for allocations */ + + char* name_prefix; /*!< FTS table name prefix */ + + fts_table_t fts_index_table;/*!< Common table definition */ + + /*!< Common table definition */ + fts_table_t fts_common_table; + + dict_table_t* table; /*!< Table that has to be queried */ + + dict_index_t* index; /*!< The FTS index to be optimized */ + + fts_doc_ids_t* to_delete; /*!< doc ids to delete, we check against + this vector and purge the matching + entries during the optimizing + process. The vector entries are + sorted on doc id */ + + ulint del_pos; /*!< Offset within to_delete vector, + this is used to keep track of where + we are up to in the vector */ + + ibool done; /*!< TRUE when optimize finishes */ + + ib_vector_t* words; /*!< Word + Nodes read from FTS_INDEX, + it contains instances of fts_word_t */ + + fts_zip_t* zip; /*!< Words read from the FTS_INDEX */ + + fts_optimize_graph_t /*!< Prepared statements used during */ + graph; /*optimize */ + + ulint n_completed; /*!< Number of FTS indexes that have + been optimized */ + ibool del_list_regenerated; + /*!< BEING_DELETED list regenarated */ +}; + +/** Used by the optimize, to keep state during compacting nodes. */ +struct fts_encode_t { + doc_id_t src_last_doc_id;/*!< Last doc id read from src node */ + byte* src_ilist_ptr; /*!< Current ptr within src ilist */ +}; + +/** We use this information to determine when to start the optimize +cycle for a table. */ +struct fts_slot_t { + /** table, or NULL if the slot is unused */ + dict_table_t* table; + + /** whether this slot is being processed */ + bool running; + + ulint added; /*!< Number of doc ids added since the + last time this table was optimized */ + + ulint deleted; /*!< Number of doc ids deleted since the + last time this table was optimized */ + + /** time(NULL) of completing fts_optimize_table_bk() */ + time_t last_run; + + /** time(NULL) of latest successful fts_optimize_table() */ + time_t completed; +}; + +/** A table remove message for the FTS optimize thread. */ +struct fts_msg_del_t +{ + /** the table to remove */ + dict_table_t *table; + /** condition variable to signal message consumption */ + pthread_cond_t *cond; +}; + +/** The FTS optimize message work queue message type. */ +struct fts_msg_t { + fts_msg_type_t type; /*!< Message type */ + + void* ptr; /*!< The message contents */ + + mem_heap_t* heap; /*!< The heap used to allocate this + message, the message consumer will + free the heap. */ +}; + +/** The number of words to read and optimize in a single pass. */ +ulong fts_num_word_optimize; + +/** Whether to enable additional FTS diagnostic printout. */ +char fts_enable_diag_print; + +/** ZLib compressed block size.*/ +static ulint FTS_ZIP_BLOCK_SIZE = 1024; + +/** The amount of time optimizing in a single pass, in seconds. */ +static ulint fts_optimize_time_limit; + +/** It's defined in fts0fts.cc */ +extern const char* fts_common_tables[]; + +/** SQL Statement for changing state of rows to be deleted from FTS Index. */ +static const char* fts_init_delete_sql = + "BEGIN\n" + "\n" + "INSERT INTO $BEING_DELETED\n" + "SELECT doc_id FROM $DELETED;\n" + "\n" + "INSERT INTO $BEING_DELETED_CACHE\n" + "SELECT doc_id FROM $DELETED_CACHE;\n"; + +static const char* fts_delete_doc_ids_sql = + "BEGIN\n" + "\n" + "DELETE FROM $DELETED WHERE doc_id = :doc_id1;\n" + "DELETE FROM $DELETED_CACHE WHERE doc_id = :doc_id2;\n"; + +static const char* fts_end_delete_sql = + "BEGIN\n" + "\n" + "DELETE FROM $BEING_DELETED;\n" + "DELETE FROM $BEING_DELETED_CACHE;\n"; + +/**********************************************************************//** +Initialize fts_zip_t. */ +static +void +fts_zip_initialize( +/*===============*/ + fts_zip_t* zip) /*!< out: zip instance to initialize */ +{ + zip->pos = 0; + zip->n_words = 0; + + zip->status = Z_OK; + + zip->last_big_block = 0; + + zip->word.f_len = 0; + *zip->word.f_str = 0; + + ib_vector_reset(zip->blocks); + + memset(zip->zp, 0, sizeof(*zip->zp)); +} + +/**********************************************************************//** +Create an instance of fts_zip_t. +@return a new instance of fts_zip_t */ +static +fts_zip_t* +fts_zip_create( +/*===========*/ + mem_heap_t* heap, /*!< in: heap */ + ulint block_sz, /*!< in: size of a zip block.*/ + ulint max_words) /*!< in: max words to read */ +{ + fts_zip_t* zip; + + zip = static_cast(mem_heap_zalloc(heap, sizeof(*zip))); + + zip->word.f_str = static_cast( + mem_heap_zalloc(heap, FTS_MAX_WORD_LEN + 1)); + + zip->block_sz = block_sz; + + zip->heap_alloc = ib_heap_allocator_create(heap); + + zip->blocks = ib_vector_create(zip->heap_alloc, sizeof(void*), 128); + + zip->max_words = max_words; + + zip->zp = static_cast( + mem_heap_zalloc(heap, sizeof(*zip->zp))); + + return(zip); +} + +/**********************************************************************//** +Initialize an instance of fts_zip_t. */ +static +void +fts_zip_init( +/*=========*/ + + fts_zip_t* zip) /*!< in: zip instance to init */ +{ + memset(zip->zp, 0, sizeof(*zip->zp)); + + zip->word.f_len = 0; + *zip->word.f_str = '\0'; +} + +/**********************************************************************//** +Create a fts_optimizer_word_t instance. +@return new instance */ +static +fts_word_t* +fts_word_init( +/*==========*/ + fts_word_t* word, /*!< in: word to initialize */ + byte* utf8, /*!< in: UTF-8 string */ + ulint len) /*!< in: length of string in bytes */ +{ + mem_heap_t* heap = mem_heap_create(sizeof(fts_node_t)); + + memset(word, 0, sizeof(*word)); + + word->text.f_len = len; + word->text.f_str = static_cast(mem_heap_alloc(heap, len + 1)); + + /* Need to copy the NUL character too. */ + memcpy(word->text.f_str, utf8, word->text.f_len); + word->text.f_str[word->text.f_len] = 0; + + word->heap_alloc = ib_heap_allocator_create(heap); + + word->nodes = ib_vector_create( + word->heap_alloc, sizeof(fts_node_t), FTS_WORD_NODES_INIT_SIZE); + + return(word); +} + +/**********************************************************************//** +Read the FTS INDEX row. +@return fts_node_t instance */ +static +fts_node_t* +fts_optimize_read_node( +/*===================*/ + fts_word_t* word, /*!< in: */ + que_node_t* exp) /*!< in: */ +{ + int i; + fts_node_t* node = static_cast( + ib_vector_push(word->nodes, NULL)); + + /* Start from 1 since the first node has been read by the caller */ + for (i = 1; exp; exp = que_node_get_next(exp), ++i) { + + dfield_t* dfield = que_node_get_val(exp); + byte* data = static_cast( + dfield_get_data(dfield)); + ulint len = dfield_get_len(dfield); + + ut_a(len != UNIV_SQL_NULL); + + /* Note: The column numbers below must match the SELECT */ + switch (i) { + case 1: /* DOC_COUNT */ + node->doc_count = mach_read_from_4(data); + break; + + case 2: /* FIRST_DOC_ID */ + node->first_doc_id = fts_read_doc_id(data); + break; + + case 3: /* LAST_DOC_ID */ + node->last_doc_id = fts_read_doc_id(data); + break; + + case 4: /* ILIST */ + node->ilist_size_alloc = node->ilist_size = len; + node->ilist = static_cast(ut_malloc_nokey(len)); + memcpy(node->ilist, data, len); + break; + + default: + ut_error; + } + } + + /* Make sure all columns were read. */ + ut_a(i == 5); + + return(node); +} + +/**********************************************************************//** +Callback function to fetch the rows in an FTS INDEX record. +@return always returns non-NULL */ +ibool +fts_optimize_index_fetch_node( +/*==========================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: pointer to ib_vector_t */ +{ + fts_word_t* word; + sel_node_t* sel_node = static_cast(row); + fts_fetch_t* fetch = static_cast(user_arg); + ib_vector_t* words = static_cast(fetch->read_arg); + que_node_t* exp = sel_node->select_list; + dfield_t* dfield = que_node_get_val(exp); + void* data = dfield_get_data(dfield); + ulint dfield_len = dfield_get_len(dfield); + fts_node_t* node; + bool is_word_init = false; + + ut_a(dfield_len <= FTS_MAX_WORD_LEN); + + if (ib_vector_size(words) == 0) { + + word = static_cast(ib_vector_push(words, NULL)); + fts_word_init(word, (byte*) data, dfield_len); + is_word_init = true; + } + + word = static_cast(ib_vector_last(words)); + + if (dfield_len != word->text.f_len + || memcmp(word->text.f_str, data, dfield_len)) { + + word = static_cast(ib_vector_push(words, NULL)); + fts_word_init(word, (byte*) data, dfield_len); + is_word_init = true; + } + + node = fts_optimize_read_node(word, que_node_get_next(exp)); + + fetch->total_memory += node->ilist_size; + if (is_word_init) { + fetch->total_memory += sizeof(fts_word_t) + + sizeof(ib_alloc_t) + sizeof(ib_vector_t) + dfield_len + + sizeof(fts_node_t) * FTS_WORD_NODES_INIT_SIZE; + } else if (ib_vector_size(words) > FTS_WORD_NODES_INIT_SIZE) { + fetch->total_memory += sizeof(fts_node_t); + } + + if (fetch->total_memory >= fts_result_cache_limit) { + return(FALSE); + } + + return(TRUE); +} + +/**********************************************************************//** +Read the rows from the FTS inde. +@return DB_SUCCESS or error code */ +dberr_t +fts_index_fetch_nodes( +/*==================*/ + trx_t* trx, /*!< in: transaction */ + que_t** graph, /*!< in: prepared statement */ + fts_table_t* fts_table, /*!< in: table of the FTS INDEX */ + const fts_string_t* + word, /*!< in: the word to fetch */ + fts_fetch_t* fetch) /*!< in: fetch callback.*/ +{ + pars_info_t* info; + dberr_t error; + char table_name[MAX_FULL_NAME_LEN]; + + trx->op_info = "fetching FTS index nodes"; + + if (*graph) { + info = (*graph)->info; + } else { + ulint selected; + + info = pars_info_create(); + + ut_a(fts_table->type == FTS_INDEX_TABLE); + + selected = fts_select_index(fts_table->charset, + word->f_str, word->f_len); + + fts_table->suffix = fts_get_suffix(selected); + + fts_get_table_name(fts_table, table_name); + + pars_info_bind_id(info, "table_name", table_name); + } + + pars_info_bind_function(info, "my_func", fetch->read_record, fetch); + pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len); + + if (!*graph) { + + *graph = fts_parse_sql( + fts_table, + info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT word, doc_count, first_doc_id, last_doc_id," + " ilist\n" + " FROM $table_name\n" + " WHERE word LIKE :word\n" + " ORDER BY first_doc_id;\n" + "BEGIN\n" + "\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + } + + for (;;) { + error = fts_eval_sql(trx, *graph); + + if (UNIV_LIKELY(error == DB_SUCCESS)) { + fts_sql_commit(trx); + + break; /* Exit the loop. */ + } else { + fts_sql_rollback(trx); + + if (error == DB_LOCK_WAIT_TIMEOUT) { + ib::warn() << "lock wait timeout reading" + " FTS index. Retrying!"; + + trx->error_state = DB_SUCCESS; + } else { + ib::error() << "(" << error + << ") while reading FTS index."; + + break; /* Exit the loop. */ + } + } + } + + return(error); +} + +/**********************************************************************//** +Read a word */ +static +byte* +fts_zip_read_word( +/*==============*/ + fts_zip_t* zip, /*!< in: Zip state + data */ + fts_string_t* word) /*!< out: uncompressed word */ +{ + short len = 0; + void* null = NULL; + byte* ptr = word->f_str; + int flush = Z_NO_FLUSH; + + /* Either there was an error or we are at the Z_STREAM_END. */ + if (zip->status != Z_OK) { + return(NULL); + } + + zip->zp->next_out = reinterpret_cast(&len); + zip->zp->avail_out = sizeof(len); + + while (zip->status == Z_OK && zip->zp->avail_out > 0) { + + /* Finished decompressing block. */ + if (zip->zp->avail_in == 0) { + + /* Free the block that's been decompressed. */ + if (zip->pos > 0) { + ulint prev = zip->pos - 1; + + ut_a(zip->pos < ib_vector_size(zip->blocks)); + + ut_free(ib_vector_getp(zip->blocks, prev)); + ib_vector_set(zip->blocks, prev, &null); + } + + /* Any more blocks to decompress. */ + if (zip->pos < ib_vector_size(zip->blocks)) { + + zip->zp->next_in = static_cast( + ib_vector_getp( + zip->blocks, zip->pos)); + + if (zip->pos > zip->last_big_block) { + zip->zp->avail_in = + FTS_MAX_WORD_LEN; + } else { + zip->zp->avail_in = + static_cast(zip->block_sz); + } + + ++zip->pos; + } else { + flush = Z_FINISH; + } + } + + switch (zip->status = inflate(zip->zp, flush)) { + case Z_OK: + if (zip->zp->avail_out == 0 && len > 0) { + + ut_a(len <= FTS_MAX_WORD_LEN); + ptr[len] = 0; + + zip->zp->next_out = ptr; + zip->zp->avail_out = uInt(len); + + word->f_len = ulint(len); + len = 0; + } + break; + + case Z_BUF_ERROR: /* No progress possible. */ + case Z_STREAM_END: + inflateEnd(zip->zp); + break; + + case Z_STREAM_ERROR: + default: + ut_error; + } + } + + /* All blocks must be freed at end of inflate. */ + if (zip->status != Z_OK) { + for (ulint i = 0; i < ib_vector_size(zip->blocks); ++i) { + if (ib_vector_getp(zip->blocks, i)) { + ut_free(ib_vector_getp(zip->blocks, i)); + ib_vector_set(zip->blocks, i, &null); + } + } + } + + if (ptr != NULL) { + ut_ad(word->f_len == strlen((char*) ptr)); + } + + return(zip->status == Z_OK || zip->status == Z_STREAM_END ? ptr : NULL); +} + +/**********************************************************************//** +Callback function to fetch and compress the word in an FTS +INDEX record. +@return FALSE on EOF */ +static +ibool +fts_fetch_index_words( +/*==================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: pointer to ib_vector_t */ +{ + sel_node_t* sel_node = static_cast(row); + fts_zip_t* zip = static_cast(user_arg); + que_node_t* exp = sel_node->select_list; + dfield_t* dfield = que_node_get_val(exp); + + ut_a(dfield_get_len(dfield) <= FTS_MAX_WORD_LEN); + + uint16 len = uint16(dfield_get_len(dfield)); + void* data = dfield_get_data(dfield); + + /* Skip the duplicate words. */ + if (zip->word.f_len == len && !memcmp(zip->word.f_str, data, len)) { + return(TRUE); + } + + memcpy(zip->word.f_str, data, len); + zip->word.f_len = len; + + ut_a(zip->zp->avail_in == 0); + ut_a(zip->zp->next_in == NULL); + + /* The string is prefixed by len. */ + /* FIXME: This is not byte order agnostic (InnoDB data files + with FULLTEXT INDEX are not portable between little-endian and + big-endian systems!) */ + zip->zp->next_in = reinterpret_cast(&len); + zip->zp->avail_in = sizeof(len); + + /* Compress the word, create output blocks as necessary. */ + while (zip->zp->avail_in > 0) { + + /* No space left in output buffer, create a new one. */ + if (zip->zp->avail_out == 0) { + byte* block; + + block = static_cast( + ut_malloc_nokey(zip->block_sz)); + + ib_vector_push(zip->blocks, &block); + + zip->zp->next_out = block; + zip->zp->avail_out = static_cast(zip->block_sz); + } + + switch (zip->status = deflate(zip->zp, Z_NO_FLUSH)) { + case Z_OK: + if (zip->zp->avail_in == 0) { + zip->zp->next_in = static_cast(data); + zip->zp->avail_in = uInt(len); + ut_a(len <= FTS_MAX_WORD_LEN); + len = 0; + } + continue; + + case Z_STREAM_END: + case Z_BUF_ERROR: + case Z_STREAM_ERROR: + default: + ut_error; + } + } + + /* All data should have been compressed. */ + ut_a(zip->zp->avail_in == 0); + zip->zp->next_in = NULL; + + ++zip->n_words; + + return(zip->n_words >= zip->max_words ? FALSE : TRUE); +} + +/**********************************************************************//** +Finish Zip deflate. */ +static +void +fts_zip_deflate_end( +/*================*/ + fts_zip_t* zip) /*!< in: instance that should be closed*/ +{ + ut_a(zip->zp->avail_in == 0); + ut_a(zip->zp->next_in == NULL); + + zip->status = deflate(zip->zp, Z_FINISH); + + ut_a(ib_vector_size(zip->blocks) > 0); + zip->last_big_block = ib_vector_size(zip->blocks) - 1; + + /* Allocate smaller block(s), since this is trailing data. */ + while (zip->status == Z_OK) { + byte* block; + + ut_a(zip->zp->avail_out == 0); + + block = static_cast( + ut_malloc_nokey(FTS_MAX_WORD_LEN + 1)); + + ib_vector_push(zip->blocks, &block); + + zip->zp->next_out = block; + zip->zp->avail_out = FTS_MAX_WORD_LEN; + + zip->status = deflate(zip->zp, Z_FINISH); + } + + ut_a(zip->status == Z_STREAM_END); + + zip->status = deflateEnd(zip->zp); + ut_a(zip->status == Z_OK); + + /* Reset the ZLib data structure. */ + memset(zip->zp, 0, sizeof(*zip->zp)); +} + +/**********************************************************************//** +Read the words from the FTS INDEX. +@return DB_SUCCESS if all OK, DB_TABLE_NOT_FOUND if no more indexes + to search else error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_index_fetch_words( +/*==================*/ + fts_optimize_t* optim, /*!< in: optimize scratch pad */ + const fts_string_t* word, /*!< in: get words greater than this + word */ + ulint n_words)/*!< in: max words to read */ +{ + pars_info_t* info; + que_t* graph; + ulint selected; + fts_zip_t* zip = NULL; + dberr_t error = DB_SUCCESS; + mem_heap_t* heap = static_cast(optim->self_heap->arg); + ibool inited = FALSE; + + optim->trx->op_info = "fetching FTS index words"; + + if (optim->zip == NULL) { + optim->zip = fts_zip_create(heap, FTS_ZIP_BLOCK_SIZE, n_words); + } else { + fts_zip_initialize(optim->zip); + } + + for (selected = fts_select_index( + optim->fts_index_table.charset, word->f_str, word->f_len); + selected < FTS_NUM_AUX_INDEX; + selected++) { + + char table_name[MAX_FULL_NAME_LEN]; + + optim->fts_index_table.suffix = fts_get_suffix(selected); + + info = pars_info_create(); + + pars_info_bind_function( + info, "my_func", fts_fetch_index_words, optim->zip); + + pars_info_bind_varchar_literal( + info, "word", word->f_str, word->f_len); + + fts_get_table_name(&optim->fts_index_table, table_name); + pars_info_bind_id(info, "table_name", table_name); + + graph = fts_parse_sql( + &optim->fts_index_table, + info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT word\n" + " FROM $table_name\n" + " WHERE word > :word\n" + " ORDER BY word;\n" + "BEGIN\n" + "\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + + zip = optim->zip; + + for (;;) { + int err; + + if (!inited && ((err = deflateInit(zip->zp, 9)) + != Z_OK)) { + ib::error() << "ZLib deflateInit() failed: " + << err; + + error = DB_ERROR; + break; + } else { + inited = TRUE; + error = fts_eval_sql(optim->trx, graph); + } + + if (UNIV_LIKELY(error == DB_SUCCESS)) { + //FIXME fts_sql_commit(optim->trx); + break; + } else { + //FIXME fts_sql_rollback(optim->trx); + + if (error == DB_LOCK_WAIT_TIMEOUT) { + ib::warn() << "Lock wait timeout" + " reading document. Retrying!"; + + /* We need to reset the ZLib state. */ + inited = FALSE; + deflateEnd(zip->zp); + fts_zip_init(zip); + + optim->trx->error_state = DB_SUCCESS; + } else { + ib::error() << "(" << error + << ") while reading document."; + + break; /* Exit the loop. */ + } + } + } + + que_graph_free(graph); + + /* Check if max word to fetch is exceeded */ + if (optim->zip->n_words >= n_words) { + break; + } + } + + if (error == DB_SUCCESS && zip->status == Z_OK && zip->n_words > 0) { + + /* All data should have been read. */ + ut_a(zip->zp->avail_in == 0); + + fts_zip_deflate_end(zip); + } else { + deflateEnd(zip->zp); + } + + return(error); +} + +/**********************************************************************//** +Callback function to fetch the doc id from the record. +@return always returns TRUE */ +static +ibool +fts_fetch_doc_ids( +/*==============*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: pointer to ib_vector_t */ +{ + que_node_t* exp; + int i = 0; + sel_node_t* sel_node = static_cast(row); + fts_doc_ids_t* fts_doc_ids = static_cast(user_arg); + doc_id_t* update = static_cast( + ib_vector_push(fts_doc_ids->doc_ids, NULL)); + + for (exp = sel_node->select_list; + exp; + exp = que_node_get_next(exp), ++i) { + + dfield_t* dfield = que_node_get_val(exp); + void* data = dfield_get_data(dfield); + ulint len = dfield_get_len(dfield); + + ut_a(len != UNIV_SQL_NULL); + + /* Note: The column numbers below must match the SELECT. */ + switch (i) { + case 0: /* DOC_ID */ + *update = fts_read_doc_id( + static_cast(data)); + break; + + default: + ut_error; + } + } + + return(TRUE); +} + +/**********************************************************************//** +Read the rows from a FTS common auxiliary table. +@return DB_SUCCESS or error code */ +dberr_t +fts_table_fetch_doc_ids( +/*====================*/ + trx_t* trx, /*!< in: transaction */ + fts_table_t* fts_table, /*!< in: table */ + fts_doc_ids_t* doc_ids) /*!< in: For collecting doc ids */ +{ + dberr_t error; + que_t* graph; + pars_info_t* info = pars_info_create(); + ibool alloc_bk_trx = FALSE; + char table_name[MAX_FULL_NAME_LEN]; + + ut_a(fts_table->suffix != NULL); + ut_a(fts_table->type == FTS_COMMON_TABLE); + + if (!trx) { + trx = trx_create(); + alloc_bk_trx = TRUE; + } + + trx->op_info = "fetching FTS doc ids"; + + pars_info_bind_function(info, "my_func", fts_fetch_doc_ids, doc_ids); + + fts_get_table_name(fts_table, table_name); + pars_info_bind_id(info, "table_name", table_name); + + graph = fts_parse_sql( + fts_table, + info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT doc_id FROM $table_name;\n" + "BEGIN\n" + "\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + + error = fts_eval_sql(trx, graph); + fts_sql_commit(trx); + que_graph_free(graph); + + if (error == DB_SUCCESS) { + ib_vector_sort(doc_ids->doc_ids, fts_doc_id_cmp); + } + + if (alloc_bk_trx) { + trx->free(); + } + + return(error); +} + +/**********************************************************************//** +Do a binary search for a doc id in the array +@return +ve index if found -ve index where it should be inserted + if not found */ +int +fts_bsearch( +/*========*/ + doc_id_t* array, /*!< in: array to sort */ + int lower, /*!< in: the array lower bound */ + int upper, /*!< in: the array upper bound */ + doc_id_t doc_id) /*!< in: the doc id to search for */ +{ + int orig_size = upper; + + if (upper == 0) { + /* Nothing to search */ + return(-1); + } else { + while (lower < upper) { + int i = (lower + upper) >> 1; + + if (doc_id > array[i]) { + lower = i + 1; + } else if (doc_id < array[i]) { + upper = i - 1; + } else { + return(i); /* Found. */ + } + } + } + + if (lower == upper && lower < orig_size) { + if (doc_id == array[lower]) { + return(lower); + } else if (lower == 0) { + return(-1); + } + } + + /* Not found. */ + return( (lower == 0) ? -1 : -(lower)); +} + +/**********************************************************************//** +Search in the to delete array whether any of the doc ids within +the [first, last] range are to be deleted +@return +ve index if found -ve index where it should be inserted + if not found */ +static +int +fts_optimize_lookup( +/*================*/ + ib_vector_t* doc_ids, /*!< in: array to search */ + ulint lower, /*!< in: lower limit of array */ + doc_id_t first_doc_id, /*!< in: doc id to lookup */ + doc_id_t last_doc_id) /*!< in: doc id to lookup */ +{ + int pos; + int upper = static_cast(ib_vector_size(doc_ids)); + doc_id_t* array = (doc_id_t*) doc_ids->data; + + pos = fts_bsearch(array, static_cast(lower), upper, first_doc_id); + + ut_a(abs(pos) <= upper + 1); + + if (pos < 0) { + + int i = abs(pos); + + /* If i is 1, it could be first_doc_id is less than + either the first or second array item, do a + double check */ + if (i == 1 && array[0] <= last_doc_id + && first_doc_id < array[0]) { + pos = 0; + } else if (i < upper && array[i] <= last_doc_id) { + + /* Check if the "next" doc id is within the + first & last doc id of the node. */ + pos = i; + } + } + + return(pos); +} + +/**********************************************************************//** +Encode the word pos list into the node +@return DB_SUCCESS or error code*/ +static MY_ATTRIBUTE((nonnull)) +dberr_t +fts_optimize_encode_node( +/*=====================*/ + fts_node_t* node, /*!< in: node to fill*/ + doc_id_t doc_id, /*!< in: doc id to encode */ + fts_encode_t* enc) /*!< in: encoding state.*/ +{ + byte* dst; + ulint enc_len; + ulint pos_enc_len; + doc_id_t doc_id_delta; + dberr_t error = DB_SUCCESS; + const byte* src = enc->src_ilist_ptr; + + if (node->first_doc_id == 0) { + ut_a(node->last_doc_id == 0); + + node->first_doc_id = doc_id; + } + + /* Calculate the space required to store the ilist. */ + ut_ad(doc_id > node->last_doc_id); + doc_id_delta = doc_id - node->last_doc_id; + enc_len = fts_get_encoded_len(static_cast(doc_id_delta)); + + /* Calculate the size of the encoded pos array. */ + while (*src) { + fts_decode_vlc(&src); + } + + /* Skip the 0x00 byte at the end of the word positions list. */ + ++src; + + /* Number of encoded pos bytes to copy. */ + pos_enc_len = ulint(src - enc->src_ilist_ptr); + + /* Total number of bytes required for copy. */ + enc_len += pos_enc_len; + + /* Check we have enough space in the destination buffer for + copying the document word list. */ + if (!node->ilist) { + ulint new_size; + + ut_a(node->ilist_size == 0); + + new_size = enc_len > FTS_ILIST_MAX_SIZE + ? enc_len : FTS_ILIST_MAX_SIZE; + + node->ilist = static_cast(ut_malloc_nokey(new_size)); + node->ilist_size_alloc = new_size; + + } else if ((node->ilist_size + enc_len) > node->ilist_size_alloc) { + ulint new_size = node->ilist_size + enc_len; + byte* ilist = static_cast(ut_malloc_nokey(new_size)); + + memcpy(ilist, node->ilist, node->ilist_size); + + ut_free(node->ilist); + + node->ilist = ilist; + node->ilist_size_alloc = new_size; + } + + src = enc->src_ilist_ptr; + dst = node->ilist + node->ilist_size; + + /* Encode the doc id. Cast to ulint, the delta should be small and + therefore no loss of precision. */ + dst = fts_encode_int(doc_id_delta, dst); + + /* Copy the encoded pos array. */ + memcpy(dst, src, pos_enc_len); + + node->last_doc_id = doc_id; + + /* Data copied upto here. */ + node->ilist_size += enc_len; + enc->src_ilist_ptr += pos_enc_len; + + ut_a(node->ilist_size <= node->ilist_size_alloc); + + return(error); +} + +/**********************************************************************//** +Optimize the data contained in a node. +@return DB_SUCCESS or error code*/ +static MY_ATTRIBUTE((nonnull)) +dberr_t +fts_optimize_node( +/*==============*/ + ib_vector_t* del_vec, /*!< in: vector of doc ids to delete*/ + int* del_pos, /*!< in: offset into above vector */ + fts_node_t* dst_node, /*!< in: node to fill*/ + fts_node_t* src_node, /*!< in: source node for data*/ + fts_encode_t* enc) /*!< in: encoding state */ +{ + ulint copied; + dberr_t error = DB_SUCCESS; + doc_id_t doc_id = enc->src_last_doc_id; + + if (!enc->src_ilist_ptr) { + enc->src_ilist_ptr = src_node->ilist; + } + + copied = ulint(enc->src_ilist_ptr - src_node->ilist); + + /* While there is data in the source node and space to copy + into in the destination node. */ + while (copied < src_node->ilist_size + && dst_node->ilist_size < FTS_ILIST_MAX_SIZE) { + + doc_id_t delta; + doc_id_t del_doc_id = FTS_NULL_DOC_ID; + + delta = fts_decode_vlc( + (const byte**)&enc->src_ilist_ptr); + +test_again: + /* Check whether the doc id is in the delete list, if + so then we skip the entries but we need to track the + delta for decoding the entries following this document's + entries. */ + if (*del_pos >= 0 && *del_pos < (int) ib_vector_size(del_vec)) { + doc_id_t* update; + + update = (doc_id_t*) ib_vector_get( + del_vec, ulint(*del_pos)); + + del_doc_id = *update; + } + + if (enc->src_ilist_ptr == src_node->ilist && doc_id == 0) { + ut_a(delta == src_node->first_doc_id); + } + + doc_id += delta; + + if (del_doc_id > 0 && doc_id == del_doc_id) { + + ++*del_pos; + + /* Skip the entries for this document. */ + while (*enc->src_ilist_ptr) { + fts_decode_vlc((const byte**)&enc->src_ilist_ptr); + } + + /* Skip the end of word position marker. */ + ++enc->src_ilist_ptr; + + } else { + + /* DOC ID already becomes larger than + del_doc_id, check the next del_doc_id */ + if (del_doc_id > 0 && doc_id > del_doc_id) { + del_doc_id = 0; + ++*del_pos; + delta = 0; + goto test_again; + } + + /* Decode and copy the word positions into + the dest node. */ + fts_optimize_encode_node(dst_node, doc_id, enc); + + ++dst_node->doc_count; + + ut_a(dst_node->last_doc_id == doc_id); + } + + /* Bytes copied so for from source. */ + copied = ulint(enc->src_ilist_ptr - src_node->ilist); + } + + if (copied >= src_node->ilist_size) { + ut_a(doc_id == src_node->last_doc_id); + } + + enc->src_last_doc_id = doc_id; + + return(error); +} + +/**********************************************************************//** +Determine the starting pos within the deleted doc id vector for a word. +@return delete position */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +int +fts_optimize_deleted_pos( +/*=====================*/ + fts_optimize_t* optim, /*!< in: optimize state data */ + fts_word_t* word) /*!< in: the word data to check */ +{ + int del_pos; + ib_vector_t* del_vec = optim->to_delete->doc_ids; + + /* Get the first and last dict ids for the word, we will use + these values to determine which doc ids need to be removed + when we coalesce the nodes. This way we can reduce the numer + of elements that need to be searched in the deleted doc ids + vector and secondly we can remove the doc ids during the + coalescing phase. */ + if (ib_vector_size(del_vec) > 0) { + fts_node_t* node; + doc_id_t last_id; + doc_id_t first_id; + ulint size = ib_vector_size(word->nodes); + + node = (fts_node_t*) ib_vector_get(word->nodes, 0); + first_id = node->first_doc_id; + + node = (fts_node_t*) ib_vector_get(word->nodes, size - 1); + last_id = node->last_doc_id; + + ut_a(first_id <= last_id); + + del_pos = fts_optimize_lookup( + del_vec, optim->del_pos, first_id, last_id); + } else { + + del_pos = -1; /* Note that there is nothing to delete. */ + } + + return(del_pos); +} + +#define FTS_DEBUG_PRINT +/**********************************************************************//** +Compact the nodes for a word, we also remove any doc ids during the +compaction pass. +@return DB_SUCCESS or error code.*/ +static +ib_vector_t* +fts_optimize_word( +/*==============*/ + fts_optimize_t* optim, /*!< in: optimize state data */ + fts_word_t* word) /*!< in: the word to optimize */ +{ + fts_encode_t enc; + ib_vector_t* nodes; + ulint i = 0; + int del_pos; + fts_node_t* dst_node = NULL; + ib_vector_t* del_vec = optim->to_delete->doc_ids; + ulint size = ib_vector_size(word->nodes); + + del_pos = fts_optimize_deleted_pos(optim, word); + nodes = ib_vector_create(word->heap_alloc, sizeof(*dst_node), 128); + + enc.src_last_doc_id = 0; + enc.src_ilist_ptr = NULL; + + while (i < size) { + ulint copied; + fts_node_t* src_node; + + src_node = (fts_node_t*) ib_vector_get(word->nodes, i); + + if (dst_node == NULL + || dst_node->last_doc_id > src_node->first_doc_id) { + + dst_node = static_cast( + ib_vector_push(nodes, NULL)); + memset(dst_node, 0, sizeof(*dst_node)); + } + + /* Copy from the src to the dst node. */ + fts_optimize_node(del_vec, &del_pos, dst_node, src_node, &enc); + + ut_a(enc.src_ilist_ptr != NULL); + + /* Determine the numer of bytes copied to dst_node. */ + copied = ulint(enc.src_ilist_ptr - src_node->ilist); + + /* Can't copy more than whats in the vlc array. */ + ut_a(copied <= src_node->ilist_size); + + /* We are done with this node release the resources. */ + if (copied == src_node->ilist_size) { + + enc.src_last_doc_id = 0; + enc.src_ilist_ptr = NULL; + + ut_free(src_node->ilist); + + src_node->ilist = NULL; + src_node->ilist_size = src_node->ilist_size_alloc = 0; + + src_node = NULL; + + ++i; /* Get next source node to OPTIMIZE. */ + } + + if (dst_node->ilist_size >= FTS_ILIST_MAX_SIZE || i >= size) { + + dst_node = NULL; + } + } + + /* All dst nodes created should have been added to the vector. */ + ut_a(dst_node == NULL); + + /* Return the OPTIMIZED nodes. */ + return(nodes); +} + +/**********************************************************************//** +Update the FTS index table. This is a delete followed by an insert. +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_optimize_write_word( +/*====================*/ + trx_t* trx, /*!< in: transaction */ + fts_table_t* fts_table, /*!< in: table of FTS index */ + fts_string_t* word, /*!< in: word data to write */ + ib_vector_t* nodes) /*!< in: the nodes to write */ +{ + ulint i; + pars_info_t* info; + que_t* graph; + ulint selected; + dberr_t error = DB_SUCCESS; + char table_name[MAX_FULL_NAME_LEN]; + + info = pars_info_create(); + + ut_ad(fts_table->charset); + + pars_info_bind_varchar_literal( + info, "word", word->f_str, word->f_len); + + selected = fts_select_index(fts_table->charset, + word->f_str, word->f_len); + + fts_table->suffix = fts_get_suffix(selected); + fts_get_table_name(fts_table, table_name); + pars_info_bind_id(info, "table_name", table_name); + + graph = fts_parse_sql( + fts_table, + info, + "BEGIN DELETE FROM $table_name WHERE word = :word;"); + + error = fts_eval_sql(trx, graph); + + if (UNIV_UNLIKELY(error != DB_SUCCESS)) { + ib::error() << "(" << error << ") during optimize," + " when deleting a word from the FTS index."; + } + + que_graph_free(graph); + graph = NULL; + + /* Even if the operation needs to be rolled back and redone, + we iterate over the nodes in order to free the ilist. */ + for (i = 0; i < ib_vector_size(nodes); ++i) { + + fts_node_t* node = (fts_node_t*) ib_vector_get(nodes, i); + + if (error == DB_SUCCESS) { + /* Skip empty node. */ + if (node->ilist == NULL) { + ut_ad(node->ilist_size == 0); + continue; + } + + error = fts_write_node( + trx, &graph, fts_table, word, node); + + if (UNIV_UNLIKELY(error != DB_SUCCESS)) { + ib::error() << "(" << error << ")" + " during optimize, while adding a" + " word to the FTS index."; + } + } + + ut_free(node->ilist); + node->ilist = NULL; + node->ilist_size = node->ilist_size_alloc = 0; + } + + if (graph != NULL) { + que_graph_free(graph); + } + + return(error); +} + +/**********************************************************************//** +Free fts_optimizer_word_t instanace.*/ +void +fts_word_free( +/*==========*/ + fts_word_t* word) /*!< in: instance to free.*/ +{ + mem_heap_t* heap = static_cast(word->heap_alloc->arg); + +#ifdef UNIV_DEBUG + memset(word, 0, sizeof(*word)); +#endif /* UNIV_DEBUG */ + + mem_heap_free(heap); +} + +/**********************************************************************//** +Optimize the word ilist and rewrite data to the FTS index. +@return status one of RESTART, EXIT, ERROR */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_optimize_compact( +/*=================*/ + fts_optimize_t* optim, /*!< in: optimize state data */ + dict_index_t* index, /*!< in: current FTS being optimized */ + time_t start_time) /*!< in: optimize start time */ +{ + ulint i; + dberr_t error = DB_SUCCESS; + ulint size = ib_vector_size(optim->words); + + for (i = 0; i < size && error == DB_SUCCESS && !optim->done; ++i) { + fts_word_t* word; + ib_vector_t* nodes; + trx_t* trx = optim->trx; + + word = (fts_word_t*) ib_vector_get(optim->words, i); + + /* nodes is allocated from the word heap and will be destroyed + when the word is freed. We however have to be careful about + the ilist, that needs to be freed explicitly. */ + nodes = fts_optimize_word(optim, word); + + /* Update the data on disk. */ + error = fts_optimize_write_word( + trx, &optim->fts_index_table, &word->text, nodes); + + if (error == DB_SUCCESS) { + /* Write the last word optimized to the config table, + we use this value for restarting optimize. */ + error = fts_config_set_index_value( + optim->trx, index, + FTS_LAST_OPTIMIZED_WORD, &word->text); + } + + /* Free the word that was optimized. */ + fts_word_free(word); + + ulint interval = ulint(time(NULL) - start_time); + + if (fts_optimize_time_limit > 0 + && (lint(interval) < 0 + || interval > fts_optimize_time_limit)) { + + optim->done = TRUE; + } + } + + return(error); +} + +/**********************************************************************//** +Create an instance of fts_optimize_t. Also create a new +background transaction.*/ +static +fts_optimize_t* +fts_optimize_create( +/*================*/ + dict_table_t* table) /*!< in: table with FTS indexes */ +{ + fts_optimize_t* optim; + mem_heap_t* heap = mem_heap_create(128); + + optim = (fts_optimize_t*) mem_heap_zalloc(heap, sizeof(*optim)); + + optim->self_heap = ib_heap_allocator_create(heap); + + optim->to_delete = fts_doc_ids_create(); + + optim->words = ib_vector_create( + optim->self_heap, sizeof(fts_word_t), 256); + + optim->table = table; + + optim->trx = trx_create(); + trx_start_internal(optim->trx); + + optim->fts_common_table.table_id = table->id; + optim->fts_common_table.type = FTS_COMMON_TABLE; + optim->fts_common_table.table = table; + + optim->fts_index_table.table_id = table->id; + optim->fts_index_table.type = FTS_INDEX_TABLE; + optim->fts_index_table.table = table; + + /* The common prefix for all this parent table's aux tables. */ + char table_id[FTS_AUX_MIN_TABLE_ID_LENGTH]; + const size_t table_id_len = 1 + + size_t(fts_get_table_id(&optim->fts_common_table, table_id)); + dict_sys.freeze(SRW_LOCK_CALL); + /* Include the separator as well. */ + const size_t dbname_len = table->name.dblen() + 1; + ut_ad(dbname_len > 1); + const size_t prefix_name_len = dbname_len + 4 + table_id_len; + char* prefix_name = static_cast( + ut_malloc_nokey(prefix_name_len)); + memcpy(prefix_name, table->name.m_name, dbname_len); + dict_sys.unfreeze(); + memcpy(prefix_name + dbname_len, "FTS_", 4); + memcpy(prefix_name + dbname_len + 4, table_id, table_id_len); + optim->name_prefix =prefix_name; + + return(optim); +} + +#ifdef FTS_OPTIMIZE_DEBUG +/**********************************************************************//** +Get optimize start time of an FTS index. +@return DB_SUCCESS if all OK else error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_optimize_get_index_start_time( +/*==============================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: FTS index */ + time_t* start_time) /*!< out: time in secs */ +{ + return(fts_config_get_index_ulint( + trx, index, FTS_OPTIMIZE_START_TIME, + (ulint*) start_time)); +} + +/**********************************************************************//** +Set the optimize start time of an FTS index. +@return DB_SUCCESS if all OK else error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_optimize_set_index_start_time( +/*==============================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: FTS index */ + time_t start_time) /*!< in: start time */ +{ + return(fts_config_set_index_ulint( + trx, index, FTS_OPTIMIZE_START_TIME, + (ulint) start_time)); +} + +/**********************************************************************//** +Get optimize end time of an FTS index. +@return DB_SUCCESS if all OK else error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_optimize_get_index_end_time( +/*============================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: FTS index */ + time_t* end_time) /*!< out: time in secs */ +{ + return(fts_config_get_index_ulint( + trx, index, FTS_OPTIMIZE_END_TIME, (ulint*) end_time)); +} + +/**********************************************************************//** +Set the optimize end time of an FTS index. +@return DB_SUCCESS if all OK else error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_optimize_set_index_end_time( +/*============================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: FTS index */ + time_t end_time) /*!< in: end time */ +{ + return(fts_config_set_index_ulint( + trx, index, FTS_OPTIMIZE_END_TIME, (ulint) end_time)); +} +#endif + +/**********************************************************************//** +Free the optimize prepared statements.*/ +static +void +fts_optimize_graph_free( +/*====================*/ + fts_optimize_graph_t* graph) /*!< in/out: The graph instances + to free */ +{ + if (graph->commit_graph) { + que_graph_free(graph->commit_graph); + graph->commit_graph = NULL; + } + + if (graph->write_nodes_graph) { + que_graph_free(graph->write_nodes_graph); + graph->write_nodes_graph = NULL; + } + + if (graph->delete_nodes_graph) { + que_graph_free(graph->delete_nodes_graph); + graph->delete_nodes_graph = NULL; + } + + if (graph->read_nodes_graph) { + que_graph_free(graph->read_nodes_graph); + graph->read_nodes_graph = NULL; + } +} + +/**********************************************************************//** +Free all optimize resources. */ +static +void +fts_optimize_free( +/*==============*/ + fts_optimize_t* optim) /*!< in: table with on FTS index */ +{ + mem_heap_t* heap = static_cast(optim->self_heap->arg); + + trx_commit_for_mysql(optim->trx); + optim->trx->free(); + optim->trx = NULL; + + fts_doc_ids_free(optim->to_delete); + fts_optimize_graph_free(&optim->graph); + + ut_free(optim->name_prefix); + + /* This will free the heap from which optim itself was allocated. */ + mem_heap_free(heap); +} + +/**********************************************************************//** +Get the max time optimize should run in millisecs. +@return max optimize time limit in millisecs. */ +static +ulint +fts_optimize_get_time_limit( +/*========================*/ + trx_t* trx, /*!< in: transaction */ + fts_table_t* fts_table) /*!< in: aux table */ +{ + ulint time_limit = 0; + + fts_config_get_ulint( + trx, fts_table, + FTS_OPTIMIZE_LIMIT_IN_SECS, &time_limit); + + /* FIXME: This is returning milliseconds, while the variable + is being stored and interpreted as seconds! */ + return(time_limit * 1000); +} + +/**********************************************************************//** +Run OPTIMIZE on the given table. Note: this can take a very long time +(hours). */ +static +void +fts_optimize_words( +/*===============*/ + fts_optimize_t* optim, /*!< in: optimize instance */ + dict_index_t* index, /*!< in: current FTS being optimized */ + fts_string_t* word) /*!< in: the starting word to optimize */ +{ + fts_fetch_t fetch; + que_t* graph = NULL; + CHARSET_INFO* charset = optim->fts_index_table.charset; + + ut_a(!optim->done); + + /* Get the time limit from the config table. */ + fts_optimize_time_limit = fts_optimize_get_time_limit( + optim->trx, &optim->fts_common_table); + + const time_t start_time = time(NULL); + + /* Setup the callback to use for fetching the word ilist etc. */ + fetch.read_arg = optim->words; + fetch.read_record = fts_optimize_index_fetch_node; + + while (!optim->done) { + dberr_t error; + trx_t* trx = optim->trx; + ulint selected; + + ut_a(ib_vector_size(optim->words) == 0); + + selected = fts_select_index(charset, word->f_str, word->f_len); + + /* Read the index records to optimize. */ + fetch.total_memory = 0; + error = fts_index_fetch_nodes( + trx, &graph, &optim->fts_index_table, word, + &fetch); + ut_ad(fetch.total_memory < fts_result_cache_limit); + + if (error == DB_SUCCESS) { + /* There must be some nodes to read. */ + ut_a(ib_vector_size(optim->words) > 0); + + /* Optimize the nodes that were read and write + back to DB. */ + error = fts_optimize_compact(optim, index, start_time); + + if (error == DB_SUCCESS) { + fts_sql_commit(optim->trx); + } else { + fts_sql_rollback(optim->trx); + } + } + + ib_vector_reset(optim->words); + + if (error == DB_SUCCESS) { + if (!optim->done) { + if (!fts_zip_read_word(optim->zip, word)) { + optim->done = TRUE; + } else if (selected + != fts_select_index( + charset, word->f_str, + word->f_len) + && graph) { + que_graph_free(graph); + graph = NULL; + } + } + } else if (error == DB_LOCK_WAIT_TIMEOUT) { + ib::warn() << "Lock wait timeout during optimize." + " Retrying!"; + + trx->error_state = DB_SUCCESS; + } else if (error == DB_DEADLOCK) { + ib::warn() << "Deadlock during optimize. Retrying!"; + + trx->error_state = DB_SUCCESS; + } else { + optim->done = TRUE; /* Exit the loop. */ + } + } + + if (graph != NULL) { + que_graph_free(graph); + } +} + +/**********************************************************************//** +Optimize is complete. Set the completion time, and reset the optimize +start string for this FTS index to "". +@return DB_SUCCESS if all OK */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_optimize_index_completed( +/*=========================*/ + fts_optimize_t* optim, /*!< in: optimize instance */ + dict_index_t* index) /*!< in: table with one FTS index */ +{ + fts_string_t word; + dberr_t error; + byte buf[sizeof(ulint)]; +#ifdef FTS_OPTIMIZE_DEBUG + time_t end_time = time(NULL); + + error = fts_optimize_set_index_end_time(optim->trx, index, end_time); +#endif + + /* If we've reached the end of the index then set the start + word to the empty string. */ + + word.f_len = 0; + word.f_str = buf; + *word.f_str = '\0'; + + error = fts_config_set_index_value( + optim->trx, index, FTS_LAST_OPTIMIZED_WORD, &word); + + if (UNIV_UNLIKELY(error != DB_SUCCESS)) { + ib::error() << "(" << error << ") while updating" + " last optimized word!"; + } + + return(error); +} + + +/**********************************************************************//** +Read the list of words from the FTS auxiliary index that will be +optimized in this pass. +@return DB_SUCCESS if all OK */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_optimize_index_read_words( +/*==========================*/ + fts_optimize_t* optim, /*!< in: optimize instance */ + dict_index_t* index, /*!< in: table with one FTS index */ + fts_string_t* word) /*!< in: buffer to use */ +{ + dberr_t error = DB_SUCCESS; + + if (optim->del_list_regenerated) { + word->f_len = 0; + } else { + + /* Get the last word that was optimized from + the config table. */ + error = fts_config_get_index_value( + optim->trx, index, FTS_LAST_OPTIMIZED_WORD, word); + } + + /* If record not found then we start from the top. */ + if (error == DB_RECORD_NOT_FOUND) { + word->f_len = 0; + error = DB_SUCCESS; + } + + while (error == DB_SUCCESS) { + + error = fts_index_fetch_words( + optim, word, fts_num_word_optimize); + + if (error == DB_SUCCESS) { + /* Reset the last optimized word to '' if no + more words could be read from the FTS index. */ + if (optim->zip->n_words == 0) { + word->f_len = 0; + *word->f_str = 0; + } + + break; + } + } + + return(error); +} + +/**********************************************************************//** +Run OPTIMIZE on the given FTS index. Note: this can take a very long +time (hours). +@return DB_SUCCESS if all OK */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_optimize_index( +/*===============*/ + fts_optimize_t* optim, /*!< in: optimize instance */ + dict_index_t* index) /*!< in: table with one FTS index */ +{ + fts_string_t word; + dberr_t error; + byte str[FTS_MAX_WORD_LEN + 1]; + + /* Set the current index that we have to optimize. */ + optim->fts_index_table.index_id = index->id; + optim->fts_index_table.charset = fts_index_get_charset(index); + + optim->done = FALSE; /* Optimize until !done */ + + /* We need to read the last word optimized so that we start from + the next word. */ + word.f_str = str; + + /* We set the length of word to the size of str since we + need to pass the max len info to the fts_get_config_value() function. */ + word.f_len = sizeof(str) - 1; + + memset(word.f_str, 0x0, word.f_len); + + /* Read the words that will be optimized in this pass. */ + error = fts_optimize_index_read_words(optim, index, &word); + + if (error == DB_SUCCESS) { + int zip_error; + + ut_a(optim->zip->pos == 0); + ut_a(optim->zip->zp->total_in == 0); + ut_a(optim->zip->zp->total_out == 0); + + zip_error = inflateInit(optim->zip->zp); + ut_a(zip_error == Z_OK); + + word.f_len = 0; + word.f_str = str; + + /* Read the first word to optimize from the Zip buffer. */ + if (!fts_zip_read_word(optim->zip, &word)) { + + optim->done = TRUE; + } else { + fts_optimize_words(optim, index, &word); + } + + /* If we couldn't read any records then optimize is + complete. Increment the number of indexes that have + been optimized and set FTS index optimize state to + completed. */ + if (error == DB_SUCCESS && optim->zip->n_words == 0) { + + error = fts_optimize_index_completed(optim, index); + + if (error == DB_SUCCESS) { + ++optim->n_completed; + } + } + } + + return(error); +} + +/**********************************************************************//** +Delete the document ids in the delete, and delete cache tables. +@return DB_SUCCESS if all OK */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_optimize_purge_deleted_doc_ids( +/*===============================*/ + fts_optimize_t* optim) /*!< in: optimize instance */ +{ + ulint i; + pars_info_t* info; + que_t* graph; + doc_id_t* update; + doc_id_t write_doc_id; + dberr_t error = DB_SUCCESS; + char deleted[MAX_FULL_NAME_LEN]; + char deleted_cache[MAX_FULL_NAME_LEN]; + + info = pars_info_create(); + + ut_a(ib_vector_size(optim->to_delete->doc_ids) > 0); + + update = static_cast( + ib_vector_get(optim->to_delete->doc_ids, 0)); + + /* Convert to "storage" byte order. */ + fts_write_doc_id((byte*) &write_doc_id, *update); + + /* This is required for the SQL parser to work. It must be able + to find the following variables. So we do it twice. */ + fts_bind_doc_id(info, "doc_id1", &write_doc_id); + fts_bind_doc_id(info, "doc_id2", &write_doc_id); + + /* Make sure the following two names are consistent with the name + used in the fts_delete_doc_ids_sql */ + optim->fts_common_table.suffix = fts_common_tables[3]; + fts_get_table_name(&optim->fts_common_table, deleted); + pars_info_bind_id(info, fts_common_tables[3], deleted); + + optim->fts_common_table.suffix = fts_common_tables[4]; + fts_get_table_name(&optim->fts_common_table, deleted_cache); + pars_info_bind_id(info, fts_common_tables[4], deleted_cache); + + graph = fts_parse_sql(NULL, info, fts_delete_doc_ids_sql); + + /* Delete the doc ids that were copied at the start. */ + for (i = 0; i < ib_vector_size(optim->to_delete->doc_ids); ++i) { + + update = static_cast(ib_vector_get( + optim->to_delete->doc_ids, i)); + + /* Convert to "storage" byte order. */ + fts_write_doc_id((byte*) &write_doc_id, *update); + + fts_bind_doc_id(info, "doc_id1", &write_doc_id); + + fts_bind_doc_id(info, "doc_id2", &write_doc_id); + + error = fts_eval_sql(optim->trx, graph); + + // FIXME: Check whether delete actually succeeded! + if (error != DB_SUCCESS) { + + fts_sql_rollback(optim->trx); + break; + } + } + + que_graph_free(graph); + + return(error); +} + +/**********************************************************************//** +Delete the document ids in the pending delete, and delete tables. +@return DB_SUCCESS if all OK */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_optimize_purge_deleted_doc_id_snapshot( +/*=======================================*/ + fts_optimize_t* optim) /*!< in: optimize instance */ +{ + dberr_t error; + que_t* graph; + pars_info_t* info; + char being_deleted[MAX_FULL_NAME_LEN]; + char being_deleted_cache[MAX_FULL_NAME_LEN]; + + info = pars_info_create(); + + /* Make sure the following two names are consistent with the name + used in the fts_end_delete_sql */ + optim->fts_common_table.suffix = fts_common_tables[0]; + fts_get_table_name(&optim->fts_common_table, being_deleted); + pars_info_bind_id(info, fts_common_tables[0], being_deleted); + + optim->fts_common_table.suffix = fts_common_tables[1]; + fts_get_table_name(&optim->fts_common_table, being_deleted_cache); + pars_info_bind_id(info, fts_common_tables[1], being_deleted_cache); + + /* Delete the doc ids that were copied to delete pending state at + the start of optimize. */ + graph = fts_parse_sql(NULL, info, fts_end_delete_sql); + + error = fts_eval_sql(optim->trx, graph); + que_graph_free(graph); + + return(error); +} + +/**********************************************************************//** +Copy the deleted doc ids that will be purged during this optimize run +to the being deleted FTS auxiliary tables. The transaction is committed +upon successfull copy and rolled back on DB_DUPLICATE_KEY error. +@return DB_SUCCESS if all OK */ +static +ulint +fts_optimize_being_deleted_count( +/*=============================*/ + fts_optimize_t* optim) /*!< in: optimize instance */ +{ + fts_table_t fts_table; + + FTS_INIT_FTS_TABLE(&fts_table, "BEING_DELETED", FTS_COMMON_TABLE, + optim->table); + + return(fts_get_rows_count(&fts_table)); +} + +/*********************************************************************//** +Copy the deleted doc ids that will be purged during this optimize run +to the being deleted FTS auxiliary tables. The transaction is committed +upon successfull copy and rolled back on DB_DUPLICATE_KEY error. +@return DB_SUCCESS if all OK */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_optimize_create_deleted_doc_id_snapshot( +/*========================================*/ + fts_optimize_t* optim) /*!< in: optimize instance */ +{ + dberr_t error; + que_t* graph; + pars_info_t* info; + char being_deleted[MAX_FULL_NAME_LEN]; + char deleted[MAX_FULL_NAME_LEN]; + char being_deleted_cache[MAX_FULL_NAME_LEN]; + char deleted_cache[MAX_FULL_NAME_LEN]; + + info = pars_info_create(); + + /* Make sure the following four names are consistent with the name + used in the fts_init_delete_sql */ + optim->fts_common_table.suffix = fts_common_tables[0]; + fts_get_table_name(&optim->fts_common_table, being_deleted); + pars_info_bind_id(info, fts_common_tables[0], being_deleted); + + optim->fts_common_table.suffix = fts_common_tables[3]; + fts_get_table_name(&optim->fts_common_table, deleted); + pars_info_bind_id(info, fts_common_tables[3], deleted); + + optim->fts_common_table.suffix = fts_common_tables[1]; + fts_get_table_name(&optim->fts_common_table, being_deleted_cache); + pars_info_bind_id(info, fts_common_tables[1], being_deleted_cache); + + optim->fts_common_table.suffix = fts_common_tables[4]; + fts_get_table_name(&optim->fts_common_table, deleted_cache); + pars_info_bind_id(info, fts_common_tables[4], deleted_cache); + + /* Move doc_ids that are to be deleted to state being deleted. */ + graph = fts_parse_sql(NULL, info, fts_init_delete_sql); + + error = fts_eval_sql(optim->trx, graph); + + que_graph_free(graph); + + if (error != DB_SUCCESS) { + fts_sql_rollback(optim->trx); + } else { + fts_sql_commit(optim->trx); + } + + optim->del_list_regenerated = TRUE; + + return(error); +} + +/*********************************************************************//** +Read in the document ids that are to be purged during optimize. The +transaction is committed upon successfully read. +@return DB_SUCCESS if all OK */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_optimize_read_deleted_doc_id_snapshot( +/*======================================*/ + fts_optimize_t* optim) /*!< in: optimize instance */ +{ + dberr_t error; + + optim->fts_common_table.suffix = "BEING_DELETED"; + + /* Read the doc_ids to delete. */ + error = fts_table_fetch_doc_ids( + optim->trx, &optim->fts_common_table, optim->to_delete); + + if (error == DB_SUCCESS) { + + optim->fts_common_table.suffix = "BEING_DELETED_CACHE"; + + /* Read additional doc_ids to delete. */ + error = fts_table_fetch_doc_ids( + optim->trx, &optim->fts_common_table, optim->to_delete); + } + + if (error != DB_SUCCESS) { + + fts_doc_ids_free(optim->to_delete); + optim->to_delete = NULL; + } + + return(error); +} + +/*********************************************************************//** +Optimze all the FTS indexes, skipping those that have already been +optimized, since the FTS auxiliary indexes are not guaranteed to be +of the same cardinality. +@return DB_SUCCESS if all OK */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_optimize_indexes( +/*=================*/ + fts_optimize_t* optim) /*!< in: optimize instance */ +{ + ulint i; + dberr_t error = DB_SUCCESS; + fts_t* fts = optim->table->fts; + + /* Optimize the FTS indexes. */ + for (i = 0; i < ib_vector_size(fts->indexes); ++i) { + dict_index_t* index; + +#ifdef FTS_OPTIMIZE_DEBUG + time_t end_time; + time_t start_time; + + /* Get the start and end optimize times for this index. */ + error = fts_optimize_get_index_start_time( + optim->trx, index, &start_time); + + if (error != DB_SUCCESS) { + break; + } + + error = fts_optimize_get_index_end_time( + optim->trx, index, &end_time); + + if (error != DB_SUCCESS) { + break; + } + + /* Start time will be 0 only for the first time or after + completing the optimization of all FTS indexes. */ + if (start_time == 0) { + start_time = time(NULL); + + error = fts_optimize_set_index_start_time( + optim->trx, index, start_time); + } + + /* Check if this index needs to be optimized or not. */ + if (difftime(end_time, start_time) < 0) { + error = fts_optimize_index(optim, index); + + if (error != DB_SUCCESS) { + break; + } + } else { + ++optim->n_completed; + } +#endif + index = static_cast( + ib_vector_getp(fts->indexes, i)); + error = fts_optimize_index(optim, index); + } + + if (error == DB_SUCCESS) { + fts_sql_commit(optim->trx); + } else { + fts_sql_rollback(optim->trx); + } + + return(error); +} + +/*********************************************************************//** +Cleanup the snapshot tables and the master deleted table. +@return DB_SUCCESS if all OK */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_optimize_purge_snapshot( +/*========================*/ + fts_optimize_t* optim) /*!< in: optimize instance */ +{ + dberr_t error; + + /* Delete the doc ids from the master deleted tables, that were + in the snapshot that was taken at the start of optimize. */ + error = fts_optimize_purge_deleted_doc_ids(optim); + + if (error == DB_SUCCESS) { + /* Destroy the deleted doc id snapshot. */ + error = fts_optimize_purge_deleted_doc_id_snapshot(optim); + } + + if (error == DB_SUCCESS) { + fts_sql_commit(optim->trx); + } else { + fts_sql_rollback(optim->trx); + } + + return(error); +} + +/*********************************************************************//** +Reset the start time to 0 so that a new optimize can be started. +@return DB_SUCCESS if all OK */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_optimize_reset_start_time( +/*==========================*/ + fts_optimize_t* optim) /*!< in: optimize instance */ +{ + dberr_t error = DB_SUCCESS; +#ifdef FTS_OPTIMIZE_DEBUG + fts_t* fts = optim->table->fts; + + /* Optimization should have been completed for all indexes. */ + ut_a(optim->n_completed == ib_vector_size(fts->indexes)); + + for (uint i = 0; i < ib_vector_size(fts->indexes); ++i) { + dict_index_t* index; + + time_t start_time = 0; + + /* Reset the start time to 0 for this index. */ + error = fts_optimize_set_index_start_time( + optim->trx, index, start_time); + + index = static_cast( + ib_vector_getp(fts->indexes, i)); + } +#endif + + if (error == DB_SUCCESS) { + fts_sql_commit(optim->trx); + } else { + fts_sql_rollback(optim->trx); + } + + return(error); +} + +/*********************************************************************//** +Run OPTIMIZE on the given table by a background thread. +@return DB_SUCCESS if all OK */ +static MY_ATTRIBUTE((nonnull)) +dberr_t +fts_optimize_table_bk( +/*==================*/ + fts_slot_t* slot) /*!< in: table to optimiza */ +{ + const time_t now = time(NULL); + const ulint interval = ulint(now - slot->last_run); + + /* Avoid optimizing tables that were optimized recently. */ + if (slot->last_run > 0 + && lint(interval) >= 0 + && interval < FTS_OPTIMIZE_INTERVAL_IN_SECS) { + + return(DB_SUCCESS); + } + + dict_table_t* table = slot->table; + dberr_t error; + + if (table->is_accessible() + && table->fts && table->fts->cache + && table->fts->cache->deleted >= FTS_OPTIMIZE_THRESHOLD) { + error = fts_optimize_table(table); + + slot->last_run = time(NULL); + + if (error == DB_SUCCESS) { + slot->running = false; + slot->completed = slot->last_run; + } + } else { + /* Note time this run completed. */ + slot->last_run = now; + error = DB_SUCCESS; + } + + return(error); +} +/*********************************************************************//** +Run OPTIMIZE on the given table. +@return DB_SUCCESS if all OK */ +dberr_t +fts_optimize_table( +/*===============*/ + dict_table_t* table) /*!< in: table to optimiza */ +{ + if (srv_read_only_mode) { + return DB_READ_ONLY; + } + + dberr_t error = DB_SUCCESS; + fts_optimize_t* optim = NULL; + fts_t* fts = table->fts; + + if (UNIV_UNLIKELY(fts_enable_diag_print)) { + ib::info() << "FTS start optimize " << table->name; + } + + optim = fts_optimize_create(table); + + // FIXME: Call this only at the start of optimize, currently we + // rely on DB_DUPLICATE_KEY to handle corrupting the snapshot. + + /* Check whether there are still records in BEING_DELETED table */ + if (fts_optimize_being_deleted_count(optim) == 0) { + /* Take a snapshot of the deleted document ids, they are copied + to the BEING_ tables. */ + error = fts_optimize_create_deleted_doc_id_snapshot(optim); + } + + /* A duplicate error is OK, since we don't erase the + doc ids from the being deleted state until all FTS + indexes have been optimized. */ + if (error == DB_DUPLICATE_KEY) { + error = DB_SUCCESS; + } + + if (error == DB_SUCCESS) { + + /* These document ids will be filtered out during the + index optimization phase. They are in the snapshot that we + took above, at the start of the optimize. */ + error = fts_optimize_read_deleted_doc_id_snapshot(optim); + + if (error == DB_SUCCESS) { + + /* Commit the read of being deleted + doc ids transaction. */ + fts_sql_commit(optim->trx); + + /* We would do optimization only if there + are deleted records to be cleaned up */ + if (ib_vector_size(optim->to_delete->doc_ids) > 0) { + error = fts_optimize_indexes(optim); + } + + } else { + ut_a(optim->to_delete == NULL); + } + + /* Only after all indexes have been optimized can we + delete the (snapshot) doc ids in the pending delete, + and master deleted tables. */ + if (error == DB_SUCCESS + && optim->n_completed == ib_vector_size(fts->indexes)) { + + if (UNIV_UNLIKELY(fts_enable_diag_print)) { + ib::info() << "FTS_OPTIMIZE: Completed" + " Optimize, cleanup DELETED table"; + } + + if (ib_vector_size(optim->to_delete->doc_ids) > 0) { + + /* Purge the doc ids that were in the + snapshot from the snapshot tables and + the master deleted table. */ + error = fts_optimize_purge_snapshot(optim); + } + + if (error == DB_SUCCESS) { + /* Reset the start time of all the FTS indexes + so that optimize can be restarted. */ + error = fts_optimize_reset_start_time(optim); + } + } + } + + fts_optimize_free(optim); + + if (UNIV_UNLIKELY(fts_enable_diag_print)) { + ib::info() << "FTS end optimize " << table->name; + } + + return(error); +} + +/********************************************************************//** +Add the table to add to the OPTIMIZER's list. +@return new message instance */ +static +fts_msg_t* +fts_optimize_create_msg( +/*====================*/ + fts_msg_type_t type, /*!< in: type of message */ + void* ptr) /*!< in: message payload */ +{ + mem_heap_t* heap; + fts_msg_t* msg; + + heap = mem_heap_create(sizeof(*msg) + sizeof(ib_list_node_t) + 16); + msg = static_cast(mem_heap_alloc(heap, sizeof(*msg))); + + msg->ptr = ptr; + msg->type = type; + msg->heap = heap; + + return(msg); +} + +/** Add message to wqueue, signal thread pool*/ +static void add_msg(fts_msg_t *msg) +{ + ib_wqueue_add(fts_optimize_wq, msg, msg->heap, true); + srv_thread_pool->submit_task(&task); +} + +/** +Called by "idle" timer. Submits optimize task, which +will only recalculate is_sync_needed, in case the queue is empty. +*/ +static void timer_callback(void*) +{ + srv_thread_pool->submit_task(&task); +} + +/** Add the table to add to the OPTIMIZER's list. +@param[in] table table to add */ +void fts_optimize_add_table(dict_table_t* table) +{ + fts_msg_t* msg; + + if (!fts_optimize_wq) { + return; + } + + /* Make sure table with FTS index cannot be evicted */ + dict_sys.prevent_eviction(table); + + msg = fts_optimize_create_msg(FTS_MSG_ADD_TABLE, table); + + mysql_mutex_lock(&fts_optimize_wq->mutex); + + add_msg(msg); + + table->fts->in_queue = true; + + mysql_mutex_unlock(&fts_optimize_wq->mutex); +} + +/**********************************************************************//** +Remove the table from the OPTIMIZER's list. We do wait for +acknowledgement from the consumer of the message. */ +void +fts_optimize_remove_table( +/*======================*/ + dict_table_t* table) /*!< in: table to remove */ +{ + if (!fts_optimize_wq) + return; + + if (fts_opt_start_shutdown) + { + ib::info() << "Try to remove table " << table->name + << " after FTS optimize thread exiting."; + while (fts_optimize_wq) + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + return; + } + + mysql_mutex_lock(&fts_optimize_wq->mutex); + + if (table->fts->in_queue) + { + fts_msg_t *msg= fts_optimize_create_msg(FTS_MSG_DEL_TABLE, nullptr); + pthread_cond_t cond; + pthread_cond_init(&cond, nullptr); + msg->ptr= new(mem_heap_alloc(msg->heap, sizeof(fts_msg_del_t))) + fts_msg_del_t{table, &cond}; + add_msg(msg); + my_cond_wait(&cond, &fts_optimize_wq->mutex.m_mutex); + pthread_cond_destroy(&cond); + ut_ad(!table->fts->in_queue); + } + + mysql_mutex_unlock(&fts_optimize_wq->mutex); +} + +/** Send sync fts cache for the table. +@param[in] table table to sync */ +void +fts_optimize_request_sync_table( + dict_table_t* table) +{ + /* if the optimize system not yet initialized, return */ + if (!fts_optimize_wq) { + return; + } + + mysql_mutex_lock(&fts_optimize_wq->mutex); + + /* FTS optimizer thread is already exited */ + if (fts_opt_start_shutdown) { + ib::info() << "Try to sync table " << table->name + << " after FTS optimize thread exiting."; + } else if (table->fts->sync_message) { + /* If the table already has SYNC message in + fts_optimize_wq queue then ignore it */ + } else { + add_msg(fts_optimize_create_msg(FTS_MSG_SYNC_TABLE, table)); + table->fts->sync_message = true; + DBUG_EXECUTE_IF("fts_optimize_wq_count_check", + DBUG_ASSERT(fts_optimize_wq->length <= 1000);); + } + + mysql_mutex_unlock(&fts_optimize_wq->mutex); +} + +/** Add a table to fts_slots if it doesn't already exist. */ +static bool fts_optimize_new_table(dict_table_t* table) +{ + ut_ad(table); + + ulint i; + fts_slot_t* slot; + fts_slot_t* empty = NULL; + + /* Search for duplicates, also find a free slot if one exists. */ + for (i = 0; i < ib_vector_size(fts_slots); ++i) { + + slot = static_cast(ib_vector_get(fts_slots, i)); + + if (!slot->table) { + empty = slot; + } else if (slot->table == table) { + /* Already exists in our optimize queue. */ + return false; + } + } + + slot = empty ? empty : static_cast( + ib_vector_push(fts_slots, NULL)); + + memset(slot, 0x0, sizeof(*slot)); + + slot->table = table; + return true; +} + +/** Remove a table from fts_slots if it exists. +@param remove table to be removed from fts_slots */ +static bool fts_optimize_del_table(fts_msg_del_t *remove) +{ + const dict_table_t* table = remove->table; + ut_ad(table); + for (ulint i = 0; i < ib_vector_size(fts_slots); ++i) { + fts_slot_t* slot; + + slot = static_cast(ib_vector_get(fts_slots, i)); + + if (slot->table == table) { + if (UNIV_UNLIKELY(fts_enable_diag_print)) { + ib::info() << "FTS Optimize Removing table " + << table->name; + } + + mysql_mutex_lock(&fts_optimize_wq->mutex); + table->fts->in_queue = false; + pthread_cond_signal(remove->cond); + mysql_mutex_unlock(&fts_optimize_wq->mutex); + slot->table = NULL; + return true; + } + } + + mysql_mutex_lock(&fts_optimize_wq->mutex); + pthread_cond_signal(remove->cond); + mysql_mutex_unlock(&fts_optimize_wq->mutex); + return false; +} + +/**********************************************************************//** +Calculate how many tables in fts_slots need to be optimized. +@return no. of tables to optimize */ +static ulint fts_optimize_how_many() +{ + ulint n_tables = 0; + const time_t current_time = time(NULL); + + for (ulint i = 0; i < ib_vector_size(fts_slots); ++i) { + const fts_slot_t* slot = static_cast( + ib_vector_get_const(fts_slots, i)); + if (!slot->table) { + continue; + } + + const time_t end = slot->running + ? slot->last_run : slot->completed; + ulint interval = ulint(current_time - end); + + if (lint(interval) < 0 + || interval >= FTS_OPTIMIZE_INTERVAL_IN_SECS) { + ++n_tables; + } + } + + return(n_tables); +} + +/**********************************************************************//** +Check if the total memory used by all FTS table exceeds the maximum limit. +@return true if a sync is needed, false otherwise */ +static bool fts_is_sync_needed() +{ + ulint total_memory = 0; + const time_t now = time(NULL); + double time_diff = difftime(now, last_check_sync_time); + + if (fts_need_sync || (time_diff >= 0 && time_diff < 5)) { + return(false); + } + + last_check_sync_time = now; + + for (ulint i = 0; i < ib_vector_size(fts_slots); ++i) { + const fts_slot_t* slot = static_cast( + ib_vector_get_const(fts_slots, i)); + + if (!slot->table) { + continue; + } + + if (slot->table->fts && slot->table->fts->cache) { + total_memory += slot->table->fts->cache->total_size; + } + + if (total_memory > fts_max_total_cache_size) { + return(true); + } + } + + return(false); +} + +/** Sync fts cache of a table +@param[in,out] table table to be synced +@param[in] process_message processing messages from fts_optimize_wq */ +static void fts_optimize_sync_table(dict_table_t *table, + bool process_message= false) +{ + MDL_ticket* mdl_ticket= nullptr; + dict_table_t *sync_table= dict_acquire_mdl_shared(table, fts_opt_thd, + &mdl_ticket); + + if (!sync_table) + return; + + if (sync_table->fts && sync_table->fts->cache && sync_table->is_accessible()) + { + fts_sync_table(sync_table, false); + if (process_message) + { + mysql_mutex_lock(&fts_optimize_wq->mutex); + sync_table->fts->sync_message = false; + mysql_mutex_unlock(&fts_optimize_wq->mutex); + } + } + + DBUG_EXECUTE_IF("ib_optimize_wq_hang", + std::this_thread::sleep_for(std::chrono::seconds(6));); + + if (mdl_ticket) + dict_table_close(sync_table, false, fts_opt_thd, mdl_ticket); +} + +/**********************************************************************//** +Optimize all FTS tables. +@return Dummy return */ +static void fts_optimize_callback(void *) +{ + ut_ad(!srv_read_only_mode); + + static ulint current; + static bool done; + static ulint n_optimize; + + if (!fts_optimize_wq || done) { + /* Possibly timer initiated callback, can come after FTS_MSG_STOP.*/ + return; + } + + static ulint n_tables = ib_vector_size(fts_slots); + + while (!done && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) { + /* If there is no message in the queue and we have tables + to optimize then optimize the tables. */ + + if (!done + && ib_wqueue_is_empty(fts_optimize_wq) + && n_tables > 0 + && n_optimize > 0) { + + /* The queue is empty but we have tables + to optimize. */ + if (UNIV_UNLIKELY(wsrep_sst_disable_writes)) { +retry_later: + if (fts_is_sync_needed()) { + fts_need_sync = true; + } + if (n_tables) { + timer->set_time(5000, 0); + } + return; + } + + fts_slot_t* slot = static_cast( + ib_vector_get(fts_slots, current)); + + /* Handle the case of empty slots. */ + if (slot->table) { + slot->running = true; + fts_optimize_table_bk(slot); + } + + /* Wrap around the counter. */ + if (++current >= ib_vector_size(fts_slots)) { + n_optimize = fts_optimize_how_many(); + current = 0; + } + } else if (n_optimize == 0 + || !ib_wqueue_is_empty(fts_optimize_wq)) { + fts_msg_t* msg = static_cast + (ib_wqueue_nowait(fts_optimize_wq)); + /* Timeout ? */ + if (!msg) { + goto retry_later; + } + + switch (msg->type) { + case FTS_MSG_STOP: + done = true; + break; + + case FTS_MSG_ADD_TABLE: + ut_a(!done); + if (fts_optimize_new_table( + static_cast( + msg->ptr))) { + ++n_tables; + } + break; + + case FTS_MSG_DEL_TABLE: + if (fts_optimize_del_table( + static_cast( + msg->ptr))) { + --n_tables; + } + break; + + case FTS_MSG_SYNC_TABLE: + if (UNIV_UNLIKELY(wsrep_sst_disable_writes)) { + add_msg(msg); + goto retry_later; + } + + DBUG_EXECUTE_IF( + "fts_instrument_msg_sync_sleep", + std::this_thread::sleep_for( + std::chrono::milliseconds( + 300));); + + fts_optimize_sync_table( + static_cast(msg->ptr), + true); + break; + + default: + ut_error; + } + + mem_heap_free(msg->heap); + n_optimize = done ? 0 : fts_optimize_how_many(); + } + } + + /* Server is being shutdown, sync the data from FTS cache to disk + if needed */ + if (n_tables > 0) { + for (ulint i = 0; i < ib_vector_size(fts_slots); i++) { + fts_slot_t* slot = static_cast( + ib_vector_get(fts_slots, i)); + + if (slot->table) { + fts_optimize_sync_table(slot->table); + } + } + } + + ib_vector_free(fts_slots); + mysql_mutex_lock(&fts_optimize_wq->mutex); + fts_slots = NULL; + pthread_cond_broadcast(&fts_opt_shutdown_cond); + mysql_mutex_unlock(&fts_optimize_wq->mutex); + + ib::info() << "FTS optimize thread exiting."; +} + +/**********************************************************************//** +Startup the optimize thread and create the work queue. */ +void +fts_optimize_init(void) +/*===================*/ +{ + mem_heap_t* heap; + ib_alloc_t* heap_alloc; + + ut_ad(!srv_read_only_mode); + + /* For now we only support one optimize thread. */ + ut_a(!fts_optimize_wq); + + /* Create FTS optimize work queue */ + fts_optimize_wq = ib_wqueue_create(); + timer = srv_thread_pool->create_timer(timer_callback); + + /* Create FTS vector to store fts_slot_t */ + heap = mem_heap_create(sizeof(dict_table_t*) * 64); + heap_alloc = ib_heap_allocator_create(heap); + fts_slots = ib_vector_create(heap_alloc, sizeof(fts_slot_t), 4); + + fts_opt_thd = innobase_create_background_thd("InnoDB FTS optimizer"); + /* Add fts tables to fts_slots which could be skipped + during dict_load_table_one() because fts_optimize_thread + wasn't even started. */ + dict_sys.freeze(SRW_LOCK_CALL); + for (dict_table_t* table = UT_LIST_GET_FIRST(dict_sys.table_LRU); + table != NULL; + table = UT_LIST_GET_NEXT(table_LRU, table)) { + if (!table->fts || !dict_table_has_fts_index(table)) { + continue; + } + + /* fts_optimize_thread is not started yet. So there is no + need to acquire fts_optimize_wq->mutex for adding the fts + table to the fts slots. */ + ut_ad(!table->can_be_evicted); + fts_optimize_new_table(table); + table->fts->in_queue = true; + } + dict_sys.unfreeze(); + + pthread_cond_init(&fts_opt_shutdown_cond, nullptr); + last_check_sync_time = time(NULL); +} + +/** Shutdown fts optimize thread. */ +void +fts_optimize_shutdown() +{ + ut_ad(!srv_read_only_mode); + + /* If there is an ongoing activity on dictionary, such as + srv_master_evict_from_table_cache(), wait for it */ + dict_sys.freeze(SRW_LOCK_CALL); + mysql_mutex_lock(&fts_optimize_wq->mutex); + /* Tells FTS optimizer system that we are exiting from + optimizer thread, message send their after will not be + processed */ + fts_opt_start_shutdown = true; + dict_sys.unfreeze(); + + /* We tell the OPTIMIZE thread to switch to state done, we + can't delete the work queue here because the add thread needs + deregister the FTS tables. */ + timer->disarm(); + task_group.cancel_pending(&task); + + add_msg(fts_optimize_create_msg(FTS_MSG_STOP, nullptr)); + + while (fts_slots) { + my_cond_wait(&fts_opt_shutdown_cond, + &fts_optimize_wq->mutex.m_mutex); + } + + destroy_background_thd(fts_opt_thd); + fts_opt_thd = NULL; + pthread_cond_destroy(&fts_opt_shutdown_cond); + mysql_mutex_unlock(&fts_optimize_wq->mutex); + + ib_wqueue_free(fts_optimize_wq); + fts_optimize_wq = NULL; + + delete timer; + timer = NULL; +} + +/** Sync the table during commit phase +@param[in] table table to be synced */ +void fts_sync_during_ddl(dict_table_t* table) +{ + if (!fts_optimize_wq) + return; + mysql_mutex_lock(&fts_optimize_wq->mutex); + const auto sync_message= table->fts->sync_message; + mysql_mutex_unlock(&fts_optimize_wq->mutex); + if (!sync_message) + return; + + fts_sync_table(table, false); + + mysql_mutex_lock(&fts_optimize_wq->mutex); + table->fts->sync_message = false; + mysql_mutex_unlock(&fts_optimize_wq->mutex); +} diff --git a/storage/innobase/fts/fts0pars.cc b/storage/innobase/fts/fts0pars.cc new file mode 100644 index 00000000..cb51784a --- /dev/null +++ b/storage/innobase/fts/fts0pars.cc @@ -0,0 +1,2007 @@ +/* A Bison parser, made by GNU Bison 2.5. */ + +/* Bison implementation for Yacc-like parsers in C + + Copyright (C) 1984, 1989-1990, 2000-2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* As a special exception, you may create a larger work that contains + part or all of the Bison parser skeleton and distribute that work + under terms of your choice, so long as that work isn't itself a + parser generator using the skeleton or a modified version thereof + as a parser skeleton. Alternatively, if you modify or redistribute + the parser skeleton itself, you may (at your option) remove this + special exception, which will cause the skeleton and the resulting + Bison output files to be licensed under the GNU General Public + License without this special exception. + + This special exception was added by the Free Software Foundation in + version 2.2 of Bison. */ + +/* C LALR(1) parser skeleton written by Richard Stallman, by + simplifying the original so-called "semantic" parser. */ + +/* All symbols defined below should begin with yy or YY, to avoid + infringing on user name space. This should be done even for local + variables, as they might otherwise be expanded by user macros. + There are some unavoidable exceptions within include files to + define necessary library symbols; they are noted "INFRINGES ON + USER NAME SPACE" below. */ + +/* Identify Bison output. */ +#define YYBISON 1 + +/* Bison version. */ +#define YYBISON_VERSION "2.5" + +/* Skeleton name. */ +#define YYSKELETON_NAME "yacc.c" + +/* Pure parsers. */ +#define YYPURE 1 + +/* Push parsers. */ +#define YYPUSH 0 + +/* Pull parsers. */ +#define YYPULL 1 + +/* Using locations. */ +#define YYLSP_NEEDED 0 + +/* Substitute the variable and function names. */ +#define yyparse ftsparse +#define yylex ftslex +#define yyerror ftserror +#define yylval ftslval +#define yychar ftschar +#define yydebug ftsdebug +#define yynerrs ftsnerrs + + +/* Copy the first part of user declarations. */ + +/* Line 268 of yacc.c */ +#line 26 "fts0pars.y" + +#include "ha_prototypes.h" +#include "mem0mem.h" +#include "fts0ast.h" +#include "fts0blex.h" +#include "fts0tlex.h" +#include "fts0pars.h" +#include +extern int fts_lexer(YYSTYPE*, fts_lexer_t*); +extern int fts_blexer(YYSTYPE*, yyscan_t); +extern int fts_tlexer(YYSTYPE*, yyscan_t); +#ifdef __GNUC__ +# pragma GCC diagnostic ignored "-Wpragmas" +# pragma GCC diagnostic ignored "-Wunknown-warning-option" +# pragma GCC diagnostic ignored "-Wunused-but-set-variable" +#endif +extern int ftserror(const char* p); +/* Required for reentrant parser */ +#define ftslex fts_lexer + +#define YYERROR_VERBOSE + +/* For passing an argument to yyparse() */ +#define YYPARSE_PARAM state +#define YYLEX_PARAM ((fts_ast_state_t*) state)->lexer + +#define YYTOKENFREE(token) fts_ast_string_free((token)) + + +typedef int (*fts_scanner)(YYSTYPE* val, yyscan_t yyscanner); + +struct fts_lexer_t { + fts_scanner scanner; + void* yyscanner; +}; + + + +/* Line 268 of yacc.c */ +#line 115 "fts0pars.cc" + +/* Enabling traces. */ +#ifndef YYDEBUG +# define YYDEBUG 0 +#endif + +/* Enabling verbose error messages. */ +#ifdef YYERROR_VERBOSE +# undef YYERROR_VERBOSE +# define YYERROR_VERBOSE 1 +#else +# define YYERROR_VERBOSE 0 +#endif + +/* Enabling the token table. */ +#ifndef YYTOKEN_TABLE +# define YYTOKEN_TABLE 0 +#endif + + +/* Tokens. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + /* Put the tokens into the symbol table, so that GDB and other debuggers + know about them. */ + enum yytokentype { + FTS_OPER = 258, + FTS_TEXT = 259, + FTS_TERM = 260, + FTS_NUMB = 261 + }; +#endif + + + +#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED +typedef union YYSTYPE +{ + +/* Line 293 of yacc.c */ +#line 61 "fts0pars.y" + + int oper; + fts_ast_string_t* token; + fts_ast_node_t* node; + + + +/* Line 293 of yacc.c */ +#line 165 "fts0pars.cc" +} YYSTYPE; +# define YYSTYPE_IS_TRIVIAL 1 +# define yystype YYSTYPE /* obsolescent; will be withdrawn */ +# define YYSTYPE_IS_DECLARED 1 +#endif + + +/* Copy the second part of user declarations. */ + + +/* Line 343 of yacc.c */ +#line 177 "fts0pars.cc" + +#ifdef short +# undef short +#endif + +#ifdef YYTYPE_UINT8 +typedef YYTYPE_UINT8 yytype_uint8; +#else +typedef unsigned char yytype_uint8; +#endif + +#ifdef YYTYPE_INT8 +typedef YYTYPE_INT8 yytype_int8; +#elif (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +typedef signed char yytype_int8; +#else +typedef short int yytype_int8; +#endif + +#ifdef YYTYPE_UINT16 +typedef YYTYPE_UINT16 yytype_uint16; +#else +typedef unsigned short int yytype_uint16; +#endif + +#ifdef YYTYPE_INT16 +typedef YYTYPE_INT16 yytype_int16; +#else +typedef short int yytype_int16; +#endif + +#ifndef YYSIZE_T +# ifdef __SIZE_TYPE__ +# define YYSIZE_T __SIZE_TYPE__ +# elif defined size_t +# define YYSIZE_T size_t +# elif ! defined YYSIZE_T && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +# include /* INFRINGES ON USER NAME SPACE */ +# define YYSIZE_T size_t +# else +# define YYSIZE_T unsigned int +# endif +#endif + +#define YYSIZE_MAXIMUM ((YYSIZE_T) -1) + +#ifndef YY_ +# if defined YYENABLE_NLS && YYENABLE_NLS +# if ENABLE_NLS +# include /* INFRINGES ON USER NAME SPACE */ +# define YY_(msgid) dgettext ("bison-runtime", msgid) +# endif +# endif +# ifndef YY_ +# define YY_(msgid) msgid +# endif +#endif + +/* Suppress unused-variable warnings by "using" E. */ +#if ! defined lint || defined __GNUC__ +# define YYUSE(e) ((void) (e)) +#else +# define YYUSE(e) /* empty */ +#endif + +/* Identity function, used to suppress warnings about constant conditions. */ +#ifndef lint +# define YYID(n) (n) +#else +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static int +YYID (int yyi) +#else +static int +YYID (yyi) + int yyi; +#endif +{ + return yyi; +} +#endif + +#if ! defined yyoverflow || YYERROR_VERBOSE + +/* The parser invokes alloca or malloc; define the necessary symbols. */ + +# ifdef YYSTACK_USE_ALLOCA +# if YYSTACK_USE_ALLOCA +# ifdef __GNUC__ +# define YYSTACK_ALLOC __builtin_alloca +# elif defined __BUILTIN_VA_ARG_INCR +# include /* INFRINGES ON USER NAME SPACE */ +# elif defined _MSC_VER +# include /* INFRINGES ON USER NAME SPACE */ +# define alloca _alloca +# else +# define YYSTACK_ALLOC alloca +# if ! defined _ALLOCA_H && ! defined EXIT_SUCCESS && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +# include /* INFRINGES ON USER NAME SPACE */ +# ifndef EXIT_SUCCESS +# define EXIT_SUCCESS 0 +# endif +# endif +# endif +# endif +# endif + +# ifdef YYSTACK_ALLOC + /* Pacify GCC's `empty if-body' warning. */ +# define YYSTACK_FREE(Ptr) do { /* empty */; } while (YYID (0)) +# ifndef YYSTACK_ALLOC_MAXIMUM + /* The OS might guarantee only one guard page at the bottom of the stack, + and a page size can be as small as 4096 bytes. So we cannot safely + invoke alloca (N) if N exceeds 4096. Use a slightly smaller number + to allow for a few compiler-allocated temporary stack slots. */ +# define YYSTACK_ALLOC_MAXIMUM 4032 /* reasonable circa 2006 */ +# endif +# else +# define YYSTACK_ALLOC YYMALLOC +# define YYSTACK_FREE YYFREE +# ifndef YYSTACK_ALLOC_MAXIMUM +# define YYSTACK_ALLOC_MAXIMUM YYSIZE_MAXIMUM +# endif +# if (defined __cplusplus && ! defined EXIT_SUCCESS \ + && ! ((defined YYMALLOC || defined malloc) \ + && (defined YYFREE || defined free))) +# include /* INFRINGES ON USER NAME SPACE */ +# ifndef EXIT_SUCCESS +# define EXIT_SUCCESS 0 +# endif +# endif +# ifndef YYMALLOC +# define YYMALLOC malloc +# if ! defined malloc && ! defined EXIT_SUCCESS && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +void *malloc (YYSIZE_T); /* INFRINGES ON USER NAME SPACE */ +# endif +# endif +# ifndef YYFREE +# define YYFREE free +# if ! defined free && ! defined EXIT_SUCCESS && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +void free (void *); /* INFRINGES ON USER NAME SPACE */ +# endif +# endif +# endif +#endif /* ! defined yyoverflow || YYERROR_VERBOSE */ + + +#if (! defined yyoverflow \ + && (! defined __cplusplus \ + || (defined YYSTYPE_IS_TRIVIAL && YYSTYPE_IS_TRIVIAL))) + +/* A type that is properly aligned for any stack member. */ +union yyalloc +{ + yytype_int16 yyss_alloc; + YYSTYPE yyvs_alloc; +}; + +/* The size of the maximum gap between one aligned stack and the next. */ +# define YYSTACK_GAP_MAXIMUM (sizeof (union yyalloc) - 1) + +/* The size of an array large to enough to hold all stacks, each with + N elements. */ +# define YYSTACK_BYTES(N) \ + ((N) * (sizeof (yytype_int16) + sizeof (YYSTYPE)) \ + + YYSTACK_GAP_MAXIMUM) + +# define YYCOPY_NEEDED 1 + +/* Relocate STACK from its old location to the new one. The + local variables YYSIZE and YYSTACKSIZE give the old and new number of + elements in the stack, and YYPTR gives the new location of the + stack. Advance YYPTR to a properly aligned location for the next + stack. */ +# define YYSTACK_RELOCATE(Stack_alloc, Stack) \ + do \ + { \ + YYSIZE_T yynewbytes; \ + YYCOPY (&yyptr->Stack_alloc, Stack, yysize); \ + Stack = &yyptr->Stack_alloc; \ + yynewbytes = yystacksize * sizeof (*Stack) + YYSTACK_GAP_MAXIMUM; \ + yyptr += yynewbytes / sizeof (*yyptr); \ + } \ + while (YYID (0)) + +#endif + +#if defined YYCOPY_NEEDED && YYCOPY_NEEDED +/* Copy COUNT objects from FROM to TO. The source and destination do + not overlap. */ +# ifndef YYCOPY +# if defined __GNUC__ && 1 < __GNUC__ +# define YYCOPY(To, From, Count) \ + __builtin_memcpy (To, From, (Count) * sizeof (*(From))) +# else +# define YYCOPY(To, From, Count) \ + do \ + { \ + YYSIZE_T yyi; \ + for (yyi = 0; yyi < (Count); yyi++) \ + (To)[yyi] = (From)[yyi]; \ + } \ + while (YYID (0)) +# endif +# endif +#endif /* !YYCOPY_NEEDED */ + +/* YYFINAL -- State number of the termination state. */ +#define YYFINAL 3 +/* YYLAST -- Last index in YYTABLE. */ +#define YYLAST 52 + +/* YYNTOKENS -- Number of terminals. */ +#define YYNTOKENS 16 +/* YYNNTS -- Number of nonterminals. */ +#define YYNNTS 8 +/* YYNRULES -- Number of rules. */ +#define YYNRULES 24 +/* YYNRULES -- Number of states. */ +#define YYNSTATES 33 + +/* YYTRANSLATE(YYLEX) -- Bison symbol number corresponding to YYLEX. */ +#define YYUNDEFTOK 2 +#define YYMAXUTOK 261 + +#define YYTRANSLATE(YYX) \ + ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK) + +/* YYTRANSLATE[YYLEX] -- Bison symbol number corresponding to YYLEX. */ +static const yytype_uint8 yytranslate[] = +{ + 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 12, 13, 14, 7, 2, 8, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 10, 2, 11, 2, 15, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 9, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 1, 2, 3, 4, + 5, 6 +}; + +#if YYDEBUG +/* YYPRHS[YYN] -- Index of the first RHS symbol of rule number YYN in + YYRHS. */ +static const yytype_uint8 yyprhs[] = +{ + 0, 0, 3, 5, 6, 9, 12, 16, 21, 23, + 25, 28, 32, 36, 39, 44, 47, 49, 51, 53, + 55, 57, 59, 61, 64 +}; + +/* YYRHS -- A `-1'-separated list of the rules' RHS. */ +static const yytype_int8 yyrhs[] = +{ + 17, 0, -1, 18, -1, -1, 18, 20, -1, 18, + 19, -1, 12, 18, 13, -1, 21, 12, 18, 13, + -1, 22, -1, 23, -1, 22, 14, -1, 23, 15, + 6, -1, 21, 22, 14, -1, 21, 22, -1, 21, + 23, 15, 6, -1, 21, 23, -1, 8, -1, 7, + -1, 9, -1, 10, -1, 11, -1, 5, -1, 6, + -1, 14, 22, -1, 4, -1 +}; + +/* YYRLINE[YYN] -- source line where rule number YYN was defined. */ +static const yytype_uint8 yyrline[] = +{ + 0, 79, 79, 85, 89, 99, 111, 119, 129, 133, + 137, 141, 146, 152, 157, 164, 170, 174, 178, 182, + 186, 191, 196, 202, 207 +}; +#endif + +#if YYDEBUG || YYERROR_VERBOSE || YYTOKEN_TABLE +/* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM. + First, the terminals, then, starting at YYNTOKENS, nonterminals. */ +static const char *const yytname[] = +{ + "$end", "error", "$undefined", "FTS_OPER", "FTS_TEXT", "FTS_TERM", + "FTS_NUMB", "'+'", "'-'", "'~'", "'<'", "'>'", "'('", "')'", "'*'", + "'@'", "$accept", "query", "expr_lst", "sub_expr", "expr", "prefix", + "term", "text", 0 +}; +#endif + +# ifdef YYPRINT +/* YYTOKNUM[YYLEX-NUM] -- Internal token number corresponding to + token YYLEX-NUM. */ +static const yytype_uint16 yytoknum[] = +{ + 0, 256, 257, 258, 259, 260, 261, 43, 45, 126, + 60, 62, 40, 41, 42, 64 +}; +# endif + +/* YYR1[YYN] -- Symbol number of symbol that rule YYN derives. */ +static const yytype_uint8 yyr1[] = +{ + 0, 16, 17, 18, 18, 18, 19, 19, 20, 20, + 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, + 21, 22, 22, 22, 23 +}; + +/* YYR2[YYN] -- Number of symbols composing right hand side of rule YYN. */ +static const yytype_uint8 yyr2[] = +{ + 0, 2, 1, 0, 2, 2, 3, 4, 1, 1, + 2, 3, 3, 2, 4, 2, 1, 1, 1, 1, + 1, 1, 1, 2, 1 +}; + +/* YYDEFACT[STATE-NAME] -- Default reduction number in state STATE-NUM. + Performed when YYTABLE doesn't specify something else to do. Zero + means the default is an error. */ +static const yytype_uint8 yydefact[] = +{ + 3, 0, 2, 1, 24, 21, 22, 17, 16, 18, + 19, 20, 3, 0, 5, 4, 0, 8, 9, 0, + 23, 3, 13, 15, 10, 0, 6, 0, 12, 0, + 11, 7, 14 +}; + +/* YYDEFGOTO[NTERM-NUM]. */ +static const yytype_int8 yydefgoto[] = +{ + -1, 1, 2, 14, 15, 16, 17, 18 +}; + +/* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing + STATE-NUM. */ +#define YYPACT_NINF -5 +static const yytype_int8 yypact[] = +{ + -5, 38, 18, -5, -5, -5, -5, -5, -5, -5, + -5, -5, -5, 31, -5, -5, 29, 30, 32, -4, + -5, -5, 34, 35, -5, 40, -5, 7, -5, 43, + -5, -5, -5 +}; + +/* YYPGOTO[NTERM-NUM]. */ +static const yytype_int8 yypgoto[] = +{ + -5, -5, 19, -5, -5, -5, 26, 36 +}; + +/* YYTABLE[YYPACT[STATE-NUM]]. What to do in state STATE-NUM. If + positive, shift that token. If negative, reduce the rule which + number is the opposite. If YYTABLE_NINF, syntax error. */ +#define YYTABLE_NINF -1 +static const yytype_uint8 yytable[] = +{ + 4, 5, 6, 7, 8, 9, 10, 11, 12, 26, + 13, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 31, 13, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 19, 13, 4, 5, 6, 5, 6, 3, 20, + 27, 21, 22, 13, 24, 13, 30, 25, 28, 32, + 29, 0, 23 +}; + +#define yypact_value_is_default(yystate) \ + ((yystate) == (-5)) + +#define yytable_value_is_error(yytable_value) \ + YYID (0) + +static const yytype_int8 yycheck[] = +{ + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 12, 14, 4, 5, 6, 5, 6, 0, 13, + 21, 12, 16, 14, 14, 14, 6, 15, 14, 6, + 15, -1, 16 +}; + +/* YYSTOS[STATE-NUM] -- The (internal number of the) accessing + symbol of state STATE-NUM. */ +static const yytype_uint8 yystos[] = +{ + 0, 17, 18, 0, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 14, 19, 20, 21, 22, 23, 18, + 22, 12, 22, 23, 14, 15, 13, 18, 14, 15, + 6, 13, 6 +}; + +#define yyerrok (yyerrstatus = 0) +#define yyclearin (yychar = YYEMPTY) +#define YYEMPTY (-2) +#define YYEOF 0 + +#define YYACCEPT goto yyacceptlab +#define YYABORT goto yyabortlab +#define YYERROR goto yyerrorlab + + +/* Like YYERROR except do call yyerror. This remains here temporarily + to ease the transition to the new meaning of YYERROR, for GCC. + Once GCC version 2 has supplanted version 1, this can go. However, + YYFAIL appears to be in use. Nevertheless, it is formally deprecated + in Bison 2.4.2's NEWS entry, where a plan to phase it out is + discussed. */ + +#define YYFAIL goto yyerrlab +#if defined YYFAIL + /* This is here to suppress warnings from the GCC cpp's + -Wunused-macros. Normally we don't worry about that warning, but + some users do, and we want to make it easy for users to remove + YYFAIL uses, which will produce warnings from Bison 2.5. */ +#endif + +#define YYRECOVERING() (!!yyerrstatus) + +#define YYBACKUP(Token, Value) \ +do \ + if (yychar == YYEMPTY && yylen == 1) \ + { \ + yychar = (Token); \ + yylval = (Value); \ + YYPOPSTACK (1); \ + goto yybackup; \ + } \ + else \ + { \ + yyerror (YY_("syntax error: cannot back up")); \ + YYERROR; \ + } \ +while (YYID (0)) + + +#define YYTERROR 1 +#define YYERRCODE 256 + +#define YYERRCLEANUP \ +do \ + switch (yylastchar) \ + { \ + case FTS_NUMB: \ + case FTS_TEXT: \ + case FTS_TERM: \ + YYTOKENFREE(yylval.token); \ + break; \ + default: \ + break; \ + } \ +while (YYID (0)) + +/* YYLLOC_DEFAULT -- Set CURRENT to span from RHS[1] to RHS[N]. + If N is 0, then set CURRENT to the empty location which ends + the previous symbol: RHS[0] (always defined). */ + +#define YYRHSLOC(Rhs, K) ((Rhs)[K]) +#ifndef YYLLOC_DEFAULT +# define YYLLOC_DEFAULT(Current, Rhs, N) \ + do \ + if (YYID (N)) \ + { \ + (Current).first_line = YYRHSLOC (Rhs, 1).first_line; \ + (Current).first_column = YYRHSLOC (Rhs, 1).first_column; \ + (Current).last_line = YYRHSLOC (Rhs, N).last_line; \ + (Current).last_column = YYRHSLOC (Rhs, N).last_column; \ + } \ + else \ + { \ + (Current).first_line = (Current).last_line = \ + YYRHSLOC (Rhs, 0).last_line; \ + (Current).first_column = (Current).last_column = \ + YYRHSLOC (Rhs, 0).last_column; \ + } \ + while (YYID (0)) +#endif + + +/* This macro is provided for backward compatibility. */ + +#ifndef YY_LOCATION_PRINT +# define YY_LOCATION_PRINT(File, Loc) ((void) 0) +#endif + + +/* YYLEX -- calling `yylex' with the right arguments. */ + +#ifdef YYLEX_PARAM +# define YYLEX yylex (&yylval, YYLEX_PARAM) +#else +# define YYLEX yylex (&yylval) +#endif + +/* Enable debugging if requested. */ +#if YYDEBUG + +# ifndef YYFPRINTF +# include /* INFRINGES ON USER NAME SPACE */ +# define YYFPRINTF fprintf +# endif + +# define YYDPRINTF(Args) \ +do { \ + if (yydebug) \ + YYFPRINTF Args; \ +} while (YYID (0)) + +# define YY_SYMBOL_PRINT(Title, Type, Value, Location) \ +do { \ + if (yydebug) \ + { \ + YYFPRINTF (stderr, "%s ", Title); \ + yy_symbol_print (stderr, \ + Type, Value); \ + YYFPRINTF (stderr, "\n"); \ + } \ +} while (YYID (0)) + + +/*--------------------------------. +| Print this symbol on YYOUTPUT. | +`--------------------------------*/ + +/*ARGSUSED*/ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_symbol_value_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep) +#else +static void +yy_symbol_value_print (yyoutput, yytype, yyvaluep) + FILE *yyoutput; + int yytype; + YYSTYPE const * const yyvaluep; +#endif +{ + if (!yyvaluep) + return; +# ifdef YYPRINT + if (yytype < YYNTOKENS) + YYPRINT (yyoutput, yytoknum[yytype], *yyvaluep); +# else + YYUSE (yyoutput); +# endif + switch (yytype) + { + default: + break; + } +} + + +/*--------------------------------. +| Print this symbol on YYOUTPUT. | +`--------------------------------*/ + +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_symbol_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep) +#else +static void +yy_symbol_print (yyoutput, yytype, yyvaluep) + FILE *yyoutput; + int yytype; + YYSTYPE const * const yyvaluep; +#endif +{ + if (yytype < YYNTOKENS) + YYFPRINTF (yyoutput, "token %s (", yytname[yytype]); + else + YYFPRINTF (yyoutput, "nterm %s (", yytname[yytype]); + + yy_symbol_value_print (yyoutput, yytype, yyvaluep); + YYFPRINTF (yyoutput, ")"); +} + +/*------------------------------------------------------------------. +| yy_stack_print -- Print the state stack from its BOTTOM up to its | +| TOP (included). | +`------------------------------------------------------------------*/ + +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_stack_print (yytype_int16 *yybottom, yytype_int16 *yytop) +#else +static void +yy_stack_print (yybottom, yytop) + yytype_int16 *yybottom; + yytype_int16 *yytop; +#endif +{ + YYFPRINTF (stderr, "Stack now"); + for (; yybottom <= yytop; yybottom++) + { + int yybot = *yybottom; + YYFPRINTF (stderr, " %d", yybot); + } + YYFPRINTF (stderr, "\n"); +} + +# define YY_STACK_PRINT(Bottom, Top) \ +do { \ + if (yydebug) \ + yy_stack_print ((Bottom), (Top)); \ +} while (YYID (0)) + + +/*------------------------------------------------. +| Report that the YYRULE is going to be reduced. | +`------------------------------------------------*/ + +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_reduce_print (YYSTYPE *yyvsp, int yyrule) +#else +static void +yy_reduce_print (yyvsp, yyrule) + YYSTYPE *yyvsp; + int yyrule; +#endif +{ + int yynrhs = yyr2[yyrule]; + int yyi; + unsigned long int yylno = yyrline[yyrule]; + YYFPRINTF (stderr, "Reducing stack by rule %d (line %lu):\n", + yyrule - 1, yylno); + /* The symbols being reduced. */ + for (yyi = 0; yyi < yynrhs; yyi++) + { + YYFPRINTF (stderr, " $%d = ", yyi + 1); + yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi], + &(yyvsp[(yyi + 1) - (yynrhs)]) + ); + YYFPRINTF (stderr, "\n"); + } +} + +# define YY_REDUCE_PRINT(Rule) \ +do { \ + if (yydebug) \ + yy_reduce_print (yyvsp, Rule); \ +} while (YYID (0)) + +/* Nonzero means print parse trace. It is left uninitialized so that + multiple parsers can coexist. */ +int yydebug; +#else /* !YYDEBUG */ +# define YYDPRINTF(Args) +# define YY_SYMBOL_PRINT(Title, Type, Value, Location) +# define YY_STACK_PRINT(Bottom, Top) +# define YY_REDUCE_PRINT(Rule) +#endif /* !YYDEBUG */ + + +/* YYINITDEPTH -- initial size of the parser's stacks. */ +#ifndef YYINITDEPTH +# define YYINITDEPTH 200 +#endif + +/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only + if the built-in stack extension method is used). + + Do not make this value too large; the results are undefined if + YYSTACK_ALLOC_MAXIMUM < YYSTACK_BYTES (YYMAXDEPTH) + evaluated with infinite-precision integer arithmetic. */ + +#ifndef YYMAXDEPTH +# define YYMAXDEPTH 10000 +#endif + + +#if YYERROR_VERBOSE + +# ifndef yystrlen +# if defined __GLIBC__ && defined _STRING_H +# define yystrlen strlen +# else +/* Return the length of YYSTR. */ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static YYSIZE_T +yystrlen (const char *yystr) +#else +static YYSIZE_T +yystrlen (yystr) + const char *yystr; +#endif +{ + YYSIZE_T yylen; + for (yylen = 0; yystr[yylen]; yylen++) + continue; + return yylen; +} +# endif +# endif + +# ifndef yystpcpy +# if defined __GLIBC__ && defined _STRING_H && defined _GNU_SOURCE +# define yystpcpy stpcpy +# else +/* Copy YYSRC to YYDEST, returning the address of the terminating '\0' in + YYDEST. */ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static char * +yystpcpy (char *yydest, const char *yysrc) +#else +static char * +yystpcpy (yydest, yysrc) + char *yydest; + const char *yysrc; +#endif +{ + char *yyd = yydest; + const char *yys = yysrc; + + while ((*yyd++ = *yys++) != '\0') + continue; + + return yyd - 1; +} +# endif +# endif + +# ifndef yytnamerr +/* Copy to YYRES the contents of YYSTR after stripping away unnecessary + quotes and backslashes, so that it's suitable for yyerror. The + heuristic is that double-quoting is unnecessary unless the string + contains an apostrophe, a comma, or backslash (other than + backslash-backslash). YYSTR is taken from yytname. If YYRES is + null, do not copy; instead, return the length of what the result + would have been. */ +static YYSIZE_T +yytnamerr (char *yyres, const char *yystr) +{ + if (*yystr == '"') + { + YYSIZE_T yyn = 0; + char const *yyp = yystr; + + for (;;) + switch (*++yyp) + { + case '\'': + case ',': + goto do_not_strip_quotes; + + case '\\': + if (*++yyp != '\\') + goto do_not_strip_quotes; + /* Fall through. */ + default: + if (yyres) + yyres[yyn] = *yyp; + yyn++; + break; + + case '"': + if (yyres) + yyres[yyn] = '\0'; + return yyn; + } + do_not_strip_quotes: ; + } + + if (! yyres) + return yystrlen (yystr); + + return yystpcpy (yyres, yystr) - yyres; +} +# endif + +/* Copy into *YYMSG, which is of size *YYMSG_ALLOC, an error message + about the unexpected token YYTOKEN for the state stack whose top is + YYSSP. + + Return 0 if *YYMSG was successfully written. Return 1 if *YYMSG is + not large enough to hold the message. In that case, also set + *YYMSG_ALLOC to the required number of bytes. Return 2 if the + required number of bytes is too large to store. */ +static int +yysyntax_error (YYSIZE_T *yymsg_alloc, char **yymsg, + yytype_int16 *yyssp, int yytoken) +{ + YYSIZE_T yysize0 = yytnamerr (0, yytname[yytoken]); + YYSIZE_T yysize = yysize0; + YYSIZE_T yysize1; + enum { YYERROR_VERBOSE_ARGS_MAXIMUM = 5 }; + /* Internationalized format string. */ + const char *yyformat = 0; + /* Arguments of yyformat. */ + char const *yyarg[YYERROR_VERBOSE_ARGS_MAXIMUM]; + /* Number of reported tokens (one for the "unexpected", one per + "expected"). */ + int yycount = 0; + + /* There are many possibilities here to consider: + - Assume YYFAIL is not used. It's too flawed to consider. See + + for details. YYERROR is fine as it does not invoke this + function. + - If this state is a consistent state with a default action, then + the only way this function was invoked is if the default action + is an error action. In that case, don't check for expected + tokens because there are none. + - The only way there can be no lookahead present (in yychar) is if + this state is a consistent state with a default action. Thus, + detecting the absence of a lookahead is sufficient to determine + that there is no unexpected or expected token to report. In that + case, just report a simple "syntax error". + - Don't assume there isn't a lookahead just because this state is a + consistent state with a default action. There might have been a + previous inconsistent state, consistent state with a non-default + action, or user semantic action that manipulated yychar. + - Of course, the expected token list depends on states to have + correct lookahead information, and it depends on the parser not + to perform extra reductions after fetching a lookahead from the + scanner and before detecting a syntax error. Thus, state merging + (from LALR or IELR) and default reductions corrupt the expected + token list. However, the list is correct for canonical LR with + one exception: it will still contain any token that will not be + accepted due to an error action in a later state. + */ + if (yytoken != YYEMPTY) + { + int yyn = yypact[*yyssp]; + yyarg[yycount++] = yytname[yytoken]; + if (!yypact_value_is_default (yyn)) + { + /* Start YYX at -YYN if negative to avoid negative indexes in + YYCHECK. In other words, skip the first -YYN actions for + this state because they are default actions. */ + int yyxbegin = yyn < 0 ? -yyn : 0; + /* Stay within bounds of both yycheck and yytname. */ + int yychecklim = YYLAST - yyn + 1; + int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS; + int yyx; + + for (yyx = yyxbegin; yyx < yyxend; ++yyx) + if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR + && !yytable_value_is_error (yytable[yyx + yyn])) + { + if (yycount == YYERROR_VERBOSE_ARGS_MAXIMUM) + { + yycount = 1; + yysize = yysize0; + break; + } + yyarg[yycount++] = yytname[yyx]; + yysize1 = yysize + yytnamerr (0, yytname[yyx]); + if (! (yysize <= yysize1 + && yysize1 <= YYSTACK_ALLOC_MAXIMUM)) + return 2; + yysize = yysize1; + } + } + } + + switch (yycount) + { +# define YYCASE_(N, S) \ + case N: \ + yyformat = S; \ + break + YYCASE_(0, YY_("syntax error")); + YYCASE_(1, YY_("syntax error, unexpected %s")); + YYCASE_(2, YY_("syntax error, unexpected %s, expecting %s")); + YYCASE_(3, YY_("syntax error, unexpected %s, expecting %s or %s")); + YYCASE_(4, YY_("syntax error, unexpected %s, expecting %s or %s or %s")); + YYCASE_(5, YY_("syntax error, unexpected %s, expecting %s or %s or %s or %s")); +# undef YYCASE_ + } + + yysize1 = yysize + yystrlen (yyformat); + if (! (yysize <= yysize1 && yysize1 <= YYSTACK_ALLOC_MAXIMUM)) + return 2; + yysize = yysize1; + + if (*yymsg_alloc < yysize) + { + *yymsg_alloc = 2 * yysize; + if (! (yysize <= *yymsg_alloc + && *yymsg_alloc <= YYSTACK_ALLOC_MAXIMUM)) + *yymsg_alloc = YYSTACK_ALLOC_MAXIMUM; + return 1; + } + + /* Avoid sprintf, as that infringes on the user's name space. + Don't have undefined behavior even if the translation + produced a string with the wrong number of "%s"s. */ + { + char *yyp = *yymsg; + int yyi = 0; + while ((*yyp = *yyformat) != '\0') + if (*yyp == '%' && yyformat[1] == 's' && yyi < yycount) + { + yyp += yytnamerr (yyp, yyarg[yyi++]); + yyformat += 2; + } + else + { + yyp++; + yyformat++; + } + } + return 0; +} +#endif /* YYERROR_VERBOSE */ + +/*-----------------------------------------------. +| Release the memory associated to this symbol. | +`-----------------------------------------------*/ + +/*ARGSUSED*/ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yydestruct (const char *yymsg, int yytype, YYSTYPE *yyvaluep) +#else +static void +yydestruct (yymsg, yytype, yyvaluep) + const char *yymsg; + int yytype; + YYSTYPE *yyvaluep; +#endif +{ + YYUSE (yyvaluep); + + if (!yymsg) + yymsg = "Deleting"; + YY_SYMBOL_PRINT (yymsg, yytype, yyvaluep, yylocationp); + + switch (yytype) + { + + default: + break; + } +} + + +/* Prevent warnings from -Wmissing-prototypes. */ +#ifdef YYPARSE_PARAM +#if defined __STDC__ || defined __cplusplus +int yyparse (void *YYPARSE_PARAM); +#else +int yyparse (); +#endif +#else /* ! YYPARSE_PARAM */ +#if defined __STDC__ || defined __cplusplus +int yyparse (void); +#else +int yyparse (); +#endif +#endif /* ! YYPARSE_PARAM */ + + +/*----------. +| yyparse. | +`----------*/ + +#ifdef YYPARSE_PARAM +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +int +yyparse (void *YYPARSE_PARAM) +#else +int +yyparse (YYPARSE_PARAM) + void *YYPARSE_PARAM; +#endif +#else /* ! YYPARSE_PARAM */ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +int +yyparse (void) +#else +int +yyparse () + +#endif +#endif +{ +/* The lookahead symbol. */ +int yychar; +/* The backup of yychar when there is an error and we're in yyerrlab. */ +int yylastchar; + +/* The semantic value of the lookahead symbol. */ +YYSTYPE yylval; + + /* Number of syntax errors so far. */ + int yynerrs; + + int yystate; + /* Number of tokens to shift before error messages enabled. */ + int yyerrstatus; + + /* The stacks and their tools: + `yyss': related to states. + `yyvs': related to semantic values. + + Refer to the stacks thru separate pointers, to allow yyoverflow + to reallocate them elsewhere. */ + + /* The state stack. */ + yytype_int16 yyssa[YYINITDEPTH]; + yytype_int16 *yyss; + yytype_int16 *yyssp; + + /* The semantic value stack. */ + YYSTYPE yyvsa[YYINITDEPTH]; + YYSTYPE *yyvs; + YYSTYPE *yyvsp; + + YYSIZE_T yystacksize; + + int yyn; + int yyresult; + /* Lookahead token as an internal (translated) token number. */ + int yytoken; + /* The variables used to return semantic value and location from the + action routines. */ + YYSTYPE yyval; + +#if YYERROR_VERBOSE + /* Buffer for error messages, and its allocated size. */ + char yymsgbuf[128]; + char *yymsg = yymsgbuf; + YYSIZE_T yymsg_alloc = sizeof yymsgbuf; +#endif + +#define YYPOPSTACK(N) (yyvsp -= (N), yyssp -= (N)) + + /* The number of symbols on the RHS of the reduced rule. + Keep to zero when no symbol should be popped. */ + int yylen = 0; + + yytoken = 0; + yyss = yyssa; + yyvs = yyvsa; + yystacksize = YYINITDEPTH; + + YYDPRINTF ((stderr, "Starting parse\n")); + + yystate = 0; + yyerrstatus = 0; + yynerrs = 0; + yychar = YYEMPTY; /* Cause a token to be read. */ + + /* Initialize stack pointers. + Waste one element of value and location stack + so that they stay on the same level as the state stack. + The wasted elements are never initialized. */ + yyssp = yyss; + yyvsp = yyvs; + + goto yysetstate; + +/*------------------------------------------------------------. +| yynewstate -- Push a new state, which is found in yystate. | +`------------------------------------------------------------*/ + yynewstate: + /* In all cases, when you get here, the value and location stacks + have just been pushed. So pushing a state here evens the stacks. */ + yyssp++; + + yysetstate: + *yyssp = yystate; + + if (yyss + yystacksize - 1 <= yyssp) + { + /* Get the current used size of the three stacks, in elements. */ + YYSIZE_T yysize = yyssp - yyss + 1; + +#ifdef yyoverflow + { + /* Give user a chance to reallocate the stack. Use copies of + these so that the &'s don't force the real ones into + memory. */ + YYSTYPE *yyvs1 = yyvs; + yytype_int16 *yyss1 = yyss; + + /* Each stack pointer address is followed by the size of the + data in use in that stack, in bytes. This used to be a + conditional around just the two extra args, but that might + be undefined if yyoverflow is a macro. */ + yyoverflow (YY_("memory exhausted"), + &yyss1, yysize * sizeof (*yyssp), + &yyvs1, yysize * sizeof (*yyvsp), + &yystacksize); + + yyss = yyss1; + yyvs = yyvs1; + } +#else /* no yyoverflow */ +# ifndef YYSTACK_RELOCATE + goto yyexhaustedlab; +# else + /* Extend the stack our own way. */ + if (YYMAXDEPTH <= yystacksize) + goto yyexhaustedlab; + yystacksize *= 2; + if (YYMAXDEPTH < yystacksize) + yystacksize = YYMAXDEPTH; + + { + yytype_int16 *yyss1 = yyss; + union yyalloc *yyptr = + (union yyalloc *) YYSTACK_ALLOC (YYSTACK_BYTES (yystacksize)); + if (! yyptr) + goto yyexhaustedlab; + YYSTACK_RELOCATE (yyss_alloc, yyss); + YYSTACK_RELOCATE (yyvs_alloc, yyvs); +# undef YYSTACK_RELOCATE + if (yyss1 != yyssa) + YYSTACK_FREE (yyss1); + } +# endif +#endif /* no yyoverflow */ + + yyssp = yyss + yysize - 1; + yyvsp = yyvs + yysize - 1; + + YYDPRINTF ((stderr, "Stack size increased to %lu\n", + (unsigned long int) yystacksize)); + + if (yyss + yystacksize - 1 <= yyssp) + YYABORT; + } + + YYDPRINTF ((stderr, "Entering state %d\n", yystate)); + + if (yystate == YYFINAL) + YYACCEPT; + + goto yybackup; + +/*-----------. +| yybackup. | +`-----------*/ +yybackup: + + /* Do appropriate processing given the current state. Read a + lookahead token if we need one and don't already have one. */ + + /* First try to decide what to do without reference to lookahead token. */ + yyn = yypact[yystate]; + if (yypact_value_is_default (yyn)) + goto yydefault; + + /* Not known => get a lookahead token if don't already have one. */ + + /* YYCHAR is either YYEMPTY or YYEOF or a valid lookahead symbol. */ + if (yychar == YYEMPTY) + { + YYDPRINTF ((stderr, "Reading a token: ")); + yychar = YYLEX; + } + + if (yychar <= YYEOF) + { + yychar = yytoken = YYEOF; + YYDPRINTF ((stderr, "Now at end of input.\n")); + } + else + { + yytoken = YYTRANSLATE (yychar); + YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc); + } + + /* If the proper action on seeing token YYTOKEN is to reduce or to + detect an error, take that action. */ + yyn += yytoken; + if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken) + goto yydefault; + yyn = yytable[yyn]; + if (yyn <= 0) + { + if (yytable_value_is_error (yyn)) + goto yyerrlab; + yyn = -yyn; + goto yyreduce; + } + + /* Count tokens shifted since error; after three, turn off error + status. */ + if (yyerrstatus) + yyerrstatus--; + + /* Shift the lookahead token. */ + YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc); + + /* Discard the shifted token. */ + yychar = YYEMPTY; + + yystate = yyn; + *++yyvsp = yylval; + + goto yynewstate; + + +/*-----------------------------------------------------------. +| yydefault -- do the default action for the current state. | +`-----------------------------------------------------------*/ +yydefault: + yyn = yydefact[yystate]; + if (yyn == 0) + goto yyerrlab; + goto yyreduce; + + +/*-----------------------------. +| yyreduce -- Do a reduction. | +`-----------------------------*/ +yyreduce: + /* yyn is the number of a rule to reduce with. */ + yylen = yyr2[yyn]; + + /* If YYLEN is nonzero, implement the default value of the action: + `$$ = $1'. + + Otherwise, the following line sets YYVAL to garbage. + This behavior is undocumented and Bison + users should not rely upon it. Assigning to YYVAL + unconditionally makes the parser a bit smaller, and it avoids a + GCC warning that YYVAL may be used uninitialized. */ + yyval = yyvsp[1-yylen]; + + + YY_REDUCE_PRINT (yyn); + switch (yyn) + { + case 2: + +/* Line 1806 of yacc.c */ +#line 79 "fts0pars.y" + { + (yyval.node) = (yyvsp[(1) - (1)].node); + ((fts_ast_state_t*) state)->root = (yyval.node); + } + break; + + case 3: + +/* Line 1806 of yacc.c */ +#line 85 "fts0pars.y" + { + (yyval.node) = NULL; + } + break; + + case 4: + +/* Line 1806 of yacc.c */ +#line 89 "fts0pars.y" + { + (yyval.node) = (yyvsp[(1) - (2)].node); + + if (!(yyval.node)) { + (yyval.node) = fts_ast_create_node_list(state, (yyvsp[(2) - (2)].node)); + } else { + fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node)); + } + } + break; + + case 5: + +/* Line 1806 of yacc.c */ +#line 99 "fts0pars.y" + { + (yyval.node) = (yyvsp[(1) - (2)].node); + (yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (2)].node)); + + if (!(yyval.node)) { + (yyval.node) = (yyvsp[(2) - (2)].node); + } else { + fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node)); + } + } + break; + + case 6: + +/* Line 1806 of yacc.c */ +#line 111 "fts0pars.y" + { + (yyval.node) = (yyvsp[(2) - (3)].node); + + if ((yyval.node)) { + (yyval.node) = fts_ast_create_node_subexp_list(state, (yyval.node)); + } + } + break; + + case 7: + +/* Line 1806 of yacc.c */ +#line 119 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (4)].node)); + + if ((yyvsp[(3) - (4)].node)) { + fts_ast_add_node((yyval.node), + fts_ast_create_node_subexp_list(state, (yyvsp[(3) - (4)].node))); + } + } + break; + + case 8: + +/* Line 1806 of yacc.c */ +#line 129 "fts0pars.y" + { + (yyval.node) = (yyvsp[(1) - (1)].node); + } + break; + + case 9: + +/* Line 1806 of yacc.c */ +#line 133 "fts0pars.y" + { + (yyval.node) = (yyvsp[(1) - (1)].node); + } + break; + + case 10: + +/* Line 1806 of yacc.c */ +#line 137 "fts0pars.y" + { + fts_ast_term_set_wildcard((yyvsp[(1) - (2)].node)); + } + break; + + case 11: + +/* Line 1806 of yacc.c */ +#line 141 "fts0pars.y" + { + fts_ast_text_set_distance((yyvsp[(1) - (3)].node), fts_ast_string_to_ul((yyvsp[(3) - (3)].token), 10)); + fts_ast_string_free((yyvsp[(3) - (3)].token)); + } + break; + + case 12: + +/* Line 1806 of yacc.c */ +#line 146 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (3)].node)); + fts_ast_add_node((yyval.node), (yyvsp[(2) - (3)].node)); + fts_ast_term_set_wildcard((yyvsp[(2) - (3)].node)); + } + break; + + case 13: + +/* Line 1806 of yacc.c */ +#line 152 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (2)].node)); + fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node)); + } + break; + + case 14: + +/* Line 1806 of yacc.c */ +#line 157 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (4)].node)); + fts_ast_add_node((yyval.node), (yyvsp[(2) - (4)].node)); + fts_ast_text_set_distance((yyvsp[(2) - (4)].node), fts_ast_string_to_ul((yyvsp[(4) - (4)].token), 10)); + fts_ast_string_free((yyvsp[(4) - (4)].token)); + } + break; + + case 15: + +/* Line 1806 of yacc.c */ +#line 164 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (2)].node)); + fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node)); + } + break; + + case 16: + +/* Line 1806 of yacc.c */ +#line 170 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_oper(state, FTS_IGNORE); + } + break; + + case 17: + +/* Line 1806 of yacc.c */ +#line 174 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_oper(state, FTS_EXIST); + } + break; + + case 18: + +/* Line 1806 of yacc.c */ +#line 178 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_oper(state, FTS_NEGATE); + } + break; + + case 19: + +/* Line 1806 of yacc.c */ +#line 182 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_oper(state, FTS_DECR_RATING); + } + break; + + case 20: + +/* Line 1806 of yacc.c */ +#line 186 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_oper(state, FTS_INCR_RATING); + } + break; + + case 21: + +/* Line 1806 of yacc.c */ +#line 191 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_term(state, (yyvsp[(1) - (1)].token)); + fts_ast_string_free((yyvsp[(1) - (1)].token)); + } + break; + + case 22: + +/* Line 1806 of yacc.c */ +#line 196 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_term(state, (yyvsp[(1) - (1)].token)); + fts_ast_string_free((yyvsp[(1) - (1)].token)); + } + break; + + case 23: + +/* Line 1806 of yacc.c */ +#line 202 "fts0pars.y" + { + (yyval.node) = (yyvsp[(2) - (2)].node); + } + break; + + case 24: + +/* Line 1806 of yacc.c */ +#line 207 "fts0pars.y" + { + (yyval.node) = fts_ast_create_node_text(state, (yyvsp[(1) - (1)].token)); + fts_ast_string_free((yyvsp[(1) - (1)].token)); + } + break; + + + +/* Line 1806 of yacc.c */ +#line 1663 "fts0pars.cc" + default: break; + } + /* User semantic actions sometimes alter yychar, and that requires + that yytoken be updated with the new translation. We take the + approach of translating immediately before every use of yytoken. + One alternative is translating here after every semantic action, + but that translation would be missed if the semantic action invokes + YYABORT, YYACCEPT, or YYERROR immediately after altering yychar or + if it invokes YYBACKUP. In the case of YYABORT or YYACCEPT, an + incorrect destructor might then be invoked immediately. In the + case of YYERROR or YYBACKUP, subsequent parser actions might lead + to an incorrect destructor call or verbose syntax error message + before the lookahead is translated. */ + YY_SYMBOL_PRINT ("-> $$ =", yyr1[yyn], &yyval, &yyloc); + + YYPOPSTACK (yylen); + yylen = 0; + YY_STACK_PRINT (yyss, yyssp); + + *++yyvsp = yyval; + + /* Now `shift' the result of the reduction. Determine what state + that goes to, based on the state we popped back to and the rule + number reduced by. */ + + yyn = yyr1[yyn]; + + yystate = yypgoto[yyn - YYNTOKENS] + *yyssp; + if (0 <= yystate && yystate <= YYLAST && yycheck[yystate] == *yyssp) + yystate = yytable[yystate]; + else + yystate = yydefgoto[yyn - YYNTOKENS]; + + goto yynewstate; + + +/*------------------------------------. +| yyerrlab -- here on detecting error | +`------------------------------------*/ +yyerrlab: + /* Backup yychar, in case we would change it. */ + yylastchar = yychar; + /* Make sure we have latest lookahead translation. See comments at + user semantic actions for why this is necessary. */ + yytoken = yychar == YYEMPTY ? YYEMPTY : YYTRANSLATE (yychar); + + /* If not already recovering from an error, report this error. */ + if (!yyerrstatus) + { + ++yynerrs; +#if ! YYERROR_VERBOSE + yyerror (YY_("syntax error")); +#else +# define YYSYNTAX_ERROR yysyntax_error (&yymsg_alloc, &yymsg, \ + yyssp, yytoken) + { + char const *yymsgp = YY_("syntax error"); + int yysyntax_error_status; + yysyntax_error_status = YYSYNTAX_ERROR; + if (yysyntax_error_status == 0) + yymsgp = yymsg; + else if (yysyntax_error_status == 1) + { + if (yymsg != yymsgbuf) + YYSTACK_FREE (yymsg); + yymsg = (char *) YYSTACK_ALLOC (yymsg_alloc); + if (!yymsg) + { + yymsg = yymsgbuf; + yymsg_alloc = sizeof yymsgbuf; + yysyntax_error_status = 2; + } + else + { + yysyntax_error_status = YYSYNTAX_ERROR; + yymsgp = yymsg; + } + } + yyerror (yymsgp); + if (yysyntax_error_status == 2) + goto yyexhaustedlab; + } +# undef YYSYNTAX_ERROR +#endif + } + + + + if (yyerrstatus == 3) + { + /* If just tried and failed to reuse lookahead token after an + error, discard it. */ + + if (yychar <= YYEOF) + { + /* Return failure if at end of input. */ + if (yychar == YYEOF) + { + /* Since we don't need the token, we have to free it first. */ + YYERRCLEANUP; + YYABORT; + } + } + else + { + yydestruct ("Error: discarding", + yytoken, &yylval); + yychar = YYEMPTY; + } + } + + /* Else will try to reuse lookahead token after shifting the error + token. */ + goto yyerrlab1; + + +/*---------------------------------------------------. +| yyerrorlab -- error raised explicitly by YYERROR. | +`---------------------------------------------------*/ +yyerrorlab: + + /* Pacify compilers like GCC when the user code never invokes + YYERROR and the label yyerrorlab therefore never appears in user + code. */ + if (/*CONSTCOND*/ 0) + goto yyerrorlab; + + /* Do not reclaim the symbols of the rule which action triggered + this YYERROR. */ + YYPOPSTACK (yylen); + yylen = 0; + YY_STACK_PRINT (yyss, yyssp); + yystate = *yyssp; + goto yyerrlab1; + + +/*-------------------------------------------------------------. +| yyerrlab1 -- common code for both syntax error and YYERROR. | +`-------------------------------------------------------------*/ +yyerrlab1: + yyerrstatus = 3; /* Each real token shifted decrements this. */ + + for (;;) + { + yyn = yypact[yystate]; + if (!yypact_value_is_default (yyn)) + { + yyn += YYTERROR; + if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYTERROR) + { + yyn = yytable[yyn]; + if (0 < yyn) + break; + } + } + + /* Pop the current state because it cannot handle the error token. */ + if (yyssp == yyss) + { + /* Since we don't need the error token, we have to free it first. */ + YYERRCLEANUP; + YYABORT; + } + + + yydestruct ("Error: popping", + yystos[yystate], yyvsp); + YYPOPSTACK (1); + yystate = *yyssp; + YY_STACK_PRINT (yyss, yyssp); + } + + *++yyvsp = yylval; + + + /* Shift the error token. */ + YY_SYMBOL_PRINT ("Shifting", yystos[yyn], yyvsp, yylsp); + + yystate = yyn; + goto yynewstate; + + +/*-------------------------------------. +| yyacceptlab -- YYACCEPT comes here. | +`-------------------------------------*/ +yyacceptlab: + yyresult = 0; + goto yyreturn; + +/*-----------------------------------. +| yyabortlab -- YYABORT comes here. | +`-----------------------------------*/ +yyabortlab: + yyresult = 1; + goto yyreturn; + +#if !defined(yyoverflow) || YYERROR_VERBOSE +/*-------------------------------------------------. +| yyexhaustedlab -- memory exhaustion comes here. | +`-------------------------------------------------*/ +yyexhaustedlab: + yyerror (YY_("memory exhausted")); + yyresult = 2; + /* Fall through. */ +#endif + +yyreturn: + if (yychar != YYEMPTY) + { + /* Make sure we have latest lookahead translation. See comments at + user semantic actions for why this is necessary. */ + yytoken = YYTRANSLATE (yychar); + yydestruct ("Cleanup: discarding lookahead", + yytoken, &yylval); + } + /* Do not reclaim the symbols of the rule which action triggered + this YYABORT or YYACCEPT. */ + YYPOPSTACK (yylen); + YY_STACK_PRINT (yyss, yyssp); + while (yyssp != yyss) + { + yydestruct ("Cleanup: popping", + yystos[*yyssp], yyvsp); + YYPOPSTACK (1); + } +#ifndef yyoverflow + if (yyss != yyssa) + YYSTACK_FREE (yyss); +#endif +#if YYERROR_VERBOSE + if (yymsg != yymsgbuf) + YYSTACK_FREE (yymsg); +#endif + /* Make sure YYID is used. */ + return YYID (yyresult); +} + + + +/* Line 2067 of yacc.c */ +#line 212 "fts0pars.y" + + +/******************************************************************** +*/ +int +ftserror( +/*=====*/ + const char* p) +{ + my_printf_error(ER_PARSE_ERROR, "%s", MYF(0), p); + return(0); +} + +/******************************************************************** +Create a fts_lexer_t instance.*/ +fts_lexer_t* +fts_lexer_create( +/*=============*/ + ibool boolean_mode, + const byte* query, + ulint query_len) +{ + fts_lexer_t* fts_lexer = static_cast( + ut_malloc_nokey(sizeof(fts_lexer_t))); + + if (boolean_mode) { + fts0blex_init(&fts_lexer->yyscanner); + fts0b_scan_bytes( + reinterpret_cast(query), + static_cast(query_len), + fts_lexer->yyscanner); + fts_lexer->scanner = fts_blexer; + /* FIXME: Debugging */ + /* fts0bset_debug(1 , fts_lexer->yyscanner); */ + } else { + fts0tlex_init(&fts_lexer->yyscanner); + fts0t_scan_bytes( + reinterpret_cast(query), + static_cast(query_len), + fts_lexer->yyscanner); + fts_lexer->scanner = fts_tlexer; + } + + return(fts_lexer); +} + +/******************************************************************** +Free an fts_lexer_t instance.*/ +void + +fts_lexer_free( +/*===========*/ + fts_lexer_t* fts_lexer) +{ + if (fts_lexer->scanner == fts_blexer) { + fts0blex_destroy(fts_lexer->yyscanner); + } else { + fts0tlex_destroy(fts_lexer->yyscanner); + } + + ut_free(fts_lexer); +} + +/******************************************************************** +Call the appropaiate scanner.*/ +int +fts_lexer( +/*======*/ + YYSTYPE* val, + fts_lexer_t* fts_lexer) +{ + fts_scanner func_ptr; + + func_ptr = fts_lexer->scanner; + + return(func_ptr(val, fts_lexer->yyscanner)); +} + +/******************************************************************** +Parse the query.*/ +int +fts_parse( +/*======*/ + fts_ast_state_t* state) +{ + return(ftsparse(state)); +} + diff --git a/storage/innobase/fts/fts0pars.y b/storage/innobase/fts/fts0pars.y new file mode 100644 index 00000000..903c7280 --- /dev/null +++ b/storage/innobase/fts/fts0pars.y @@ -0,0 +1,293 @@ +/***************************************************************************** + +Copyright (c) 2007, 2014, Oracle and/or its affiliates. All rights reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/** + * @file fts/fts0pars.y + * FTS parser: input file for the GNU Bison parser generator + * + * Created 2007/5/9 Sunny Bains + */ + +%{ +#include "ha_prototypes.h" +#include "mem0mem.h" +#include "fts0ast.h" +#include "fts0blex.h" +#include "fts0tlex.h" +#include "fts0pars.h" +#include +extern int fts_lexer(YYSTYPE*, fts_lexer_t*); +extern int fts_blexer(YYSTYPE*, yyscan_t); +extern int fts_tlexer(YYSTYPE*, yyscan_t); +#ifdef __GNUC__ +# pragma GCC diagnostic ignored "-Wpragmas" +# pragma GCC diagnostic ignored "-Wunknown-warning-option" +# pragma GCC diagnostic ignored "-Wunused-but-set-variable" +#endif +extern int ftserror(const char* p); +/* Required for reentrant parser */ +#define ftslex fts_lexer + +#define YYERROR_VERBOSE + +/* For passing an argument to yyparse() */ +#define YYPARSE_PARAM state +#define YYLEX_PARAM ((fts_ast_state_t*) state)->lexer + + +typedef int (*fts_scanner)(YYSTYPE* val, yyscan_t yyscanner); + +struct fts_lexer_struct { + fts_scanner scanner; + void* yyscanner; +}; + +%} + +%union { + int oper; + fts_ast_string_t* token; + fts_ast_node_t* node; +}; + +/* Enable re-entrant parser */ +%pure_parser + +%token FTS_OPER +%token FTS_TEXT FTS_TERM FTS_NUMB + +%type prefix term text expr sub_expr expr_lst query + +%nonassoc '+' '-' '~' '<' '>' + +%% + +query : expr_lst { + $$ = $1; + ((fts_ast_state_t*) state)->root = $$; + } + ; + +expr_lst: /* Empty */ { + $$ = NULL; + } + + | expr_lst expr { + $$ = $1; + + if (!$$) { + $$ = fts_ast_create_node_list(state, $2); + } else { + fts_ast_add_node($$, $2); + } + } + + | expr_lst sub_expr { + $$ = $1; + $$ = fts_ast_create_node_list(state, $1); + + if (!$$) { + $$ = $2; + } else { + fts_ast_add_node($$, $2); + } + } + ; + +sub_expr: '(' expr_lst ')' { + $$ = $2; + + if ($$) { + $$ = fts_ast_create_node_subexp_list(state, $$); + } + } + + | prefix '(' expr_lst ')' { + $$ = fts_ast_create_node_list(state, $1); + + if ($3) { + fts_ast_add_node($$, + fts_ast_create_node_subexp_list(state, $3)); + } + } + ; + +expr : term { + $$ = $1; + } + + | text { + $$ = $1; + } + + | term '*' { + fts_ast_term_set_wildcard($1); + } + + | text '@' FTS_NUMB { + fts_ast_text_set_distance($1, fts_ast_string_to_ul($3, 10)); + fts_ast_string_free($3); + } + + | prefix term '*' { + $$ = fts_ast_create_node_list(state, $1); + fts_ast_add_node($$, $2); + fts_ast_term_set_wildcard($2); + } + + | prefix term { + $$ = fts_ast_create_node_list(state, $1); + fts_ast_add_node($$, $2); + } + + | prefix text '@' FTS_NUMB { + $$ = fts_ast_create_node_list(state, $1); + fts_ast_add_node($$, $2); + fts_ast_text_set_distance($2, fts_ast_string_to_ul($4, 10)); + fts_ast_string_free($4); + } + + | prefix text { + $$ = fts_ast_create_node_list(state, $1); + fts_ast_add_node($$, $2); + } + ; + +prefix : '-' { + $$ = fts_ast_create_node_oper(state, FTS_IGNORE); + } + + | '+' { + $$ = fts_ast_create_node_oper(state, FTS_EXIST); + } + + | '~' { + $$ = fts_ast_create_node_oper(state, FTS_NEGATE); + } + + | '<' { + $$ = fts_ast_create_node_oper(state, FTS_DECR_RATING); + } + + | '>' { + $$ = fts_ast_create_node_oper(state, FTS_INCR_RATING); + } + ; + +term : FTS_TERM { + $$ = fts_ast_create_node_term(state, $1); + fts_ast_string_free($1); + } + + | FTS_NUMB { + $$ = fts_ast_create_node_term(state, $1); + fts_ast_string_free($1); + } + + /* Ignore leading '*' */ + | '*' term { + $$ = $2; + } + ; + +text : FTS_TEXT { + $$ = fts_ast_create_node_text(state, $1); + fts_ast_string_free($1); + } + ; +%% + +/******************************************************************** +*/ +int +ftserror( +/*=====*/ + const char* p) +{ + fprintf(stderr, "%s\n", p); + return(0); +} + +/******************************************************************** +Create a fts_lexer_t instance.*/ +fts_lexer_t* +fts_lexer_create( +/*=============*/ + ibool boolean_mode, + const byte* query, + ulint query_len) +{ + fts_lexer_t* fts_lexer = static_cast( + ut_malloc_nokey(sizeof(fts_lexer_t))); + + if (boolean_mode) { + fts0blex_init(&fts_lexer->yyscanner); + fts0b_scan_bytes((char*) query, (int) query_len, fts_lexer->yyscanner); + fts_lexer->scanner = fts_blexer; + /* FIXME: Debugging */ + /* fts0bset_debug(1 , fts_lexer->yyscanner); */ + } else { + fts0tlex_init(&fts_lexer->yyscanner); + fts0t_scan_bytes((char*) query, (int) query_len, fts_lexer->yyscanner); + fts_lexer->scanner = fts_tlexer; + } + + return(fts_lexer); +} + +/******************************************************************** +Free an fts_lexer_t instance.*/ +void + +fts_lexer_free( +/*===========*/ + fts_lexer_t* fts_lexer) +{ + if (fts_lexer->scanner == fts_blexer) { + fts0blex_destroy(fts_lexer->yyscanner); + } else { + fts0tlex_destroy(fts_lexer->yyscanner); + } + + ut_free(fts_lexer); +} + +/******************************************************************** +Call the appropaiate scanner.*/ +int +fts_lexer( +/*======*/ + YYSTYPE* val, + fts_lexer_t* fts_lexer) +{ + fts_scanner func_ptr; + + func_ptr = fts_lexer->scanner; + + return(func_ptr(val, fts_lexer->yyscanner)); +} + +/******************************************************************** +Parse the query.*/ +int +fts_parse( +/*======*/ + fts_ast_state_t* state) +{ + return(ftsparse(state)); +} diff --git a/storage/innobase/fts/fts0plugin.cc b/storage/innobase/fts/fts0plugin.cc new file mode 100644 index 00000000..de99d170 --- /dev/null +++ b/storage/innobase/fts/fts0plugin.cc @@ -0,0 +1,283 @@ +/***************************************************************************** + +Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file fts/fts0plugin.cc +Full Text Search plugin support. + +Created 2013/06/04 Shaohua Wang +***********************************************************************/ + +#include "fts0ast.h" +#include "fts0plugin.h" +#include "fts0tokenize.h" + +#include "ft_global.h" + +/******************************************************************//** +FTS default parser init +@return 0 */ +static int fts_default_parser_init(MYSQL_FTPARSER_PARAM*) { return 0; } + +/******************************************************************//** +FTS default parser deinit +@return 0 */ +static int fts_default_parser_deinit(MYSQL_FTPARSER_PARAM*) { return 0; } + +/******************************************************************//** +FTS default parser parse from ft_static.c in MYISAM. +@return 0 if parse successfully, or return non-zero */ +static +int +fts_default_parser_parse( +/*=====================*/ + MYSQL_FTPARSER_PARAM *param) /*!< in: plugin parser param */ +{ + return(param->mysql_parse(param, param->doc, param->length)); +} + +/* FTS default parser from ft_static.c in MYISAM. */ +struct st_mysql_ftparser fts_default_parser = +{ + MYSQL_FTPARSER_INTERFACE_VERSION, + fts_default_parser_parse, + fts_default_parser_init, + fts_default_parser_deinit +}; + +/******************************************************************//** +Get a operator node from token boolean info +@return node */ +static +fts_ast_node_t* +fts_query_get_oper_node( +/*====================*/ + MYSQL_FTPARSER_BOOLEAN_INFO* info, /*!< in: token info */ + fts_ast_state_t* state) /*!< in/out: query parse state*/ +{ + fts_ast_node_t* oper_node = NULL; + + if (info->yesno > 0) { + oper_node = fts_ast_create_node_oper(state, FTS_EXIST); + } else if (info->yesno < 0) { + oper_node = fts_ast_create_node_oper(state, FTS_IGNORE); + } else if (info->weight_adjust > 0) { + oper_node = fts_ast_create_node_oper(state, FTS_INCR_RATING); + } else if (info->weight_adjust < 0) { + oper_node = fts_ast_create_node_oper(state, FTS_DECR_RATING); + } else if (info->wasign > 0) { + oper_node = fts_ast_create_node_oper(state, FTS_NEGATE); + } + + return(oper_node); +} + +/******************************************************************//** +FTS plugin parser 'myql_add_word' callback function for query parse. +Refer to 'st_mysql_ftparser_param' for more detail. +Note: +a. Parse logic refers to 'ftb_query_add_word' from ft_boolean_search.c in MYISAM; +b. Parse node or tree refers to fts0pars.y. +@return 0 if add successfully, or return non-zero. */ +static +int +fts_query_add_word_for_parser( +/*==========================*/ + MYSQL_FTPARSER_PARAM* param, /*!< in: parser param */ + const char* word, /*!< in: token */ + int word_len, /*!< in: token length */ + MYSQL_FTPARSER_BOOLEAN_INFO* info) /*!< in: token info */ +{ + fts_ast_state_t* state = + static_cast(param->mysql_ftparam); + fts_ast_node_t* cur_node = state->cur_node; + fts_ast_node_t* oper_node = NULL; + fts_ast_node_t* term_node = NULL; + fts_ast_node_t* node = NULL; + + switch (info->type) { + case FT_TOKEN_STOPWORD: + /* We only handler stopword in phrase */ + if (cur_node->type != FTS_AST_PARSER_PHRASE_LIST) { + break; + } + /* fall through */ + + case FT_TOKEN_WORD: + term_node = fts_ast_create_node_term_for_parser( + state, word, ulint(word_len)); + + if (info->trunc) { + fts_ast_term_set_wildcard(term_node); + } + + if (cur_node->type == FTS_AST_PARSER_PHRASE_LIST) { + /* Ignore operator inside phrase */ + fts_ast_add_node(cur_node, term_node); + } else { + ut_ad(cur_node->type == FTS_AST_LIST + || cur_node->type == FTS_AST_SUBEXP_LIST); + oper_node = fts_query_get_oper_node(info, state); + + if (oper_node) { + node = fts_ast_create_node_list(state, oper_node); + fts_ast_add_node(node, term_node); + fts_ast_add_node(cur_node, node); + } else { + fts_ast_add_node(cur_node, term_node); + } + } + + break; + + case FT_TOKEN_LEFT_PAREN: + /* Check parse error */ + if (cur_node->type != FTS_AST_LIST + && cur_node->type != FTS_AST_SUBEXP_LIST) { + return(1); + } + + /* Set operator */ + oper_node = fts_query_get_oper_node(info, state); + if (oper_node != NULL) { + node = fts_ast_create_node_list(state, oper_node); + fts_ast_add_node(cur_node, node); + node->go_up = true; + node->up_node = cur_node; + cur_node = node; + } + + if (info->quot) { + /* Phrase node */ + node = fts_ast_create_node_phrase_list(state); + } else { + /* Subexp list node */ + node = fts_ast_create_node_subexp_list(state, NULL); + } + + fts_ast_add_node(cur_node, node); + + node->up_node = cur_node; + state->cur_node = node; + state->depth += 1; + + break; + + case FT_TOKEN_RIGHT_PAREN: + info->quot = 0; + + if (cur_node->up_node != NULL) { + cur_node = cur_node->up_node; + + if (cur_node->go_up) { + ut_a(cur_node->up_node + && !(cur_node->up_node->go_up)); + cur_node = cur_node->up_node; + } + } + + state->cur_node = cur_node; + + if (state->depth > 0) { + state->depth--; + } else { + /* Parentheses mismatch */ + return(1); + } + + break; + + case FT_TOKEN_EOF: + default: + break; + } + + return(0); +} + +/******************************************************************//** +FTS plugin parser 'myql_parser' callback function for query parse. +Refer to 'st_mysql_ftparser_param' for more detail. +@return 0 if parse successfully */ +static +int +fts_parse_query_internal( +/*=====================*/ + MYSQL_FTPARSER_PARAM* param, /*!< in: parser param */ + const char* query, /*!< in: query string */ + int len) /*!< in: query length */ +{ + MYSQL_FTPARSER_BOOLEAN_INFO info; + const CHARSET_INFO* cs = param->cs; + uchar** start = (uchar**)(&query); + uchar* end = (uchar*)(query + len); + FT_WORD w = {NULL, 0, 0}; + + info.prev = ' '; + info.quot = 0; + memset(&w, 0, sizeof(w)); + /* Note: We don't handle simple parser mode here, + but user supplied plugin parser should handler it. */ + while (fts_get_word(cs, start, end, &w, &info)) { + int ret = param->mysql_add_word( + param, + reinterpret_cast(w.pos), + int(w.len), &info); + if (ret) { + return(ret); + } + } + + return(0); +} + +/******************************************************************//** +fts parse query by plugin parser. +@return 0 if parse successfully, or return non-zero. */ +int +fts_parse_by_parser( +/*================*/ + ibool mode, /*!< in: parse boolean mode */ + uchar* query_str, /*!< in: query string */ + ulint query_len, /*!< in: query string length */ + st_mysql_ftparser* parser, /*!< in: fts plugin parser */ + fts_ast_state_t* state) /*!< in/out: parser state */ +{ + MYSQL_FTPARSER_PARAM param; + int ret; + + ut_ad(parser); + + /* Initial parser param */ + param.mysql_parse = fts_parse_query_internal; + param.mysql_add_word = fts_query_add_word_for_parser; + param.mysql_ftparam = static_cast(state); + param.cs = state->charset; + param.doc = reinterpret_cast(query_str); + param.length = static_cast(query_len); + param.flags = 0; + param.mode = mode ? + MYSQL_FTPARSER_FULL_BOOLEAN_INFO : + MYSQL_FTPARSER_SIMPLE_MODE; + + PARSER_INIT(parser, ¶m); + ret = parser->parse(¶m); + PARSER_DEINIT(parser, ¶m); + + return(ret | state->depth); +} diff --git a/storage/innobase/fts/fts0que.cc b/storage/innobase/fts/fts0que.cc new file mode 100644 index 00000000..9c92a117 --- /dev/null +++ b/storage/innobase/fts/fts0que.cc @@ -0,0 +1,4612 @@ +/***************************************************************************** + +Copyright (c) 2007, 2020, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file fts/fts0que.cc +Full Text Search functionality. + +Created 2007/03/27 Sunny Bains +Completed 2011/7/10 Sunny and Jimmy Yang +*******************************************************/ + +#include "dict0dict.h" +#include "ut0rbt.h" +#include "row0sel.h" +#include "fts0fts.h" +#include "fts0priv.h" +#include "fts0ast.h" +#include "fts0pars.h" +#include "fts0types.h" +#include "fts0plugin.h" +#include "fts0vlc.h" + +#include +#include + +#define FTS_ELEM(t, n, i, j) (t[(i) * n + (j)]) + +#define RANK_DOWNGRADE (-1.0F) +#define RANK_UPGRADE (1.0F) + +/* Maximum number of words supported in a phrase or proximity search. */ +#define MAX_PROXIMITY_ITEM 128 + +/* Memory used by rbt itself for create and node add */ +#define SIZEOF_RBT_CREATE sizeof(ib_rbt_t) + sizeof(ib_rbt_node_t) * 2 +#define SIZEOF_RBT_NODE_ADD sizeof(ib_rbt_node_t) + +/*Initial byte length for 'words' in fts_ranking_t */ +#define RANKING_WORDS_INIT_LEN 4 + +// FIXME: Need to have a generic iterator that traverses the ilist. + +typedef std::vector > word_vector_t; + +struct fts_word_freq_t; + +/** State of an FTS query. */ +struct fts_query_t { + mem_heap_t* heap; /*!< Heap to use for allocations */ + + trx_t* trx; /*!< The query transaction */ + + dict_index_t* index; /*!< The FTS index to search */ + /*!< FTS auxiliary common table def */ + + fts_table_t fts_common_table; + + fts_table_t fts_index_table;/*!< FTS auxiliary index table def */ + + size_t total_size; /*!< total memory size used by query */ + + fts_doc_ids_t* deleted; /*!< Deleted doc ids that need to be + filtered from the output */ + + fts_ast_node_t* root; /*!< Abstract syntax tree */ + + fts_ast_node_t* cur_node; /*!< Current tree node */ + + ib_rbt_t* word_map; /*!< Matched word map for + searching by word*/ + + word_vector_t* word_vector; /*!< Matched word vector for + searching by index */ + + ib_rbt_t* doc_ids; /*!< The current set of matching + doc ids, elements are of + type fts_ranking_t */ + + ib_rbt_t* intersection; /*!< The doc ids that were found in + doc_ids, this tree will become + the new doc_ids, elements are of type + fts_ranking_t */ + + /*!< Prepared statement to read the + nodes from the FTS INDEX */ + que_t* read_nodes_graph; + + fts_ast_oper_t oper; /*!< Current boolean mode operator */ + + /*!< TRUE if we want to collect the + word positions within the document */ + ibool collect_positions; + + ulint flags; /*!< Specify the full text search type, + such as boolean search, phrase + search, proximity search etc. */ + + ulint distance; /*!< The proximity distance of a + phrase search. */ + + /*!< These doc ids are used as a + boundary condition when searching the + FTS index rows */ + + doc_id_t lower_doc_id; /*!< Lowest doc id in doc_ids */ + + doc_id_t upper_doc_id; /*!< Highest doc id in doc_ids */ + + bool boolean_mode; /*!< TRUE if boolean mode query */ + + ib_vector_t* matched; /*!< Array of matching documents + (fts_match_t) to search for a phrase */ + + ib_vector_t** match_array; /*!< Used for proximity search, contains + position info for each matched word + in the word list */ + + ib_uint64_t total_docs; /*!< The total number of documents */ + + ulint total_words; /*!< The total number of words */ + + dberr_t error; /*!< Error code if any, that is + encountered during query processing */ + + ib_rbt_t* word_freqs; /*!< RB tree of word frequencies per + document, its elements are of type + fts_word_freq_t */ + + ib_rbt_t* wildcard_words; /*!< words with wildcard */ + + bool multi_exist; /*!< multiple FTS_EXIST oper */ + byte visiting_sub_exp; /*!< count of nested + fts_ast_visit_sub_exp() */ + + st_mysql_ftparser* parser; /*!< fts plugin parser */ +}; + +/** For phrase matching, first we collect the documents and the positions +then we match. */ +struct fts_match_t { + doc_id_t doc_id; /*!< Document id */ + + ulint start; /*!< Start the phrase match from + this offset within the positions + vector. */ + + ib_vector_t* positions; /*!< Offsets of a word in a + document */ +}; + +/** For matching tokens in a phrase search. We use this data structure in +the callback that determines whether a document should be accepted or +rejected for a phrase search. */ +struct fts_select_t { + doc_id_t doc_id; /*!< The document id to match */ + + ulint min_pos; /*!< For found to be TRUE at least + one position must be greater than + min_pos. */ + + ibool found; /*!< TRUE if found */ + + fts_word_freq_t* + word_freq; /*!< Word frequency instance of the + current word being looked up in + the FTS index */ +}; + +typedef std::vector > pos_vector_t; + +/** structure defines a set of ranges for original documents, each of which +has a minimum position and maximum position. Text in such range should +contain all words in the proximity search. We will need to count the +words in such range to make sure it is less than the specified distance +of the proximity search */ +struct fts_proximity_t { + ulint n_pos; /*!< number of position set, defines + a range (min to max) containing all + matching words */ + pos_vector_t min_pos; /*!< the minimum position (in bytes) + of the range */ + pos_vector_t max_pos; /*!< the maximum position (in bytes) + of the range */ +}; + +/** The match positions and tokesn to match */ +struct fts_phrase_t { + fts_phrase_t(const dict_table_t* table) + : + found(false), + match(NULL), + tokens(NULL), + distance(0), + charset(NULL), + heap(NULL), + zip_size(table->space->zip_size()), + proximity_pos(NULL), + parser(NULL) + { + } + + /** Match result */ + ibool found; + + /** Positions within text */ + const fts_match_t* match; + + /** Tokens to match */ + const ib_vector_t* tokens; + + /** For matching on proximity distance. Can be 0 for exact match */ + ulint distance; + + /** Phrase match charset */ + CHARSET_INFO* charset; + + /** Heap for word processing */ + mem_heap_t* heap; + + /** ROW_FORMAT=COMPRESSED page size, or 0 */ + const ulint zip_size; + + /** Position info for proximity search verification. Records the + min and max position of words matched */ + fts_proximity_t* proximity_pos; + + /** FTS plugin parser */ + st_mysql_ftparser* parser; +}; + +/** Paramter passed to fts phrase match by parser */ +struct fts_phrase_param_t { + fts_phrase_t* phrase; /*!< Match phrase instance */ + ulint token_index; /*!< Index of token to match next */ + mem_heap_t* heap; /*!< Heap for word processing */ +}; + +/** For storing the frequncy of a word/term in a document */ +struct fts_doc_freq_t { + doc_id_t doc_id; /*!< Document id */ + ulint freq; /*!< Frequency of a word in a document */ +}; + +/** To determine the word frequency per document. */ +struct fts_word_freq_t { + fts_string_t word; /*!< Word for which we need the freq, + it's allocated on the query heap */ + + ib_rbt_t* doc_freqs; /*!< RB Tree for storing per document + word frequencies. The elements are + of type fts_doc_freq_t */ + ib_uint64_t doc_count; /*!< Total number of documents that + contain this word */ + double idf; /*!< Inverse document frequency */ +}; + +/******************************************************************** +Callback function to fetch the rows in an FTS INDEX record. +@return always TRUE */ +static +ibool +fts_query_index_fetch_nodes( +/*========================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg); /*!< in: pointer to ib_vector_t */ + +/******************************************************************** +Read and filter nodes. +@return fts_node_t instance */ +static +dberr_t +fts_query_filter_doc_ids( +/*=====================*/ + fts_query_t* query, /*!< in: query instance */ + const fts_string_t* word, /*!< in: the current word */ + fts_word_freq_t* word_freq, /*!< in/out: word frequency */ + const fts_node_t* node, /*!< in: current FTS node */ + void* data, /*!< in: doc id ilist */ + ulint len, /*!< in: doc id ilist size */ + ibool calc_doc_count);/*!< in: whether to remember doc + count */ + +/** Process (nested) sub-expression, create a new result set to store the +sub-expression result by processing nodes under current sub-expression +list. Merge the sub-expression result with that of parent expression list. +@param[in,out] node current root node +@param[in,out] visitor callback function +@param[in,out] arg argument for callback +@return DB_SUCCESS if all go well */ +static +dberr_t +fts_ast_visit_sub_exp( + fts_ast_node_t* node, + fts_ast_callback visitor, + void* arg); + +#if 0 +/*****************************************************************//*** +Find a doc_id in a word's ilist. +@return TRUE if found. */ +static +ibool +fts_query_find_doc_id( +/*==================*/ + fts_select_t* select, /*!< in/out: search the doc id selected, + update the frequency if found. */ + void* data, /*!< in: doc id ilist */ + ulint len); /*!< in: doc id ilist size */ +#endif + +/*************************************************************//** +This function implements a simple "blind" query expansion search: +words in documents found in the first search pass will be used as +search arguments to search the document again, thus "expand" +the search result set. +@return DB_SUCCESS if success, otherwise the error code */ +static +dberr_t +fts_expand_query( +/*=============*/ + dict_index_t* index, /*!< in: FTS index to search */ + fts_query_t* query) /*!< in: query result, to be freed + by the client */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*************************************************************//** +This function finds documents that contain all words in a +phrase or proximity search. And if proximity search, verify +the words are close enough to each other, as in specified distance. +This function is called for phrase and proximity search. +@return TRUE if documents are found, FALSE if otherwise */ +static +ibool +fts_phrase_or_proximity_search( +/*===========================*/ + fts_query_t* query, /*!< in/out: query instance + query->doc_ids might be instantiated + with qualified doc IDs */ + ib_vector_t* tokens); /*!< in: Tokens contain words */ +/*************************************************************//** +This function checks whether words in result documents are close to +each other (within proximity range as specified by "distance"). +If "distance" is MAX_ULINT, then it will find all combinations of +positions of matching words and store min and max positions +in the "qualified_pos" for later verification. +@return true if words are close to each other, false if otherwise */ +static +bool +fts_proximity_get_positions( +/*========================*/ + fts_match_t** match, /*!< in: query instance */ + ulint num_match, /*!< in: number of matching + items */ + ulint distance, /*!< in: distance value + for proximity search */ + fts_proximity_t* qualified_pos); /*!< out: the position info + records ranges containing + all matching words. */ +#if 0 +/******************************************************************** +Get the total number of words in a documents. */ +static +ulint +fts_query_terms_in_document( +/*========================*/ + /*!< out: DB_SUCCESS if all go well + else error code */ + fts_query_t* query, /*!< in: FTS query state */ + doc_id_t doc_id, /*!< in: the word to check */ + ulint* total); /*!< out: total words in document */ +#endif + +/******************************************************************** +Compare two fts_doc_freq_t doc_ids. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_freq_doc_id_cmp( +/*================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const fts_doc_freq_t* fq1 = (const fts_doc_freq_t*) p1; + const fts_doc_freq_t* fq2 = (const fts_doc_freq_t*) p2; + + return((int) (fq1->doc_id - fq2->doc_id)); +} + +#if 0 +/*******************************************************************//** +Print the table used for calculating LCS. */ +static +void +fts_print_lcs_table( +/*================*/ + const ulint* table, /*!< in: array to print */ + ulint n_rows, /*!< in: total no. of rows */ + ulint n_cols) /*!< in: total no. of cols */ +{ + ulint i; + + for (i = 0; i < n_rows; ++i) { + ulint j; + + printf("\n"); + + for (j = 0; j < n_cols; ++j) { + + printf("%2lu ", FTS_ELEM(table, n_cols, i, j)); + } + } +} + +/******************************************************************** +Find the longest common subsequence between the query string and +the document. */ +static +ulint +fts_query_lcs( +/*==========*/ + /*!< out: LCS (length) between + two ilists */ + const ulint* p1, /*!< in: word positions of query */ + ulint len_p1, /*!< in: no. of elements in p1 */ + const ulint* p2, /*!< in: word positions within document */ + ulint len_p2) /*!< in: no. of elements in p2 */ +{ + int i; + ulint len = 0; + ulint r = len_p1; + ulint c = len_p2; + ulint size = (r + 1) * (c + 1) * sizeof(ulint); + ulint* table = (ulint*) ut_malloc_nokey(size); + + /* Traverse the table backwards, from the last row to the first and + also from the last column to the first. We compute the smaller + common subsequeces first, then use the caluclated values to determine + the longest common subsequence. The result will be in TABLE[0][0]. */ + for (i = r; i >= 0; --i) { + int j; + + for (j = c; j >= 0; --j) { + + if (p1[i] == (ulint) -1 || p2[j] == (ulint) -1) { + + FTS_ELEM(table, c, i, j) = 0; + + } else if (p1[i] == p2[j]) { + + FTS_ELEM(table, c, i, j) = FTS_ELEM( + table, c, i + 1, j + 1) + 1; + + } else { + + ulint value; + + value = ut_max( + FTS_ELEM(table, c, i + 1, j), + FTS_ELEM(table, c, i, j + 1)); + + FTS_ELEM(table, c, i, j) = value; + } + } + } + + len = FTS_ELEM(table, c, 0, 0); + + fts_print_lcs_table(table, r, c); + printf("\nLen=" ULINTPF "\n", len); + + ut_free(table); + + return(len); +} +#endif + +/*******************************************************************//** +Compare two fts_ranking_t instance on their rank value and doc ids in +descending order on the rank and ascending order on doc id. +@return 0 if p1 == p2, < 0 if p1 < p2, > 0 if p1 > p2 */ +static +int +fts_query_compare_rank( +/*===================*/ + const void* p1, /*!< in: pointer to elem */ + const void* p2) /*!< in: pointer to elem */ +{ + const fts_ranking_t* r1 = (const fts_ranking_t*) p1; + const fts_ranking_t* r2 = (const fts_ranking_t*) p2; + + if (r2->rank < r1->rank) { + return(-1); + } else if (r2->rank == r1->rank) { + + if (r1->doc_id < r2->doc_id) { + return(1); + } else if (r1->doc_id > r2->doc_id) { + return(1); + } + + return(0); + } + + return(1); +} + +/*******************************************************************//** +Create words in ranking */ +static +void +fts_ranking_words_create( +/*=====================*/ + fts_query_t* query, /*!< in: query instance */ + fts_ranking_t* ranking) /*!< in: ranking instance */ +{ + ranking->words = static_cast( + mem_heap_zalloc(query->heap, RANKING_WORDS_INIT_LEN)); + ranking->words_len = RANKING_WORDS_INIT_LEN; +} + +/* +The optimization here is using a char array(bitmap) to replace words rb tree +in fts_ranking_t. + +It can save lots of memory except in some cases of QUERY EXPANSION. + +'word_map' is used as a word dictionary, in which the key is a word, the value +is a number. In 'fts_ranking_words_add', we first check if the word is in 'word_map'. +if not, we add it into 'word_map', and give it a position(actually a number). +then we set the corresponding bit to '1' at the position in the char array 'words'. + +'word_vector' is a useful backup of 'word_map', and we can get a word by its position, +more quickly than searching by value in 'word_map'. we use 'word_vector' +in 'fts_query_calculate_ranking' and 'fts_expand_query'. In the two functions, we need +to scan the bitmap 'words', and get a word when a bit is '1', then we get word_freq +by the word. +*/ + +/*******************************************************************//** +Add a word into ranking */ +static +void +fts_ranking_words_add( +/*==================*/ + fts_query_t* query, /*!< in: query instance */ + fts_ranking_t* ranking, /*!< in: ranking instance */ + const fts_string_t* word) /*!< in: term/word to add */ +{ + ulint pos; + ulint byte_offset; + ulint bit_offset; + ib_rbt_bound_t parent; + + /* Note: we suppose the word map and vector are append-only. */ + ut_ad(query->word_vector->size() == rbt_size(query->word_map)); + + /* We use ib_rbt to simulate a map, f_n_char means position. */ + if (rbt_search(query->word_map, &parent, word) == 0) { + fts_string_t* result_word; + + result_word = rbt_value(fts_string_t, parent.last); + pos = result_word->f_n_char; + ut_ad(pos < rbt_size(query->word_map)); + } else { + /* Add the word to map. */ + fts_string_t new_word; + + pos = rbt_size(query->word_map); + + fts_string_dup(&new_word, word, query->heap); + new_word.f_n_char = pos; + + rbt_add_node(query->word_map, &parent, &new_word); + ut_ad(rbt_validate(query->word_map)); + query->word_vector->push_back(new_word); + } + + /* Check words len */ + byte_offset = pos / CHAR_BIT; + if (byte_offset >= ranking->words_len) { + byte* words = ranking->words; + ulint words_len = ranking->words_len; + + while (byte_offset >= words_len) { + words_len *= 2; + } + + ranking->words = static_cast( + mem_heap_zalloc(query->heap, words_len)); + memcpy(ranking->words, words, ranking->words_len); + ranking->words_len = words_len; + } + + /* Set ranking words */ + ut_ad(byte_offset < ranking->words_len); + bit_offset = pos % CHAR_BIT; + ranking->words[byte_offset] = static_cast( + ranking->words[byte_offset] | 1 << bit_offset); +} + +/*******************************************************************//** +Get a word from a ranking +@return true if it's successful */ +static +bool +fts_ranking_words_get_next( +/*=======================*/ + const fts_query_t* query, /*!< in: query instance */ + fts_ranking_t* ranking,/*!< in: ranking instance */ + ulint* pos, /*!< in/out: word start pos */ + fts_string_t* word) /*!< in/out: term/word to add */ +{ + bool ret = false; + ulint max_pos = ranking->words_len * CHAR_BIT; + + /* Search for next word */ + while (*pos < max_pos) { + ulint byte_offset = *pos / CHAR_BIT; + ulint bit_offset = *pos % CHAR_BIT; + + if (ranking->words[byte_offset] & (1 << bit_offset)) { + ret = true; + break; + } + + *pos += 1; + }; + + /* Get next word from word vector */ + if (ret) { + ut_ad(*pos < query->word_vector->size()); + *word = query->word_vector->at((size_t)*pos); + *pos += 1; + } + + return ret; +} + +/*******************************************************************//** +Add a word if it doesn't exist, to the term freq RB tree. We store +a pointer to the word that is passed in as the argument. +@return pointer to word */ +static +fts_word_freq_t* +fts_query_add_word_freq( +/*====================*/ + fts_query_t* query, /*!< in: query instance */ + const fts_string_t* word) /*!< in: term/word to add */ +{ + ib_rbt_bound_t parent; + + /* Lookup the word in our rb tree and add if it doesn't exist. */ + if (rbt_search(query->word_freqs, &parent, word) != 0) { + fts_word_freq_t word_freq; + + memset(&word_freq, 0, sizeof(word_freq)); + + fts_string_dup(&word_freq.word, word, query->heap); + + word_freq.doc_count = 0; + + word_freq.doc_freqs = rbt_create( + sizeof(fts_doc_freq_t), fts_freq_doc_id_cmp); + + parent.last = rbt_add_node( + query->word_freqs, &parent, &word_freq); + + query->total_size += word->f_len + + SIZEOF_RBT_CREATE + + SIZEOF_RBT_NODE_ADD + + sizeof(fts_word_freq_t); + } + + return(rbt_value(fts_word_freq_t, parent.last)); +} + +/*******************************************************************//** +Add a doc id if it doesn't exist, to the doc freq RB tree. +@return pointer to word */ +static +fts_doc_freq_t* +fts_query_add_doc_freq( +/*===================*/ + fts_query_t* query, /*!< in: query instance */ + ib_rbt_t* doc_freqs, /*!< in: rb tree of fts_doc_freq_t */ + doc_id_t doc_id) /*!< in: doc id to add */ +{ + ib_rbt_bound_t parent; + + /* Lookup the doc id in our rb tree and add if it doesn't exist. */ + if (rbt_search(doc_freqs, &parent, &doc_id) != 0) { + fts_doc_freq_t doc_freq; + + memset(&doc_freq, 0, sizeof(doc_freq)); + + doc_freq.freq = 0; + doc_freq.doc_id = doc_id; + + parent.last = rbt_add_node(doc_freqs, &parent, &doc_freq); + + query->total_size += SIZEOF_RBT_NODE_ADD + + sizeof(fts_doc_freq_t); + } + + return(rbt_value(fts_doc_freq_t, parent.last)); +} + +/*******************************************************************//** +Add the doc id to the query set only if it's not in the +deleted array. */ +static +void +fts_query_union_doc_id( +/*===================*/ + fts_query_t* query, /*!< in: query instance */ + doc_id_t doc_id, /*!< in: the doc id to add */ + fts_rank_t rank) /*!< in: if non-zero, it is the + rank associated with the doc_id */ +{ + ib_rbt_bound_t parent; + ulint size = ib_vector_size(query->deleted->doc_ids); + doc_id_t* updates = (doc_id_t*) query->deleted->doc_ids->data; + + /* Check if the doc id is deleted and it's not already in our set. */ + if (fts_bsearch(updates, 0, static_cast(size), doc_id) < 0 + && rbt_search(query->doc_ids, &parent, &doc_id) != 0) { + + fts_ranking_t ranking; + + ranking.rank = rank; + ranking.doc_id = doc_id; + fts_ranking_words_create(query, &ranking); + + rbt_add_node(query->doc_ids, &parent, &ranking); + + query->total_size += SIZEOF_RBT_NODE_ADD + + sizeof(fts_ranking_t) + RANKING_WORDS_INIT_LEN; + } +} + +/*******************************************************************//** +Remove the doc id from the query set only if it's not in the +deleted set. */ +static +void +fts_query_remove_doc_id( +/*====================*/ + fts_query_t* query, /*!< in: query instance */ + doc_id_t doc_id) /*!< in: the doc id to add */ +{ + ib_rbt_bound_t parent; + ulint size = ib_vector_size(query->deleted->doc_ids); + doc_id_t* updates = (doc_id_t*) query->deleted->doc_ids->data; + + /* Check if the doc id is deleted and it's in our set. */ + if (fts_bsearch(updates, 0, static_cast(size), doc_id) < 0 + && rbt_search(query->doc_ids, &parent, &doc_id) == 0) { + ut_free(rbt_remove_node(query->doc_ids, parent.last)); + + ut_ad(query->total_size >= + SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t)); + query->total_size -= SIZEOF_RBT_NODE_ADD + + sizeof(fts_ranking_t); + } +} + +/*******************************************************************//** +Find the doc id in the query set but not in the deleted set, artificialy +downgrade or upgrade its ranking by a value and make/initialize its ranking +under or above its normal range 0 to 1. This is used for Boolean Search +operator such as Negation operator, which makes word's contribution to the +row's relevance to be negative */ +static +void +fts_query_change_ranking( +/*====================*/ + fts_query_t* query, /*!< in: query instance */ + doc_id_t doc_id, /*!< in: the doc id to add */ + ibool downgrade) /*!< in: Whether to downgrade ranking */ +{ + ib_rbt_bound_t parent; + ulint size = ib_vector_size(query->deleted->doc_ids); + doc_id_t* updates = (doc_id_t*) query->deleted->doc_ids->data; + + /* Check if the doc id is deleted and it's in our set. */ + if (fts_bsearch(updates, 0, static_cast(size), doc_id) < 0 + && rbt_search(query->doc_ids, &parent, &doc_id) == 0) { + + fts_ranking_t* ranking; + + ranking = rbt_value(fts_ranking_t, parent.last); + + ranking->rank += downgrade ? RANK_DOWNGRADE : RANK_UPGRADE; + + /* Allow at most 2 adjustment by RANK_DOWNGRADE (-0.5) + and RANK_UPGRADE (0.5) */ + if (ranking->rank >= 1.0F) { + ranking->rank = 1.0F; + } else if (ranking->rank <= -1.0F) { + ranking->rank = -1.0F; + } + } +} + +/*******************************************************************//** +Check the doc id in the query set only if it's not in the +deleted array. The doc ids that were found are stored in +another rb tree (fts_query_t::intersect). */ +static +void +fts_query_intersect_doc_id( +/*=======================*/ + fts_query_t* query, /*!< in: query instance */ + doc_id_t doc_id, /*!< in: the doc id to add */ + fts_rank_t rank) /*!< in: if non-zero, it is the + rank associated with the doc_id */ +{ + ib_rbt_bound_t parent; + ulint size = ib_vector_size(query->deleted->doc_ids); + doc_id_t* updates = (doc_id_t*) query->deleted->doc_ids->data; + fts_ranking_t* ranking= NULL; + + /* There are three types of intersect: + 1. '+a': doc_ids is empty, add doc into intersect if it matches 'a'. + 2. 'a +b': docs match 'a' is in doc_ids, add doc into intersect + if it matches 'b'. if the doc is also in doc_ids, then change the + doc's rank, and add 'a' in doc's words. + 3. '+a +b': docs matching '+a' is in doc_ids, add doc into intsersect + if it matches 'b' and it's in doc_ids.(multi_exist = true). */ + + /* Check if the doc id is deleted and it's in our set */ + if (fts_bsearch(updates, 0, static_cast(size), doc_id) < 0) { + fts_ranking_t new_ranking; + + if (rbt_search(query->doc_ids, &parent, &doc_id) != 0) { + if (query->multi_exist) { + return; + } else { + new_ranking.words = NULL; + } + } else { + ranking = rbt_value(fts_ranking_t, parent.last); + + /* We've just checked the doc id before */ + if (ranking->words == NULL) { + ut_ad(rbt_search(query->intersection, &parent, + ranking) == 0); + return; + } + + /* Merge rank */ + rank += ranking->rank; + if (rank >= 1.0F) { + rank = 1.0F; + } else if (rank <= -1.0F) { + rank = -1.0F; + } + + /* Take words */ + new_ranking.words = ranking->words; + new_ranking.words_len = ranking->words_len; + } + + new_ranking.rank = rank; + new_ranking.doc_id = doc_id; + + if (rbt_search(query->intersection, &parent, + &new_ranking) != 0) { + if (new_ranking.words == NULL) { + fts_ranking_words_create(query, &new_ranking); + + query->total_size += RANKING_WORDS_INIT_LEN; + } else { + /* Note that the intersection has taken + ownership of the ranking data. */ + ranking->words = NULL; + } + + rbt_add_node(query->intersection, + &parent, &new_ranking); + + query->total_size += SIZEOF_RBT_NODE_ADD + + sizeof(fts_ranking_t); + } + } +} + +/*******************************************************************//** +Free the document ranking rb tree. */ +static +void +fts_query_free_doc_ids( +/*===================*/ + fts_query_t* query, /*!< in: query instance */ + ib_rbt_t* doc_ids) /*!< in: rb tree to free */ +{ + const ib_rbt_node_t* node; + + for (node = rbt_first(doc_ids); node; node = rbt_first(doc_ids)) { + + fts_ranking_t* ranking; + + ranking = rbt_value(fts_ranking_t, node); + + if (ranking->words) { + ranking->words = NULL; + } + + ut_free(rbt_remove_node(doc_ids, node)); + + ut_ad(query->total_size >= + SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t)); + query->total_size -= SIZEOF_RBT_NODE_ADD + + sizeof(fts_ranking_t); + } + + rbt_free(doc_ids); + + ut_ad(query->total_size >= SIZEOF_RBT_CREATE); + query->total_size -= SIZEOF_RBT_CREATE; +} + +/*******************************************************************//** +Add the word to the documents "list" of matching words from +the query. We make a copy of the word from the query heap. */ +static +void +fts_query_add_word_to_document( +/*===========================*/ + fts_query_t* query, /*!< in: query to update */ + doc_id_t doc_id, /*!< in: the document to update */ + const fts_string_t* word) /*!< in: the token to add */ +{ + ib_rbt_bound_t parent; + fts_ranking_t* ranking = NULL; + + if (query->flags == FTS_OPT_RANKING) { + return; + } + + /* First we search the intersection RB tree as it could have + taken ownership of the words rb tree instance. */ + if (query->intersection + && rbt_search(query->intersection, &parent, &doc_id) == 0) { + + ranking = rbt_value(fts_ranking_t, parent.last); + } + + if (ranking == NULL + && rbt_search(query->doc_ids, &parent, &doc_id) == 0) { + + ranking = rbt_value(fts_ranking_t, parent.last); + } + + if (ranking != NULL) { + fts_ranking_words_add(query, ranking, word); + } +} + +/*******************************************************************//** +Check the node ilist. */ +static +void +fts_query_check_node( +/*=================*/ + fts_query_t* query, /*!< in: query to update */ + const fts_string_t* token, /*!< in: the token to search */ + const fts_node_t* node) /*!< in: node to check */ +{ + /* Skip nodes whose doc ids are out range. */ + if (query->oper == FTS_EXIST + && ((query->upper_doc_id > 0 + && node->first_doc_id > query->upper_doc_id) + || (query->lower_doc_id > 0 + && node->last_doc_id < query->lower_doc_id))) { + + /* Ignore */ + + } else { + int ret; + ib_rbt_bound_t parent; + ulint ilist_size = node->ilist_size; + fts_word_freq_t*word_freqs; + + /* The word must exist. */ + ret = rbt_search(query->word_freqs, &parent, token); + ut_a(ret == 0); + + word_freqs = rbt_value(fts_word_freq_t, parent.last); + + query->error = fts_query_filter_doc_ids( + query, token, word_freqs, node, + node->ilist, ilist_size, TRUE); + } +} + +/*****************************************************************//** +Search index cache for word with wildcard match. +@return number of words matched */ +static +ulint +fts_cache_find_wildcard( +/*====================*/ + fts_query_t* query, /*!< in: query instance */ + const fts_index_cache_t*index_cache, /*!< in: cache to search */ + const fts_string_t* token) /*!< in: token to search */ +{ + ib_rbt_bound_t parent; + const ib_vector_t* nodes = NULL; + fts_string_t srch_text; + byte term[FTS_MAX_WORD_LEN + 1]; + ulint num_word = 0; + + srch_text.f_len = (token->f_str[token->f_len - 1] == '%') + ? token->f_len - 1 + : token->f_len; + + strncpy((char*) term, (char*) token->f_str, srch_text.f_len); + term[srch_text.f_len] = '\0'; + srch_text.f_str = term; + + /* Lookup the word in the rb tree */ + if (rbt_search_cmp(index_cache->words, &parent, &srch_text, NULL, + innobase_fts_text_cmp_prefix) == 0) { + const fts_tokenizer_word_t* word; + ulint i; + const ib_rbt_node_t* cur_node; + ibool forward = FALSE; + + word = rbt_value(fts_tokenizer_word_t, parent.last); + cur_node = parent.last; + + while (innobase_fts_text_cmp_prefix( + index_cache->charset, &srch_text, &word->text) == 0) { + + nodes = word->nodes; + + for (i = 0; nodes && i < ib_vector_size(nodes); ++i) { + int ret; + const fts_node_t* node; + ib_rbt_bound_t freq_parent; + fts_word_freq_t* word_freqs; + + node = static_cast( + ib_vector_get_const(nodes, i)); + + ret = rbt_search(query->word_freqs, + &freq_parent, + &srch_text); + + ut_a(ret == 0); + + word_freqs = rbt_value( + fts_word_freq_t, + freq_parent.last); + + query->error = fts_query_filter_doc_ids( + query, &srch_text, + word_freqs, node, + node->ilist, node->ilist_size, TRUE); + + if (query->error != DB_SUCCESS) { + return(0); + } + } + + num_word++; + + if (!forward) { + cur_node = rbt_prev( + index_cache->words, cur_node); + } else { +cont_search: + cur_node = rbt_next( + index_cache->words, cur_node); + } + + if (!cur_node) { + break; + } + + word = rbt_value(fts_tokenizer_word_t, cur_node); + } + + if (!forward) { + forward = TRUE; + cur_node = parent.last; + goto cont_search; + } + } + + return(num_word); +} + +/*****************************************************************//** +Set difference. +@return DB_SUCCESS if all go well */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_query_difference( +/*=================*/ + fts_query_t* query, /*!< in: query instance */ + const fts_string_t* token) /*!< in: token to search */ +{ + ulint n_doc_ids= 0; + trx_t* trx = query->trx; + dict_table_t* table = query->index->table; + + ut_a(query->oper == FTS_IGNORE); + +#ifdef FTS_INTERNAL_DIAG_PRINT + { + ib::info out; + out << "DIFFERENCE: Searching: '"; + out.write(token->f_str, token->f_len); + out << "'"; + } +#endif + + if (query->doc_ids) { + n_doc_ids = rbt_size(query->doc_ids); + } + + /* There is nothing we can substract from an empty set. */ + if (query->doc_ids && !rbt_empty(query->doc_ids)) { + ulint i; + fts_fetch_t fetch; + const ib_vector_t* nodes; + const fts_index_cache_t*index_cache; + que_t* graph = NULL; + fts_cache_t* cache = table->fts->cache; + dberr_t error; + + mysql_mutex_lock(&cache->lock); + + index_cache = fts_find_index_cache(cache, query->index); + + /* Must find the index cache */ + ut_a(index_cache != NULL); + + /* Search the cache for a matching word first. */ + if (query->cur_node->term.wildcard + && query->flags != FTS_PROXIMITY + && query->flags != FTS_PHRASE) { + fts_cache_find_wildcard(query, index_cache, token); + } else { + nodes = fts_cache_find_word(index_cache, token); + + for (i = 0; nodes && i < ib_vector_size(nodes) + && query->error == DB_SUCCESS; ++i) { + const fts_node_t* node; + + node = static_cast( + ib_vector_get_const(nodes, i)); + + fts_query_check_node(query, token, node); + } + } + + mysql_mutex_unlock(&cache->lock); + + /* error is passed by 'query->error' */ + if (query->error != DB_SUCCESS) { + ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT); + return(query->error); + } + + /* Setup the callback args for filtering and + consolidating the ilist. */ + fetch.read_arg = query; + fetch.read_record = fts_query_index_fetch_nodes; + + error = fts_index_fetch_nodes( + trx, &graph, &query->fts_index_table, token, &fetch); + + /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */ + ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS)); + if (error != DB_SUCCESS) { + query->error = error; + } + + que_graph_free(graph); + } + + /* The size can't increase. */ + ut_a(rbt_size(query->doc_ids) <= n_doc_ids); + + return(query->error); +} + +/* Free the query intersection +@param query query instance */ +static void fts_query_free_intersection(fts_query_t* query) +{ + fts_query_free_doc_ids(query, query->intersection); + query->intersection = NULL; +} + +/*****************************************************************//** +Intersect the token doc ids with the current set. +@return DB_SUCCESS if all go well */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_query_intersect( +/*================*/ + fts_query_t* query, /*!< in: query instance */ + const fts_string_t* token) /*!< in: the token to search */ +{ + trx_t* trx = query->trx; + dict_table_t* table = query->index->table; + + ut_a(query->oper == FTS_EXIST); + +#ifdef FTS_INTERNAL_DIAG_PRINT + { + ib::info out; + out << "INTERSECT: Searching: '"; + out.write(token->f_str, token->f_len); + out << "'"; + } +#endif + + /* If the words set is not empty and multi exist is true, + we know the intersection set is empty in advance. */ + if (!(rbt_empty(query->doc_ids) && query->multi_exist)) { + ulint n_doc_ids = 0; + ulint i; + fts_fetch_t fetch; + const ib_vector_t* nodes; + const fts_index_cache_t*index_cache; + que_t* graph = NULL; + fts_cache_t* cache = table->fts->cache; + dberr_t error; + + ut_a(!query->intersection); + + n_doc_ids = rbt_size(query->doc_ids); + + /* Create the rb tree that will hold the doc ids of + the intersection. */ + query->intersection = rbt_create( + sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); + + query->total_size += SIZEOF_RBT_CREATE; + + /* This is to avoid decompressing the ilist if the + node's ilist doc ids are out of range. */ + if (!rbt_empty(query->doc_ids) && query->multi_exist) { + const ib_rbt_node_t* node; + doc_id_t* doc_id; + + node = rbt_first(query->doc_ids); + doc_id = rbt_value(doc_id_t, node); + query->lower_doc_id = *doc_id; + + node = rbt_last(query->doc_ids); + doc_id = rbt_value(doc_id_t, node); + query->upper_doc_id = *doc_id; + + } else { + query->lower_doc_id = 0; + query->upper_doc_id = 0; + } + + /* Search the cache for a matching word first. */ + + mysql_mutex_lock(&cache->lock); + + /* Search for the index specific cache. */ + index_cache = fts_find_index_cache(cache, query->index); + + /* Must find the index cache. */ + ut_a(index_cache != NULL); + + if (query->cur_node->term.wildcard) { + /* Wildcard search the index cache */ + fts_cache_find_wildcard(query, index_cache, token); + } else { + nodes = fts_cache_find_word(index_cache, token); + + for (i = 0; nodes && i < ib_vector_size(nodes) + && query->error == DB_SUCCESS; ++i) { + const fts_node_t* node; + + node = static_cast( + ib_vector_get_const(nodes, i)); + + fts_query_check_node(query, token, node); + } + } + + mysql_mutex_unlock(&cache->lock); + + /* error is passed by 'query->error' */ + if (query->error != DB_SUCCESS) { + ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT); + fts_query_free_intersection(query); + return(query->error); + } + + /* Setup the callback args for filtering and + consolidating the ilist. */ + fetch.read_arg = query; + fetch.read_record = fts_query_index_fetch_nodes; + + error = fts_index_fetch_nodes( + trx, &graph, &query->fts_index_table, token, &fetch); + + /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */ + ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS)); + if (error != DB_SUCCESS) { + query->error = error; + } + + que_graph_free(graph); + + if (query->error == DB_SUCCESS) { + /* Make the intesection (rb tree) the current doc id + set and free the old set. */ + fts_query_free_doc_ids(query, query->doc_ids); + query->doc_ids = query->intersection; + query->intersection = NULL; + + ut_a(!query->multi_exist || (query->multi_exist + && rbt_size(query->doc_ids) <= n_doc_ids)); + } else if (query->intersection) { + fts_query_free_intersection(query); + } + } + + return(query->error); +} + +/*****************************************************************//** +Query index cache. +@return DB_SUCCESS if all go well */ +static +dberr_t +fts_query_cache( +/*============*/ + fts_query_t* query, /*!< in/out: query instance */ + const fts_string_t* token) /*!< in: token to search */ +{ + const fts_index_cache_t*index_cache; + dict_table_t* table = query->index->table; + fts_cache_t* cache = table->fts->cache; + + /* Search the cache for a matching word first. */ + mysql_mutex_lock(&cache->lock); + + /* Search for the index specific cache. */ + index_cache = fts_find_index_cache(cache, query->index); + + /* Must find the index cache. */ + ut_a(index_cache != NULL); + + if (query->cur_node->term.wildcard + && query->flags != FTS_PROXIMITY + && query->flags != FTS_PHRASE) { + /* Wildcard search the index cache */ + fts_cache_find_wildcard(query, index_cache, token); + } else { + const ib_vector_t* nodes; + ulint i; + + nodes = fts_cache_find_word(index_cache, token); + + for (i = 0; nodes && i < ib_vector_size(nodes) + && query->error == DB_SUCCESS; ++i) { + const fts_node_t* node; + + node = static_cast( + ib_vector_get_const(nodes, i)); + + fts_query_check_node(query, token, node); + } + } + + mysql_mutex_unlock(&cache->lock); + + return(query->error); +} + +/*****************************************************************//** +Set union. +@return DB_SUCCESS if all go well */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_query_union( +/*============*/ + fts_query_t* query, /*!< in: query instance */ + fts_string_t* token) /*!< in: token to search */ +{ + fts_fetch_t fetch; + ulint n_doc_ids = 0; + trx_t* trx = query->trx; + que_t* graph = NULL; + dberr_t error; + + ut_a(query->oper == FTS_NONE || query->oper == FTS_DECR_RATING || + query->oper == FTS_NEGATE || query->oper == FTS_INCR_RATING); + +#ifdef FTS_INTERNAL_DIAG_PRINT + { + ib::info out; + out << "UNION: Searching: '"; + out.write(token->f_str, token->f_len); + out << "'"; + } +#endif + + if (query->doc_ids) { + n_doc_ids = rbt_size(query->doc_ids); + } + + if (token->f_len == 0) { + return(query->error); + } + + fts_query_cache(query, token); + + /* Setup the callback args for filtering and + consolidating the ilist. */ + fetch.read_arg = query; + fetch.read_record = fts_query_index_fetch_nodes; + + /* Read the nodes from disk. */ + error = fts_index_fetch_nodes( + trx, &graph, &query->fts_index_table, token, &fetch); + + /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */ + ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS)); + if (error != DB_SUCCESS) { + query->error = error; + } + + que_graph_free(graph); + + if (query->error == DB_SUCCESS) { + + /* The size can't decrease. */ + ut_a(rbt_size(query->doc_ids) >= n_doc_ids); + + /* Calulate the number of doc ids that were added to + the current doc id set. */ + if (query->doc_ids) { + n_doc_ids = rbt_size(query->doc_ids) - n_doc_ids; + } + } + + return(query->error); +} + +/*****************************************************************//** +Depending upon the current query operator process the doc id. +return DB_SUCCESS if all go well +or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */ +static +dberr_t +fts_query_process_doc_id( +/*=====================*/ + fts_query_t* query, /*!< in: query instance */ + doc_id_t doc_id, /*!< in: doc id to process */ + fts_rank_t rank) /*!< in: if non-zero, it is the + rank associated with the doc_id */ +{ + if (query->flags == FTS_OPT_RANKING) { + return(DB_SUCCESS); + } + + switch (query->oper) { + case FTS_NONE: + fts_query_union_doc_id(query, doc_id, rank); + break; + + case FTS_EXIST: + fts_query_intersect_doc_id(query, doc_id, rank); + break; + + case FTS_IGNORE: + fts_query_remove_doc_id(query, doc_id); + break; + + case FTS_NEGATE: + fts_query_change_ranking(query, doc_id, TRUE); + break; + + case FTS_DECR_RATING: + fts_query_union_doc_id(query, doc_id, rank); + fts_query_change_ranking(query, doc_id, TRUE); + break; + + case FTS_INCR_RATING: + fts_query_union_doc_id(query, doc_id, rank); + fts_query_change_ranking(query, doc_id, FALSE); + break; + + default: + ut_error; + } + + if (query->total_size > fts_result_cache_limit) { + return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT); + } else { + return(DB_SUCCESS); + } +} + +/*****************************************************************//** +Merge two result sets. */ +static +dberr_t +fts_merge_doc_ids( +/*==============*/ + fts_query_t* query, /*!< in,out: query instance */ + const ib_rbt_t* doc_ids) /*!< in: result set to merge */ +{ + const ib_rbt_node_t* node; + + DBUG_ENTER("fts_merge_doc_ids"); + + ut_a(!query->intersection); + + /* To process FTS_EXIST operation (intersection), we need + to create a new result set for fts_query_intersect(). */ + if (query->oper == FTS_EXIST) { + + query->intersection = rbt_create( + sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); + + query->total_size += SIZEOF_RBT_CREATE; + } + + /* Merge the elements to the result set. */ + for (node = rbt_first(doc_ids); node; node = rbt_next(doc_ids, node)) { + fts_ranking_t* ranking; + ulint pos = 0; + fts_string_t word; + + ranking = rbt_value(fts_ranking_t, node); + + query->error = fts_query_process_doc_id( + query, ranking->doc_id, ranking->rank); + + if (query->error != DB_SUCCESS) { + if (query->intersection) { + ut_a(query->oper == FTS_EXIST); + fts_query_free_intersection(query); + } + DBUG_RETURN(query->error); + } + + /* Merge words. Don't need to take operator into account. */ + ut_a(ranking->words); + while (fts_ranking_words_get_next(query, ranking, &pos, &word)) { + fts_query_add_word_to_document(query, ranking->doc_id, + &word); + } + } + + /* If it is an intersection operation, reset query->doc_ids + to query->intersection and free the old result list. */ + if (query->oper == FTS_EXIST && query->intersection != NULL) { + fts_query_free_doc_ids(query, query->doc_ids); + query->doc_ids = query->intersection; + query->intersection = NULL; + } + + DBUG_RETURN(DB_SUCCESS); +} + +/*****************************************************************//** +Skip non-whitespace in a string. Move ptr to the next word boundary. +@return pointer to first whitespace character or end */ +UNIV_INLINE +byte* +fts_query_skip_word( +/*================*/ + byte* ptr, /*!< in: start of scan */ + const byte* end) /*!< in: pointer to end of string */ +{ + /* TODO: Does this have to be UTF-8 too ? */ + while (ptr < end && !(ispunct(*ptr) || isspace(*ptr))) { + ++ptr; + } + + return(ptr); +} + +/*****************************************************************//** +Check whether the remaining terms in the phrase match the text. +@return TRUE if matched else FALSE */ +static +ibool +fts_query_match_phrase_terms( +/*=========================*/ + fts_phrase_t* phrase, /*!< in: phrase to match */ + byte** start, /*!< in/out: text to search, we can't + make this const becase we need to + first convert the string to + lowercase */ + const byte* end, /*!< in: pointer to the end of + the string to search */ + mem_heap_t* heap) /*!< in: heap */ +{ + ulint i; + byte* ptr = *start; + const ib_vector_t* tokens = phrase->tokens; + ulint distance = phrase->distance; + + /* We check only from the second term onwards, since the first + must have matched otherwise we wouldn't be here. */ + for (i = 1; ptr < end && i < ib_vector_size(tokens); /* No op */) { + fts_string_t match; + fts_string_t cmp_str; + const fts_string_t* token; + int result; + ulint ret; + + ret = innobase_mysql_fts_get_token( + phrase->charset, ptr, + const_cast(end), &match); + + if (match.f_len > 0) { + /* Get next token to match. */ + token = static_cast( + ib_vector_get_const(tokens, i)); + + fts_string_dup(&cmp_str, &match, heap); + + result = innobase_fts_text_case_cmp( + phrase->charset, token, &cmp_str); + + /* Skip the rest of the tokens if this one doesn't + match and the proximity distance is exceeded. */ + if (result + && (distance == ULINT_UNDEFINED + || distance == 0)) { + + break; + } + + /* This token matched move to the next token. */ + if (result == 0) { + /* Advance the text to search by the length + of the last token. */ + ptr += ret; + + /* Advance to the next token. */ + ++i; + } else { + + ut_a(distance != ULINT_UNDEFINED); + + ptr = fts_query_skip_word(ptr, end); + } + + /* Distance can be 0 for exact matches. */ + if (distance != ULINT_UNDEFINED && distance > 0) { + --distance; + } + } else { + ptr += ret; + } + } + + *start = ptr; + + /* Can't be greater than the number of elements. */ + ut_a(i <= ib_vector_size(tokens)); + + /* This is the case for multiple words. */ + if (i == ib_vector_size(tokens)) { + phrase->found = TRUE; + } + + return(phrase->found); +} + +/*****************************************************************//** +Callback function to count the number of words in position ranges, +and see whether the word count is in specified "phrase->distance" +@return true if the number of characters is less than the "distance" */ +static +bool +fts_proximity_is_word_in_range( +/*===========================*/ + const fts_phrase_t* + phrase, /*!< in: phrase with the search info */ + byte* start, /*!< in: text to search */ + ulint total_len) /*!< in: length of text */ +{ + fts_proximity_t* proximity_pos = phrase->proximity_pos; + + ut_ad(proximity_pos->n_pos == proximity_pos->min_pos.size()); + ut_ad(proximity_pos->n_pos == proximity_pos->max_pos.size()); + + /* Search each matched position pair (with min and max positions) + and count the number of words in the range */ + for (ulint i = 0; i < proximity_pos->n_pos; i++) { + ulint cur_pos = proximity_pos->min_pos[i]; + ulint n_word = 0; + + ut_ad(proximity_pos->max_pos[i] <= total_len); + + /* Walk through words in the range and count them */ + while (cur_pos <= proximity_pos->max_pos[i]) { + ulint len; + fts_string_t str; + + len = innobase_mysql_fts_get_token( + phrase->charset, + start + cur_pos, + start + total_len, &str); + + if (len == 0) { + break; + } + + /* Advances position with "len" bytes */ + cur_pos += len; + + /* Record the number of words */ + if (str.f_n_char > 0) { + n_word++; + } + + if (n_word > phrase->distance) { + break; + } + } + + /* Check if the number of words is less than specified + "distance" */ + if (n_word && n_word <= phrase->distance) { + return(true); + } + } + + return(false); +} + +/*****************************************************************//** +FTS plugin parser 'myql_add_word' callback function for phrase match +Refer to 'st_mysql_ftparser_param' for more detail. +@return 0 if match, or return non-zero */ +static +int +fts_query_match_phrase_add_word_for_parser( +/*=======================================*/ + MYSQL_FTPARSER_PARAM* param, /*!< in: parser param */ + const char* word, /*!< in: token */ + int word_len, /*!< in: token length */ + MYSQL_FTPARSER_BOOLEAN_INFO*) +{ + fts_phrase_param_t* phrase_param; + fts_phrase_t* phrase; + const ib_vector_t* tokens; + fts_string_t match; + fts_string_t cmp_str; + const fts_string_t* token; + int result; + mem_heap_t* heap; + + phrase_param = static_cast(param->mysql_ftparam); + heap = phrase_param->heap; + phrase = phrase_param->phrase; + tokens = phrase->tokens; + + /* In case plugin parser doesn't check return value */ + if (phrase_param->token_index == ib_vector_size(tokens)) { + return(1); + } + + match.f_str = (uchar *)(word); + match.f_len = ulint(word_len); + match.f_n_char= fts_get_token_size(phrase->charset, word, match.f_len); + + if (match.f_len > 0) { + /* Get next token to match. */ + ut_a(phrase_param->token_index < ib_vector_size(tokens)); + token = static_cast( + ib_vector_get_const(tokens, phrase_param->token_index)); + + fts_string_dup(&cmp_str, &match, heap); + + result = innobase_fts_text_case_cmp( + phrase->charset, token, &cmp_str); + + if (result == 0) { + phrase_param->token_index++; + } else { + return(1); + } + } + + /* Can't be greater than the number of elements. */ + ut_a(phrase_param->token_index <= ib_vector_size(tokens)); + + /* This is the case for multiple words. */ + if (phrase_param->token_index == ib_vector_size(tokens)) { + phrase->found = TRUE; + } + + return(static_cast(phrase->found)); +} + +/*****************************************************************//** +Check whether the terms in the phrase match the text. +@return TRUE if matched else FALSE */ +static +ibool +fts_query_match_phrase_terms_by_parser( +/*===================================*/ + fts_phrase_param_t* phrase_param, /* in/out: phrase param */ + st_mysql_ftparser* parser, /* in: plugin fts parser */ + byte* text, /* in: text to check */ + ulint len) /* in: text length */ +{ + MYSQL_FTPARSER_PARAM param; + + ut_a(parser); + + /* Set paramters for param */ + param.mysql_parse = fts_tokenize_document_internal; + param.mysql_add_word = fts_query_match_phrase_add_word_for_parser; + param.mysql_ftparam = phrase_param; + param.cs = phrase_param->phrase->charset; + param.doc = reinterpret_cast(text); + param.length = static_cast(len); + param.mode= MYSQL_FTPARSER_WITH_STOPWORDS; + + PARSER_INIT(parser, ¶m); + parser->parse(¶m); + PARSER_DEINIT(parser, ¶m); + + return(phrase_param->phrase->found); +} + +/*****************************************************************//** +Callback function to fetch and search the document. +@return TRUE if matched else FALSE */ +static +ibool +fts_query_match_phrase( +/*===================*/ + fts_phrase_t* phrase, /*!< in: phrase to match */ + byte* start, /*!< in: text to search, we can't make + this const becase we need to first + convert the string to lowercase */ + ulint cur_len, /*!< in: length of text */ + ulint prev_len, /*!< in: total length for searched + doc fields*/ + mem_heap_t* heap) /* heap */ +{ + ulint i; + const fts_string_t* first; + const byte* end = start + cur_len; + const ib_vector_t* tokens = phrase->tokens; + const ib_vector_t* positions = phrase->match->positions; + + ut_a(!phrase->found); + ut_a(phrase->match->doc_id > 0); + ut_a(ib_vector_size(tokens) > 0); + ut_a(ib_vector_size(positions) > 0); + + first = static_cast( + ib_vector_get_const(tokens, 0)); + + ut_a(phrase->match->start < ib_vector_size(positions)); + + for (i = phrase->match->start; i < ib_vector_size(positions); ++i) { + ulint pos; + byte* ptr = start; + + pos = *(ulint*) ib_vector_get_const(positions, i); + + if (pos == ULINT_UNDEFINED) { + break; + } + + if (pos < prev_len) { + continue; + } + + /* Document positions are calculated from the beginning + of the first field, need to save the length for each + searched field to adjust the doc position when search + phrases. */ + pos -= prev_len; + ptr = start + pos; + + /* Within limits ? */ + if (ptr >= end) { + break; + } + + if (phrase->parser) { + fts_phrase_param_t phrase_param; + + phrase_param.phrase = phrase; + phrase_param.token_index = 0; + phrase_param.heap = heap; + + if (fts_query_match_phrase_terms_by_parser( + &phrase_param, + phrase->parser, + ptr, + ulint(end - ptr))) { + break; + } + } else { + fts_string_t match; + fts_string_t cmp_str; + ulint ret; + + match.f_str = ptr; + ret = innobase_mysql_fts_get_token( + phrase->charset, start + pos, + const_cast(end), &match); + + if (match.f_len == 0) { + break; + } + + fts_string_dup(&cmp_str, &match, heap); + + if (innobase_fts_text_case_cmp( + phrase->charset, first, &cmp_str) == 0) { + + /* This is the case for the single word + in the phrase. */ + if (ib_vector_size(phrase->tokens) == 1) { + phrase->found = TRUE; + break; + } + + ptr += ret; + + /* Match the remaining terms in the phrase. */ + if (fts_query_match_phrase_terms(phrase, &ptr, + end, heap)) { + break; + } + } + } + } + + return(phrase->found); +} + +/*****************************************************************//** +Callback function to fetch and search the document. +@return whether the phrase is found */ +static +ibool +fts_query_fetch_document( +/*=====================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: fts_doc_t* */ +{ + + que_node_t* exp; + sel_node_t* node = static_cast(row); + fts_phrase_t* phrase = static_cast(user_arg); + ulint prev_len = 0; + ulint total_len = 0; + byte* document_text = NULL; + + exp = node->select_list; + + phrase->found = FALSE; + + /* For proximity search, we will need to get the whole document + from all fields, so first count the total length of the document + from all the fields */ + if (phrase->proximity_pos) { + while (exp) { + ulint field_len; + dfield_t* dfield = que_node_get_val(exp); + byte* data = static_cast( + dfield_get_data(dfield)); + + if (dfield_is_ext(dfield)) { + ulint local_len = dfield_get_len(dfield); + + local_len -= BTR_EXTERN_FIELD_REF_SIZE; + + field_len = mach_read_from_4( + data + local_len + BTR_EXTERN_LEN + 4); + } else { + field_len = dfield_get_len(dfield); + } + + if (field_len != UNIV_SQL_NULL) { + total_len += field_len + 1; + } + + exp = que_node_get_next(exp); + } + + document_text = static_cast(mem_heap_zalloc( + phrase->heap, total_len)); + + if (!document_text) { + return(FALSE); + } + } + + exp = node->select_list; + + while (exp) { + dfield_t* dfield = que_node_get_val(exp); + byte* data = static_cast( + dfield_get_data(dfield)); + ulint cur_len; + + if (dfield_is_ext(dfield)) { + data = btr_copy_externally_stored_field( + &cur_len, data, phrase->zip_size, + dfield_get_len(dfield), phrase->heap); + } else { + cur_len = dfield_get_len(dfield); + } + + if (cur_len != UNIV_SQL_NULL && cur_len != 0) { + if (phrase->proximity_pos) { + ut_ad(prev_len + cur_len <= total_len); + memcpy(document_text + prev_len, data, cur_len); + } else { + /* For phrase search */ + phrase->found = + fts_query_match_phrase( + phrase, + static_cast(data), + cur_len, prev_len, + phrase->heap); + } + + /* Document positions are calculated from the beginning + of the first field, need to save the length for each + searched field to adjust the doc position when search + phrases. */ + prev_len += cur_len + 1; + } + + if (phrase->found) { + break; + } + + exp = que_node_get_next(exp); + } + + if (phrase->proximity_pos) { + ut_ad(prev_len <= total_len); + + phrase->found = fts_proximity_is_word_in_range( + phrase, document_text, total_len); + } + + return(phrase->found); +} + +#if 0 +/******************************************************************** +Callback function to check whether a record was found or not. */ +static +ibool +fts_query_select( +/*=============*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: fts_doc_t* */ +{ + int i; + que_node_t* exp; + sel_node_t* node = row; + fts_select_t* select = user_arg; + + ut_a(select->word_freq); + ut_a(select->word_freq->doc_freqs); + + exp = node->select_list; + + for (i = 0; exp && !select->found; ++i) { + dfield_t* dfield = que_node_get_val(exp); + void* data = dfield_get_data(dfield); + ulint len = dfield_get_len(dfield); + + switch (i) { + case 0: /* DOC_COUNT */ + if (len != UNIV_SQL_NULL && len != 0) { + + select->word_freq->doc_count += + mach_read_from_4(data); + } + break; + + case 1: /* ILIST */ + if (len != UNIV_SQL_NULL && len != 0) { + + fts_query_find_doc_id(select, data, len); + } + break; + + default: + ut_error; + } + + exp = que_node_get_next(exp); + } + + return(FALSE); +} + +/******************************************************************** +Read the rows from the FTS index, that match word and where the +doc id is between first and last doc id. +@return DB_SUCCESS if all go well else error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_query_find_term( +/*================*/ + fts_query_t* query, /*!< in: FTS query state */ + que_t** graph, /*!< in: prepared statement */ + const fts_string_t* word, /*!< in: the word to fetch */ + doc_id_t doc_id, /*!< in: doc id to match */ + ulint* min_pos,/*!< in/out: pos found must be + greater than this minimum value. */ + ibool* found) /*!< out: TRUE if found else FALSE */ +{ + pars_info_t* info; + dberr_t error; + fts_select_t select; + doc_id_t match_doc_id; + trx_t* trx = query->trx; + char table_name[MAX_FULL_NAME_LEN]; + + trx->op_info = "fetching FTS index matching nodes"; + + if (*graph) { + info = (*graph)->info; + } else { + ulint selected; + + info = pars_info_create(); + + selected = fts_select_index(*word->f_str); + query->fts_index_table.suffix = fts_get_suffix(selected); + + fts_get_table_name(&query->fts_index_table, table_name); + pars_info_bind_id(info, "index_table_name", table_name); + } + + select.found = FALSE; + select.doc_id = doc_id; + select.min_pos = *min_pos; + select.word_freq = fts_query_add_word_freq(query, word->f_str); + + pars_info_bind_function(info, "my_func", fts_query_select, &select); + pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len); + + /* Convert to "storage" byte order. */ + fts_write_doc_id((byte*) &match_doc_id, doc_id); + + fts_bind_doc_id(info, "min_doc_id", &match_doc_id); + + fts_bind_doc_id(info, "max_doc_id", &match_doc_id); + + if (!*graph) { + + *graph = fts_parse_sql( + &query->fts_index_table, + info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT doc_count, ilist\n" + " FROM $index_table_name\n" + " WHERE word LIKE :word AND" + " first_doc_id <= :min_doc_id AND" + " last_doc_id >= :max_doc_id\n" + " ORDER BY first_doc_id;\n" + "BEGIN\n" + "\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + } + + for (;;) { + error = fts_eval_sql(trx, *graph); + + if (error == DB_SUCCESS) { + + break; /* Exit the loop. */ + } else { + + if (error == DB_LOCK_WAIT_TIMEOUT) { + ib::warn() << "lock wait timeout reading FTS" + " index. Retrying!"; + + trx->error_state = DB_SUCCESS; + } else { + ib::error() << error + << " while reading FTS index."; + + break; /* Exit the loop. */ + } + } + } + + /* Value to return */ + *found = select.found; + + if (*found) { + *min_pos = select.min_pos; + } + + return(error); +} + +/******************************************************************** +Callback aggregator for int columns. */ +static +ibool +fts_query_sum( +/*==========*/ + /*!< out: always returns TRUE */ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: ulint* */ +{ + + que_node_t* exp; + sel_node_t* node = row; + ulint* total = user_arg; + + exp = node->select_list; + + while (exp) { + dfield_t* dfield = que_node_get_val(exp); + void* data = dfield_get_data(dfield); + ulint len = dfield_get_len(dfield); + + if (len != UNIV_SQL_NULL && len != 0) { + *total += mach_read_from_4(data); + } + + exp = que_node_get_next(exp); + } + + return(TRUE); +} + +/******************************************************************** +Calculate the total documents that contain a particular word (term). +@return DB_SUCCESS if all go well else error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_query_total_docs_containing_term( +/*=================================*/ + fts_query_t* query, /*!< in: FTS query state */ + const fts_string_t* word, /*!< in: the word to check */ + ulint* total) /*!< out: documents containing word */ +{ + pars_info_t* info; + dberr_t error; + que_t* graph; + ulint selected; + trx_t* trx = query->trx; + char table_name[MAX_FULL_NAME_LEN] + + trx->op_info = "fetching FTS index document count"; + + *total = 0; + + info = pars_info_create(); + + pars_info_bind_function(info, "my_func", fts_query_sum, total); + pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len); + + selected = fts_select_index(*word->f_str); + + query->fts_index_table.suffix = fts_get_suffix(selected); + + fts_get_table_name(&query->fts_index_table, table_name); + + pars_info_bind_id(info, "index_table_name", table_name); + + graph = fts_parse_sql( + &query->fts_index_table, + info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT doc_count\n" + " FROM $index_table_name\n" + " WHERE word = :word" + " ORDER BY first_doc_id;\n" + "BEGIN\n" + "\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + + for (;;) { + error = fts_eval_sql(trx, graph); + + if (error == DB_SUCCESS) { + + break; /* Exit the loop. */ + } else { + + if (error == DB_LOCK_WAIT_TIMEOUT) { + ib::warn() << "lock wait timeout reading FTS" + " index. Retrying!"; + + trx->error_state = DB_SUCCESS; + } else { + ib::error() << error + << " while reading FTS index."; + + break; /* Exit the loop. */ + } + } + } + + que_graph_free(graph); + + return(error); +} + +/******************************************************************** +Get the total number of words in a documents. +@return DB_SUCCESS if all go well else error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_query_terms_in_document( +/*========================*/ + fts_query_t* query, /*!< in: FTS query state */ + doc_id_t doc_id, /*!< in: the word to check */ + ulint* total) /*!< out: total words in document */ +{ + pars_info_t* info; + dberr_t error; + que_t* graph; + doc_id_t read_doc_id; + trx_t* trx = query->trx; + char table_name[MAX_FULL_NAME_LEN]; + + trx->op_info = "fetching FTS document term count"; + + *total = 0; + + info = pars_info_create(); + + pars_info_bind_function(info, "my_func", fts_query_sum, total); + + /* Convert to "storage" byte order. */ + fts_write_doc_id((byte*) &read_doc_id, doc_id); + fts_bind_doc_id(info, "doc_id", &read_doc_id); + + query->fts_index_table.suffix = "DOC_ID"; + + fts_get_table_name(&query->fts_index_table, table_name); + + pars_info_bind_id(info, "index_table_name", table_name); + + graph = fts_parse_sql( + &query->fts_index_table, + info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT count\n" + " FROM $index_table_name\n" + " WHERE doc_id = :doc_id" + " BEGIN\n" + "\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + + for (;;) { + error = fts_eval_sql(trx, graph); + + if (error == DB_SUCCESS) { + + break; /* Exit the loop. */ + } else { + + if (error == DB_LOCK_WAIT_TIMEOUT) { + ib::warn() << "lock wait timeout reading FTS" + " doc id table. Retrying!"; + + trx->error_state = DB_SUCCESS; + } else { + ib::error() << error << " while reading FTS" + " doc id table."; + + break; /* Exit the loop. */ + } + } + } + + que_graph_free(graph); + + return(error); +} +#endif + +/*****************************************************************//** +Retrieve the document and match the phrase tokens. +@return DB_SUCCESS or error code */ +MY_ATTRIBUTE((nonnull(1,2,3,6), warn_unused_result)) +static +dberr_t +fts_query_match_document( +/*=====================*/ + ib_vector_t* tokens, /*!< in: phrase tokens */ + fts_get_doc_t* get_doc, /*!< in: table and prepared statements */ + fts_match_t* match, /*!< in: doc id and positions */ + ulint distance, /*!< in: proximity distance */ + st_mysql_ftparser* parser, /*!< in: fts plugin parser */ + ibool* found) /*!< out: TRUE if phrase found */ +{ + dberr_t error; + fts_phrase_t phrase(get_doc->index_cache->index->table); + + phrase.match = match; /* Positions to match */ + phrase.tokens = tokens; /* Tokens to match */ + phrase.distance = distance; + phrase.charset = get_doc->index_cache->charset; + phrase.heap = mem_heap_create(512); + phrase.parser = parser; + + *found = phrase.found = FALSE; + + error = fts_doc_fetch_by_doc_id( + get_doc, match->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL, + fts_query_fetch_document, &phrase); + + if (UNIV_UNLIKELY(error != DB_SUCCESS)) { + ib::error() << "(" << error << ") matching document."; + } else { + *found = phrase.found; + } + + mem_heap_free(phrase.heap); + + return(error); +} + +/*****************************************************************//** +This function fetches the original documents and count the +words in between matching words to see that is in specified distance +@return DB_SUCCESS if all OK */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +bool +fts_query_is_in_proximity_range( +/*============================*/ + const fts_query_t* query, /*!< in: query instance */ + fts_match_t** match, /*!< in: query instance */ + fts_proximity_t* qualified_pos) /*!< in: position info for + qualified ranges */ +{ + fts_get_doc_t get_doc; + fts_cache_t* cache = query->index->table->fts->cache; + dberr_t err; + + memset(&get_doc, 0x0, sizeof(get_doc)); + + mysql_mutex_lock(&cache->lock); + get_doc.index_cache = fts_find_index_cache(cache, query->index); + mysql_mutex_unlock(&cache->lock); + ut_a(get_doc.index_cache != NULL); + + fts_phrase_t phrase(get_doc.index_cache->index->table); + + phrase.distance = query->distance; + phrase.charset = get_doc.index_cache->charset; + phrase.heap = mem_heap_create(512); + phrase.proximity_pos = qualified_pos; + phrase.found = FALSE; + + err = fts_doc_fetch_by_doc_id( + &get_doc, match[0]->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL, + fts_query_fetch_document, &phrase); + + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + ib::error() << "(" << err << ") in verification" + " phase of proximity search"; + } + + /* Free the prepared statement. */ + if (get_doc.get_document_graph) { + que_graph_free(get_doc.get_document_graph); + get_doc.get_document_graph = NULL; + } + + mem_heap_free(phrase.heap); + + return(err == DB_SUCCESS && phrase.found); +} + +/*****************************************************************//** +Iterate over the matched document ids and search the for the +actual phrase in the text. +@return DB_SUCCESS if all OK */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_query_search_phrase( +/*====================*/ + fts_query_t* query, /*!< in: query instance */ + ib_vector_t* orig_tokens, /*!< in: tokens to search, + with any stopwords in the + original phrase */ + ib_vector_t* tokens) /*!< in: tokens that does + not include stopwords and + can be used to calculate + ranking */ +{ + ulint i; + fts_get_doc_t get_doc; + ulint n_matched; + fts_cache_t* cache = query->index->table->fts->cache; + + n_matched = ib_vector_size(query->matched); + + /* Setup the doc retrieval infrastructure. */ + memset(&get_doc, 0x0, sizeof(get_doc)); + + mysql_mutex_lock(&cache->lock); + + get_doc.index_cache = fts_find_index_cache(cache, query->index); + + /* Must find the index cache */ + ut_a(get_doc.index_cache != NULL); + + mysql_mutex_unlock(&cache->lock); + +#ifdef FTS_INTERNAL_DIAG_PRINT + ib::info() << "Start phrase search"; +#endif + + /* Read the document from disk and do the actual + match, matching documents will be added to the current + doc id set. */ + for (i = 0; i < n_matched && query->error == DB_SUCCESS; ++i) { + fts_match_t* match; + ibool found = FALSE; + + match = static_cast( + ib_vector_get(query->matched, i)); + + /* Skip the document ids that were filtered out by + an earlier pass. */ + if (match->doc_id != 0) { + + query->error = fts_query_match_document( + orig_tokens, &get_doc, match, + query->distance, query->parser, &found); + + if (query->error == DB_SUCCESS && found) { + ulint z; + + query->error = fts_query_process_doc_id(query, + match->doc_id, 0); + if (query->error != DB_SUCCESS) { + goto func_exit; + } + + for (z = 0; z < ib_vector_size(tokens); z++) { + fts_string_t* token; + token = static_cast( + ib_vector_get(tokens, z)); + fts_query_add_word_to_document( + query, match->doc_id, token); + } + } + } + } + +func_exit: + /* Free the prepared statement. */ + if (get_doc.get_document_graph) { + que_graph_free(get_doc.get_document_graph); + get_doc.get_document_graph = NULL; + } + + return(query->error); +} + +/** Split the phrase into tokens +@param[in,out] query query instance +@param[in] node query node to search +@param[in,out] tokens token vector +@param[in,out] orig_tokens original node tokens include stopword +@param[in,out] heap mem heap */ +static +void +fts_query_phrase_split( + fts_query_t* query, + const fts_ast_node_t* node, + ib_vector_t* tokens, + ib_vector_t* orig_tokens, + mem_heap_t* heap) +{ + fts_string_t phrase; + ulint len = 0; + ulint cur_pos = 0; + fts_ast_node_t* term_node = NULL; + + if (node->type == FTS_AST_TEXT) { + phrase.f_str = node->text.ptr->str; + phrase.f_len = node->text.ptr->len; + len = phrase.f_len; + } else { + ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST); + phrase.f_str = NULL; + phrase.f_len = 0; + term_node = node->list.head; + } + + while (true) { + fts_cache_t* cache = query->index->table->fts->cache; + ulint cur_len; + fts_string_t result_str; + + if (node->type == FTS_AST_TEXT) { + if (cur_pos >= len) { + break; + } + + cur_len = innobase_mysql_fts_get_token( + query->fts_index_table.charset, + reinterpret_cast(phrase.f_str) + + cur_pos, + reinterpret_cast(phrase.f_str) + + len, + &result_str); + + if (cur_len == 0) { + break; + } + + cur_pos += cur_len; + } else { + ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST); + /* Term node in parser phrase list */ + if (term_node == NULL) { + break; + } + + ut_a(term_node->type == FTS_AST_TERM); + result_str.f_str = term_node->term.ptr->str; + result_str.f_len = term_node->term.ptr->len; + result_str.f_n_char = fts_get_token_size( + query->fts_index_table.charset, + reinterpret_cast(result_str.f_str), + result_str.f_len); + + term_node = term_node->next; + } + + if (result_str.f_n_char == 0) { + continue; + } + + fts_string_t* token = static_cast( + ib_vector_push(tokens, NULL)); + fts_string_dup(token, &result_str, heap); + + if (fts_check_token( + &result_str, + cache->stopword_info.cached_stopword, + query->fts_index_table.charset)) { + /* Add the word to the RB tree so that we can + calculate it's frequencey within a document. */ + fts_query_add_word_freq(query, token); + } else { + ib_vector_pop(tokens); + } + + /* we will start to store all words including stopwords + in the "orig_tokens" vector, but skip any leading words + that are stopwords */ + if (!ib_vector_is_empty(tokens)) { + fts_string_t* orig_token = static_cast( + ib_vector_push(orig_tokens, NULL)); + + orig_token->f_str = token->f_str; + orig_token->f_len = token->f_len; + } + } +} + +/*****************************************************************//** +Text/Phrase search. +@return DB_SUCCESS or error code */ +static MY_ATTRIBUTE((warn_unused_result)) +dberr_t +fts_query_phrase_search( +/*====================*/ + fts_query_t* query, /*!< in: query instance */ + const fts_ast_node_t* node) /*!< in: node to search */ +{ + ib_vector_t* tokens; + ib_vector_t* orig_tokens; + mem_heap_t* heap = mem_heap_create(sizeof(fts_string_t)); + ib_alloc_t* heap_alloc; + ulint num_token; + + heap_alloc = ib_heap_allocator_create(heap); + + tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4); + orig_tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4); + + if (query->distance != ULINT_UNDEFINED && query->distance > 0) { + query->flags = FTS_PROXIMITY; + } else { + query->flags = FTS_PHRASE; + } + + /* Split the phrase into tokens. */ + fts_query_phrase_split(query, node, tokens, orig_tokens, heap); + + num_token = ib_vector_size(tokens); + if (num_token > MAX_PROXIMITY_ITEM) { + query->error = DB_FTS_TOO_MANY_WORDS_IN_PHRASE; + goto func_exit; + } + + ut_ad(ib_vector_size(orig_tokens) >= num_token); + + /* Ignore empty strings. */ + if (num_token > 0) { + fts_string_t* token = NULL; + fts_fetch_t fetch; + trx_t* trx = query->trx; + fts_ast_oper_t oper = query->oper; + que_t* graph = NULL; + ulint i; + dberr_t error; + + /* Create the vector for storing matching document ids + and the positions of the first token of the phrase. */ + if (!query->matched) { + ib_alloc_t* heap_alloc; + + heap_alloc = ib_heap_allocator_create(heap); + + if (!(query->flags & FTS_PROXIMITY) + && !(query->flags & FTS_PHRASE)) { + query->matched = ib_vector_create( + heap_alloc, sizeof(fts_match_t), + 64); + } else { + ut_a(num_token <= MAX_PROXIMITY_ITEM); + query->match_array = + (ib_vector_t**) mem_heap_alloc( + heap, + num_token * + sizeof(query->matched)); + + for (i = 0; i < num_token; i++) { + query->match_array[i] = + ib_vector_create( + heap_alloc, sizeof(fts_match_t), + 64); + } + + query->matched = query->match_array[0]; + } + } + + /* Setup the callback args for filtering and consolidating + the ilist. */ + fetch.read_arg = query; + fetch.read_record = fts_query_index_fetch_nodes; + + for (i = 0; i < num_token; i++) { + /* Search for the first word from the phrase. */ + token = static_cast( + ib_vector_get(tokens, i)); + + if (query->flags & FTS_PROXIMITY + || query->flags & FTS_PHRASE) { + query->matched = query->match_array[i]; + } + + error = fts_index_fetch_nodes( + trx, &graph, &query->fts_index_table, + token, &fetch); + + /* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */ + ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS)); + if (error != DB_SUCCESS) { + query->error = error; + } + + que_graph_free(graph); + graph = NULL; + + fts_query_cache(query, token); + + if (!(query->flags & FTS_PHRASE) + && !(query->flags & FTS_PROXIMITY)) { + break; + } + + /* If any of the token can't be found, + no need to continue match */ + if (ib_vector_is_empty(query->match_array[i]) + || query->error != DB_SUCCESS) { + goto func_exit; + } + } + + /* Just a single word, no need to fetch the original + documents to do phrase matching */ + if (ib_vector_size(orig_tokens) == 1 + && !ib_vector_is_empty(query->match_array[0])) { + fts_match_t* match; + ulint n_matched; + + n_matched = ib_vector_size(query->match_array[0]); + + for (i = 0; i < n_matched; i++) { + match = static_cast( + ib_vector_get( + query->match_array[0], i)); + + query->error = fts_query_process_doc_id( + query, match->doc_id, 0); + if (query->error != DB_SUCCESS) { + goto func_exit; + } + + fts_query_add_word_to_document( + query, match->doc_id, token); + } + query->oper = oper; + goto func_exit; + } + + /* If we are doing proximity search, verify the distance + between all words, and check they are in specified distance. */ + if (query->flags & FTS_PROXIMITY) { + fts_phrase_or_proximity_search(query, tokens); + } else { + ibool matched; + + /* Phrase Search case: + We filter out the doc ids that don't contain + all the tokens in the phrase. It's cheaper to + search the ilist than bringing the documents in + and then doing a search through the text. Isolated + testing shows this also helps in mitigating disruption + of the buffer cache. */ + matched = fts_phrase_or_proximity_search(query, tokens); + query->matched = query->match_array[0]; + + /* Read the actual text in and search for the phrase. */ + if (matched) { + ut_ad(query->error == DB_SUCCESS); + query->error = fts_query_search_phrase( + query, orig_tokens, tokens); + } + } + + /* Restore original operation. */ + query->oper = oper; + + if (query->error != DB_SUCCESS) { + goto func_exit; + } + } + +func_exit: + mem_heap_free(heap); + + /* Don't need it anymore. */ + query->matched = NULL; + + return(query->error); +} + +/*****************************************************************//** +Find the word and evaluate. +@return DB_SUCCESS if all go well */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_query_execute( +/*==============*/ + fts_query_t* query, /*!< in: query instance */ + fts_string_t* token) /*!< in: token to search */ +{ + switch (query->oper) { + case FTS_NONE: + case FTS_NEGATE: + case FTS_INCR_RATING: + case FTS_DECR_RATING: + query->error = fts_query_union(query, token); + break; + + case FTS_EXIST: + query->error = fts_query_intersect(query, token); + break; + + case FTS_IGNORE: + query->error = fts_query_difference(query, token); + break; + + default: + ut_error; + } + + return(query->error); +} + +/*****************************************************************//** +Create a wildcard string. It's the responsibility of the caller to +free the byte* pointer. It's allocated using ut_malloc_nokey(). +@return ptr to allocated memory */ +static +byte* +fts_query_get_token( +/*================*/ + fts_ast_node_t* node, /*!< in: the current sub tree */ + fts_string_t* token) /*!< in: token to create */ +{ + ulint str_len; + byte* new_ptr = NULL; + + str_len = node->term.ptr->len; + + ut_a(node->type == FTS_AST_TERM); + + token->f_len = str_len; + token->f_str = node->term.ptr->str; + + if (node->term.wildcard) { + + token->f_str = static_cast(ut_malloc_nokey(str_len + 2)); + token->f_len = str_len + 1; + + memcpy(token->f_str, node->term.ptr->str, str_len); + + token->f_str[str_len] = '%'; + token->f_str[token->f_len] = 0; + + new_ptr = token->f_str; + } + + return(new_ptr); +} + +static dberr_t fts_ast_visit_sub_exp(fts_ast_node_t*, fts_ast_callback, void*); + +/*****************************************************************//** +Visit every node of the AST. */ +static +dberr_t +fts_query_visitor( +/*==============*/ + fts_ast_oper_t oper, /*!< in: current operator */ + fts_ast_node_t* node, /*!< in: The root of the current subtree*/ + void* arg) /*!< in: callback arg*/ +{ + byte* ptr; + fts_string_t token; + fts_query_t* query = static_cast(arg); + + ut_a(node); + DBUG_ENTER("fts_query_visitor"); + DBUG_PRINT("fts", ("nodetype: %s", fts_ast_node_type_get(node->type))); + + token.f_n_char = 0; + query->oper = oper; + query->cur_node = node; + + switch (node->type) { + case FTS_AST_TEXT: + case FTS_AST_PARSER_PHRASE_LIST: + + if (query->oper == FTS_EXIST) { + ut_ad(query->intersection == NULL); + query->intersection = rbt_create( + sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); + + query->total_size += SIZEOF_RBT_CREATE; + } + + /* Set the current proximity distance. */ + query->distance = node->text.distance; + + /* Force collection of doc ids and the positions. */ + query->collect_positions = TRUE; + + query->error = fts_query_phrase_search(query, node); + + query->collect_positions = FALSE; + + if (query->oper == FTS_EXIST) { + fts_query_free_doc_ids(query, query->doc_ids); + query->doc_ids = query->intersection; + query->intersection = NULL; + } + + break; + + case FTS_AST_TERM: + token.f_str = node->term.ptr->str; + token.f_len = node->term.ptr->len; + + /* Collect wildcard words for QUERY EXPANSION. */ + if (node->term.wildcard && query->wildcard_words != NULL) { + ib_rbt_bound_t parent; + + if (rbt_search(query->wildcard_words, &parent, &token) + != 0) { + fts_string_t word; + + fts_string_dup(&word, &token, query->heap); + rbt_add_node(query->wildcard_words, &parent, + &word); + } + } + + /* Add the word to our RB tree that will be used to + calculate this terms per document frequency. */ + fts_query_add_word_freq(query, &token); + + ptr = fts_query_get_token(node, &token); + query->error = fts_query_execute(query, &token); + + if (ptr) { + ut_free(ptr); + } + + break; + + case FTS_AST_SUBEXP_LIST: + query->error = fts_ast_visit_sub_exp(node, fts_query_visitor, arg); + break; + + default: + ut_error; + } + + if (query->oper == FTS_EXIST) { + query->multi_exist = true; + } + + DBUG_RETURN(query->error); +} + +/** Process (nested) sub-expression, create a new result set to store the +sub-expression result by processing nodes under current sub-expression +list. Merge the sub-expression result with that of parent expression list. +@param[in,out] node current root node +@param[in,out] visitor callback function +@param[in,out] arg argument for callback +@return DB_SUCCESS if all go well */ +static +dberr_t +fts_ast_visit_sub_exp( + fts_ast_node_t* node, + fts_ast_callback visitor, + void* arg) +{ + fts_ast_oper_t cur_oper; + fts_query_t* query = static_cast(arg); + ib_rbt_t* parent_doc_ids; + ib_rbt_t* subexpr_doc_ids; + dberr_t error = DB_SUCCESS; + bool will_be_ignored = false; + bool multi_exist; + + DBUG_ENTER("fts_ast_visit_sub_exp"); + + ut_a(node->type == FTS_AST_SUBEXP_LIST); + + /* To avoid stack overflow, we limit the mutual recursion + depth between fts_ast_visit(), fts_query_visitor() and + fts_ast_visit_sub_exp(). */ + if (query->visiting_sub_exp++ > 31) { + query->error = DB_OUT_OF_MEMORY; + DBUG_RETURN(query->error); + } + + cur_oper = query->oper; + + /* Save current result set */ + parent_doc_ids = query->doc_ids; + + /* Create new result set to store the sub-expression result. We + will merge this result set with the parent after processing. */ + query->doc_ids = rbt_create(sizeof(fts_ranking_t), + fts_ranking_doc_id_cmp); + + query->total_size += SIZEOF_RBT_CREATE; + + multi_exist = query->multi_exist; + query->multi_exist = false; + /* Process nodes in current sub-expression and store its + result set in query->doc_ids we created above. */ + error = fts_ast_visit(FTS_NONE, node, visitor, + arg, &will_be_ignored); + + /* Reinstate parent node state */ + query->multi_exist = multi_exist; + query->oper = cur_oper; + query->visiting_sub_exp--; + + /* Merge the sub-expression result with the parent result set. */ + subexpr_doc_ids = query->doc_ids; + query->doc_ids = parent_doc_ids; + if (error == DB_SUCCESS) { + error = fts_merge_doc_ids(query, subexpr_doc_ids); + } + + /* Free current result set. Result already merged into parent. */ + fts_query_free_doc_ids(query, subexpr_doc_ids); + + DBUG_RETURN(error); +} + +#if 0 +/*****************************************************************//*** +Check if the doc id exists in the ilist. +@return TRUE if doc id found */ +static +ulint +fts_query_find_doc_id( +/*==================*/ + fts_select_t* select, /*!< in/out: contains the doc id to + find, we update the word freq if + document found */ + void* data, /*!< in: doc id ilist */ + ulint len) /*!< in: doc id ilist size */ +{ + byte* ptr = data; + doc_id_t doc_id = 0; + ulint decoded = 0; + + /* Decode the ilist and search for selected doc_id. We also + calculate the frequency of the word in the document if found. */ + while (decoded < len && !select->found) { + ulint freq = 0; + ulint min_pos = 0; + ulint last_pos = 0; + ulint pos = fts_decode_vlc(&ptr); + + /* Add the delta. */ + doc_id += pos; + + while (*ptr) { + ++freq; + last_pos += fts_decode_vlc(&ptr); + + /* Only if min_pos is not set and the current + term exists in a position greater than the + min_pos of the previous term. */ + if (min_pos == 0 && last_pos > select->min_pos) { + min_pos = last_pos; + } + } + + /* Skip the end of word position marker. */ + ++ptr; + + /* Bytes decoded so far. */ + decoded = ptr - (byte*) data; + + /* A word may exist in the document but we only consider a + match if it exists in a position that is greater than the + position of the previous term. */ + if (doc_id == select->doc_id && min_pos > 0) { + fts_doc_freq_t* doc_freq; + + /* Add the doc id to the doc freq rb tree, if + the doc id doesn't exist it will be created. */ + doc_freq = fts_query_add_doc_freq( + select->word_freq->doc_freqs, doc_id); + + /* Avoid duplicating the frequency tally */ + if (doc_freq->freq == 0) { + doc_freq->freq = freq; + } + + select->found = TRUE; + select->min_pos = min_pos; + } + } + + return(select->found); +} +#endif + +/*****************************************************************//** +Read and filter nodes. +@return DB_SUCCESS if all go well, +or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */ +static +dberr_t +fts_query_filter_doc_ids( +/*=====================*/ + fts_query_t* query, /*!< in: query instance */ + const fts_string_t* word, /*!< in: the current word */ + fts_word_freq_t* word_freq, /*!< in/out: word frequency */ + const fts_node_t* node, /*!< in: current FTS node */ + void* data, /*!< in: doc id ilist */ + ulint len, /*!< in: doc id ilist size */ + ibool calc_doc_count) /*!< in: whether to remember doc count */ +{ + const byte* ptr = static_cast(data); + doc_id_t doc_id = 0; + ulint decoded = 0; + ib_rbt_t* doc_freqs = word_freq->doc_freqs; + + /* Decode the ilist and add the doc ids to the query doc_id set. */ + while (decoded < len) { + ulint freq = 0; + fts_doc_freq_t* doc_freq; + fts_match_t* match = NULL; + doc_id_t last_pos = 0; + doc_id_t pos = fts_decode_vlc(&ptr); + + /* Some sanity checks. */ + if (doc_id == 0) { + ut_a(pos == node->first_doc_id); + } + + /* Add the delta. */ + doc_id += pos; + + if (calc_doc_count) { + word_freq->doc_count++; + } + + /* We simply collect the matching instances here. */ + if (query->collect_positions) { + ib_alloc_t* heap_alloc; + + /* Create a new fts_match_t instance. */ + match = static_cast( + ib_vector_push(query->matched, NULL)); + + match->start = 0; + match->doc_id = doc_id; + heap_alloc = ib_vector_allocator(query->matched); + + /* Allocate from the same heap as the + parent container. */ + match->positions = ib_vector_create( + heap_alloc, sizeof(ulint), 64); + + query->total_size += sizeof(fts_match_t) + + sizeof(ib_vector_t) + + sizeof(ulint) * 64; + } + + /* Unpack the positions within the document. */ + while (*ptr) { + last_pos += fts_decode_vlc(&ptr); + + /* Collect the matching word positions, for phrase + matching later. */ + if (query->collect_positions) { + ib_vector_push(match->positions, &last_pos); + } + + ++freq; + } + + /* End of list marker. */ + last_pos = (ulint) -1; + + if (query->collect_positions) { + ut_a(match != NULL); + ib_vector_push(match->positions, &last_pos); + } + + /* Add the doc id to the doc freq rb tree, if the doc id + doesn't exist it will be created. */ + doc_freq = fts_query_add_doc_freq(query, doc_freqs, doc_id); + + /* Avoid duplicating frequency tally. */ + if (doc_freq->freq == 0) { + doc_freq->freq = freq; + } + + /* Skip the end of word position marker. */ + ++ptr; + + /* Bytes decoded so far */ + decoded = ulint(ptr - (byte*) data); + + /* We simply collect the matching documents and the + positions here and match later. */ + if (!query->collect_positions) { + /* We ignore error here and will check it later */ + fts_query_process_doc_id(query, doc_id, 0); + + /* Add the word to the document's matched RB tree. */ + fts_query_add_word_to_document(query, doc_id, word); + } + } + + /* Some sanity checks. */ + ut_a(doc_id == node->last_doc_id); + + if (query->total_size > fts_result_cache_limit) { + return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT); + } else { + return(DB_SUCCESS); + } +} + +/*****************************************************************//** +Read the FTS INDEX row. +@return DB_SUCCESS if all go well. */ +static +dberr_t +fts_query_read_node( +/*================*/ + fts_query_t* query, /*!< in: query instance */ + const fts_string_t* word, /*!< in: current word */ + que_node_t* exp) /*!< in: query graph node */ +{ + int i; + int ret; + fts_node_t node; + ib_rbt_bound_t parent; + fts_word_freq_t* word_freq; + ibool skip = FALSE; + fts_string_t term; + byte buf[FTS_MAX_WORD_LEN + 1]; + dberr_t error = DB_SUCCESS; + + ut_a(query->cur_node->type == FTS_AST_TERM + || query->cur_node->type == FTS_AST_TEXT + || query->cur_node->type == FTS_AST_PARSER_PHRASE_LIST); + + memset(&node, 0, sizeof(node)); + term.f_str = buf; + + /* Need to consider the wildcard search case, the word frequency + is created on the search string not the actual word. So we need + to assign the frequency on search string behalf. */ + if (query->cur_node->type == FTS_AST_TERM + && query->cur_node->term.wildcard) { + + term.f_len = query->cur_node->term.ptr->len; + ut_ad(FTS_MAX_WORD_LEN >= term.f_len); + memcpy(term.f_str, query->cur_node->term.ptr->str, term.f_len); + } else { + term.f_len = word->f_len; + ut_ad(FTS_MAX_WORD_LEN >= word->f_len); + memcpy(term.f_str, word->f_str, word->f_len); + } + + /* Lookup the word in our rb tree, it must exist. */ + ret = rbt_search(query->word_freqs, &parent, &term); + + ut_a(ret == 0); + + word_freq = rbt_value(fts_word_freq_t, parent.last); + + /* Start from 1 since the first column has been read by the caller. + Also, we rely on the order of the columns projected, to filter + out ilists that are out of range and we always want to read + the doc_count irrespective of the suitablility of the row. */ + + for (i = 1; exp && !skip; exp = que_node_get_next(exp), ++i) { + + dfield_t* dfield = que_node_get_val(exp); + byte* data = static_cast( + dfield_get_data(dfield)); + ulint len = dfield_get_len(dfield); + + ut_a(len != UNIV_SQL_NULL); + + /* Note: The column numbers below must match the SELECT. */ + + switch (i) { + case 1: /* DOC_COUNT */ + word_freq->doc_count += mach_read_from_4(data); + break; + + case 2: /* FIRST_DOC_ID */ + node.first_doc_id = fts_read_doc_id(data); + + /* Skip nodes whose doc ids are out range. */ + if (query->oper == FTS_EXIST + && query->upper_doc_id > 0 + && node.first_doc_id > query->upper_doc_id) { + skip = TRUE; + } + break; + + case 3: /* LAST_DOC_ID */ + node.last_doc_id = fts_read_doc_id(data); + + /* Skip nodes whose doc ids are out range. */ + if (query->oper == FTS_EXIST + && query->lower_doc_id > 0 + && node.last_doc_id < query->lower_doc_id) { + skip = TRUE; + } + break; + + case 4: /* ILIST */ + + error = fts_query_filter_doc_ids( + query, &word_freq->word, word_freq, + &node, data, len, FALSE); + + break; + + default: + ut_error; + } + } + + if (!skip) { + /* Make sure all columns were read. */ + + ut_a(i == 5); + } + + return error; +} + +/*****************************************************************//** +Callback function to fetch the rows in an FTS INDEX record. +@return always returns TRUE */ +static +ibool +fts_query_index_fetch_nodes( +/*========================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: pointer to fts_fetch_t */ +{ + fts_string_t key; + sel_node_t* sel_node = static_cast(row); + fts_fetch_t* fetch = static_cast(user_arg); + fts_query_t* query = static_cast(fetch->read_arg); + que_node_t* exp = sel_node->select_list; + dfield_t* dfield = que_node_get_val(exp); + void* data = dfield_get_data(dfield); + ulint dfield_len = dfield_get_len(dfield); + + key.f_str = static_cast(data); + key.f_len = dfield_len; + + ut_a(dfield_len <= FTS_MAX_WORD_LEN); + + /* Note: we pass error out by 'query->error' */ + query->error = fts_query_read_node(query, &key, que_node_get_next(exp)); + + if (query->error != DB_SUCCESS) { + ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT); + return(FALSE); + } else { + return(TRUE); + } +} + +/*****************************************************************//** +Calculate the inverse document frequency (IDF) for all the terms. */ +static +void +fts_query_calculate_idf( +/*====================*/ + fts_query_t* query) /*!< in: Query state */ +{ + const ib_rbt_node_t* node; + ib_uint64_t total_docs = query->total_docs; + + /* We need to free any instances of fts_doc_freq_t that we + may have allocated. */ + for (node = rbt_first(query->word_freqs); + node; + node = rbt_next(query->word_freqs, node)) { + + fts_word_freq_t* word_freq; + + word_freq = rbt_value(fts_word_freq_t, node); + + if (word_freq->doc_count > 0) { + if (total_docs == word_freq->doc_count) { + /* QP assume ranking > 0 if we find + a match. Since Log10(1) = 0, we cannot + make IDF a zero value if do find a + word in all documents. So let's make + it an arbitrary very small number */ + word_freq->idf = log10(1.0001); + } else { + word_freq->idf = log10( + static_cast(total_docs) + / static_cast( + word_freq->doc_count)); + } + } + } +} + +/*****************************************************************//** +Calculate the ranking of the document. */ +static +void +fts_query_calculate_ranking( +/*========================*/ + const fts_query_t* query, /*!< in: query state */ + fts_ranking_t* ranking) /*!< in: Document to rank */ +{ + ulint pos = 0; + fts_string_t word; + + /* At this stage, ranking->rank should not exceed the 1.0 + bound */ + ut_ad(ranking->rank <= 1.0 && ranking->rank >= -1.0); + ut_ad(rbt_size(query->word_map) == query->word_vector->size()); + + while (fts_ranking_words_get_next(query, ranking, &pos, &word)) { + int ret; + ib_rbt_bound_t parent; + double weight; + fts_doc_freq_t* doc_freq; + fts_word_freq_t* word_freq; + + ret = rbt_search(query->word_freqs, &parent, &word); + + /* It must exist. */ + ut_a(ret == 0); + + word_freq = rbt_value(fts_word_freq_t, parent.last); + + ret = rbt_search( + word_freq->doc_freqs, &parent, &ranking->doc_id); + + /* It must exist. */ + ut_a(ret == 0); + + doc_freq = rbt_value(fts_doc_freq_t, parent.last); + + weight = (double) doc_freq->freq * word_freq->idf; + + ranking->rank += (fts_rank_t) (weight * word_freq->idf); + } +} + +/*****************************************************************//** +Add ranking to the result set. */ +static +void +fts_query_add_ranking( +/*==================*/ + fts_query_t* query, /*!< in: query state */ + ib_rbt_t* ranking_tree, /*!< in: ranking tree */ + const fts_ranking_t* new_ranking) /*!< in: ranking of a document */ +{ + ib_rbt_bound_t parent; + + /* Lookup the ranking in our rb tree and add if it doesn't exist. */ + if (rbt_search(ranking_tree, &parent, new_ranking) == 0) { + fts_ranking_t* ranking; + + ranking = rbt_value(fts_ranking_t, parent.last); + + ranking->rank += new_ranking->rank; + + ut_a(ranking->words == NULL); + } else { + rbt_add_node(ranking_tree, &parent, new_ranking); + + query->total_size += SIZEOF_RBT_NODE_ADD + + sizeof(fts_ranking_t); + } +} + +/*****************************************************************//** +Retrieve the FTS Relevance Ranking result for doc with doc_id +@return the relevance ranking value, 0 if no ranking value +present. */ +float +fts_retrieve_ranking( +/*=================*/ + fts_result_t* result, /*!< in: FTS result structure */ + doc_id_t doc_id) /*!< in: doc_id of the item to retrieve */ +{ + ib_rbt_bound_t parent; + fts_ranking_t new_ranking; + + DBUG_ENTER("fts_retrieve_ranking"); + + if (!result || !result->rankings_by_id) { + DBUG_RETURN(0); + } + + new_ranking.doc_id = doc_id; + + /* Lookup the ranking in our rb tree */ + if (rbt_search(result->rankings_by_id, &parent, &new_ranking) == 0) { + fts_ranking_t* ranking; + + ranking = rbt_value(fts_ranking_t, parent.last); + + DBUG_RETURN(ranking->rank); + } + + DBUG_RETURN(0); +} + +/*****************************************************************//** +Create the result and copy the data to it. */ +static +fts_result_t* +fts_query_prepare_result( +/*=====================*/ + fts_query_t* query, /*!< in: Query state */ + fts_result_t* result) /*!< in: result this can contain + data from a previous search on + another FTS index */ +{ + const ib_rbt_node_t* node; + bool result_is_null = false; + + DBUG_ENTER("fts_query_prepare_result"); + + if (result == NULL) { + result = static_cast( + ut_zalloc_nokey(sizeof(*result))); + + result->rankings_by_id = rbt_create( + sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); + + query->total_size += sizeof(fts_result_t) + SIZEOF_RBT_CREATE; + result_is_null = true; + } + + if (query->flags == FTS_OPT_RANKING) { + fts_word_freq_t* word_freq; + ulint size = ib_vector_size(query->deleted->doc_ids); + doc_id_t* updates = + (doc_id_t*) query->deleted->doc_ids->data; + + node = rbt_first(query->word_freqs); + ut_ad(node); + word_freq = rbt_value(fts_word_freq_t, node); + + for (node = rbt_first(word_freq->doc_freqs); + node; + node = rbt_next(word_freq->doc_freqs, node)) { + fts_doc_freq_t* doc_freq; + fts_ranking_t ranking; + + doc_freq = rbt_value(fts_doc_freq_t, node); + + /* Don't put deleted docs into result */ + if (fts_bsearch(updates, 0, static_cast(size), + doc_freq->doc_id) >= 0) { + /* one less matching doc count */ + --word_freq->doc_count; + continue; + } + + ranking.doc_id = doc_freq->doc_id; + ranking.rank = static_cast(doc_freq->freq); + ranking.words = NULL; + + fts_query_add_ranking(query, result->rankings_by_id, + &ranking); + + if (query->total_size > fts_result_cache_limit) { + query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT; + fts_query_free_result(result); + DBUG_RETURN(NULL); + } + } + + /* Calculate IDF only after we exclude the deleted items */ + fts_query_calculate_idf(query); + + node = rbt_first(query->word_freqs); + word_freq = rbt_value(fts_word_freq_t, node); + + /* Calculate the ranking for each doc */ + for (node = rbt_first(result->rankings_by_id); + node != NULL; + node = rbt_next(result->rankings_by_id, node)) { + + fts_ranking_t* ranking; + + ranking = rbt_value(fts_ranking_t, node); + + ranking->rank = static_cast( + ranking->rank * word_freq->idf * word_freq->idf); + } + + DBUG_RETURN(result); + } + + ut_a(rbt_size(query->doc_ids) > 0); + + for (node = rbt_first(query->doc_ids); + node; + node = rbt_next(query->doc_ids, node)) { + + fts_ranking_t* ranking; + + ranking = rbt_value(fts_ranking_t, node); + fts_query_calculate_ranking(query, ranking); + + // FIXME: I think we may requre this information to improve the + // ranking of doc ids which have more word matches from + // different FTS indexes. + + /* We don't need these anymore free the resources. */ + ranking->words = NULL; + + if (!result_is_null) { + fts_query_add_ranking(query, result->rankings_by_id, ranking); + + if (query->total_size > fts_result_cache_limit) { + query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT; + fts_query_free_result(result); + DBUG_RETURN(NULL); + } + } + } + + if (result_is_null) { + /* Use doc_ids directly */ + rbt_free(result->rankings_by_id); + result->rankings_by_id = query->doc_ids; + query->doc_ids = NULL; + } + + DBUG_RETURN(result); +} + +/*****************************************************************//** +Get the result of the query. Calculate the similarity coefficient. */ +static +fts_result_t* +fts_query_get_result( +/*=================*/ + fts_query_t* query, /*!< in: query instance */ + fts_result_t* result) /*!< in: result */ +{ + DBUG_ENTER("fts_query_get_result"); + + if (rbt_size(query->doc_ids) > 0 || query->flags == FTS_OPT_RANKING) { + /* Copy the doc ids to the result. */ + result = fts_query_prepare_result(query, result); + } else { + /* Create an empty result instance. */ + result = static_cast( + ut_zalloc_nokey(sizeof(*result))); + } + + DBUG_RETURN(result); +} + +/*****************************************************************//** +FTS Query free resources and reset. */ +static +void +fts_query_free( +/*===========*/ + fts_query_t* query) /*!< in: query instance to free*/ +{ + + if (query->read_nodes_graph) { + que_graph_free(query->read_nodes_graph); + } + + if (query->root) { + fts_ast_free_node(query->root); + } + + if (query->deleted) { + fts_doc_ids_free(query->deleted); + } + + if (query->intersection) { + fts_query_free_doc_ids(query, query->intersection); + } + + if (query->doc_ids) { + fts_query_free_doc_ids(query, query->doc_ids); + } + + if (query->word_freqs) { + const ib_rbt_node_t* node; + + /* We need to free any instances of fts_doc_freq_t that we + may have allocated. */ + for (node = rbt_first(query->word_freqs); + node; + node = rbt_next(query->word_freqs, node)) { + + fts_word_freq_t* word_freq; + + word_freq = rbt_value(fts_word_freq_t, node); + + /* We need to cast away the const. */ + rbt_free(word_freq->doc_freqs); + } + + rbt_free(query->word_freqs); + } + + if (query->wildcard_words != NULL) { + rbt_free(query->wildcard_words); + } + + ut_a(!query->intersection); + + if (query->word_map) { + rbt_free(query->word_map); + } + + if (query->word_vector != NULL) { + UT_DELETE(query->word_vector); + } + + if (query->heap) { + mem_heap_free(query->heap); + } + + memset(query, 0, sizeof(*query)); +} + +/*****************************************************************//** +Parse the query using flex/bison or plugin parser. +@return parse tree node. */ +static +fts_ast_node_t* +fts_query_parse( +/*============*/ + fts_query_t* query, /*!< in: query instance */ + byte* query_str, /*!< in: query string */ + ulint query_len) /*!< in: query string length */ +{ + int error; + fts_ast_state_t state; + bool mode = query->boolean_mode; + DBUG_ENTER("fts_query_parse"); + + memset(&state, 0x0, sizeof(state)); + + state.charset = query->fts_index_table.charset; + + DBUG_EXECUTE_IF("fts_instrument_query_disable_parser", + query->parser = NULL;); + + if (query->parser) { + state.root = state.cur_node = + fts_ast_create_node_list(&state, NULL); + error = fts_parse_by_parser(mode, query_str, query_len, + query->parser, &state); + } else { + /* Setup the scanner to use, this depends on the mode flag. */ + state.lexer = fts_lexer_create(mode, query_str, query_len); + state.charset = query->fts_index_table.charset; + error = fts_parse(&state); + fts_lexer_free(state.lexer); + state.lexer = NULL; + } + + /* Error during parsing ? */ + if (error) { + /* Free the nodes that were allocated during parsing. */ + fts_ast_state_free(&state); + } else { + query->root = state.root; + + if (UNIV_UNLIKELY(fts_enable_diag_print) && query->root) { + fts_ast_node_print(query->root); + } + } + + DBUG_RETURN(state.root); +} + +/*******************************************************************//** +FTS Query optimization +Set FTS_OPT_RANKING if it is a simple term query */ +static +void +fts_query_can_optimize( +/*===================*/ + fts_query_t* query, /*!< in/out: query instance */ + uint flags) /*!< In: FTS search mode */ +{ + fts_ast_node_t* node = query->root; + + if (flags & FTS_EXPAND) { + return; + } + + /* Check if it has only a term without oper */ + ut_ad(node->type == FTS_AST_LIST); + node = node->list.head; + if (node != NULL && node->type == FTS_AST_TERM && node->next == NULL) { + query->flags = FTS_OPT_RANKING; + } +} + +/** FTS Query entry point. +@param[in,out] trx transaction +@param[in] index fts index to search +@param[in] flags FTS search mode +@param[in] query_str FTS query +@param[in] query_len FTS query string len in bytes +@param[in,out] result result doc ids +@return DB_SUCCESS if successful otherwise error code */ +dberr_t +fts_query( + trx_t* trx, + dict_index_t* index, + uint flags, + const byte* query_str, + ulint query_len, + fts_result_t** result) +{ + fts_query_t query; + dberr_t error = DB_SUCCESS; + byte* lc_query_str; + ulint lc_query_str_len; + ulint result_len; + bool boolean_mode; + trx_t* query_trx; /* FIXME: use provided trx */ + CHARSET_INFO* charset; + ulint start_time_ms; + bool will_be_ignored = false; + + boolean_mode = flags & FTS_BOOL; + + *result = NULL; + memset(&query, 0x0, sizeof(query)); + query_trx = trx_create(); + query_trx->op_info = "FTS query"; + + start_time_ms = ut_time_ms(); + + query.trx = query_trx; + query.index = index; + query.boolean_mode = boolean_mode; + query.deleted = fts_doc_ids_create(); + query.cur_node = NULL; + + query.fts_common_table.type = FTS_COMMON_TABLE; + query.fts_common_table.table_id = index->table->id; + query.fts_common_table.table = index->table; + + charset = fts_index_get_charset(index); + + query.fts_index_table.type = FTS_INDEX_TABLE; + query.fts_index_table.index_id = index->id; + query.fts_index_table.table_id = index->table->id; + query.fts_index_table.charset = charset; + query.fts_index_table.table = index->table; + + query.word_map = rbt_create_arg_cmp( + sizeof(fts_string_t), innobase_fts_text_cmp, (void*)charset); + query.word_vector = UT_NEW_NOKEY(word_vector_t()); + query.error = DB_SUCCESS; + + /* Setup the RB tree that will be used to collect per term + statistics. */ + query.word_freqs = rbt_create_arg_cmp( + sizeof(fts_word_freq_t), innobase_fts_text_cmp, + (void*) charset); + + if (flags & FTS_EXPAND) { + query.wildcard_words = rbt_create_arg_cmp( + sizeof(fts_string_t), innobase_fts_text_cmp, (void *)charset); + } + + query.total_size += SIZEOF_RBT_CREATE; + + query.total_docs = dict_table_get_n_rows(index->table); + + query.fts_common_table.suffix = "DELETED"; + + /* Read the deleted doc_ids, we need these for filtering. */ + error = fts_table_fetch_doc_ids( + NULL, &query.fts_common_table, query.deleted); + + if (error != DB_SUCCESS) { + goto func_exit; + } + + query.fts_common_table.suffix = "DELETED_CACHE"; + + error = fts_table_fetch_doc_ids( + NULL, &query.fts_common_table, query.deleted); + + if (error != DB_SUCCESS) { + goto func_exit; + } + + /* Get the deleted doc ids that are in the cache. */ + fts_cache_append_deleted_doc_ids( + index->table->fts->cache, query.deleted->doc_ids); + DEBUG_SYNC_C("fts_deleted_doc_ids_append"); + + /* Sort the vector so that we can do a binary search over the ids. */ + ib_vector_sort(query.deleted->doc_ids, fts_doc_id_cmp); + + /* Convert the query string to lower case before parsing. We own + the ut_malloc'ed result and so remember to free it before return. */ + + lc_query_str_len = query_len * charset->casedn_multiply() + 1; + lc_query_str = static_cast(ut_malloc_nokey(lc_query_str_len)); + + /* For binary collations, a case sensitive search is + performed. Hence don't convert to lower case. */ + if (my_binary_compare(charset)) { + memcpy(lc_query_str, query_str, query_len); + lc_query_str[query_len]= 0; + result_len= query_len; + } else { + result_len = innobase_fts_casedn_str( + charset, (char*)( query_str), query_len, + (char*)(lc_query_str), lc_query_str_len); + } + + ut_ad(result_len < lc_query_str_len); + + lc_query_str[result_len] = 0; + + query.heap = mem_heap_create(128); + + /* Create the rb tree for the doc id (current) set. */ + query.doc_ids = rbt_create( + sizeof(fts_ranking_t), fts_ranking_doc_id_cmp); + query.parser = index->parser; + + query.total_size += SIZEOF_RBT_CREATE; + + /* Parse the input query string. */ + if (fts_query_parse(&query, lc_query_str, result_len)) { + fts_ast_node_t* ast = query.root; + ast->trx = trx; + + /* Optimize query to check if it's a single term */ + fts_query_can_optimize(&query, flags); + + DBUG_EXECUTE_IF("fts_instrument_result_cache_limit", + fts_result_cache_limit = 2048; + ); + + /* Traverse the Abstract Syntax Tree (AST) and execute + the query. */ + query.error = fts_ast_visit( + FTS_NONE, ast, fts_query_visitor, + &query, &will_be_ignored); + if (query.error == DB_INTERRUPTED) { + error = DB_INTERRUPTED; + ut_free(lc_query_str); + goto func_exit; + } + + /* If query expansion is requested, extend the search + with first search pass result */ + if (query.error == DB_SUCCESS && (flags & FTS_EXPAND)) { + query.error = fts_expand_query(index, &query); + } + + /* Calculate the inverse document frequency of the terms. */ + if (query.error == DB_SUCCESS + && query.flags != FTS_OPT_RANKING) { + fts_query_calculate_idf(&query); + } + + /* Copy the result from the query state, so that we can + return it to the caller. */ + if (query.error == DB_SUCCESS) { + *result = fts_query_get_result(&query, *result); + } + + error = query.error; + } else { + /* still return an empty result set */ + *result = static_cast( + ut_zalloc_nokey(sizeof(**result))); + } + + if (trx_is_interrupted(trx)) { + error = DB_INTERRUPTED; + ut_free(lc_query_str); + if (*result) { + fts_query_free_result(*result); + } + goto func_exit; + } + + ut_free(lc_query_str); + + if (UNIV_UNLIKELY(fts_enable_diag_print) && (*result)) { + ulint diff_time = ut_time_ms() - start_time_ms; + + ib::info() << "FTS Search Processing time: " + << diff_time / 1000 << " secs: " << diff_time % 1000 + << " millisec: row(s) " + << ((*result)->rankings_by_id + ? lint(rbt_size((*result)->rankings_by_id)) + : -1); + + /* Log memory consumption & result size */ + ib::info() << "Full Search Memory: " << query.total_size + << " (bytes), Row: " + << ((*result)->rankings_by_id + ? rbt_size((*result)->rankings_by_id) + : 0) + << "."; + } + +func_exit: + fts_query_free(&query); + + query_trx->free(); + + return(error); +} + +/*****************************************************************//** +FTS Query free result, returned by fts_query(). */ +void +fts_query_free_result( +/*==================*/ + fts_result_t* result) /*!< in: result instance to free.*/ +{ + if (result) { + if (result->rankings_by_id != NULL) { + rbt_free(result->rankings_by_id); + result->rankings_by_id = NULL; + } + if (result->rankings_by_rank != NULL) { + rbt_free(result->rankings_by_rank); + result->rankings_by_rank = NULL; + } + + ut_free(result); + result = NULL; + } +} + +/*****************************************************************//** +FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */ +void +fts_query_sort_result_on_rank( +/*==========================*/ + fts_result_t* result) /*!< out: result instance to sort.*/ +{ + const ib_rbt_node_t* node; + ib_rbt_t* ranked; + + ut_a(result->rankings_by_id != NULL); + if (result->rankings_by_rank) { + rbt_free(result->rankings_by_rank); + } + + ranked = rbt_create(sizeof(fts_ranking_t), fts_query_compare_rank); + + /* We need to free any instances of fts_doc_freq_t that we + may have allocated. */ + for (node = rbt_first(result->rankings_by_id); + node; + node = rbt_next(result->rankings_by_id, node)) { + + fts_ranking_t* ranking; + + ranking = rbt_value(fts_ranking_t, node); + + ut_a(ranking->words == NULL); + + rbt_insert(ranked, ranking, ranking); + } + + /* Reset the current node too. */ + result->current = NULL; + result->rankings_by_rank = ranked; +} + +/*******************************************************************//** +A debug function to print result doc_id set. */ +static +void +fts_print_doc_id( +/*=============*/ + fts_query_t* query) /*!< in : tree that stores doc_ids.*/ +{ + const ib_rbt_node_t* node; + + /* Iterate each member of the doc_id set */ + for (node = rbt_first(query->doc_ids); + node; + node = rbt_next(query->doc_ids, node)) { + fts_ranking_t* ranking; + ranking = rbt_value(fts_ranking_t, node); + + ib::info() << "doc_ids info, doc_id: " << ranking->doc_id; + + ulint pos = 0; + fts_string_t word; + + while (fts_ranking_words_get_next(query, ranking, &pos, &word)) { + ib::info() << "doc_ids info, value: " << word.f_str; + } + } +} + +/*************************************************************//** +This function implements a simple "blind" query expansion search: +words in documents found in the first search pass will be used as +search arguments to search the document again, thus "expand" +the search result set. +@return DB_SUCCESS if success, otherwise the error code */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +fts_expand_query( +/*=============*/ + dict_index_t* index, /*!< in: FTS index to search */ + fts_query_t* query) /*!< in: FTS query instance */ +{ + const ib_rbt_node_t* node; + const ib_rbt_node_t* token_node; + fts_doc_t result_doc; + dberr_t error = DB_SUCCESS; + const fts_index_cache_t*index_cache; + + /* If no doc is found in first search pass, return */ + if (!rbt_size(query->doc_ids)) { + return(error); + } + + /* Init "result_doc", to hold words from the first search pass */ + fts_doc_init(&result_doc); + + mysql_mutex_lock(&index->table->fts->cache->lock); + index_cache = fts_find_index_cache(index->table->fts->cache, index); + mysql_mutex_unlock(&index->table->fts->cache->lock); + + ut_a(index_cache); + + result_doc.tokens = rbt_create_arg_cmp( + sizeof(fts_token_t), innobase_fts_text_cmp, + (void*) index_cache->charset); + + result_doc.charset = index_cache->charset; + result_doc.parser = index_cache->index->parser; + + query->total_size += SIZEOF_RBT_CREATE; + + if (UNIV_UNLIKELY(fts_enable_diag_print)) { + fts_print_doc_id(query); + } + + for (node = rbt_first(query->doc_ids); + node; + node = rbt_next(query->doc_ids, node)) { + + fts_ranking_t* ranking; + ulint prev_token_size; + ulint estimate_size; + + prev_token_size = rbt_size(result_doc.tokens); + + ranking = rbt_value(fts_ranking_t, node); + + /* Fetch the documents with the doc_id from the + result of first seach pass. Since we do not + store document-to-word mapping, we need to + fetch the original document and parse them. + Future optimization could be done here if we + support some forms of document-to-word mapping */ + fts_doc_fetch_by_doc_id(NULL, ranking->doc_id, index, + FTS_FETCH_DOC_BY_ID_EQUAL, + fts_query_expansion_fetch_doc, + &result_doc); + + /* Estimate memory used, see fts_process_token and fts_token_t. + We ignore token size here. */ + estimate_size = (rbt_size(result_doc.tokens) - prev_token_size) + * (SIZEOF_RBT_NODE_ADD + sizeof(fts_token_t) + + sizeof(ib_vector_t) + sizeof(ulint) * 32); + query->total_size += estimate_size; + + if (query->total_size > fts_result_cache_limit) { + error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT; + goto func_exit; + } + } + + /* Remove words that have already been searched in the first pass */ + for (ulint i = 0; i < query->word_vector->size(); i++) { + fts_string_t word = query->word_vector->at(i); + ib_rbt_bound_t parent; + + if (query->wildcard_words + && rbt_search(query->wildcard_words, &parent, &word) == 0) { + /* If it's a wildcard word, remove words having + it as prefix. */ + while (rbt_search_cmp(result_doc.tokens, + &parent, &word, NULL, + innobase_fts_text_cmp_prefix) + == 0) { + ut_free(rbt_remove_node(result_doc.tokens, + parent.last)); + } + } else { + /* We don't check return value, because the word may + have been deleted by a previous wildcard word as its + prefix, e.g. ('g * good'). */ + rbt_delete(result_doc.tokens, &word); + } + } + + /* Search the table the second time with expanded search list */ + for (token_node = rbt_first(result_doc.tokens); + token_node; + token_node = rbt_next(result_doc.tokens, token_node)) { + fts_token_t* mytoken; + mytoken = rbt_value(fts_token_t, token_node); + + /* '%' in the end is treated as prefix search, + it can cause assert failure, so we skip it. */ + if (mytoken->text.f_str[mytoken->text.f_len - 1] == '%') { + continue; + } + + ut_ad(mytoken->text.f_str[mytoken->text.f_len] == 0); + fts_query_add_word_freq(query, &mytoken->text); + error = fts_query_union(query, &mytoken->text); + + if (error != DB_SUCCESS) { + break; + } + } + +func_exit: + fts_doc_free(&result_doc); + + return(error); +} +/*************************************************************//** +This function finds documents that contain all words in a +phrase or proximity search. And if proximity search, verify +the words are close enough to each other, as in specified distance. +This function is called for phrase and proximity search. +@return TRUE if documents are found, FALSE if otherwise */ +static +ibool +fts_phrase_or_proximity_search( +/*===========================*/ + fts_query_t* query, /*!< in/out: query instance. + query->doc_ids might be instantiated + with qualified doc IDs */ + ib_vector_t* tokens) /*!< in: Tokens contain words */ +{ + ulint n_matched; + ulint i; + ibool matched = FALSE; + ulint num_token = ib_vector_size(tokens); + fts_match_t* match[MAX_PROXIMITY_ITEM]; + ibool end_list = FALSE; + + /* Number of matched documents for the first token */ + n_matched = ib_vector_size(query->match_array[0]); + + /* We have a set of match list for each word, we shall + walk through the list and find common documents that + contain all the matching words. */ + for (i = 0; i < n_matched; i++) { + ulint j; + ulint k = 0; + fts_proximity_t qualified_pos; + + match[0] = static_cast( + ib_vector_get(query->match_array[0], i)); + + /* For remaining match list for the token(word), we + try to see if there is a document with the same + doc id */ + for (j = 1; j < num_token; j++) { + match[j] = static_cast( + ib_vector_get(query->match_array[j], k)); + + while (match[j]->doc_id < match[0]->doc_id + && k < ib_vector_size(query->match_array[j])) { + match[j] = static_cast( + ib_vector_get( + query->match_array[j], k)); + k++; + } + + if (match[j]->doc_id > match[0]->doc_id) { + /* no match */ + if (query->flags & FTS_PHRASE) { + match[0]->doc_id = 0; + } + break; + } + + if (k == ib_vector_size(query->match_array[j])) { + end_list = TRUE; + + if (query->flags & FTS_PHRASE) { + ulint s; + /* Since i is the last doc id in the + match_array[j], remove all doc ids > i + from the match_array[0]. */ + fts_match_t* match_temp; + for (s = i + 1; s < n_matched; s++) { + match_temp = static_cast< + fts_match_t*>(ib_vector_get( + query->match_array[0], s)); + match_temp->doc_id = 0; + } + + if (match[j]->doc_id != + match[0]->doc_id) { + /* no match */ + match[0]->doc_id = 0; + } + } + + if (match[j]->doc_id != match[0]->doc_id) { + goto func_exit; + } + } + + /* FIXME: A better solution will be a counter array + remember each run's last position. So we don't + reset it here very time */ + k = 0; + } + + if (j != num_token) { + continue; + } + + /* For this matching doc, we need to further + verify whether the words in the doc are close + to each other, and within the distance specified + in the proximity search */ + if (query->flags & FTS_PHRASE) { + matched = TRUE; + } else if (fts_proximity_get_positions( + match, num_token, ULINT_MAX, &qualified_pos)) { + + /* Fetch the original documents and count the + words in between matching words to see that is in + specified distance */ + if (fts_query_is_in_proximity_range( + query, match, &qualified_pos)) { + /* If so, mark we find a matching doc */ + query->error = fts_query_process_doc_id( + query, match[0]->doc_id, 0); + if (query->error != DB_SUCCESS) { + matched = FALSE; + goto func_exit; + } + + matched = TRUE; + for (ulint z = 0; z < num_token; z++) { + fts_string_t* token; + token = static_cast( + ib_vector_get(tokens, z)); + fts_query_add_word_to_document( + query, match[0]->doc_id, token); + } + } + } + + if (end_list) { + break; + } + } + +func_exit: + return(matched); +} + +/*************************************************************//** +This function checks whether words in result documents are close to +each other (within proximity range as specified by "distance"). +If "distance" is MAX_ULINT, then it will find all combinations of +positions of matching words and store min and max positions +in the "qualified_pos" for later verification. +@return true if words are close to each other, false if otherwise */ +static +bool +fts_proximity_get_positions( +/*========================*/ + fts_match_t** match, /*!< in: query instance */ + ulint num_match, /*!< in: number of matching + items */ + ulint distance, /*!< in: distance value + for proximity search */ + fts_proximity_t* qualified_pos) /*!< out: the position info + records ranges containing + all matching words. */ +{ + ulint i; + ulint idx[MAX_PROXIMITY_ITEM]; + ulint num_pos[MAX_PROXIMITY_ITEM]; + ulint min_idx; + + qualified_pos->n_pos = 0; + + ut_a(num_match <= MAX_PROXIMITY_ITEM); + + /* Each word could appear multiple times in a doc. So + we need to walk through each word's position list, and find + closest distance between different words to see if + they are in the proximity distance. */ + + /* Assume each word's position list is sorted, we + will just do a walk through to all words' lists + similar to a the merge phase of a merge sort */ + for (i = 0; i < num_match; i++) { + /* idx is the current position we are checking + for a particular word */ + idx[i] = 0; + + /* Number of positions for this word */ + num_pos[i] = ib_vector_size(match[i]->positions); + } + + /* Start with the first word */ + min_idx = 0; + + while (idx[min_idx] < num_pos[min_idx]) { + ulint position[MAX_PROXIMITY_ITEM]; + ulint min_pos = ULINT_MAX; + ulint max_pos = 0; + + /* Check positions in each word position list, and + record the max/min position */ + for (i = 0; i < num_match; i++) { + position[i] = *(ulint*) ib_vector_get_const( + match[i]->positions, idx[i]); + + if (position[i] == ULINT_UNDEFINED) { + break; + } + + if (position[i] < min_pos) { + min_pos = position[i]; + min_idx = i; + } + + if (position[i] > max_pos) { + max_pos = position[i]; + } + } + + /* If max and min position are within range, we + find a good match */ + if (max_pos - min_pos <= distance + && (i >= num_match || position[i] != ULINT_UNDEFINED)) { + /* The charset has variable character + length encoding, record the min_pos and + max_pos, we will need to verify the actual + number of characters */ + qualified_pos->min_pos.push_back(min_pos); + qualified_pos->max_pos.push_back(max_pos); + qualified_pos->n_pos++; + } + + /* Otherwise, move to the next position is the + list for the word with the smallest position */ + idx[min_idx]++; + } + + return(qualified_pos->n_pos != 0); +} diff --git a/storage/innobase/fts/fts0sql.cc b/storage/innobase/fts/fts0sql.cc new file mode 100644 index 00000000..1970f6f5 --- /dev/null +++ b/storage/innobase/fts/fts0sql.cc @@ -0,0 +1,208 @@ +/***************************************************************************** + +Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file fts/fts0sql.cc +Full Text Search functionality. + +Created 2007-03-27 Sunny Bains +*******************************************************/ + +#include "que0que.h" +#include "trx0roll.h" +#include "pars0pars.h" +#include "dict0dict.h" +#include "fts0types.h" +#include "fts0priv.h" + +/** SQL statements for creating the ancillary FTS tables. */ + +/** Preamble to all SQL statements. */ +static const char* fts_sql_begin= + "PROCEDURE P() IS\n"; + +/** Postamble to non-committing SQL statements. */ +static const char* fts_sql_end= + "\n" + "END;\n"; + +/******************************************************************//** +Get the table id. +@return number of bytes written */ +int +fts_get_table_id( +/*=============*/ + const fts_table_t* + fts_table, /*!< in: FTS Auxiliary table */ + char* table_id) /*!< out: table id, must be at least + FTS_AUX_MIN_TABLE_ID_LENGTH bytes + long */ +{ + int len; + + ut_a(fts_table->table != NULL); + + switch (fts_table->type) { + case FTS_COMMON_TABLE: + len = fts_write_object_id(fts_table->table_id, table_id); + break; + + case FTS_INDEX_TABLE: + + len = fts_write_object_id(fts_table->table_id, table_id); + + table_id[len] = '_'; + ++len; + table_id += len; + + len += fts_write_object_id(fts_table->index_id, table_id); + break; + + default: + ut_error; + } + + ut_a(len >= 16); + ut_a(len < FTS_AUX_MIN_TABLE_ID_LENGTH); + + return(len); +} + +/** Construct the name of an internal FTS table for the given table. +@param[in] fts_table metadata on fulltext-indexed table +@param[out] table_name a name up to MAX_FULL_NAME_LEN +@param[in] dict_locked whether dict_sys.latch is being held */ +void fts_get_table_name(const fts_table_t* fts_table, char* table_name, + bool dict_locked) +{ + if (!dict_locked) { + dict_sys.freeze(SRW_LOCK_CALL); + } + ut_ad(dict_sys.frozen()); + /* Include the separator as well. */ + const size_t dbname_len = fts_table->table->name.dblen() + 1; + ut_ad(dbname_len > 1); + memcpy(table_name, fts_table->table->name.m_name, dbname_len); + if (!dict_locked) { + dict_sys.unfreeze(); + } + memcpy(table_name += dbname_len, "FTS_", 4); + table_name += 4; + table_name += fts_get_table_id(fts_table, table_name); + *table_name++ = '_'; + strcpy(table_name, fts_table->suffix); +} + +/******************************************************************//** +Parse an SQL string. +@return query graph */ +que_t* +fts_parse_sql( +/*==========*/ + fts_table_t* fts_table, /*!< in: FTS auxiliarry table info */ + pars_info_t* info, /*!< in: info struct, or NULL */ + const char* sql) /*!< in: SQL string to evaluate */ +{ + char* str; + que_t* graph; + ibool dict_locked; + + str = ut_str3cat(fts_sql_begin, sql, fts_sql_end); + + dict_locked = (fts_table && fts_table->table->fts + && fts_table->table->fts->dict_locked); + + if (!dict_locked) { + /* The InnoDB SQL parser is not re-entrant. */ + dict_sys.lock(SRW_LOCK_CALL); + } + + graph = pars_sql(info, str); + ut_a(graph); + + if (!dict_locked) { + dict_sys.unlock(); + } + + ut_free(str); + + return(graph); +} + +/******************************************************************//** +Evaluate an SQL query graph. +@return DB_SUCCESS or error code */ +dberr_t +fts_eval_sql( +/*=========*/ + trx_t* trx, /*!< in: transaction */ + que_t* graph) /*!< in: Query graph to evaluate */ +{ + que_thr_t* thr; + + graph->trx = trx; + + ut_a(thr = que_fork_start_command(graph)); + + que_run_threads(thr); + + return(trx->error_state); +} + +/******************************************************************//** +Construct the column specification part of the SQL string for selecting the +indexed FTS columns for the given table. Adds the necessary bound +ids to the given 'info' and returns the SQL string. Examples: + +One indexed column named "text": + + "$sel0", + info/ids: sel0 -> "text" + +Two indexed columns named "subject" and "content": + + "$sel0, $sel1", + info/ids: sel0 -> "subject", sel1 -> "content", +@return heap-allocated WHERE string */ +const char* +fts_get_select_columns_str( +/*=======================*/ + dict_index_t* index, /*!< in: index */ + pars_info_t* info, /*!< in/out: parser info */ + mem_heap_t* heap) /*!< in: memory heap */ +{ + ulint i; + const char* str = ""; + + for (i = 0; i < index->n_user_defined_cols; i++) { + char* sel_str; + + dict_field_t* field = dict_index_get_nth_field(index, i); + + sel_str = mem_heap_printf(heap, "sel%lu", (ulong) i); + + /* Set copy_name to TRUE since it's dynamic. */ + pars_info_bind_id(info, sel_str, field->name); + + str = mem_heap_printf( + heap, "%s%s$%s", str, (*str) ? ", " : "", sel_str); + } + + return(str); +} diff --git a/storage/innobase/fts/fts0tlex.cc b/storage/innobase/fts/fts0tlex.cc new file mode 100644 index 00000000..29f73f23 --- /dev/null +++ b/storage/innobase/fts/fts0tlex.cc @@ -0,0 +1,2169 @@ +#include "univ.i" +#line 2 "fts0tlex.cc" + +#line 4 "fts0tlex.cc" + +#define YY_INT_ALIGNED short int + +/* A lexical scanner generated by flex */ + +#define FLEX_SCANNER +#define YY_FLEX_MAJOR_VERSION 2 +#define YY_FLEX_MINOR_VERSION 6 +#define YY_FLEX_SUBMINOR_VERSION 4 +#if YY_FLEX_SUBMINOR_VERSION > 0 +#define FLEX_BETA +#endif + +#ifdef yy_create_buffer +#define fts0t_create_buffer_ALREADY_DEFINED +#else +#define yy_create_buffer fts0t_create_buffer +#endif + +#ifdef yy_delete_buffer +#define fts0t_delete_buffer_ALREADY_DEFINED +#else +#define yy_delete_buffer fts0t_delete_buffer +#endif + +#ifdef yy_scan_buffer +#define fts0t_scan_buffer_ALREADY_DEFINED +#else +#define yy_scan_buffer fts0t_scan_buffer +#endif + +#ifdef yy_scan_string +#define fts0t_scan_string_ALREADY_DEFINED +#else +#define yy_scan_string fts0t_scan_string +#endif + +#ifdef yy_scan_bytes +#define fts0t_scan_bytes_ALREADY_DEFINED +#else +#define yy_scan_bytes fts0t_scan_bytes +#endif + +#ifdef yy_init_buffer +#define fts0t_init_buffer_ALREADY_DEFINED +#else +#define yy_init_buffer fts0t_init_buffer +#endif + +#ifdef yy_flush_buffer +#define fts0t_flush_buffer_ALREADY_DEFINED +#else +#define yy_flush_buffer fts0t_flush_buffer +#endif + +#ifdef yy_load_buffer_state +#define fts0t_load_buffer_state_ALREADY_DEFINED +#else +#define yy_load_buffer_state fts0t_load_buffer_state +#endif + +#ifdef yy_switch_to_buffer +#define fts0t_switch_to_buffer_ALREADY_DEFINED +#else +#define yy_switch_to_buffer fts0t_switch_to_buffer +#endif + +#ifdef yypush_buffer_state +#define fts0tpush_buffer_state_ALREADY_DEFINED +#else +#define yypush_buffer_state fts0tpush_buffer_state +#endif + +#ifdef yypop_buffer_state +#define fts0tpop_buffer_state_ALREADY_DEFINED +#else +#define yypop_buffer_state fts0tpop_buffer_state +#endif + +#ifdef yyensure_buffer_stack +#define fts0tensure_buffer_stack_ALREADY_DEFINED +#else +#define yyensure_buffer_stack fts0tensure_buffer_stack +#endif + +#ifdef yylex +#define fts0tlex_ALREADY_DEFINED +#else +#define yylex fts0tlex +#endif + +#ifdef yyrestart +#define fts0trestart_ALREADY_DEFINED +#else +#define yyrestart fts0trestart +#endif + +#ifdef yylex_init +#define fts0tlex_init_ALREADY_DEFINED +#else +#define yylex_init fts0tlex_init +#endif + +#ifdef yylex_init_extra +#define fts0tlex_init_extra_ALREADY_DEFINED +#else +#define yylex_init_extra fts0tlex_init_extra +#endif + +#ifdef yylex_destroy +#define fts0tlex_destroy_ALREADY_DEFINED +#else +#define yylex_destroy fts0tlex_destroy +#endif + +#ifdef yyget_debug +#define fts0tget_debug_ALREADY_DEFINED +#else +#define yyget_debug fts0tget_debug +#endif + +#ifdef yyset_debug +#define fts0tset_debug_ALREADY_DEFINED +#else +#define yyset_debug fts0tset_debug +#endif + +#ifdef yyget_extra +#define fts0tget_extra_ALREADY_DEFINED +#else +#define yyget_extra fts0tget_extra +#endif + +#ifdef yyset_extra +#define fts0tset_extra_ALREADY_DEFINED +#else +#define yyset_extra fts0tset_extra +#endif + +#ifdef yyget_in +#define fts0tget_in_ALREADY_DEFINED +#else +#define yyget_in fts0tget_in +#endif + +#ifdef yyset_in +#define fts0tset_in_ALREADY_DEFINED +#else +#define yyset_in fts0tset_in +#endif + +#ifdef yyget_out +#define fts0tget_out_ALREADY_DEFINED +#else +#define yyget_out fts0tget_out +#endif + +#ifdef yyset_out +#define fts0tset_out_ALREADY_DEFINED +#else +#define yyset_out fts0tset_out +#endif + +#ifdef yyget_leng +#define fts0tget_leng_ALREADY_DEFINED +#else +#define yyget_leng fts0tget_leng +#endif + +#ifdef yyget_text +#define fts0tget_text_ALREADY_DEFINED +#else +#define yyget_text fts0tget_text +#endif + +#ifdef yyget_lineno +#define fts0tget_lineno_ALREADY_DEFINED +#else +#define yyget_lineno fts0tget_lineno +#endif + +#ifdef yyset_lineno +#define fts0tset_lineno_ALREADY_DEFINED +#else +#define yyset_lineno fts0tset_lineno +#endif + +#ifdef yyget_column +#define fts0tget_column_ALREADY_DEFINED +#else +#define yyget_column fts0tget_column +#endif + +#ifdef yyset_column +#define fts0tset_column_ALREADY_DEFINED +#else +#define yyset_column fts0tset_column +#endif + +#ifdef yywrap +#define fts0twrap_ALREADY_DEFINED +#else +#define yywrap fts0twrap +#endif + +#ifdef yyalloc +#define fts0talloc_ALREADY_DEFINED +#else +#define yyalloc fts0talloc +#endif + +#ifdef yyrealloc +#define fts0trealloc_ALREADY_DEFINED +#else +#define yyrealloc fts0trealloc +#endif + +#ifdef yyfree +#define fts0tfree_ALREADY_DEFINED +#else +#define yyfree fts0tfree +#endif + +/* First, we deal with platform-specific or compiler-specific issues. */ + +/* begin standard C headers. */ +#include +#include +#include +#include + +/* end standard C headers. */ + +/* flex integer type definitions */ + +#ifndef FLEXINT_H +#define FLEXINT_H + +/* C99 systems have . Non-C99 systems may or may not. */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + +/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, + * if you want the limit (max/min) macros for int types. + */ +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif + +#include +typedef int8_t flex_int8_t; +typedef uint8_t flex_uint8_t; +typedef int16_t flex_int16_t; +typedef uint16_t flex_uint16_t; +typedef int32_t flex_int32_t; +typedef uint32_t flex_uint32_t; +#else +typedef signed char flex_int8_t; +typedef short int flex_int16_t; +typedef int flex_int32_t; +typedef unsigned char flex_uint8_t; +typedef unsigned short int flex_uint16_t; +typedef unsigned int flex_uint32_t; + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#ifndef SIZE_MAX +#define SIZE_MAX (~(size_t)0) +#endif + +#endif /* ! C99 */ + +#endif /* ! FLEXINT_H */ + +/* begin standard C++ headers. */ + +/* TODO: this is always defined, so inline it */ +#define yyconst const + +#if defined(__GNUC__) && __GNUC__ >= 3 +#define yynoreturn __attribute__((__noreturn__)) +#else +#define yynoreturn +#endif + +/* Returned upon end-of-file. */ +#define YY_NULL 0 + +/* Promotes a possibly negative, possibly signed char to an + * integer in range [0..255] for use as an array index. + */ +#define YY_SC_TO_UI(c) ((YY_CHAR) (c)) + +/* An opaque pointer. */ +#ifndef YY_TYPEDEF_YY_SCANNER_T +#define YY_TYPEDEF_YY_SCANNER_T +typedef void* yyscan_t; +#endif + +/* For convenience, these vars (plus the bison vars far below) + are macros in the reentrant scanner. */ +#define yyin yyg->yyin_r +#define yyout yyg->yyout_r +#define yyextra yyg->yyextra_r +#define yyleng yyg->yyleng_r +#define yytext yyg->yytext_r +#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno) +#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column) +#define yy_flex_debug yyg->yy_flex_debug_r + +/* Enter a start condition. This macro really ought to take a parameter, + * but we do it the disgusting crufty way forced on us by the ()-less + * definition of BEGIN. + */ +#define BEGIN yyg->yy_start = 1 + 2 * +/* Translate the current start state into a value that can be later handed + * to BEGIN to return to the state. The YYSTATE alias is for lex + * compatibility. + */ +#define YY_START ((yyg->yy_start - 1) / 2) +#define YYSTATE YY_START +/* Action number for EOF rule of a given start state. */ +#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1) +/* Special action meaning "start processing a new file". */ +#define YY_NEW_FILE yyrestart( yyin , yyscanner ) +#define YY_END_OF_BUFFER_CHAR 0 + +/* Size of default input buffer. */ +#ifndef YY_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k. + * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case. + * Ditto for the __ia64__ case accordingly. + */ +#define YY_BUF_SIZE 32768 +#else +#define YY_BUF_SIZE 16384 +#endif /* __ia64__ */ +#endif + +/* The state buf must be large enough to hold one state per character in the main buffer. + */ +#define YY_STATE_BUF_SIZE ((YY_BUF_SIZE + 2) * sizeof(yy_state_type)) + +#ifndef YY_TYPEDEF_YY_BUFFER_STATE +#define YY_TYPEDEF_YY_BUFFER_STATE +typedef struct yy_buffer_state *YY_BUFFER_STATE; +#endif + +#ifndef YY_TYPEDEF_YY_SIZE_T +#define YY_TYPEDEF_YY_SIZE_T +typedef size_t yy_size_t; +#endif + +#define EOB_ACT_CONTINUE_SCAN 0 +#define EOB_ACT_END_OF_FILE 1 +#define EOB_ACT_LAST_MATCH 2 + + #define YY_LESS_LINENO(n) + #define YY_LINENO_REWIND_TO(ptr) + +/* Return all but the first "n" matched characters back to the input stream. */ +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + *yy_cp = yyg->yy_hold_char; \ + YY_RESTORE_YY_MORE_OFFSET \ + yyg->yy_c_buf_p = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \ + YY_DO_BEFORE_ACTION; /* set up yytext again */ \ + } \ + while ( 0 ) +#define unput(c) yyunput( c, yyg->yytext_ptr , yyscanner ) + +#ifndef YY_STRUCT_YY_BUFFER_STATE +#define YY_STRUCT_YY_BUFFER_STATE +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + int yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + int yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + int yy_bs_lineno; /**< The line count. */ + int yy_bs_column; /**< The column count. */ + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; + +#define YY_BUFFER_NEW 0 +#define YY_BUFFER_NORMAL 1 + /* When an EOF's been seen but there's still some text to process + * then we mark the buffer as YY_EOF_PENDING, to indicate that we + * shouldn't try reading from the input source any more. We might + * still have a bunch of tokens to match, though, because of + * possible backing-up. + * + * When we actually see the EOF, we change the status to "new" + * (via yyrestart()), so that the user can continue scanning by + * just pointing yyin at a new input file. + */ +#define YY_BUFFER_EOF_PENDING 2 + + }; +#endif /* !YY_STRUCT_YY_BUFFER_STATE */ + +/* We provide macros for accessing buffer states in case in the + * future we want to put the buffer states in a more general + * "scanner state". + * + * Returns the top of the stack, or NULL. + */ +#define YY_CURRENT_BUFFER ( yyg->yy_buffer_stack \ + ? yyg->yy_buffer_stack[yyg->yy_buffer_stack_top] \ + : NULL) +/* Same as previous macro, but useful when we know that the buffer stack is not + * NULL or when we need an lvalue. For internal use only. + */ +#define YY_CURRENT_BUFFER_LVALUE yyg->yy_buffer_stack[yyg->yy_buffer_stack_top] + +void yyrestart ( FILE *input_file , yyscan_t yyscanner ); +void yy_switch_to_buffer ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner ); +YY_BUFFER_STATE yy_create_buffer ( FILE *file, int size , yyscan_t yyscanner ); +void yy_delete_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner ); +void yy_flush_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner ); +void yypush_buffer_state ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner ); +void yypop_buffer_state ( yyscan_t yyscanner ); + +static void yyensure_buffer_stack ( yyscan_t yyscanner ); +static void yy_load_buffer_state ( yyscan_t yyscanner ); +static void yy_init_buffer ( YY_BUFFER_STATE b, FILE *file , yyscan_t yyscanner ); +#define YY_FLUSH_BUFFER yy_flush_buffer( YY_CURRENT_BUFFER , yyscanner) + +YY_BUFFER_STATE yy_scan_buffer ( char *base, yy_size_t size , yyscan_t yyscanner ); +YY_BUFFER_STATE yy_scan_string ( const char *yy_str , yyscan_t yyscanner ); +YY_BUFFER_STATE yy_scan_bytes ( const char *bytes, int len , yyscan_t yyscanner ); + +void *yyalloc ( yy_size_t , yyscan_t yyscanner ); +void *yyrealloc ( void *, yy_size_t , yyscan_t yyscanner ); +void yyfree ( void * , yyscan_t yyscanner ); + +#define yy_new_buffer yy_create_buffer +#define yy_set_interactive(is_interactive) \ + { \ + if ( ! YY_CURRENT_BUFFER ){ \ + yyensure_buffer_stack (yyscanner); \ + YY_CURRENT_BUFFER_LVALUE = \ + yy_create_buffer( yyin, YY_BUF_SIZE , yyscanner); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \ + } +#define yy_set_bol(at_bol) \ + { \ + if ( ! YY_CURRENT_BUFFER ){\ + yyensure_buffer_stack (yyscanner); \ + YY_CURRENT_BUFFER_LVALUE = \ + yy_create_buffer( yyin, YY_BUF_SIZE , yyscanner); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \ + } +#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol) + +/* Begin user sect3 */ + +#define fts0twrap(yyscanner) (/*CONSTCOND*/1) +#define YY_SKIP_YYWRAP +typedef flex_uint8_t YY_CHAR; + +typedef int yy_state_type; + +#define yytext_ptr yytext_r + +static yy_state_type yy_get_previous_state ( yyscan_t yyscanner ); +static yy_state_type yy_try_NUL_trans ( yy_state_type current_state , yyscan_t yyscanner); +static int yy_get_next_buffer ( yyscan_t yyscanner ); +static void yynoreturn yy_fatal_error ( const char* msg , yyscan_t yyscanner ); + +/* Done after the current pattern has been matched and before the + * corresponding action - sets up yytext. + */ +#define YY_DO_BEFORE_ACTION \ + yyg->yytext_ptr = yy_bp; \ + yyleng = (int) (yy_cp - yy_bp); \ + yyg->yy_hold_char = *yy_cp; \ + *yy_cp = '\0'; \ + yyg->yy_c_buf_p = yy_cp; +#define YY_NUM_RULES 7 +#define YY_END_OF_BUFFER 8 +/* This struct is not used in this scanner, + but its presence is necessary. */ +struct yy_trans_info + { + flex_int32_t yy_verify; + flex_int32_t yy_nxt; + }; +static const flex_int16_t yy_accept[17] = + { 0, + 4, 4, 8, 4, 1, 6, 1, 5, 5, 2, + 4, 1, 1, 0, 3, 0 + } ; + +static const YY_CHAR yy_ec[256] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 4, 1, 5, 1, 1, 6, 1, 1, 1, + 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1 + } ; + +static const YY_CHAR yy_meta[8] = + { 0, + 1, 2, 3, 4, 5, 5, 1 + } ; + +static const flex_int16_t yy_base[20] = + { 0, + 0, 0, 18, 0, 6, 21, 0, 9, 21, 0, + 0, 0, 0, 4, 21, 21, 10, 11, 15 + } ; + +static const flex_int16_t yy_def[20] = + { 0, + 16, 1, 16, 17, 17, 16, 18, 19, 16, 17, + 17, 5, 18, 19, 16, 0, 16, 16, 16 + } ; + +static const flex_int16_t yy_nxt[29] = + { 0, + 4, 5, 6, 7, 8, 9, 10, 12, 15, 13, + 11, 11, 13, 15, 13, 14, 14, 16, 14, 14, + 3, 16, 16, 16, 16, 16, 16, 16 + } ; + +static const flex_int16_t yy_chk[29] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 5, 14, 5, + 17, 17, 18, 8, 18, 19, 19, 3, 19, 19, + 16, 16, 16, 16, 16, 16, 16, 16 + } ; + +/* The intent behind this definition is that it'll catch + * any uses of REJECT which flex missed. + */ +#define REJECT reject_used_but_not_detected +#define yymore() yymore_used_but_not_detected +#define YY_MORE_ADJ 0 +#define YY_RESTORE_YY_MORE_OFFSET +#line 1 "fts0tlex.l" +/***************************************************************************** + +Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ +/** + * @file fts/fts0tlex.l + * FTS parser lexical analyzer + * + * Created 2007/5/9 Sunny Bains + */ +#line 27 "fts0tlex.l" + +#include "fts0ast.h" +#include "fts0pars.h" + +/* Required for reentrant parser */ +#define YY_DECL int fts_tlexer(YYSTYPE* val, yyscan_t yyscanner) +#define exit(A) ut_error + +#line 671 "fts0tlex.cc" +#define YY_NO_INPUT 1 +#line 673 "fts0tlex.cc" + +#define INITIAL 0 + +#ifndef YY_NO_UNISTD_H +/* Special case for "unistd.h", since it is non-ANSI. We include it way + * down here because we want the user's section 1 to have been scanned first. + * The user has a chance to override it with an option. + */ +#include +#endif + +#ifndef YY_EXTRA_TYPE +#define YY_EXTRA_TYPE void * +#endif + +/* Holds the entire state of the reentrant scanner. */ +struct yyguts_t + { + + /* User-defined. Not touched by flex. */ + YY_EXTRA_TYPE yyextra_r; + + /* The rest are the same as the globals declared in the non-reentrant scanner. */ + FILE *yyin_r, *yyout_r; + size_t yy_buffer_stack_top; /**< index of top of stack. */ + size_t yy_buffer_stack_max; /**< capacity of stack. */ + YY_BUFFER_STATE * yy_buffer_stack; /**< Stack as an array. */ + char yy_hold_char; + int yy_n_chars; + int yyleng_r; + char *yy_c_buf_p; + int yy_init; + int yy_start; + int yy_did_buffer_switch_on_eof; + int yy_start_stack_ptr; + int yy_start_stack_depth; + int *yy_start_stack; + yy_state_type yy_last_accepting_state; + char* yy_last_accepting_cpos; + + int yylineno_r; + int yy_flex_debug_r; + + char *yytext_r; + int yy_more_flag; + int yy_more_len; + + }; /* end struct yyguts_t */ + +static int yy_init_globals ( yyscan_t yyscanner ); + +int yylex_init (yyscan_t* scanner); + +int yylex_init_extra ( YY_EXTRA_TYPE user_defined, yyscan_t* scanner); + +/* Accessor methods to globals. + These are made visible to non-reentrant scanners for convenience. */ + +int yylex_destroy ( yyscan_t yyscanner ); + +int yyget_debug ( yyscan_t yyscanner ); + +void yyset_debug ( int debug_flag , yyscan_t yyscanner ); + +YY_EXTRA_TYPE yyget_extra ( yyscan_t yyscanner ); + +void yyset_extra ( YY_EXTRA_TYPE user_defined , yyscan_t yyscanner ); + +FILE *yyget_in ( yyscan_t yyscanner ); + +void yyset_in ( FILE * _in_str , yyscan_t yyscanner ); + +FILE *yyget_out ( yyscan_t yyscanner ); + +void yyset_out ( FILE * _out_str , yyscan_t yyscanner ); + + int yyget_leng ( yyscan_t yyscanner ); + +char *yyget_text ( yyscan_t yyscanner ); + +int yyget_lineno ( yyscan_t yyscanner ); + +void yyset_lineno ( int _line_number , yyscan_t yyscanner ); + +int yyget_column ( yyscan_t yyscanner ); + +void yyset_column ( int _column_no , yyscan_t yyscanner ); + +/* Macros after this point can all be overridden by user definitions in + * section 1. + */ + +#ifndef YY_SKIP_YYWRAP +#ifdef __cplusplus +extern "C" int yywrap ( yyscan_t yyscanner ); +#else +extern int yywrap ( yyscan_t yyscanner ); +#endif +#endif + +#ifndef YY_NO_UNPUT + +#endif + +#ifndef yytext_ptr +static void yy_flex_strncpy ( char *, const char *, int , yyscan_t yyscanner); +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen ( const char * , yyscan_t yyscanner); +#endif + +#ifndef YY_NO_INPUT +#ifdef __cplusplus +static int yyinput ( yyscan_t yyscanner ); +#else +static int input ( yyscan_t yyscanner ); +#endif + +#endif + +/* Amount of stuff to slurp up with each read. */ +#ifndef YY_READ_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k */ +#define YY_READ_BUF_SIZE 16384 +#else +#define YY_READ_BUF_SIZE 8192 +#endif /* __ia64__ */ +#endif + +/* Copy whatever the last rule matched to the standard output. */ +#ifndef ECHO +/* This used to be an fputs(), but since the string might contain NUL's, + * we now use fwrite(). + */ +#define ECHO do { if (fwrite( yytext, (size_t) yyleng, 1, yyout )) {} } while (0) +#endif + +/* Gets input and stuffs it into "buf". number of characters read, or YY_NULL, + * is returned in "result". + */ +#ifndef YY_INPUT +#define YY_INPUT(buf,result,max_size) \ + if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \ + { \ + int c = '*'; \ + int n; \ + for ( n = 0; n < max_size && \ + (c = getc( yyin )) != EOF && c != '\n'; ++n ) \ + buf[n] = (char) c; \ + if ( c == '\n' ) \ + buf[n++] = (char) c; \ + if ( c == EOF && ferror( yyin ) ) \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + result = n; \ + } \ + else \ + { \ + errno=0; \ + while ( (result = (int) fread(buf, 1, (yy_size_t) max_size, yyin)) == 0 && ferror(yyin)) \ + { \ + if( errno != EINTR) \ + { \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + break; \ + } \ + errno=0; \ + clearerr(yyin); \ + } \ + }\ +\ + +#endif + +/* No semi-colon after return; correct usage is to write "yyterminate();" - + * we don't want an extra ';' after the "return" because that will cause + * some compilers to complain about unreachable statements. + */ +#ifndef yyterminate +#define yyterminate() return YY_NULL +#endif + +/* Number of entries by which start-condition stack grows. */ +#ifndef YY_START_STACK_INCR +#define YY_START_STACK_INCR 25 +#endif + +/* Report a fatal error. */ +#ifndef YY_FATAL_ERROR +#define YY_FATAL_ERROR(msg) yy_fatal_error( msg , yyscanner) +#endif + +/* end tables serialization structures and prototypes */ + +/* Default declaration of generated scanner - a define so the user can + * easily add parameters. + */ +#ifndef YY_DECL +#define YY_DECL_IS_OURS 1 + +extern int yylex (yyscan_t yyscanner); + +#define YY_DECL int yylex (yyscan_t yyscanner) +#endif /* !YY_DECL */ + +/* Code executed at the beginning of each rule, after yytext and yyleng + * have been set up. + */ +#ifndef YY_USER_ACTION +#define YY_USER_ACTION +#endif + +/* Code executed at the end of each rule. */ +#ifndef YY_BREAK +#define YY_BREAK /*LINTED*/break; +#endif + +#define YY_RULE_SETUP \ + YY_USER_ACTION + +/** The main scanner function which does all the work. + */ +YY_DECL +{ + yy_state_type yy_current_state; + char *yy_cp, *yy_bp; + int yy_act; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if ( !yyg->yy_init ) + { + yyg->yy_init = 1; + +#ifdef YY_USER_INIT + YY_USER_INIT; +#endif + + if ( ! yyg->yy_start ) + yyg->yy_start = 1; /* first start state */ + + if ( ! yyin ) + yyin = stdin; + + if ( ! yyout ) + yyout = stdout; + + if ( ! YY_CURRENT_BUFFER ) { + yyensure_buffer_stack (yyscanner); + YY_CURRENT_BUFFER_LVALUE = + yy_create_buffer( yyin, YY_BUF_SIZE , yyscanner); + } + + yy_load_buffer_state( yyscanner ); + } + + { +#line 45 "fts0tlex.l" + + +#line 934 "fts0tlex.cc" + + while ( /*CONSTCOND*/1 ) /* loops until end-of-file is reached */ + { + yy_cp = yyg->yy_c_buf_p; + + /* Support of yytext. */ + *yy_cp = yyg->yy_hold_char; + + /* yy_bp points to the position in yy_ch_buf of the start of + * the current run. + */ + yy_bp = yy_cp; + + yy_current_state = yyg->yy_start; +yy_match: + do + { + YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)] ; + if ( yy_accept[yy_current_state] ) + { + yyg->yy_last_accepting_state = yy_current_state; + yyg->yy_last_accepting_cpos = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 17 ) + yy_c = yy_meta[yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c]; + ++yy_cp; + } + while ( yy_current_state != 16 ); + yy_cp = yyg->yy_last_accepting_cpos; + yy_current_state = yyg->yy_last_accepting_state; + +yy_find_action: + yy_act = yy_accept[yy_current_state]; + + YY_DO_BEFORE_ACTION; + +do_action: /* This label is used only to access EOF actions. */ + + switch ( yy_act ) + { /* beginning of action switch */ + case 0: /* must back up */ + /* undo the effects of YY_DO_BEFORE_ACTION */ + *yy_cp = yyg->yy_hold_char; + yy_cp = yyg->yy_last_accepting_cpos; + yy_current_state = yyg->yy_last_accepting_state; + goto yy_find_action; + +case 1: +YY_RULE_SETUP +#line 47 "fts0tlex.l" +/* Ignore whitespace */ ; + YY_BREAK +case 2: +YY_RULE_SETUP +#line 49 "fts0tlex.l" +{ + val->oper = fts0tget_text(yyscanner)[0]; + + return(val->oper); +} + YY_BREAK +case 3: +YY_RULE_SETUP +#line 55 "fts0tlex.l" +{ + val->token = fts_ast_string_create(reinterpret_cast(fts0tget_text(yyscanner)), fts0tget_leng(yyscanner)); + + return(FTS_TEXT); +} + YY_BREAK +case 4: +YY_RULE_SETUP +#line 61 "fts0tlex.l" +{ + val->token = fts_ast_string_create(reinterpret_cast(fts0tget_text(yyscanner)), fts0tget_leng(yyscanner)); + + return(FTS_TERM); +} + YY_BREAK +case 5: +YY_RULE_SETUP +#line 66 "fts0tlex.l" +; + YY_BREAK +case 6: +/* rule 6 can match eol */ +YY_RULE_SETUP +#line 67 "fts0tlex.l" + + YY_BREAK +case 7: +YY_RULE_SETUP +#line 69 "fts0tlex.l" +ECHO; + YY_BREAK +#line 1035 "fts0tlex.cc" +case YY_STATE_EOF(INITIAL): + yyterminate(); + + case YY_END_OF_BUFFER: + { + /* Amount of text matched not including the EOB char. */ + int yy_amount_of_matched_text = (int) (yy_cp - yyg->yytext_ptr) - 1; + + /* Undo the effects of YY_DO_BEFORE_ACTION. */ + *yy_cp = yyg->yy_hold_char; + YY_RESTORE_YY_MORE_OFFSET + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW ) + { + /* We're scanning a new file or input source. It's + * possible that this happened because the user + * just pointed yyin at a new source and called + * yylex(). If so, then we have to assure + * consistency between YY_CURRENT_BUFFER and our + * globals. Here is the right place to do so, because + * this is the first action (other than possibly a + * back-up) that will match for the new input source. + */ + yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL; + } + + /* Note that here we test for yy_c_buf_p "<=" to the position + * of the first EOB in the buffer, since yy_c_buf_p will + * already have been incremented past the NUL character + * (since all states make transitions on EOB to the + * end-of-buffer state). Contrast this with the test + * in input(). + */ + if ( yyg->yy_c_buf_p <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] ) + { /* This was really a NUL. */ + yy_state_type yy_next_state; + + yyg->yy_c_buf_p = yyg->yytext_ptr + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( yyscanner ); + + /* Okay, we're now positioned to make the NUL + * transition. We couldn't have + * yy_get_previous_state() go ahead and do it + * for us because it doesn't know how to deal + * with the possibility of jamming (and we don't + * want to build jamming into it because then it + * will run more slowly). + */ + + yy_next_state = yy_try_NUL_trans( yy_current_state , yyscanner); + + yy_bp = yyg->yytext_ptr + YY_MORE_ADJ; + + if ( yy_next_state ) + { + /* Consume the NUL. */ + yy_cp = ++yyg->yy_c_buf_p; + yy_current_state = yy_next_state; + goto yy_match; + } + + else + { + yy_cp = yyg->yy_last_accepting_cpos; + yy_current_state = yyg->yy_last_accepting_state; + goto yy_find_action; + } + } + + else switch ( yy_get_next_buffer( yyscanner ) ) + { + case EOB_ACT_END_OF_FILE: + { + yyg->yy_did_buffer_switch_on_eof = 0; + + if ( yywrap( yyscanner ) ) + { + /* Note: because we've taken care in + * yy_get_next_buffer() to have set up + * yytext, we can now set up + * yy_c_buf_p so that if some total + * hoser (like flex itself) wants to + * call the scanner after we return the + * YY_NULL, it'll still work - another + * YY_NULL will get returned. + */ + yyg->yy_c_buf_p = yyg->yytext_ptr + YY_MORE_ADJ; + + yy_act = YY_STATE_EOF(YY_START); + goto do_action; + } + + else + { + if ( ! yyg->yy_did_buffer_switch_on_eof ) + YY_NEW_FILE; + } + break; + } + + case EOB_ACT_CONTINUE_SCAN: + yyg->yy_c_buf_p = + yyg->yytext_ptr + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( yyscanner ); + + yy_cp = yyg->yy_c_buf_p; + yy_bp = yyg->yytext_ptr + YY_MORE_ADJ; + goto yy_match; + + case EOB_ACT_LAST_MATCH: + yyg->yy_c_buf_p = + &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars]; + + yy_current_state = yy_get_previous_state( yyscanner ); + + yy_cp = yyg->yy_c_buf_p; + yy_bp = yyg->yytext_ptr + YY_MORE_ADJ; + goto yy_find_action; + } + break; + } + + default: + YY_FATAL_ERROR( + "fatal flex scanner internal error--no action found" ); + } /* end of action switch */ + } /* end of scanning one token */ + } /* end of user's declarations */ +} /* end of yylex */ + +/* yy_get_next_buffer - try to read in a new buffer + * + * Returns a code representing an action: + * EOB_ACT_LAST_MATCH - + * EOB_ACT_CONTINUE_SCAN - continue scanning from current position + * EOB_ACT_END_OF_FILE - end of file + */ +static int yy_get_next_buffer (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf; + char *source = yyg->yytext_ptr; + int number_to_move, i; + int ret_val; + + if ( yyg->yy_c_buf_p > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] ) + YY_FATAL_ERROR( + "fatal flex scanner internal error--end of buffer missed" ); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 ) + { /* Don't try to fill the buffer, so this is an EOF. */ + if ( yyg->yy_c_buf_p - yyg->yytext_ptr - YY_MORE_ADJ == 1 ) + { + /* We matched a single character, the EOB, so + * treat this as a final EOF. + */ + return EOB_ACT_END_OF_FILE; + } + + else + { + /* We matched some text prior to the EOB, first + * process it. + */ + return EOB_ACT_LAST_MATCH; + } + } + + /* Try to read more data. */ + + /* First move last chars to start of buffer. */ + number_to_move = (int) (yyg->yy_c_buf_p - yyg->yytext_ptr - 1); + + for ( i = 0; i < number_to_move; ++i ) + *(dest++) = *(source++); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING ) + /* don't do the read, it's not guaranteed to return an EOF, + * just force an EOF + */ + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars = 0; + + else + { + int num_to_read = + YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1; + + while ( num_to_read <= 0 ) + { /* Not enough room in the buffer - grow it. */ + + /* just a shorter name for the current buffer */ + YY_BUFFER_STATE b = YY_CURRENT_BUFFER_LVALUE; + + int yy_c_buf_p_offset = + (int) (yyg->yy_c_buf_p - b->yy_ch_buf); + + if ( b->yy_is_our_buffer ) + { + int new_size = b->yy_buf_size * 2; + + if ( new_size <= 0 ) + b->yy_buf_size += b->yy_buf_size / 8; + else + b->yy_buf_size *= 2; + + b->yy_ch_buf = (char *) + /* Include room in for 2 EOB chars. */ + yyrealloc( (void *) b->yy_ch_buf, + (yy_size_t) (b->yy_buf_size + 2) , yyscanner ); + } + else + /* Can't grow it, we don't own it. */ + b->yy_ch_buf = NULL; + + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( + "fatal error - scanner input buffer overflow" ); + + yyg->yy_c_buf_p = &b->yy_ch_buf[yy_c_buf_p_offset]; + + num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size - + number_to_move - 1; + + } + + if ( num_to_read > YY_READ_BUF_SIZE ) + num_to_read = YY_READ_BUF_SIZE; + + /* Read in more data. */ + YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]), + yyg->yy_n_chars, num_to_read ); + + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars; + } + + if ( yyg->yy_n_chars == 0 ) + { + if ( number_to_move == YY_MORE_ADJ ) + { + ret_val = EOB_ACT_END_OF_FILE; + yyrestart( yyin , yyscanner); + } + + else + { + ret_val = EOB_ACT_LAST_MATCH; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = + YY_BUFFER_EOF_PENDING; + } + } + + else + ret_val = EOB_ACT_CONTINUE_SCAN; + + if ((yyg->yy_n_chars + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) { + /* Extend the array by 50%, plus the number we really need. */ + int new_size = yyg->yy_n_chars + number_to_move + (yyg->yy_n_chars >> 1); + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) yyrealloc( + (void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf, (yy_size_t) new_size , yyscanner ); + if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" ); + /* "- 2" to take care of EOB's */ + YY_CURRENT_BUFFER_LVALUE->yy_buf_size = (int) (new_size - 2); + } + + yyg->yy_n_chars += number_to_move; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] = YY_END_OF_BUFFER_CHAR; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] = YY_END_OF_BUFFER_CHAR; + + yyg->yytext_ptr = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0]; + + return ret_val; +} + +/* yy_get_previous_state - get the state just before the EOB char was reached */ + + static yy_state_type yy_get_previous_state (yyscan_t yyscanner) +{ + yy_state_type yy_current_state; + char *yy_cp; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + yy_current_state = yyg->yy_start; + + for ( yy_cp = yyg->yytext_ptr + YY_MORE_ADJ; yy_cp < yyg->yy_c_buf_p; ++yy_cp ) + { + YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1); + if ( yy_accept[yy_current_state] ) + { + yyg->yy_last_accepting_state = yy_current_state; + yyg->yy_last_accepting_cpos = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 17 ) + yy_c = yy_meta[yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c]; + } + + return yy_current_state; +} + +/* yy_try_NUL_trans - try to make a transition on the NUL character + * + * synopsis + * next_state = yy_try_NUL_trans( current_state ); + */ + static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state , yyscan_t yyscanner) +{ + int yy_is_jam; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* This var may be unused depending upon options. */ + char *yy_cp = yyg->yy_c_buf_p; + + YY_CHAR yy_c = 1; + if ( yy_accept[yy_current_state] ) + { + yyg->yy_last_accepting_state = yy_current_state; + yyg->yy_last_accepting_cpos = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 17 ) + yy_c = yy_meta[yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c]; + yy_is_jam = (yy_current_state == 16); + + (void)yyg; + return yy_is_jam ? 0 : yy_current_state; +} + +#ifndef YY_NO_UNPUT + +#endif + +#ifndef YY_NO_INPUT +#ifdef __cplusplus + static int yyinput (yyscan_t yyscanner) +#else + static int input (yyscan_t yyscanner) +#endif + +{ + int c; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + *yyg->yy_c_buf_p = yyg->yy_hold_char; + + if ( *yyg->yy_c_buf_p == YY_END_OF_BUFFER_CHAR ) + { + /* yy_c_buf_p now points to the character we want to return. + * If this occurs *before* the EOB characters, then it's a + * valid NUL; if not, then we've hit the end of the buffer. + */ + if ( yyg->yy_c_buf_p < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] ) + /* This was really a NUL. */ + *yyg->yy_c_buf_p = '\0'; + + else + { /* need more input */ + int offset = (int) (yyg->yy_c_buf_p - yyg->yytext_ptr); + ++yyg->yy_c_buf_p; + + switch ( yy_get_next_buffer( yyscanner ) ) + { + case EOB_ACT_LAST_MATCH: + /* This happens because yy_g_n_b() + * sees that we've accumulated a + * token and flags that we need to + * try matching the token before + * proceeding. But for input(), + * there's no matching to consider. + * So convert the EOB_ACT_LAST_MATCH + * to EOB_ACT_END_OF_FILE. + */ + + /* Reset buffer status. */ + yyrestart( yyin , yyscanner); + + /*FALLTHROUGH*/ + + case EOB_ACT_END_OF_FILE: + { + if ( yywrap( yyscanner ) ) + return 0; + + if ( ! yyg->yy_did_buffer_switch_on_eof ) + YY_NEW_FILE; +#ifdef __cplusplus + return yyinput(yyscanner); +#else + return input(yyscanner); +#endif + } + + case EOB_ACT_CONTINUE_SCAN: + yyg->yy_c_buf_p = yyg->yytext_ptr + offset; + break; + } + } + } + + c = *(unsigned char *) yyg->yy_c_buf_p; /* cast for 8-bit char's */ + *yyg->yy_c_buf_p = '\0'; /* preserve yytext */ + yyg->yy_hold_char = *++yyg->yy_c_buf_p; + + return c; +} +#endif /* ifndef YY_NO_INPUT */ + +/** Immediately switch to a different input stream. + * @param input_file A readable stream. + * @param yyscanner The scanner object. + * @note This function does not reset the start condition to @c INITIAL . + */ + void yyrestart (FILE * input_file , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if ( ! YY_CURRENT_BUFFER ){ + yyensure_buffer_stack (yyscanner); + YY_CURRENT_BUFFER_LVALUE = + yy_create_buffer( yyin, YY_BUF_SIZE , yyscanner); + } + + yy_init_buffer( YY_CURRENT_BUFFER, input_file , yyscanner); + yy_load_buffer_state( yyscanner ); +} + +/** Switch to a different input buffer. + * @param new_buffer The new input buffer. + * @param yyscanner The scanner object. + */ + void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + /* TODO. We should be able to replace this entire function body + * with + * yypop_buffer_state(); + * yypush_buffer_state(new_buffer); + */ + yyensure_buffer_stack (yyscanner); + if ( YY_CURRENT_BUFFER == new_buffer ) + return; + + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *yyg->yy_c_buf_p = yyg->yy_hold_char; + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p; + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars; + } + + YY_CURRENT_BUFFER_LVALUE = new_buffer; + yy_load_buffer_state( yyscanner ); + + /* We don't actually know whether we did this switch during + * EOF (yywrap()) processing, but the only time this flag + * is looked at is after yywrap() is called, so it's safe + * to go ahead and always set it. + */ + yyg->yy_did_buffer_switch_on_eof = 1; +} + +static void yy_load_buffer_state (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + yyg->yytext_ptr = yyg->yy_c_buf_p = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos; + yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file; + yyg->yy_hold_char = *yyg->yy_c_buf_p; +} + +/** Allocate and initialize an input buffer state. + * @param file A readable stream. + * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE. + * @param yyscanner The scanner object. + * @return the allocated buffer state. + */ + YY_BUFFER_STATE yy_create_buffer (FILE * file, int size , yyscan_t yyscanner) +{ + YY_BUFFER_STATE b; + + b = (YY_BUFFER_STATE) yyalloc( sizeof( struct yy_buffer_state ) , yyscanner ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); + + b->yy_buf_size = size; + + /* yy_ch_buf has to be 2 characters longer than the size given because + * we need to put in 2 end-of-buffer characters. + */ + b->yy_ch_buf = (char *) yyalloc( (yy_size_t) (b->yy_buf_size + 2) , yyscanner ); + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); + + b->yy_is_our_buffer = 1; + + yy_init_buffer( b, file , yyscanner); + + return b; +} + +/** Destroy the buffer. + * @param b a buffer created with yy_create_buffer() + * @param yyscanner The scanner object. + */ + void yy_delete_buffer (YY_BUFFER_STATE b , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if ( ! b ) + return; + + if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */ + YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0; + + if ( b->yy_is_our_buffer ) + yyfree( (void *) b->yy_ch_buf , yyscanner ); + + yyfree( (void *) b , yyscanner ); +} + +/* Initializes or reinitializes a buffer. + * This function is sometimes called more than once on the same buffer, + * such as during a yyrestart() or at EOF. + */ + static void yy_init_buffer (YY_BUFFER_STATE b, FILE * file , yyscan_t yyscanner) + +{ + int oerrno = errno; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + yy_flush_buffer( b , yyscanner); + + b->yy_input_file = file; + b->yy_fill_buffer = 1; + + /* If b is the current buffer, then yy_init_buffer was _probably_ + * called from yyrestart() or through yy_get_next_buffer. + * In that case, we don't want to reset the lineno or column. + */ + if (b != YY_CURRENT_BUFFER){ + b->yy_bs_lineno = 1; + b->yy_bs_column = 0; + } + + b->yy_is_interactive = 0; + + errno = oerrno; +} + +/** Discard all buffered characters. On the next scan, YY_INPUT will be called. + * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER. + * @param yyscanner The scanner object. + */ + void yy_flush_buffer (YY_BUFFER_STATE b , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + if ( ! b ) + return; + + b->yy_n_chars = 0; + + /* We always need two end-of-buffer characters. The first causes + * a transition to the end-of-buffer state. The second causes + * a jam in that state. + */ + b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR; + b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR; + + b->yy_buf_pos = &b->yy_ch_buf[0]; + + b->yy_at_bol = 1; + b->yy_buffer_status = YY_BUFFER_NEW; + + if ( b == YY_CURRENT_BUFFER ) + yy_load_buffer_state( yyscanner ); +} + +/** Pushes the new state onto the stack. The new state becomes + * the current state. This function will allocate the stack + * if necessary. + * @param new_buffer The new state. + * @param yyscanner The scanner object. + */ +void yypush_buffer_state (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + if (new_buffer == NULL) + return; + + yyensure_buffer_stack(yyscanner); + + /* This block is copied from yy_switch_to_buffer. */ + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *yyg->yy_c_buf_p = yyg->yy_hold_char; + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p; + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars; + } + + /* Only push if top exists. Otherwise, replace top. */ + if (YY_CURRENT_BUFFER) + yyg->yy_buffer_stack_top++; + YY_CURRENT_BUFFER_LVALUE = new_buffer; + + /* copied from yy_switch_to_buffer. */ + yy_load_buffer_state( yyscanner ); + yyg->yy_did_buffer_switch_on_eof = 1; +} + +/** Removes and deletes the top of the stack, if present. + * The next element becomes the new top. + * @param yyscanner The scanner object. + */ +void yypop_buffer_state (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + if (!YY_CURRENT_BUFFER) + return; + + yy_delete_buffer(YY_CURRENT_BUFFER , yyscanner); + YY_CURRENT_BUFFER_LVALUE = NULL; + if (yyg->yy_buffer_stack_top > 0) + --yyg->yy_buffer_stack_top; + + if (YY_CURRENT_BUFFER) { + yy_load_buffer_state( yyscanner ); + yyg->yy_did_buffer_switch_on_eof = 1; + } +} + +/* Allocates the stack if it does not exist. + * Guarantees space for at least one push. + */ +static void yyensure_buffer_stack (yyscan_t yyscanner) +{ + yy_size_t num_to_alloc; + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if (!yyg->yy_buffer_stack) { + + /* First allocation is just for 2 elements, since we don't know if this + * scanner will even need a stack. We use 2 instead of 1 to avoid an + * immediate realloc on the next call. + */ + num_to_alloc = 1; /* After all that talk, this was set to 1 anyways... */ + yyg->yy_buffer_stack = (struct yy_buffer_state**)yyalloc + (num_to_alloc * sizeof(struct yy_buffer_state*) + , yyscanner); + if ( ! yyg->yy_buffer_stack ) + YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" ); + + memset(yyg->yy_buffer_stack, 0, num_to_alloc * sizeof(struct yy_buffer_state*)); + + yyg->yy_buffer_stack_max = num_to_alloc; + yyg->yy_buffer_stack_top = 0; + return; + } + + if (yyg->yy_buffer_stack_top >= (yyg->yy_buffer_stack_max) - 1){ + + /* Increase the buffer to prepare for a possible push. */ + yy_size_t grow_size = 8 /* arbitrary grow size */; + + num_to_alloc = yyg->yy_buffer_stack_max + grow_size; + yyg->yy_buffer_stack = (struct yy_buffer_state**)yyrealloc + (yyg->yy_buffer_stack, + num_to_alloc * sizeof(struct yy_buffer_state*) + , yyscanner); + if ( ! yyg->yy_buffer_stack ) + YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" ); + + /* zero only the new slots.*/ + memset(yyg->yy_buffer_stack + yyg->yy_buffer_stack_max, 0, grow_size * sizeof(struct yy_buffer_state*)); + yyg->yy_buffer_stack_max = num_to_alloc; + } +} + +/** Setup the input buffer state to scan directly from a user-specified character buffer. + * @param base the character buffer + * @param size the size in bytes of the character buffer + * @param yyscanner The scanner object. + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE yy_scan_buffer (char * base, yy_size_t size , yyscan_t yyscanner) +{ + YY_BUFFER_STATE b; + + if ( size < 2 || + base[size-2] != YY_END_OF_BUFFER_CHAR || + base[size-1] != YY_END_OF_BUFFER_CHAR ) + /* They forgot to leave room for the EOB's. */ + return NULL; + + b = (YY_BUFFER_STATE) yyalloc( sizeof( struct yy_buffer_state ) , yyscanner ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in yy_scan_buffer()" ); + + b->yy_buf_size = (int) (size - 2); /* "- 2" to take care of EOB's */ + b->yy_buf_pos = b->yy_ch_buf = base; + b->yy_is_our_buffer = 0; + b->yy_input_file = NULL; + b->yy_n_chars = b->yy_buf_size; + b->yy_is_interactive = 0; + b->yy_at_bol = 1; + b->yy_fill_buffer = 0; + b->yy_buffer_status = YY_BUFFER_NEW; + + yy_switch_to_buffer( b , yyscanner ); + + return b; +} + +/** Setup the input buffer state to scan a string. The next call to yylex() will + * scan from a @e copy of @a str. + * @param yystr a NUL-terminated string to scan + * @param yyscanner The scanner object. + * @return the newly allocated buffer state object. + * @note If you want to scan bytes that may contain NUL values, then use + * yy_scan_bytes() instead. + */ +YY_BUFFER_STATE yy_scan_string (const char * yystr , yyscan_t yyscanner) +{ + + return yy_scan_bytes( yystr, (int) strlen(yystr) , yyscanner); +} + +/** Setup the input buffer state to scan the given bytes. The next call to yylex() will + * scan from a @e copy of @a bytes. + * @param yybytes the byte buffer to scan + * @param _yybytes_len the number of bytes in the buffer pointed to by @a bytes. + * @param yyscanner The scanner object. + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE yy_scan_bytes (const char * yybytes, int _yybytes_len , yyscan_t yyscanner) +{ + YY_BUFFER_STATE b; + char *buf; + yy_size_t n; + int i; + + /* Get memory for full buffer, including space for trailing EOB's. */ + n = (yy_size_t) (_yybytes_len + 2); + buf = (char *) yyalloc( n , yyscanner ); + if ( ! buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_scan_bytes()" ); + + for ( i = 0; i < _yybytes_len; ++i ) + buf[i] = yybytes[i]; + + buf[_yybytes_len] = buf[_yybytes_len+1] = YY_END_OF_BUFFER_CHAR; + + b = yy_scan_buffer( buf, n , yyscanner); + if ( ! b ) + YY_FATAL_ERROR( "bad buffer in yy_scan_bytes()" ); + + /* It's okay to grow etc. this buffer, and we should throw it + * away when we're done. + */ + b->yy_is_our_buffer = 1; + + return b; +} + +#ifndef YY_EXIT_FAILURE +#define YY_EXIT_FAILURE 2 +#endif + +static void yynoreturn yy_fatal_error (const char* msg , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + (void)yyg; + fprintf( stderr, "%s\n", msg ); + exit( YY_EXIT_FAILURE ); +} + +/* Redefine yyless() so it works in section 3 code. */ + +#undef yyless +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + yytext[yyleng] = yyg->yy_hold_char; \ + yyg->yy_c_buf_p = yytext + yyless_macro_arg; \ + yyg->yy_hold_char = *yyg->yy_c_buf_p; \ + *yyg->yy_c_buf_p = '\0'; \ + yyleng = yyless_macro_arg; \ + } \ + while ( 0 ) + +/* Accessor methods (get/set functions) to struct members. */ + +/** Get the user-defined data for this scanner. + * @param yyscanner The scanner object. + */ +YY_EXTRA_TYPE yyget_extra (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yyextra; +} + +/** Get the current line number. + * @param yyscanner The scanner object. + */ +int yyget_lineno (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if (! YY_CURRENT_BUFFER) + return 0; + + return yylineno; +} + +/** Get the current column number. + * @param yyscanner The scanner object. + */ +int yyget_column (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + if (! YY_CURRENT_BUFFER) + return 0; + + return yycolumn; +} + +/** Get the input stream. + * @param yyscanner The scanner object. + */ +FILE *yyget_in (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yyin; +} + +/** Get the output stream. + * @param yyscanner The scanner object. + */ +FILE *yyget_out (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yyout; +} + +/** Get the length of the current token. + * @param yyscanner The scanner object. + */ +int yyget_leng (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yyleng; +} + +/** Get the current token. + * @param yyscanner The scanner object. + */ + +char *yyget_text (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yytext; +} + +/** Set the user-defined data. This data is never touched by the scanner. + * @param user_defined The data to be associated with this scanner. + * @param yyscanner The scanner object. + */ +void yyset_extra (YY_EXTRA_TYPE user_defined , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yyextra = user_defined ; +} + +/** Set the current line number. + * @param _line_number line number + * @param yyscanner The scanner object. + */ +void yyset_lineno (int _line_number , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + /* lineno is only valid if an input buffer exists. */ + if (! YY_CURRENT_BUFFER ) + YY_FATAL_ERROR( "yyset_lineno called with no buffer" ); + + yylineno = _line_number; +} + +/** Set the current column. + * @param _column_no column number + * @param yyscanner The scanner object. + */ +void yyset_column (int _column_no , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + /* column is only valid if an input buffer exists. */ + if (! YY_CURRENT_BUFFER ) + YY_FATAL_ERROR( "yyset_column called with no buffer" ); + + yycolumn = _column_no; +} + +/** Set the input stream. This does not discard the current + * input buffer. + * @param _in_str A readable stream. + * @param yyscanner The scanner object. + * @see yy_switch_to_buffer + */ +void yyset_in (FILE * _in_str , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yyin = _in_str ; +} + +void yyset_out (FILE * _out_str , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yyout = _out_str ; +} + +int yyget_debug (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + return yy_flex_debug; +} + +void yyset_debug (int _bdebug , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + yy_flex_debug = _bdebug ; +} + +/* Accessor methods for yylval and yylloc */ + +/* User-visible API */ + +/* yylex_init is special because it creates the scanner itself, so it is + * the ONLY reentrant function that doesn't take the scanner as the last argument. + * That's why we explicitly handle the declaration, instead of using our macros. + */ +int yylex_init(yyscan_t* ptr_yy_globals) +{ + if (ptr_yy_globals == NULL){ + errno = EINVAL; + return 1; + } + + *ptr_yy_globals = (yyscan_t) yyalloc ( sizeof( struct yyguts_t ), NULL ); + + if (*ptr_yy_globals == NULL){ + errno = ENOMEM; + return 1; + } + + /* By setting to 0xAA, we expose bugs in yy_init_globals. Leave at 0x00 for releases. */ + memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t)); + + return yy_init_globals ( *ptr_yy_globals ); +} + +/* yylex_init_extra has the same functionality as yylex_init, but follows the + * convention of taking the scanner as the last argument. Note however, that + * this is a *pointer* to a scanner, as it will be allocated by this call (and + * is the reason, too, why this function also must handle its own declaration). + * The user defined value in the first argument will be available to yyalloc in + * the yyextra field. + */ +int yylex_init_extra( YY_EXTRA_TYPE yy_user_defined, yyscan_t* ptr_yy_globals ) +{ + struct yyguts_t dummy_yyguts; + + yyset_extra (yy_user_defined, &dummy_yyguts); + + if (ptr_yy_globals == NULL){ + errno = EINVAL; + return 1; + } + + *ptr_yy_globals = (yyscan_t) yyalloc ( sizeof( struct yyguts_t ), &dummy_yyguts ); + + if (*ptr_yy_globals == NULL){ + errno = ENOMEM; + return 1; + } + + /* By setting to 0xAA, we expose bugs in + yy_init_globals. Leave at 0x00 for releases. */ + memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t)); + + yyset_extra (yy_user_defined, *ptr_yy_globals); + + return yy_init_globals ( *ptr_yy_globals ); +} + +static int yy_init_globals (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + /* Initialization is the same as for the non-reentrant scanner. + * This function is called from yylex_destroy(), so don't allocate here. + */ + + yyg->yy_buffer_stack = NULL; + yyg->yy_buffer_stack_top = 0; + yyg->yy_buffer_stack_max = 0; + yyg->yy_c_buf_p = NULL; + yyg->yy_init = 0; + yyg->yy_start = 0; + + yyg->yy_start_stack_ptr = 0; + yyg->yy_start_stack_depth = 0; + yyg->yy_start_stack = NULL; + +/* Defined in main.c */ +#ifdef YY_STDINIT + yyin = stdin; + yyout = stdout; +#else + yyin = NULL; + yyout = NULL; +#endif + + /* For future reference: Set errno on error, since we are called by + * yylex_init() + */ + return 0; +} + +/* yylex_destroy is for both reentrant and non-reentrant scanners. */ +int yylex_destroy (yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + + /* Pop the buffer stack, destroying each element. */ + while(YY_CURRENT_BUFFER){ + yy_delete_buffer( YY_CURRENT_BUFFER , yyscanner ); + YY_CURRENT_BUFFER_LVALUE = NULL; + yypop_buffer_state(yyscanner); + } + + /* Destroy the stack itself. */ + yyfree(yyg->yy_buffer_stack , yyscanner); + yyg->yy_buffer_stack = NULL; + + /* Destroy the start condition stack. */ + yyfree( yyg->yy_start_stack , yyscanner ); + yyg->yy_start_stack = NULL; + + /* Reset the globals. This is important in a non-reentrant scanner so the next time + * yylex() is called, initialization will occur. */ + yy_init_globals( yyscanner); + + /* Destroy the main struct (reentrant only). */ + yyfree ( yyscanner , yyscanner ); + yyscanner = NULL; + return 0; +} + +/* + * Internal utility routines. + */ + +#ifndef yytext_ptr +static void yy_flex_strncpy (char* s1, const char * s2, int n , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + (void)yyg; + + int i; + for ( i = 0; i < n; ++i ) + s1[i] = s2[i]; +} +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (const char * s , yyscan_t yyscanner) +{ + int n; + for ( n = 0; s[n]; ++n ) + ; + + return n; +} +#endif + +void *yyalloc (yy_size_t size , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + (void)yyg; + return malloc(size); +} + +void *yyrealloc (void * ptr, yy_size_t size , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + (void)yyg; + + /* The cast to (char *) in the following accommodates both + * implementations that use char* generic pointers, and those + * that use void* generic pointers. It works with the latter + * because both ANSI C and C++ allow castless assignment from + * any pointer type to void*, and deal with argument conversions + * as though doing an assignment. + */ + return realloc(ptr, size); +} + +void yyfree (void * ptr , yyscan_t yyscanner) +{ + struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; + (void)yyg; + free( (char *) ptr ); /* see yyrealloc() for (char *) cast */ +} + +#define YYTABLES_NAME "yytables" + +#line 69 "fts0tlex.l" + + diff --git a/storage/innobase/fts/fts0tlex.l b/storage/innobase/fts/fts0tlex.l new file mode 100644 index 00000000..e19e907f --- /dev/null +++ b/storage/innobase/fts/fts0tlex.l @@ -0,0 +1,69 @@ +/***************************************************************************** + +Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/** + * @file fts/fts0tlex.l + * FTS parser lexical analyzer + * + * Created 2007/5/9 Sunny Bains + */ + +%{ + +#include "fts0ast.h" +#include "fts0pars.h" + +/* Required for reentrant parser */ +#define YY_DECL int fts_tlexer(YYSTYPE* val, yyscan_t yyscanner) +#define exit(A) ut_error + +%} + +%option noinput +%option nounput +%option noyywrap +%option nostdinit +%option reentrant +%option never-interactive + + +%% + +[\t ]+ /* Ignore whitespace */ ; + +[*] { + val->oper = fts0tget_text(yyscanner)[0]; + + return(val->oper); +} + +\"[^\"\n]*\" { + val->token = fts_ast_string_create(reinterpret_cast(fts0tget_text(yyscanner)), fts0tget_leng(yyscanner)); + + return(FTS_TEXT); +} + +[^" \n\%]* { + val->token = fts_ast_string_create(reinterpret_cast(fts0tget_text(yyscanner)), fts0tget_leng(yyscanner)); + + return(FTS_TERM); +} +. ; +\n + +%% diff --git a/storage/innobase/fts/make_parser.sh b/storage/innobase/fts/make_parser.sh new file mode 100755 index 00000000..6b82c5ba --- /dev/null +++ b/storage/innobase/fts/make_parser.sh @@ -0,0 +1,49 @@ +#!/bin/sh +# +# Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free Software +# Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along with +# this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + + +TMPF=t.$$ + +make -f Makefile.query + +echo '#include "univ.i"' > $TMPF + +# This is to avoid compiler warning about unused parameters. +# FIXME: gcc extension "MY_ATTRIBUTE" causing compilation errors on windows +# platform. Quote them out for now. +sed -e ' +s/^\(static.*void.*yy_fatal_error.*msg.*,\)\(.*yyscanner\)/\1 \2 MY_ATTRIBUTE((unused))/; +s/^\(static.*void.*yy_flex_strncpy.*n.*,\)\(.*yyscanner\)/\1 \2 MY_ATTRIBUTE((unused))/; +s/^\(static.*int.*yy_flex_strlen.*s.*,\)\(.*yyscanner\)/\1 \2 MY_ATTRIBUTE((unused))/; +s/^\(\(static\|void\).*fts0[bt]alloc.*,\)\(.*yyscanner\)/\1 \3 MY_ATTRIBUTE((unused))/; +s/^\(\(static\|void\).*fts0[bt]realloc.*,\)\(.*yyscanner\)/\1 \3 MY_ATTRIBUTE((unused))/; +s/^\(\(static\|void\).*fts0[bt]free.*,\)\(.*yyscanner\)/\1 \3 MY_ATTRIBUTE((unused))/; +' < fts0blex.cc >> $TMPF + +mv $TMPF fts0blex.cc + +echo '#include "univ.i"' > $TMPF + +sed -e ' +s/^\(static.*void.*yy_fatal_error.*msg.*,\)\(.*yyscanner\)/\1 \2 MY_ATTRIBUTE((unused))/; +s/^\(static.*void.*yy_flex_strncpy.*n.*,\)\(.*yyscanner\)/\1 \2 MY_ATTRIBUTE((unused))/; +s/^\(static.*int.*yy_flex_strlen.*s.*,\)\(.*yyscanner\)/\1 \2 MY_ATTRIBUTE((unused))/; +s/^\(\(static\|void\).*fts0[bt]alloc.*,\)\(.*yyscanner\)/\1 \3 MY_ATTRIBUTE((unused))/; +s/^\(\(static\|void\).*fts0[bt]realloc.*,\)\(.*yyscanner\)/\1 \3 MY_ATTRIBUTE((unused))/; +s/^\(\(static\|void\).*fts0[bt]free.*,\)\(.*yyscanner\)/\1 \3 MY_ATTRIBUTE((unused))/; +' < fts0tlex.cc >> $TMPF + +mv $TMPF fts0tlex.cc diff --git a/storage/innobase/fut/fut0lst.cc b/storage/innobase/fut/fut0lst.cc new file mode 100644 index 00000000..a52027f2 --- /dev/null +++ b/storage/innobase/fut/fut0lst.cc @@ -0,0 +1,416 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file fut/fut0lst.cc +File-based list utilities + +Created 11/28/1995 Heikki Tuuri +***********************************************************************/ + +#include "fut0lst.h" +#include "buf0buf.h" +#include "page0page.h" + + +/** Write a file address. +@param[in] block file page +@param[in,out] faddr file address location +@param[in] page page number +@param[in] boffset byte offset +@param[in,out] mtr mini-transaction */ +static void flst_write_addr(const buf_block_t& block, byte *faddr, + uint32_t page, uint16_t boffset, mtr_t* mtr) +{ + ut_ad(mtr->memo_contains_page_flagged(faddr, MTR_MEMO_PAGE_X_FIX | + MTR_MEMO_PAGE_SX_FIX)); + ut_a(page == FIL_NULL || boffset >= FIL_PAGE_DATA); + ut_a(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA); + + static_assert(FIL_ADDR_PAGE == 0, "compatibility"); + static_assert(FIL_ADDR_BYTE == 4, "compatibility"); + static_assert(FIL_ADDR_SIZE == 6, "compatibility"); + + const bool same_page= mach_read_from_4(faddr + FIL_ADDR_PAGE) == page; + const bool same_offset= mach_read_from_2(faddr + FIL_ADDR_BYTE) == boffset; + if (same_page) + { + if (!same_offset) + mtr->write<2>(block, faddr + FIL_ADDR_BYTE, boffset); + return; + } + if (same_offset) + mtr->write<4>(block, faddr + FIL_ADDR_PAGE, page); + else + { + alignas(4) byte fil_addr[6]; + mach_write_to_4(fil_addr + FIL_ADDR_PAGE, page); + mach_write_to_2(fil_addr + FIL_ADDR_BYTE, boffset); + mtr->memcpy(block, faddr + FIL_ADDR_PAGE, fil_addr, 6); + } +} + +/** Write 2 null file addresses. +@param[in] b file page +@param[in,out] addr file address to be zeroed out +@param[in,out] mtr mini-transaction */ +static void flst_zero_both(const buf_block_t& b, byte *addr, mtr_t *mtr) +{ + if (mach_read_from_4(addr + FIL_ADDR_PAGE) != FIL_NULL) + mtr->memset(&b, ulint(addr - b.page.frame) + FIL_ADDR_PAGE, 4, 0xff); + mtr->write<2,mtr_t::MAYBE_NOP>(b, addr + FIL_ADDR_BYTE, 0U); + /* Initialize the other address by (MEMMOVE|0x80,offset,FIL_ADDR_SIZE,source) + which is 4 bytes, or less than FIL_ADDR_SIZE. */ + memcpy(addr + FIL_ADDR_SIZE, addr, FIL_ADDR_SIZE); + const uint16_t boffset= page_offset(addr); + mtr->memmove(b, boffset + FIL_ADDR_SIZE, boffset, FIL_ADDR_SIZE); +} + +/** Add a node to an empty list. */ +static void flst_add_to_empty(buf_block_t *base, uint16_t boffset, + buf_block_t *add, uint16_t aoffset, mtr_t *mtr) +{ + ut_ad(base != add || boffset != aoffset); + ut_ad(boffset < base->physical_size()); + ut_ad(aoffset < add->physical_size()); + ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX | + MTR_MEMO_PAGE_SX_FIX)); + ut_ad(mtr->memo_contains_flagged(add, MTR_MEMO_PAGE_X_FIX | + MTR_MEMO_PAGE_SX_FIX)); + + ut_ad(!mach_read_from_4(base->page.frame + boffset + FLST_LEN)); + mtr->write<1>(*base, base->page.frame + boffset + (FLST_LEN + 3), 1U); + /* Update first and last fields of base node */ + flst_write_addr(*base, base->page.frame + boffset + FLST_FIRST, + add->page.id().page_no(), aoffset, mtr); + memcpy(base->page.frame + boffset + FLST_LAST, + base->page.frame + boffset + FLST_FIRST, + FIL_ADDR_SIZE); + /* Initialize FLST_LAST by (MEMMOVE|0x80,offset,FIL_ADDR_SIZE,source) + which is 4 bytes, or less than FIL_ADDR_SIZE. */ + mtr->memmove(*base, boffset + FLST_LAST, boffset + FLST_FIRST, + FIL_ADDR_SIZE); + + /* Set prev and next fields of node to add */ + static_assert(FLST_NEXT == FLST_PREV + FIL_ADDR_SIZE, "compatibility"); + flst_zero_both(*add, add->page.frame + aoffset + FLST_PREV, mtr); +} + +/** Insert a node after another one. +@param[in,out] base base node block +@param[in] boffset byte offset of the base node +@param[in,out] cur insert position block +@param[in] coffset byte offset of the insert position +@param[in,out] add block to be added +@param[in] aoffset byte offset of the block to be added +@param[in,out] mtr mini-transaction */ +static dberr_t flst_insert_after(buf_block_t *base, uint16_t boffset, + buf_block_t *cur, uint16_t coffset, + buf_block_t *add, uint16_t aoffset, + mtr_t *mtr) +{ + ut_ad(base != cur || boffset != coffset); + ut_ad(base != add || boffset != aoffset); + ut_ad(cur != add || coffset != aoffset); + ut_ad(boffset < base->physical_size()); + ut_ad(coffset < cur->physical_size()); + ut_ad(aoffset < add->physical_size()); + ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX | + MTR_MEMO_PAGE_SX_FIX)); + ut_ad(mtr->memo_contains_flagged(cur, MTR_MEMO_PAGE_X_FIX | + MTR_MEMO_PAGE_SX_FIX)); + ut_ad(mtr->memo_contains_flagged(add, MTR_MEMO_PAGE_X_FIX | + MTR_MEMO_PAGE_SX_FIX)); + + fil_addr_t next_addr= flst_get_next_addr(cur->page.frame + coffset); + + flst_write_addr(*add, add->page.frame + aoffset + FLST_PREV, + cur->page.id().page_no(), coffset, mtr); + flst_write_addr(*add, add->page.frame + aoffset + FLST_NEXT, + next_addr.page, next_addr.boffset, mtr); + + dberr_t err= DB_SUCCESS; + + if (next_addr.page == FIL_NULL) + flst_write_addr(*base, base->page.frame + boffset + FLST_LAST, + add->page.id().page_no(), aoffset, mtr); + else if (buf_block_t *block= + buf_page_get_gen(page_id_t{add->page.id().space(), next_addr.page}, + add->zip_size(), RW_SX_LATCH, nullptr, + BUF_GET_POSSIBLY_FREED, mtr, &err)) + flst_write_addr(*block, block->page.frame + + next_addr.boffset + FLST_PREV, + add->page.id().page_no(), aoffset, mtr); + + flst_write_addr(*cur, cur->page.frame + coffset + FLST_NEXT, + add->page.id().page_no(), aoffset, mtr); + + byte *len= &base->page.frame[boffset + FLST_LEN]; + mtr->write<4>(*base, len, mach_read_from_4(len) + 1); + return err; +} + +/** Insert a node before another one. +@param[in,out] base base node block +@param[in] boffset byte offset of the base node +@param[in,out] cur insert position block +@param[in] coffset byte offset of the insert position +@param[in,out] add block to be added +@param[in] aoffset byte offset of the block to be added +@param[in,out] mtr mini-transaction +@return error code */ +static dberr_t flst_insert_before(buf_block_t *base, uint16_t boffset, + buf_block_t *cur, uint16_t coffset, + buf_block_t *add, uint16_t aoffset, + mtr_t *mtr) +{ + ut_ad(base != cur || boffset != coffset); + ut_ad(base != add || boffset != aoffset); + ut_ad(cur != add || coffset != aoffset); + ut_ad(boffset < base->physical_size()); + ut_ad(coffset < cur->physical_size()); + ut_ad(aoffset < add->physical_size()); + ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX | + MTR_MEMO_PAGE_SX_FIX)); + ut_ad(mtr->memo_contains_flagged(cur, MTR_MEMO_PAGE_X_FIX | + MTR_MEMO_PAGE_SX_FIX)); + ut_ad(mtr->memo_contains_flagged(add, MTR_MEMO_PAGE_X_FIX | + MTR_MEMO_PAGE_SX_FIX)); + + fil_addr_t prev_addr= flst_get_prev_addr(cur->page.frame + coffset); + + flst_write_addr(*add, add->page.frame + aoffset + FLST_PREV, + prev_addr.page, prev_addr.boffset, mtr); + flst_write_addr(*add, add->page.frame + aoffset + FLST_NEXT, + cur->page.id().page_no(), coffset, mtr); + + dberr_t err= DB_SUCCESS; + + if (prev_addr.page == FIL_NULL) + flst_write_addr(*base, base->page.frame + boffset + FLST_FIRST, + add->page.id().page_no(), aoffset, mtr); + else if (buf_block_t *block= + buf_page_get_gen(page_id_t{add->page.id().space(), prev_addr.page}, + add->zip_size(), RW_SX_LATCH, nullptr, + BUF_GET_POSSIBLY_FREED, mtr, &err)) + flst_write_addr(*block, block->page.frame + + prev_addr.boffset + FLST_NEXT, + add->page.id().page_no(), aoffset, mtr); + + flst_write_addr(*cur, cur->page.frame + coffset + FLST_PREV, + add->page.id().page_no(), aoffset, mtr); + + byte *len= &base->page.frame[boffset + FLST_LEN]; + mtr->write<4>(*base, len, mach_read_from_4(len) + 1); + return err; +} + +/** Initialize a list base node. +@param[in] block file page +@param[in,out] base base node +@param[in,out] mtr mini-transaction */ +void flst_init(const buf_block_t& block, byte *base, mtr_t *mtr) +{ + ut_ad(mtr->memo_contains_page_flagged(base, MTR_MEMO_PAGE_X_FIX | + MTR_MEMO_PAGE_SX_FIX)); + mtr->write<4,mtr_t::MAYBE_NOP>(block, base + FLST_LEN, 0U); + static_assert(FLST_LAST == FLST_FIRST + FIL_ADDR_SIZE, "compatibility"); + flst_zero_both(block, base + FLST_FIRST, mtr); +} + +/** Append a file list node to a list. +@param[in,out] base base node block +@param[in] boffset byte offset of the base node +@param[in,out] add block to be added +@param[in] aoffset byte offset of the node to be added +@param[in,outr] mtr mini-transaction */ +dberr_t flst_add_last(buf_block_t *base, uint16_t boffset, + buf_block_t *add, uint16_t aoffset, mtr_t *mtr) +{ + ut_ad(base != add || boffset != aoffset); + ut_ad(boffset < base->physical_size()); + ut_ad(aoffset < add->physical_size()); + ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX | + MTR_MEMO_PAGE_SX_FIX)); + ut_ad(mtr->memo_contains_flagged(add, MTR_MEMO_PAGE_X_FIX | + MTR_MEMO_PAGE_SX_FIX)); + if (!flst_get_len(base->page.frame + boffset)) + { + flst_add_to_empty(base, boffset, add, aoffset, mtr); + return DB_SUCCESS; + } + else + { + fil_addr_t addr= flst_get_last(base->page.frame + boffset); + buf_block_t *cur= add; + dberr_t err; + if (addr.page != add->page.id().page_no() && + !(cur= buf_page_get_gen(page_id_t{add->page.id().space(), addr.page}, + add->zip_size(), RW_SX_LATCH, nullptr, + BUF_GET_POSSIBLY_FREED, mtr, &err))) + return err; + return flst_insert_after(base, boffset, cur, addr.boffset, + add, aoffset, mtr); + } +} + +/** Prepend a file list node to a list. +@param[in,out] base base node block +@param[in] boffset byte offset of the base node +@param[in,out] add block to be added +@param[in] aoffset byte offset of the node to be added +@param[in,out] mtr mini-transaction +@return error code */ +dberr_t flst_add_first(buf_block_t *base, uint16_t boffset, + buf_block_t *add, uint16_t aoffset, mtr_t *mtr) +{ + ut_ad(base != add || boffset != aoffset); + ut_ad(boffset < base->physical_size()); + ut_ad(aoffset < add->physical_size()); + ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX | + MTR_MEMO_PAGE_SX_FIX)); + ut_ad(mtr->memo_contains_flagged(add, MTR_MEMO_PAGE_X_FIX | + MTR_MEMO_PAGE_SX_FIX)); + + if (!flst_get_len(base->page.frame + boffset)) + { + flst_add_to_empty(base, boffset, add, aoffset, mtr); + return DB_SUCCESS; + } + else + { + fil_addr_t addr= flst_get_first(base->page.frame + boffset); + buf_block_t *cur= add; + dberr_t err; + if (addr.page != add->page.id().page_no() && + !(cur= buf_page_get_gen(page_id_t{add->page.id().space(), addr.page}, + add->zip_size(), RW_SX_LATCH, nullptr, + BUF_GET_POSSIBLY_FREED, mtr, &err))) + return err; + return flst_insert_before(base, boffset, cur, addr.boffset, + add, aoffset, mtr); + } +} + +/** Remove a file list node. +@param[in,out] base base node block +@param[in] boffset byte offset of the base node +@param[in,out] cur block to be removed +@param[in] coffset byte offset of the current record to be removed +@param[in,out] mtr mini-transaction +@return error code */ +dberr_t flst_remove(buf_block_t *base, uint16_t boffset, + buf_block_t *cur, uint16_t coffset, mtr_t *mtr) +{ + ut_ad(boffset < base->physical_size()); + ut_ad(coffset < cur->physical_size()); + ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX | + MTR_MEMO_PAGE_SX_FIX)); + ut_ad(mtr->memo_contains_flagged(cur, MTR_MEMO_PAGE_X_FIX | + MTR_MEMO_PAGE_SX_FIX)); + + const fil_addr_t prev_addr= flst_get_prev_addr(cur->page.frame + coffset); + const fil_addr_t next_addr= flst_get_next_addr(cur->page.frame + coffset); + dberr_t err= DB_SUCCESS; + + if (prev_addr.page == FIL_NULL) + flst_write_addr(*base, base->page.frame + boffset + FLST_FIRST, + next_addr.page, next_addr.boffset, mtr); + else + { + buf_block_t *b= cur; + if (prev_addr.page == b->page.id().page_no() || + (b= buf_page_get_gen(page_id_t(b->page.id().space(), prev_addr.page), + b->zip_size(), RW_SX_LATCH, nullptr, + BUF_GET_POSSIBLY_FREED, mtr, &err))) + flst_write_addr(*b, b->page.frame + prev_addr.boffset + FLST_NEXT, + next_addr.page, next_addr.boffset, mtr); + } + + if (next_addr.page == FIL_NULL) + flst_write_addr(*base, base->page.frame + boffset + FLST_LAST, + prev_addr.page, prev_addr.boffset, mtr); + else + { + dberr_t err2; + if (next_addr.page == cur->page.id().page_no() || + (cur= buf_page_get_gen(page_id_t(cur->page.id().space(), + next_addr.page), + cur->zip_size(), RW_SX_LATCH, nullptr, + BUF_GET_POSSIBLY_FREED, mtr, &err2))) + flst_write_addr(*cur, cur->page.frame + next_addr.boffset + FLST_PREV, + prev_addr.page, prev_addr.boffset, mtr); + else if (err == DB_SUCCESS) + err= err2; + } + + byte *len= &base->page.frame[boffset + FLST_LEN]; + if (UNIV_UNLIKELY(!mach_read_from_4(len))) + return DB_CORRUPTION; + mtr->write<4>(*base, len, mach_read_from_4(len) - 1); + return err; +} + +#ifdef UNIV_DEBUG +/** Validate a file-based list. */ +void flst_validate(const buf_block_t *base, uint16_t boffset, mtr_t *mtr) +{ + ut_ad(boffset < base->physical_size()); + ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX | + MTR_MEMO_PAGE_SX_FIX)); + + /* We use two mini-transaction handles: the first is used to lock + the base node, and prevent other threads from modifying the list. + The second is used to traverse the list. We cannot run the second + mtr without committing it at times, because if the list is long, + the x-locked pages could fill the buffer, resulting in a deadlock. */ + mtr_t mtr2; + + const uint32_t len= flst_get_len(base->page.frame + boffset); + fil_addr_t addr= flst_get_first(base->page.frame + boffset); + + for (uint32_t i= len; i--; ) + { + mtr2.start(); + const buf_block_t *b= + buf_page_get_gen(page_id_t(base->page.id().space(), addr.page), + base->zip_size(), RW_SX_LATCH, nullptr, BUF_GET, mtr); + ut_ad(b); + addr= flst_get_next_addr(b->page.frame + addr.boffset); + mtr2.commit(); + } + + ut_ad(addr.page == FIL_NULL); + + addr= flst_get_last(base->page.frame + boffset); + + for (uint32_t i= len; i--; ) + { + mtr2.start(); + const buf_block_t *b= + buf_page_get_gen(page_id_t(base->page.id().space(), addr.page), + base->zip_size(), RW_SX_LATCH, nullptr, BUF_GET, mtr); + ut_ad(b); + addr= flst_get_prev_addr(b->page.frame + addr.boffset); + mtr2.commit(); + } + + ut_ad(addr.page == FIL_NULL); +} +#endif diff --git a/storage/innobase/gis/gis0geo.cc b/storage/innobase/gis/gis0geo.cc new file mode 100644 index 00000000..4c3ff188 --- /dev/null +++ b/storage/innobase/gis/gis0geo.cc @@ -0,0 +1,650 @@ +/***************************************************************************** + +Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file gis/gis0geo.cc +InnoDB R-tree related functions. + +Created 2013/03/27 Allen Lai and Jimmy Yang +*******************************************************/ + +#include "page0types.h" +#include "gis0geo.h" +#include "page0cur.h" +#include "ut0rnd.h" +#include "mach0data.h" + +#include +#include + +/* These definitions are for comparing 2 mbrs. */ + +/* Check if a intersects b. +Return false if a intersects b, otherwise true. */ +#define INTERSECT_CMP(amin, amax, bmin, bmax) \ +(((amin) > (bmax)) || ((bmin) > (amax))) + +/* Check if b contains a. +Return false if b contains a, otherwise true. */ +#define CONTAIN_CMP(amin, amax, bmin, bmax) \ +(((bmin) > (amin)) || ((bmax) < (amax))) + +/* Check if b is within a. +Return false if b is within a, otherwise true. */ +#define WITHIN_CMP(amin, amax, bmin, bmax) \ +(((amin) > (bmin)) || ((amax) < (bmax))) + +/* Check if a disjoints b. +Return false if a disjoints b, otherwise true. */ +#define DISJOINT_CMP(amin, amax, bmin, bmax) \ +(((amin) <= (bmax)) && ((bmin) <= (amax))) + +/* Check if a equals b. +Return false if equal, otherwise true. */ +#define EQUAL_CMP(amin, amax, bmin, bmax) \ +(((amin) != (bmin)) || ((amax) != (bmax))) + +/**************************************************************** +Functions for generating mbr +****************************************************************/ +/*************************************************************//** +Add one point stored in wkb to a given mbr. +@return 0 if the point in wkb is valid, otherwise -1. */ +static +int +rtree_add_point_to_mbr( +/*===================*/ + const uchar** wkb, /*!< in: pointer to wkb, + where point is stored */ + const uchar* end, /*!< in: end of wkb. */ + uint n_dims, /*!< in: dimensions. */ + double* mbr) /*!< in/out: mbr, which + must be of length n_dims * 2. */ +{ + double ord; + double* mbr_end = mbr + n_dims * 2; + + while (mbr < mbr_end) { + if ((*wkb) + sizeof(double) > end) { + return(-1); + } + + ord = mach_double_read(*wkb); + (*wkb) += sizeof(double); + + if (ord < *mbr) { + *mbr = ord; + } + mbr++; + + if (ord > *mbr) { + *mbr = ord; + } + mbr++; + } + + return(0); +} + +/*************************************************************//** +Get mbr of point stored in wkb. +@return 0 if ok, otherwise -1. */ +static +int +rtree_get_point_mbr( +/*================*/ + const uchar** wkb, /*!< in: pointer to wkb, + where point is stored. */ + const uchar* end, /*!< in: end of wkb. */ + uint n_dims, /*!< in: dimensions. */ + double* mbr) /*!< in/out: mbr, + must be of length n_dims * 2. */ +{ + return rtree_add_point_to_mbr(wkb, end, n_dims, mbr); +} + + +/*************************************************************//** +Get mbr of linestring stored in wkb. +@return 0 if the linestring is valid, otherwise -1. */ +static +int +rtree_get_linestring_mbr( +/*=====================*/ + const uchar** wkb, /*!< in: pointer to wkb, + where point is stored. */ + const uchar* end, /*!< in: end of wkb. */ + uint n_dims, /*!< in: dimensions. */ + double* mbr) /*!< in/out: mbr, + must be of length n_dims * 2. */ +{ + uint n_points; + + n_points = uint4korr(*wkb); + (*wkb) += 4; + + for (; n_points > 0; --n_points) { + /* Add next point to mbr */ + if (rtree_add_point_to_mbr(wkb, end, n_dims, mbr)) { + return(-1); + } + } + + return(0); +} + +/*************************************************************//** +Get mbr of polygon stored in wkb. +@return 0 if the polygon is valid, otherwise -1. */ +static +int +rtree_get_polygon_mbr( +/*==================*/ + const uchar** wkb, /*!< in: pointer to wkb, + where point is stored. */ + const uchar* end, /*!< in: end of wkb. */ + uint n_dims, /*!< in: dimensions. */ + double* mbr) /*!< in/out: mbr, + must be of length n_dims * 2. */ +{ + uint n_linear_rings; + uint n_points; + + n_linear_rings = uint4korr((*wkb)); + (*wkb) += 4; + + for (; n_linear_rings > 0; --n_linear_rings) { + n_points = uint4korr((*wkb)); + (*wkb) += 4; + + for (; n_points > 0; --n_points) { + /* Add next point to mbr */ + if (rtree_add_point_to_mbr(wkb, end, n_dims, mbr)) { + return(-1); + } + } + } + + return(0); +} + +/*************************************************************//** +Get mbr of geometry stored in wkb. +@return 0 if the geometry is valid, otherwise -1. */ +static +int +rtree_get_geometry_mbr( +/*===================*/ + const uchar** wkb, /*!< in: pointer to wkb, + where point is stored. */ + const uchar* end, /*!< in: end of wkb. */ + uint n_dims, /*!< in: dimensions. */ + double* mbr, /*!< in/out: mbr. */ + int top) /*!< in: if it is the top, + which means it's not called + by itself. */ +{ + int res; + uint wkb_type = 0; + uint n_items; + + /* byte_order = *(*wkb); */ + ++(*wkb); + + wkb_type = uint4korr((*wkb)); + (*wkb) += 4; + + switch ((enum wkbType) wkb_type) { + case wkbPoint: + res = rtree_get_point_mbr(wkb, end, n_dims, mbr); + break; + case wkbLineString: + res = rtree_get_linestring_mbr(wkb, end, n_dims, mbr); + break; + case wkbPolygon: + res = rtree_get_polygon_mbr(wkb, end, n_dims, mbr); + break; + case wkbMultiPoint: + n_items = uint4korr((*wkb)); + (*wkb) += 4; + for (; n_items > 0; --n_items) { + /* byte_order = *(*wkb); */ + ++(*wkb); + (*wkb) += 4; + if (rtree_get_point_mbr(wkb, end, n_dims, mbr)) { + return(-1); + } + } + res = 0; + break; + case wkbMultiLineString: + n_items = uint4korr((*wkb)); + (*wkb) += 4; + for (; n_items > 0; --n_items) { + /* byte_order = *(*wkb); */ + ++(*wkb); + (*wkb) += 4; + if (rtree_get_linestring_mbr(wkb, end, n_dims, mbr)) { + return(-1); + } + } + res = 0; + break; + case wkbMultiPolygon: + n_items = uint4korr((*wkb)); + (*wkb) += 4; + for (; n_items > 0; --n_items) { + /* byte_order = *(*wkb); */ + ++(*wkb); + (*wkb) += 4; + if (rtree_get_polygon_mbr(wkb, end, n_dims, mbr)) { + return(-1); + } + } + res = 0; + break; + case wkbGeometryCollection: + if (!top) { + return(-1); + } + + n_items = uint4korr((*wkb)); + (*wkb) += 4; + for (; n_items > 0; --n_items) { + if (rtree_get_geometry_mbr(wkb, end, n_dims, + mbr, 0)) { + return(-1); + } + } + res = 0; + break; + default: + res = -1; + } + + return(res); +} + +/*************************************************************//** +Calculate Minimal Bounding Rectangle (MBR) of the spatial object +stored in "well-known binary representation" (wkb) format. +@return 0 if ok. */ +int +rtree_mbr_from_wkb( +/*===============*/ + const uchar* wkb, /*!< in: wkb */ + uint size, /*!< in: size of wkb. */ + uint n_dims, /*!< in: dimensions. */ + double* mbr) /*!< in/out: mbr, which must + be of length n_dim2 * 2. */ +{ + for (uint i = 0; i < n_dims; ++i) { + mbr[i * 2] = DBL_MAX; + mbr[i * 2 + 1] = -DBL_MAX; + } + + return rtree_get_geometry_mbr(&wkb, wkb + size, n_dims, mbr, 1); +} + + +/**************************************************************** +Functions for Rtree split +****************************************************************/ +/*************************************************************//** +Join 2 mbrs of dimensions n_dim. */ +static +void +mbr_join( +/*=====*/ + double* a, /*!< in/out: the first mbr, + where the joined result will be. */ + const double* b, /*!< in: the second mbr. */ + int n_dim) /*!< in: dimensions. */ +{ + double* end = a + n_dim * 2; + + do { + if (a[0] > b[0]) { + a[0] = b[0]; + } + + if (a[1] < b[1]) { + a[1] = b[1]; + } + + a += 2; + b += 2; + + } while (a != end); +} + +/*************************************************************//** +Counts the square of mbr which is the join of a and b. Both a and b +are of dimensions n_dim. */ +static +double +mbr_join_square( +/*============*/ + const double* a, /*!< in: the first mbr. */ + const double* b, /*!< in: the second mbr. */ + int n_dim) /*!< in: dimensions. */ +{ + const double* end = a + n_dim * 2; + double square = 1.0; + + do { + square *= std::max(a[1], b[1]) - std::min(a[0], b[0]); + + a += 2; + b += 2; + } while (a != end); + + /* Check if finite (not infinity or NaN), + so we don't get NaN in calculations */ + if (!std::isfinite(square)) { + return DBL_MAX; + } + + return square; +} + +/*************************************************************//** +Counts the square of mbr of dimension n_dim. */ +static +double +count_square( +/*=========*/ + const double* a, /*!< in: the mbr. */ + int n_dim) /*!< in: dimensions. */ +{ + const double* end = a + n_dim * 2; + double square = 1.0; + + do { + square *= a[1] - a[0]; + a += 2; + } while (a != end); + + return square; +} + +/*************************************************************//** +Copy mbr of dimension n_dim from src to dst. */ +inline +static +void +copy_coords( +/*========*/ + double* dst, /*!< in/out: destination. */ + const double* src, /*!< in: source. */ + int) +{ + memcpy(dst, src, DATA_MBR_LEN); +} + +/*************************************************************//** +Select two nodes to collect group upon */ +static +void +pick_seeds( +/*=======*/ + rtr_split_node_t* node, /*!< in: split nodes. */ + int n_entries, /*!< in: entries number. */ + rtr_split_node_t** seed_a, /*!< out: seed 1. */ + rtr_split_node_t** seed_b, /*!< out: seed 2. */ + int n_dim) /*!< in: dimensions. */ +{ + rtr_split_node_t* cur1; + rtr_split_node_t* lim1 = node + (n_entries - 1); + rtr_split_node_t* cur2; + rtr_split_node_t* lim2 = node + n_entries; + + double max_d = -DBL_MAX; + double d; + + *seed_a = node; + *seed_b = node + 1; + + for (cur1 = node; cur1 < lim1; ++cur1) { + for (cur2 = cur1 + 1; cur2 < lim2; ++cur2) { + d = mbr_join_square(cur1->coords, cur2->coords, n_dim) - + cur1->square - cur2->square; + if (d > max_d) { + max_d = d; + *seed_a = cur1; + *seed_b = cur2; + } + } + } +} + +/*************************************************************//** +Select next node and group where to add. */ +static +void +pick_next( +/*======*/ + rtr_split_node_t* node, /*!< in: split nodes. */ + int n_entries, /*!< in: entries number. */ + double* g1, /*!< in: mbr of group 1. */ + double* g2, /*!< in: mbr of group 2. */ + rtr_split_node_t** choice, /*!< out: the next node.*/ + int* n_group, /*!< out: group number.*/ + int n_dim) /*!< in: dimensions. */ +{ + rtr_split_node_t* cur = node; + rtr_split_node_t* end = node + n_entries; + double max_diff = -DBL_MAX; + + for (; cur < end; ++cur) { + double diff; + double abs_diff; + + if (cur->n_node != 0) { + continue; + } + + diff = mbr_join_square(g1, cur->coords, n_dim) - + mbr_join_square(g2, cur->coords, n_dim); + + abs_diff = fabs(diff); + if (abs_diff > max_diff) { + max_diff = abs_diff; + + /* Introduce some randomness if the record + is identical */ + if (diff == 0) { + diff = static_cast(ut_rnd_gen() & 1); + } + + *n_group = 1 + (diff > 0); + *choice = cur; + } + } +} + +/*************************************************************//** +Mark not-in-group entries as n_group. */ +static +void +mark_all_entries( +/*=============*/ + rtr_split_node_t* node, /*!< in/out: split nodes. */ + int n_entries, /*!< in: entries number. */ + int n_group) /*!< in: group number. */ +{ + rtr_split_node_t* cur = node; + rtr_split_node_t* end = node + n_entries; + for (; cur < end; ++cur) { + if (cur->n_node != 0) { + continue; + } + cur->n_node = n_group; + } +} + +/*************************************************************//** +Split rtree node. +Return which group the first rec is in. */ +int +split_rtree_node( +/*=============*/ + rtr_split_node_t* node, /*!< in: split nodes. */ + int n_entries, /*!< in: entries number. */ + int all_size, /*!< in: total key's size. */ + int key_size, /*!< in: key's size. */ + int min_size, /*!< in: minimal group size. */ + int size1, /*!< in: size of group. */ + int size2, /*!< in: initial group sizes */ + double** d_buffer, /*!< in/out: buffer. */ + int n_dim, /*!< in: dimensions. */ + uchar* first_rec) /*!< in: the first rec. */ +{ + rtr_split_node_t* cur; + rtr_split_node_t* a = NULL; + rtr_split_node_t* b = NULL; + double* g1 = reserve_coords(d_buffer, n_dim); + double* g2 = reserve_coords(d_buffer, n_dim); + rtr_split_node_t* next = NULL; + int next_node = 0; + int i; + int first_rec_group = 1; + rtr_split_node_t* end = node + n_entries; + + if (all_size < min_size * 2) { + return 1; + } + + cur = node; + for (; cur < end; ++cur) { + cur->square = count_square(cur->coords, n_dim); + cur->n_node = 0; + } + + pick_seeds(node, n_entries, &a, &b, n_dim); + a->n_node = 1; + b->n_node = 2; + + copy_coords(g1, a->coords, n_dim); + size1 += key_size; + copy_coords(g2, b->coords, n_dim); + size2 += key_size; + + for (i = n_entries - 2; i > 0; --i) { + /* Can't write into group 2 */ + if (all_size - (size2 + key_size) < min_size) { + mark_all_entries(node, n_entries, 1); + break; + } + + /* Can't write into group 1 */ + if (all_size - (size1 + key_size) < min_size) { + mark_all_entries(node, n_entries, 2); + break; + } + + pick_next(node, n_entries, g1, g2, &next, &next_node, n_dim); + if (next_node == 1) { + size1 += key_size; + mbr_join(g1, next->coords, n_dim); + } else { + size2 += key_size; + mbr_join(g2, next->coords, n_dim); + } + + next->n_node = next_node; + + /* Find out where the first rec (of the page) will be at, + and inform the caller */ + if (first_rec && first_rec == next->key) { + first_rec_group = next_node; + } + } + + return(first_rec_group); +} + +/** Compare two minimum bounding rectangles. +@param mode comparison operator + MBR_INTERSECT(a,b) a overlaps b + MBR_CONTAIN(a,b) a contains b + MBR_DISJOINT(a,b) a disjoint b + MBR_WITHIN(a,b) a within b + MBR_EQUAL(a,b) All coordinates of MBRs are equal + MBR_DATA(a,b) Data reference is the same +@param b first MBR +@param a second MBR +@retval 0 if the predicate holds +@retval 1 if the precidate does not hold */ +int rtree_key_cmp(page_cur_mode_t mode, const void *b, const void *a) +{ + const byte *b_= static_cast(b); + const byte *a_= static_cast(a); + + static_assert(DATA_MBR_LEN == SPDIMS * 2 * sizeof(double), "compatibility"); + + for (auto i = SPDIMS; i--; ) + { + double amin= mach_double_read(a_); + double bmin= mach_double_read(b_); + a_+= sizeof(double); + b_+= sizeof(double); + double amax= mach_double_read(a_); + double bmax= mach_double_read(b_); + a_+= sizeof(double); + b_+= sizeof(double); + + switch (mode) { + case PAGE_CUR_INTERSECT: + if (INTERSECT_CMP(amin, amax, bmin, bmax)) + return 1; + continue; + case PAGE_CUR_CONTAIN: + if (CONTAIN_CMP(amin, amax, bmin, bmax)) + return 1; + continue; + case PAGE_CUR_WITHIN: + if (WITHIN_CMP(amin, amax, bmin, bmax)) + return 1; + continue; + case PAGE_CUR_MBR_EQUAL: + if (EQUAL_CMP(amin, amax, bmin, bmax)) + return 1; + continue; + case PAGE_CUR_DISJOINT: + if (!DISJOINT_CMP(amin, amax, bmin, bmax)) + return 0; + if (!i) + return 1; + continue; + case PAGE_CUR_UNSUPP: + case PAGE_CUR_G: + case PAGE_CUR_GE: + case PAGE_CUR_L: + case PAGE_CUR_LE: + case PAGE_CUR_RTREE_LOCATE: + case PAGE_CUR_RTREE_GET_FATHER: + case PAGE_CUR_RTREE_INSERT: + break; + } + ut_ad("unknown comparison operator" == 0); + } + + return 0; +} diff --git a/storage/innobase/gis/gis0rtree.cc b/storage/innobase/gis/gis0rtree.cc new file mode 100644 index 00000000..83afd732 --- /dev/null +++ b/storage/innobase/gis/gis0rtree.cc @@ -0,0 +1,1934 @@ +/***************************************************************************** + +Copyright (c) 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file gis/gis0rtree.cc +InnoDB R-tree interfaces + +Created 2013/03/27 Allen Lai and Jimmy Yang +***********************************************************************/ + +#include "fsp0fsp.h" +#include "page0page.h" +#include "page0cur.h" +#include "page0zip.h" +#include "gis0rtree.h" +#include "btr0cur.h" +#include "btr0sea.h" +#include "btr0pcur.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "ibuf0ibuf.h" +#include "trx0undo.h" +#include "srv0mon.h" +#include "gis0geo.h" +#include + +/*************************************************************//** +Initial split nodes info for R-tree split. +@return initialized split nodes array */ +static +rtr_split_node_t* +rtr_page_split_initialize_nodes( +/*============================*/ + mem_heap_t* heap, /*!< in: pointer to memory heap, or NULL */ + btr_cur_t* cursor, /*!< in: cursor at which to insert; when the + function returns, the cursor is positioned + on the predecessor of the inserted record */ + rec_offs** offsets,/*!< in: offsets on inserted record */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + double** buf_pos)/*!< in/out: current buffer position */ +{ + rtr_split_node_t* split_node_array; + double* buf; + ulint n_recs; + rtr_split_node_t* task; + rtr_split_node_t* stop; + rtr_split_node_t* cur; + rec_t* rec; + buf_block_t* block; + page_t* page; + ulint n_uniq; + ulint len; + const byte* source_cur; + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + n_uniq = dict_index_get_n_unique_in_tree(cursor->index()); + + n_recs = ulint(page_get_n_recs(page)) + 1; + + /*We reserve 2 MBRs memory space for temp result of split + algrithm. And plus the new mbr that need to insert, we + need (n_recs + 3)*MBR size for storing all MBRs.*/ + buf = static_cast(mem_heap_alloc( + heap, DATA_MBR_LEN * (n_recs + 3) + + sizeof(rtr_split_node_t) * (n_recs + 1))); + + split_node_array = (rtr_split_node_t*)(buf + SPDIMS * 2 * (n_recs + 3)); + task = split_node_array; + *buf_pos = buf; + stop = task + n_recs; + + rec = page_rec_get_next(page_get_infimum_rec(page)); + const ulint n_core = page_is_leaf(page) + ? cursor->index()->n_core_fields : 0; + *offsets = rec_get_offsets(rec, cursor->index(), *offsets, n_core, + n_uniq, &heap); + + source_cur = rec_get_nth_field(rec, *offsets, 0, &len); + + for (cur = task; cur < stop - 1; ++cur) { + cur->coords = reserve_coords(buf_pos, SPDIMS); + cur->key = rec; + + memcpy(cur->coords, source_cur, DATA_MBR_LEN); + + rec = page_rec_get_next(rec); + *offsets = rec_get_offsets(rec, cursor->index(), *offsets, + n_core, n_uniq, &heap); + source_cur = rec_get_nth_field(rec, *offsets, 0, &len); + } + + /* Put the insert key to node list */ + source_cur = static_cast(dfield_get_data( + dtuple_get_nth_field(tuple, 0))); + cur->coords = reserve_coords(buf_pos, SPDIMS); + rec = (byte*) mem_heap_alloc( + heap, rec_get_converted_size(cursor->index(), tuple, 0)); + + rec = rec_convert_dtuple_to_rec(rec, cursor->index(), tuple, 0); + cur->key = rec; + + memcpy(cur->coords, source_cur, DATA_MBR_LEN); + + return split_node_array; +} + +/**********************************************************************//** +Builds a Rtree node pointer out of a physical record and a page number. +Note: For Rtree, we just keep the mbr and page no field in non-leaf level +page. It's different with Btree, Btree still keeps PK fields so far. +@return own: node pointer */ +dtuple_t* +rtr_index_build_node_ptr( +/*=====================*/ + const dict_index_t* index, /*!< in: index */ + const rtr_mbr_t* mbr, /*!< in: mbr of lower page */ + const rec_t* rec, /*!< in: record for which to build node + pointer */ + ulint page_no,/*!< in: page number to put in node + pointer */ + mem_heap_t* heap) /*!< in: memory heap where pointer + created */ +{ + dtuple_t* tuple; + dfield_t* field; + byte* buf; + ulint n_unique; + ulint info_bits; + + ut_ad(dict_index_is_spatial(index)); + + n_unique = DICT_INDEX_SPATIAL_NODEPTR_SIZE; + + tuple = dtuple_create(heap, n_unique + 1); + + /* For rtree internal node, we need to compare page number + fields. */ + dtuple_set_n_fields_cmp(tuple, n_unique + 1); + + dict_index_copy_types(tuple, index, n_unique); + + /* Write page no field */ + buf = static_cast(mem_heap_alloc(heap, 4)); + + mach_write_to_4(buf, page_no); + + field = dtuple_get_nth_field(tuple, n_unique); + dfield_set_data(field, buf, 4); + + dtype_set(dfield_get_type(field), DATA_SYS_CHILD, DATA_NOT_NULL, 4); + + /* Set info bits. */ + info_bits = rec_get_info_bits(rec, dict_table_is_comp(index->table)); + dtuple_set_info_bits(tuple, info_bits | REC_STATUS_NODE_PTR); + + /* Set mbr as index entry data */ + field = dtuple_get_nth_field(tuple, 0); + + buf = static_cast(mem_heap_alloc(heap, DATA_MBR_LEN)); + + rtr_write_mbr(buf, mbr); + + dfield_set_data(field, buf, DATA_MBR_LEN); + + ut_ad(dtuple_check_typed(tuple)); + + return(tuple); +} + +/**************************************************************//** +Update the mbr field of a spatial index row. */ +void +rtr_update_mbr_field( +/*=================*/ + btr_cur_t* cursor, /*!< in/out: cursor pointed to rec.*/ + rec_offs* offsets, /*!< in/out: offsets on rec. */ + btr_cur_t* cursor2, /*!< in/out: cursor pointed to rec + that should be deleted. + this cursor is for btr_compress to + delete the merged page's father rec.*/ + page_t* child_page, /*!< in: child page. */ + rtr_mbr_t* mbr, /*!< in: the new mbr. */ + rec_t* new_rec, /*!< in: rec to use */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_index_t* index = cursor->index(); + mem_heap_t* heap; + page_t* page; + rec_t* rec; + constexpr ulint flags = BTR_NO_UNDO_LOG_FLAG + | BTR_NO_LOCKING_FLAG + | BTR_KEEP_SYS_FLAG; + dberr_t err; + big_rec_t* dummy_big_rec; + buf_block_t* block; + rec_t* child_rec; + ulint up_match = 0; + ulint low_match = 0; + ulint child; + ulint rec_info; + bool ins_suc = true; + ulint cur2_pos = 0; + ulint del_page_no = 0; + rec_offs* offsets2; + + rec = btr_cur_get_rec(cursor); + page = page_align(rec); + + rec_info = rec_get_info_bits(rec, rec_offs_comp(offsets)); + + heap = mem_heap_create(100); + block = btr_cur_get_block(cursor); + ut_ad(page == buf_block_get_frame(block)); + + child = btr_node_ptr_get_child_page_no(rec, offsets); + const ulint n_core = page_is_leaf(block->page.frame) + ? index->n_core_fields : 0; + + if (new_rec) { + child_rec = new_rec; + } else { + child_rec = page_rec_get_next(page_get_infimum_rec(child_page)); + } + + dtuple_t* node_ptr = rtr_index_build_node_ptr( + index, mbr, child_rec, child, heap); + + /* We need to remember the child page no of cursor2, since page could be + reorganized or insert a new rec before it. */ + if (cursor2) { + ut_ad(cursor2->index() == index); + rec_t* del_rec = btr_cur_get_rec(cursor2); + offsets2 = rec_get_offsets(btr_cur_get_rec(cursor2), + index, NULL, 0, + ULINT_UNDEFINED, &heap); + del_page_no = btr_node_ptr_get_child_page_no(del_rec, offsets2); + cur2_pos = page_rec_get_n_recs_before(btr_cur_get_rec(cursor2)); + } + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(rec_offs_base(offsets)[0 + 1] == DATA_MBR_LEN); + ut_ad(node_ptr->fields[0].len == DATA_MBR_LEN); + + if (rec_info & REC_INFO_MIN_REC_FLAG) { + /* When the rec is minimal rec in this level, we do + in-place update for avoiding it move to other place. */ + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + + if (UNIV_LIKELY_NULL(page_zip)) { + /* Check if there's enough space for in-place + update the zip page. */ + if (!btr_cur_update_alloc_zip( + page_zip, + btr_cur_get_page_cur(cursor), + offsets, + rec_offs_size(offsets), + false, mtr)) { + + /* If there's not enought space for + inplace update zip page, we do delete + insert. */ + ins_suc = false; + + /* Since btr_cur_update_alloc_zip could + reorganize the page, we need to repositon + cursor2. */ + if (cursor2) { + cursor2->page_cur.rec = + page_rec_get_nth(page, + cur2_pos); + } + + goto update_mbr; + } + + /* Record could be repositioned */ + rec = btr_cur_get_rec(cursor); + +#ifdef UNIV_DEBUG + /* Make sure it is still the first record */ + rec_info = rec_get_info_bits( + rec, rec_offs_comp(offsets)); + ut_ad(rec_info & REC_INFO_MIN_REC_FLAG); +#endif /* UNIV_DEBUG */ + memcpy(rec, node_ptr->fields[0].data, DATA_MBR_LEN); + page_zip_write_rec(block, rec, index, offsets, 0, mtr); + } else { + mtr->memcpy(*block, rec, + node_ptr->fields[0].data, + DATA_MBR_LEN); + } + + if (cursor2) { + rec_offs* offsets2; + + if (UNIV_LIKELY_NULL(page_zip)) { + cursor2->page_cur.rec + = page_rec_get_nth(page, cur2_pos); + } + offsets2 = rec_get_offsets(btr_cur_get_rec(cursor2), + index, NULL, 0, + ULINT_UNDEFINED, &heap); + ut_ad(del_page_no == btr_node_ptr_get_child_page_no( + cursor2->page_cur.rec, + offsets2)); + + page_cur_delete_rec(btr_cur_get_page_cur(cursor2), + offsets2, mtr); + } + } else if (page_get_n_recs(page) == 1) { + /* When there's only one rec in the page, we do insert/delete to + avoid page merge. */ + + page_cur_t page_cur; + rec_t* insert_rec; + rec_offs* insert_offsets = NULL; + ulint old_pos; + rec_t* old_rec; + + ut_ad(cursor2 == NULL); + + /* Insert the new mbr rec. */ + old_pos = page_rec_get_n_recs_before(rec); + + err = btr_cur_optimistic_insert( + flags, + cursor, &insert_offsets, &heap, + node_ptr, &insert_rec, &dummy_big_rec, 0, NULL, mtr); + + ut_ad(err == DB_SUCCESS); + + btr_cur_position(index, insert_rec, block, cursor); + + /* Delete the old mbr rec. */ + old_rec = page_rec_get_nth(page, old_pos); + ut_ad(old_rec != insert_rec); + + page_cur_position(old_rec, block, &page_cur); + page_cur.index = index; + offsets2 = rec_get_offsets(old_rec, index, NULL, n_core, + ULINT_UNDEFINED, &heap); + page_cur_delete_rec(&page_cur, offsets2, mtr); + + } else { +update_mbr: + /* When there're not only 1 rec in the page, we do delete/insert + to avoid page split. */ + rec_t* insert_rec; + rec_offs* insert_offsets = NULL; + rec_t* next_rec; + + /* Delete the rec which cursor point to. */ + next_rec = page_rec_get_next(rec); + page_cur_delete_rec(&cursor->page_cur, offsets, mtr); + if (!ins_suc) { + ut_ad(rec_info & REC_INFO_MIN_REC_FLAG); + + btr_set_min_rec_mark(next_rec, *block, mtr); + } + + /* If there's more than 1 rec left in the page, delete + the rec which cursor2 point to. Otherwise, delete it later.*/ + if (cursor2 && page_get_n_recs(page) > 1) { + ulint cur2_rec_info; + rec_t* cur2_rec; + + cur2_rec = cursor2->page_cur.rec; + offsets2 = rec_get_offsets(cur2_rec, index, NULL, + n_core, + ULINT_UNDEFINED, &heap); + + cur2_rec_info = rec_get_info_bits(cur2_rec, + rec_offs_comp(offsets2)); + if (cur2_rec_info & REC_INFO_MIN_REC_FLAG) { + /* If we delete the leftmost node + pointer on a non-leaf level, we must + mark the new leftmost node pointer as + the predefined minimum record */ + rec_t* next_rec = page_rec_get_next(cur2_rec); + btr_set_min_rec_mark(next_rec, *block, mtr); + } + + ut_ad(del_page_no + == btr_node_ptr_get_child_page_no(cur2_rec, + offsets2)); + page_cur_delete_rec(btr_cur_get_page_cur(cursor2), + offsets2, mtr); + cursor2 = NULL; + } + + /* Insert the new rec. */ + if (page_cur_search_with_match(node_ptr, PAGE_CUR_LE, + &up_match, &low_match, + btr_cur_get_page_cur(cursor), + NULL)) { + goto err_exit; + } + + err = btr_cur_optimistic_insert(flags, cursor, &insert_offsets, + &heap, node_ptr, &insert_rec, + &dummy_big_rec, 0, NULL, mtr); + + /* If optimistic insert fail, try reorganize the page + and insert again. */ + if (err == DB_SUCCESS) { + ins_suc = true; + } else if (ins_suc) { + ut_ad(err == DB_FAIL); + err = btr_page_reorganize(btr_cur_get_page_cur(cursor), + mtr); + if (err == DB_SUCCESS) { + err = btr_cur_optimistic_insert( + flags, cursor, &insert_offsets, &heap, + node_ptr, &insert_rec, &dummy_big_rec, + 0, NULL, mtr); + } + + /* Will do pessimistic insert */ + if (err != DB_SUCCESS) { + ut_ad(err == DB_FAIL); + ins_suc = false; + } + } + + /* Insert succeed, position cursor the inserted rec.*/ + if (ins_suc) { + btr_cur_position(index, insert_rec, block, cursor); + offsets = rec_get_offsets(insert_rec, + index, offsets, n_core, + ULINT_UNDEFINED, &heap); + } + + /* Delete the rec which cursor2 point to. */ + if (cursor2) { + ulint cur2_pno; + rec_t* cur2_rec; + + cursor2->page_cur.rec = page_rec_get_nth(page, + cur2_pos); + + cur2_rec = btr_cur_get_rec(cursor2); + + offsets2 = rec_get_offsets(cur2_rec, index, NULL, + n_core, + ULINT_UNDEFINED, &heap); + + /* If the cursor2 position is on a wrong rec, we + need to reposition it. */ + cur2_pno = btr_node_ptr_get_child_page_no(cur2_rec, offsets2); + if ((del_page_no != cur2_pno) + || (cur2_rec == insert_rec)) { + cur2_rec = page_get_infimum_rec(page); + + while ((cur2_rec + = page_rec_get_next(cur2_rec))) { + if (page_rec_is_supremum(cur2_rec)) { + break; + } + + offsets2 = rec_get_offsets(cur2_rec, index, + NULL, + n_core, + ULINT_UNDEFINED, + &heap); + cur2_pno = btr_node_ptr_get_child_page_no( + cur2_rec, offsets2); + if (cur2_pno == del_page_no) { + if (insert_rec != cur2_rec) { + cursor2->page_cur.rec = + cur2_rec; + break; + } + } + } + } + + rec_info = rec_get_info_bits(cur2_rec, + rec_offs_comp(offsets2)); + if (rec_info & REC_INFO_MIN_REC_FLAG) { + /* If we delete the leftmost node + pointer on a non-leaf level, we must + mark the new leftmost node pointer as + the predefined minimum record */ + rec_t* next_rec = page_rec_get_next(cur2_rec); + btr_set_min_rec_mark(next_rec, *block, mtr); + } + + ut_ad(cur2_pno == del_page_no && cur2_rec != insert_rec); + + page_cur_delete_rec(btr_cur_get_page_cur(cursor2), + offsets2, mtr); + } + + if (!ins_suc) { + mem_heap_t* new_heap = NULL; + + err = btr_cur_pessimistic_insert( + flags, + cursor, &insert_offsets, &new_heap, + node_ptr, &insert_rec, &dummy_big_rec, + 0, NULL, mtr); + + ut_ad(err == DB_SUCCESS); + + if (new_heap) { + mem_heap_free(new_heap); + } + + } + + if (cursor2) { + btr_cur_compress_if_useful(cursor, FALSE, mtr); + } + } + + ut_ad(page_has_prev(page) + || (REC_INFO_MIN_REC_FLAG & rec_get_info_bits( + page_rec_get_next(page_get_infimum_rec(page)), + page_is_comp(page)))); +err_exit: + mem_heap_free(heap); +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/**************************************************************//** +Update parent page's MBR and Predicate lock information during a split */ +static +dberr_t +rtr_adjust_upper_level( +/*===================*/ + btr_cur_t* sea_cur, /*!< in: search cursor */ + ulint flags, /*!< in: undo logging and + locking flags */ + buf_block_t* block, /*!< in/out: page to be split */ + buf_block_t* new_block, /*!< in/out: the new half page */ + rtr_mbr_t* mbr, /*!< in: MBR on the old page */ + rtr_mbr_t* new_mbr, /*!< in: MBR on the new page */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint page_no; + ulint new_page_no; + btr_cur_t cursor; + rec_offs* offsets; + mem_heap_t* heap; + ulint level; + dtuple_t* node_ptr_upper = nullptr; + page_cur_t* page_cursor; + lock_prdt_t prdt; + lock_prdt_t new_prdt; + big_rec_t* dummy_big_rec; + rec_t* rec; + + /* Create a memory heap where the data tuple is stored */ + heap = mem_heap_create(1024); + + cursor.thr = sea_cur->thr; + cursor.page_cur.index = sea_cur->index(); + cursor.page_cur.block = block; + + /* Get the level of the split pages */ + level = btr_page_get_level(buf_block_get_frame(block)); + ut_ad(level == btr_page_get_level(buf_block_get_frame(new_block))); + + page_no = block->page.id().page_no(); + + new_page_no = new_block->page.id().page_no(); + + /* Set new mbr for the old page on the upper level. */ + /* Look up the index for the node pointer to page */ + offsets = rtr_page_get_father_block(NULL, heap, mtr, sea_cur, &cursor); + + page_cursor = btr_cur_get_page_cur(&cursor); + + rtr_update_mbr_field(&cursor, offsets, nullptr, block->page.frame, mbr, + nullptr, mtr); + + /* Already updated parent MBR, reset in our path */ + if (sea_cur->rtr_info) { + node_visit_t* node_visit = rtr_get_parent_node( + sea_cur, level + 1, true); + if (node_visit) { + node_visit->mbr_inc = 0; + } + } + + dberr_t err; + + if (const rec_t* first = page_rec_get_next_const( + page_get_infimum_rec(new_block->page.frame))) { + /* Insert the node for the new page. */ + node_ptr_upper = rtr_index_build_node_ptr( + sea_cur->index(), new_mbr, first, new_page_no, heap); + ulint up_match = 0, low_match = 0; + err = page_cur_search_with_match(node_ptr_upper, + PAGE_CUR_LE, + &up_match, &low_match, + btr_cur_get_page_cur(&cursor), + NULL) + ? DB_CORRUPTION + : btr_cur_optimistic_insert(flags + | BTR_NO_LOCKING_FLAG + | BTR_KEEP_SYS_FLAG + | BTR_NO_UNDO_LOG_FLAG, + &cursor, &offsets, &heap, + node_ptr_upper, &rec, + &dummy_big_rec, 0, NULL, + mtr); + } else { + err = DB_CORRUPTION; + } + + if (err == DB_FAIL) { + cursor.rtr_info = sea_cur->rtr_info; + cursor.tree_height = sea_cur->tree_height; + + /* Recreate a memory heap as input parameter for + btr_cur_pessimistic_insert(), because the heap may be + emptied in btr_cur_pessimistic_insert(). */ + mem_heap_t* new_heap = mem_heap_create(1024); + + err = btr_cur_pessimistic_insert(flags + | BTR_NO_LOCKING_FLAG + | BTR_KEEP_SYS_FLAG + | BTR_NO_UNDO_LOG_FLAG, + &cursor, &offsets, &new_heap, + node_ptr_upper, &rec, + &dummy_big_rec, 0, NULL, mtr); + cursor.rtr_info = NULL; + mem_heap_free(new_heap); + } + + if (err == DB_SUCCESS) { + prdt.data = static_cast(mbr); + prdt.op = 0; + new_prdt.data = static_cast(new_mbr); + new_prdt.op = 0; + + lock_prdt_update_parent(block, new_block, &prdt, &new_prdt, + page_cursor->block->page.id()); + } + + mem_heap_free(heap); + + ut_ad(block->zip_size() == sea_cur->index()->table->space->zip_size()); + + if (err != DB_SUCCESS) { + return err; + } + + const uint32_t next_page_no = btr_page_get_next(block->page.frame); + + if (next_page_no == FIL_NULL) { + } else if (buf_block_t* next_block = + btr_block_get(*sea_cur->index(), next_page_no, RW_X_LATCH, + false, mtr, &err)) { + if (UNIV_UNLIKELY(memcmp_aligned<4>(next_block->page.frame + + FIL_PAGE_PREV, + block->page.frame + + FIL_PAGE_OFFSET, 4))) { + return DB_CORRUPTION; + } + btr_page_set_prev(next_block, new_page_no, mtr); + } else { + return err; + } + + btr_page_set_next(block, new_page_no, mtr); + + btr_page_set_prev(new_block, page_no, mtr); + btr_page_set_next(new_block, next_page_no, mtr); + return DB_SUCCESS; +} + +/*************************************************************//** +Moves record list to another page for rtree splitting. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return error code +@retval DB_FAIL on ROW_FORMAT=COMPRESSED compression failure */ +static +dberr_t +rtr_split_page_move_rec_list( +/*=========================*/ + rtr_split_node_t* node_array, /*!< in: split node array. */ + int first_rec_group,/*!< in: group number of the + first rec. */ + buf_block_t* new_block, /*!< in/out: index page + where to move */ + buf_block_t* block, /*!< in/out: page containing + split_rec */ + rec_t* first_rec, /*!< in: first record not to + move */ + dict_index_t* index, /*!< in: record descriptor */ + mem_heap_t* heap, /*!< in: pointer to memory + heap, or NULL */ + mtr_t* mtr) /*!< in: mtr */ +{ + rtr_split_node_t* cur_split_node; + rtr_split_node_t* end_split_node; + page_cur_t page_cursor; + page_cur_t new_page_cursor; + page_t* page; + page_t* new_page; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + page_zip_des_t* new_page_zip + = buf_block_get_page_zip(new_block); + rec_t* rec; + ulint moved = 0; + ulint max_to_move = 0; + rtr_rec_move_t* rec_move = NULL; + + ut_ad(!dict_index_is_ibuf(index)); + ut_ad(dict_index_is_spatial(index)); + + rec_offs_init(offsets_); + + page_cur_set_before_first(block, &page_cursor); + page_cur_set_before_first(new_block, &new_page_cursor); + page_cursor.index = new_page_cursor.index = index; + + page = buf_block_get_frame(block); + new_page = buf_block_get_frame(new_block); + + end_split_node = node_array + page_get_n_recs(page); + + mtr_log_t log_mode = MTR_LOG_NONE; + + if (new_page_zip) { + log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE); + } + + max_to_move = page_get_n_recs(buf_block_get_frame(block)); + rec_move = static_cast(mem_heap_alloc( + heap, + sizeof (*rec_move) * max_to_move)); + const ulint n_core = page_is_leaf(page) + ? index->n_core_fields : 0; + + /* Insert the recs in group 2 to new page. */ + for (cur_split_node = node_array; + cur_split_node < end_split_node; ++cur_split_node) { + if (cur_split_node->n_node != first_rec_group) { + lock_rec_store_on_page_infimum( + block, cur_split_node->key); + + offsets = rec_get_offsets(cur_split_node->key, + index, offsets, n_core, + ULINT_UNDEFINED, &heap); + + ut_ad(!n_core || cur_split_node->key != first_rec); + + rec = page_cur_insert_rec_low( + &new_page_cursor, + cur_split_node->key, offsets, mtr); + + if (UNIV_UNLIKELY + (!rec + || !page_cur_move_to_next(&new_page_cursor))) { + return DB_CORRUPTION; + } + + lock_rec_restore_from_page_infimum( + *new_block, rec, block->page.id()); + + rec_move[moved].new_rec = rec; + rec_move[moved].old_rec = cur_split_node->key; + rec_move[moved].moved = false; + moved++; + + if (moved > max_to_move) { + ut_ad(0); + break; + } + } + } + + /* Update PAGE_MAX_TRX_ID on the uncompressed page. + Modifications will be redo logged and copied to the compressed + page in page_zip_compress() or page_zip_reorganize() below. + Multiple transactions cannot simultaneously operate on the + same temp-table in parallel. + max_trx_id is ignored for temp tables because it not required + for MVCC. */ + if (n_core && !index->table->is_temporary()) { + page_update_max_trx_id(new_block, NULL, + page_get_max_trx_id(page), + mtr); + } + + if (new_page_zip) { + mtr_set_log_mode(mtr, log_mode); + + if (!page_zip_compress(new_block, index, + page_zip_level, mtr)) { + if (dberr_t err = + page_zip_reorganize(new_block, index, + page_zip_level, mtr)) { + if (err == DB_FAIL) { + ut_a(page_zip_decompress(new_page_zip, + new_page, + FALSE)); + } + return err; + } + } + } + + /* Update the lock table */ + lock_rtr_move_rec_list(new_block, block, rec_move, moved); + + /* Delete recs in second group from the old page. */ + for (cur_split_node = node_array; + cur_split_node < end_split_node; ++cur_split_node) { + if (cur_split_node->n_node != first_rec_group) { + page_cur_position(cur_split_node->key, + block, &page_cursor); + offsets = rec_get_offsets( + page_cur_get_rec(&page_cursor), index, + offsets, n_core, ULINT_UNDEFINED, + &heap); + page_cur_delete_rec(&page_cursor, offsets, mtr); + } + } + + return DB_SUCCESS; +} + +/*************************************************************//** +Splits an R-tree index page to halves and inserts the tuple. It is assumed +that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is +released within this function! NOTE that the operation of this +function must always succeed, we cannot reverse it: therefore enough +free disk space (2 pages) must be guaranteed to be available before +this function is called. +@return inserted record */ +rec_t* +rtr_page_split_and_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in/out: cursor at which to insert; when the + function returns, the cursor is positioned + on the predecessor of the inserted record */ + rec_offs** offsets,/*!< out: offsets on inserted record */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr, /*!< in: mtr */ + dberr_t* err) /*!< out: error code */ +{ + buf_block_t* block; + page_t* page; + page_t* new_page; + buf_block_t* new_block; + page_zip_des_t* page_zip; + page_zip_des_t* new_page_zip; + page_cur_t* page_cursor; + rec_t* rec = 0; + ulint n_recs; + ulint total_data; + ulint insert_size; + rtr_split_node_t* rtr_split_node_array; + rtr_split_node_t* cur_split_node; + rtr_split_node_t* end_split_node; + double* buf_pos; + node_seq_t current_ssn; + node_seq_t next_ssn; + buf_block_t* root_block; + rtr_mbr_t mbr; + rtr_mbr_t new_mbr; + lock_prdt_t prdt; + lock_prdt_t new_prdt; + rec_t* first_rec = NULL; + int first_rec_group = 1; + IF_DBUG(bool iterated = false,); + + if (!*heap) { + *heap = mem_heap_create(1024); + } + +func_start: + mem_heap_empty(*heap); + *offsets = NULL; + + ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock, + MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); + ut_ad(!dict_index_is_online_ddl(cursor->index())); + ut_ad(cursor->index()->lock.have_u_or_x()); + + block = btr_cur_get_block(cursor); + page = buf_block_get_frame(block); + page_zip = buf_block_get_page_zip(block); + current_ssn = page_get_ssn_id(page); + + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(page_get_n_recs(page) >= 1); + + const page_id_t page_id(block->page.id()); + + if (!page_has_prev(page) && !page_is_leaf(page)) { + first_rec = page_rec_get_next( + page_get_infimum_rec(buf_block_get_frame(block))); + if (UNIV_UNLIKELY(!first_rec)) { +corrupted: + *err = DB_CORRUPTION; + return nullptr; + } + } + + /* Initial split nodes array. */ + rtr_split_node_array = rtr_page_split_initialize_nodes( + *heap, cursor, offsets, tuple, &buf_pos); + + /* Divide all mbrs to two groups. */ + n_recs = ulint(page_get_n_recs(page)) + 1; + + end_split_node = rtr_split_node_array + n_recs; + +#ifdef UNIV_GIS_DEBUG + fprintf(stderr, "Before split a page:\n"); + for (cur_split_node = rtr_split_node_array; + cur_split_node < end_split_node; ++cur_split_node) { + for (int i = 0; i < SPDIMS * 2; i++) { + fprintf(stderr, "%.2lf ", + *(cur_split_node->coords + i)); + } + fprintf(stderr, "\n"); + } +#endif + + insert_size = rec_get_converted_size(cursor->index(), tuple, n_ext); + total_data = page_get_data_size(page) + insert_size; + first_rec_group = split_rtree_node(rtr_split_node_array, + static_cast(n_recs), + static_cast(total_data), + static_cast(insert_size), + 0, 2, 2, &buf_pos, SPDIMS, + static_cast(first_rec)); + + /* Allocate a new page to the index */ + const uint16_t page_level = btr_page_get_level(page); + new_block = btr_page_alloc(cursor->index(), page_id.page_no() + 1, + FSP_UP, page_level, mtr, mtr, err); + if (UNIV_UNLIKELY(!new_block)) { + return nullptr; + } + + new_page_zip = buf_block_get_page_zip(new_block); + if (page_level && UNIV_LIKELY_NULL(new_page_zip)) { + /* ROW_FORMAT=COMPRESSED non-leaf pages are not expected + to contain FIL_NULL in FIL_PAGE_PREV at this stage. */ + memset_aligned<4>(new_block->page.frame + FIL_PAGE_PREV, 0, 4); + } + btr_page_create(new_block, new_page_zip, cursor->index(), + page_level, mtr); + + new_page = buf_block_get_frame(new_block); + ut_ad(page_get_ssn_id(new_page) == 0); + + /* Set new ssn to the new page and page. */ + page_set_ssn_id(new_block, new_page_zip, current_ssn, mtr); + next_ssn = rtr_get_new_ssn_id(cursor->index()); + + page_set_ssn_id(block, page_zip, next_ssn, mtr); + + /* Keep recs in first group to the old page, move recs in second + groups to the new page. */ + if (0 +#ifdef UNIV_ZIP_COPY + || page_zip +#endif + || (*err = rtr_split_page_move_rec_list(rtr_split_node_array, + first_rec_group, + new_block, block, + first_rec, cursor->index(), + *heap, mtr))) { + if (*err != DB_FAIL) { + return nullptr; + } + + *err = DB_SUCCESS; + + ulint n = 0; + rec_t* rec; + ulint moved = 0; + ulint max_to_move = 0; + rtr_rec_move_t* rec_move = NULL; + ulint pos; + + /* For some reason, compressing new_page failed, + even though it should contain fewer records than + the original page. Copy the page byte for byte + and then delete the records from both pages + as appropriate. Deleting will always succeed. */ + ut_a(new_page_zip); + + page_zip_copy_recs(new_block, + page_zip, page, cursor->index(), mtr); + + page_cursor = btr_cur_get_page_cur(cursor); + + /* Move locks on recs. */ + max_to_move = page_get_n_recs(page); + rec_move = static_cast(mem_heap_alloc( + *heap, + sizeof (*rec_move) * max_to_move)); + + /* Init the rec_move array for moving lock on recs. */ + for (cur_split_node = rtr_split_node_array; + cur_split_node < end_split_node - 1; ++cur_split_node) { + if (cur_split_node->n_node != first_rec_group) { + pos = page_rec_get_n_recs_before( + cur_split_node->key); + rec = page_rec_get_nth(new_page, pos); + ut_a(rec); + + rec_move[moved].new_rec = rec; + rec_move[moved].old_rec = cur_split_node->key; + rec_move[moved].moved = false; + moved++; + + if (moved > max_to_move) { + ut_ad(0); + break; + } + } + } + + /* Update the lock table */ + lock_rtr_move_rec_list(new_block, block, rec_move, moved); + + const ulint n_core = page_level + ? 0 : cursor->index()->n_core_fields; + + /* Delete recs in first group from the new page. */ + for (cur_split_node = rtr_split_node_array; + cur_split_node < end_split_node - 1; ++cur_split_node) { + if (cur_split_node->n_node == first_rec_group) { + ulint pos; + + pos = page_rec_get_n_recs_before( + cur_split_node->key); + ut_a(pos > 0); + rec_t* new_rec = page_rec_get_nth(new_page, + pos - n); + + ut_a(new_rec && page_rec_is_user_rec(new_rec)); + page_cur_position(new_rec, new_block, + page_cursor); + + *offsets = rec_get_offsets( + page_cur_get_rec(page_cursor), + cursor->index(), *offsets, n_core, + ULINT_UNDEFINED, heap); + + page_cur_delete_rec(page_cursor, + *offsets, mtr); + n++; + } + } + + /* Delete recs in second group from the old page. */ + for (cur_split_node = rtr_split_node_array; + cur_split_node < end_split_node - 1; ++cur_split_node) { + if (cur_split_node->n_node != first_rec_group) { + page_cur_position(cur_split_node->key, + block, page_cursor); + *offsets = rec_get_offsets( + page_cur_get_rec(page_cursor), + page_cursor->index, *offsets, n_core, + ULINT_UNDEFINED, heap); + page_cur_delete_rec(page_cursor, *offsets, + mtr); + } + } + +#ifdef UNIV_GIS_DEBUG + ut_ad(page_validate(new_page, cursor->index())); + ut_ad(page_validate(page, cursor->index())); +#endif + } + + /* Insert the new rec to the proper page. */ + cur_split_node = end_split_node - 1; + + /* Reposition the cursor for insert and try insertion */ + page_cursor = btr_cur_get_page_cur(cursor); + page_cursor->block = cur_split_node->n_node != first_rec_group + ? new_block : block; + + ulint up_match = 0, low_match = 0; + + if (page_cur_search_with_match(tuple, + PAGE_CUR_LE, &up_match, &low_match, + page_cursor, nullptr)) { + goto corrupted; + } + + /* It's possible that the new record is too big to be inserted into + the page, and it'll need the second round split in this case. + We test this scenario here*/ + DBUG_EXECUTE_IF("rtr_page_need_second_split", + if (!iterated) { + rec = NULL; + goto after_insert; } + ); + + rec = page_cur_tuple_insert(page_cursor, tuple, + offsets, heap, n_ext, mtr); + + /* If insert did not fit, try page reorganization. + For compressed pages, page_cur_tuple_insert() will have + attempted this already. */ + if (rec == NULL) { + if (!is_page_cur_get_page_zip(page_cursor) + && btr_page_reorganize(page_cursor, mtr)) { + rec = page_cur_tuple_insert(page_cursor, tuple, + offsets, + heap, n_ext, mtr); + + } + /* If insert fail, we will try to split the block again. */ + } + +#ifdef UNIV_DEBUG +after_insert: +#endif + /* Calculate the mbr on the upper half-page, and the mbr on + original page. */ + rtr_page_cal_mbr(cursor->index(), block, &mbr, *heap); + rtr_page_cal_mbr(cursor->index(), new_block, &new_mbr, *heap); + prdt.data = &mbr; + new_prdt.data = &new_mbr; + + /* Check any predicate locks need to be moved/copied to the + new page */ + lock_prdt_update_split(new_block, &prdt, &new_prdt, page_id); + + /* Adjust the upper level. */ + *err = rtr_adjust_upper_level(cursor, flags, block, new_block, + &mbr, &new_mbr, mtr); + if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { + return nullptr; + } + + /* Save the new ssn to the root page, since we need to reinit + the first ssn value from it after restart server. */ + + root_block = btr_root_block_get(cursor->index(), RW_SX_LATCH, + mtr, err); + if (UNIV_UNLIKELY(!root_block)) { + return nullptr; + } + + page_zip = buf_block_get_page_zip(root_block); + page_set_ssn_id(root_block, page_zip, next_ssn, mtr); + + /* If the new res insert fail, we need to do another split + again. */ + if (!rec) { + /* We play safe and reset the free bits for new_page */ + if (!dict_index_is_clust(cursor->index()) + && !cursor->index()->table->is_temporary()) { + ibuf_reset_free_bits(new_block); + ibuf_reset_free_bits(block); + } + + /* We need to clean the parent path here and search father + node later, otherwise, it's possible that find a wrong + parent. */ + rtr_clean_rtr_info(cursor->rtr_info, true); + cursor->rtr_info = NULL; + IF_DBUG(iterated=true,); + + rec_t* i_rec = page_rec_get_next(page_get_infimum_rec( + buf_block_get_frame(block))); + if (UNIV_UNLIKELY(!i_rec)) { + goto corrupted; + } + btr_cur_position(cursor->index(), i_rec, block, cursor); + + goto func_start; + } + +#ifdef UNIV_GIS_DEBUG + ut_ad(page_validate(buf_block_get_frame(block), cursor->index())); + ut_ad(page_validate(buf_block_get_frame(new_block), cursor->index())); + + ut_ad(!rec || rec_offs_validate(rec, cursor->index(), *offsets)); +#endif + return(rec); +} + +/****************************************************************//** +Following the right link to find the proper block for insert. +@return the proper block.*/ +dberr_t +rtr_ins_enlarge_mbr( +/*================*/ + btr_cur_t* btr_cur, /*!< in: btr cursor */ + mtr_t* mtr) /*!< in: mtr */ +{ + dberr_t err = DB_SUCCESS; + rtr_mbr_t new_mbr; + buf_block_t* block; + mem_heap_t* heap; + page_cur_t* page_cursor; + rec_offs* offsets; + node_visit_t* node_visit; + btr_cur_t cursor; + page_t* page; + + ut_ad(btr_cur->index()->is_spatial()); + + /* If no rtr_info or rtree is one level tree, return. */ + if (!btr_cur->rtr_info || btr_cur->tree_height == 1) { + return(err); + } + + /* Check path info is not empty. */ + ut_ad(!btr_cur->rtr_info->parent_path->empty()); + + /* Create a memory heap. */ + heap = mem_heap_create(1024); + + /* Leaf level page is stored in cursor */ + page_cursor = btr_cur_get_page_cur(btr_cur); + block = page_cur_get_block(page_cursor); + + for (ulint i = 1; i < btr_cur->tree_height; i++) { + node_visit = rtr_get_parent_node(btr_cur, i, true); + ut_ad(node_visit != NULL); + + /* If there's no mbr enlarge, return.*/ + if (node_visit->mbr_inc == 0) { + block = btr_pcur_get_block(node_visit->cursor); + continue; + } + + /* Calculate the mbr of the child page. */ + rtr_page_cal_mbr(page_cursor->index, block, &new_mbr, heap); + + /* Get father block. */ + cursor.page_cur.index = page_cursor->index; + cursor.page_cur.block = block; + offsets = rtr_page_get_father_block( + NULL, heap, mtr, btr_cur, &cursor); + + page = buf_block_get_frame(block); + + /* Update the mbr field of the rec. */ + rtr_update_mbr_field(&cursor, offsets, NULL, page, + &new_mbr, NULL, mtr); + block = btr_cur_get_block(&cursor); + } + + mem_heap_free(heap); + + return(err); +} + +/*************************************************************//** +Copy recs from a page to new_block of rtree. + +@return error code */ +dberr_t +rtr_page_copy_rec_list_end_no_locks( +/*================================*/ + buf_block_t* new_block, /*!< in: index page to copy to */ + buf_block_t* block, /*!< in: index page of rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mem_heap_t* heap, /*!< in/out: heap memory */ + rtr_rec_move_t* rec_move, /*!< in: recording records moved */ + ulint max_move, /*!< in: num of rec to move */ + ulint* num_moved, /*!< out: num of rec to move */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* new_page = buf_block_get_frame(new_block); + page_cur_t page_cur; + page_cur_t cur1; + rec_t* cur_rec; + rec_offs offsets_1[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets1 = offsets_1; + rec_offs offsets_2[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets2 = offsets_2; + ulint moved = 0; + const ulint n_core = page_is_leaf(new_page) + ? index->n_core_fields : 0; + + rec_offs_init(offsets_1); + rec_offs_init(offsets_2); + + page_cur_position(rec, block, &cur1); + + if (page_cur_is_before_first(&cur1) && !page_cur_move_to_next(&cur1)) { + return DB_CORRUPTION; + } + + ut_a(page_is_comp(new_page) == page_rec_is_comp(rec)); + ut_a(mach_read_from_2(new_page + srv_page_size - 10) == (ulint) + (page_is_comp(new_page) ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM)); + + cur_rec = page_rec_get_next( + page_get_infimum_rec(buf_block_get_frame(new_block))); + if (UNIV_UNLIKELY(!cur_rec)) { + return DB_CORRUPTION; + } + page_cur_position(cur_rec, new_block, &page_cur); + page_cur.index = index; + + /* Copy records from the original page to the new page */ + while (!page_cur_is_after_last(&cur1)) { + rec_t* cur1_rec = page_cur_get_rec(&cur1); + rec_t* ins_rec; + + if (page_rec_is_infimum(cur_rec)) { + cur_rec = page_rec_get_next(cur_rec); + if (UNIV_UNLIKELY(!cur_rec)) { + return DB_CORRUPTION; + } + } + + offsets1 = rec_get_offsets(cur1_rec, index, offsets1, n_core, + ULINT_UNDEFINED, &heap); + while (!page_rec_is_supremum(cur_rec)) { + ulint cur_matched_fields = 0; + int cmp; + + offsets2 = rec_get_offsets(cur_rec, index, offsets2, + n_core, + ULINT_UNDEFINED, &heap); + cmp = cmp_rec_rec(cur1_rec, cur_rec, + offsets1, offsets2, index, false, + &cur_matched_fields); + if (cmp < 0) { + goto move_to_prev; + } else if (cmp > 0) { + /* Skip small recs. */ + cur_rec = page_cur_move_to_next(&page_cur); + } else if (n_core) { + if (rec_get_deleted_flag(cur1_rec, + dict_table_is_comp(index->table))) { + goto next; + } else { + /* We have two identical leaf records, + skip copying the undeleted one, and + unmark deleted on the current page */ + btr_rec_set_deleted( + new_block, cur_rec, mtr); + goto next; + } + } + } + + /* If position is on suprenum rec, need to move to + previous rec. */ + if (page_rec_is_supremum(cur_rec)) { +move_to_prev: + cur_rec = page_cur_move_to_prev(&page_cur); + } else { + cur_rec = page_cur_get_rec(&page_cur); + } + + if (UNIV_UNLIKELY(!cur_rec)) { + return DB_CORRUPTION; + } + + offsets1 = rec_get_offsets(cur1_rec, index, offsets1, n_core, + ULINT_UNDEFINED, &heap); + + ins_rec = page_cur_insert_rec_low(&page_cur, + cur1_rec, offsets1, mtr); + if (UNIV_UNLIKELY(!ins_rec || moved >= max_move)) { + return DB_CORRUPTION; + } + + rec_move[moved].new_rec = ins_rec; + rec_move[moved].old_rec = cur1_rec; + rec_move[moved].moved = false; + moved++; +next: + if (UNIV_UNLIKELY(!page_cur_move_to_next(&cur1))) { + return DB_CORRUPTION; + } + } + + *num_moved = moved; + return DB_SUCCESS; +} + +/*************************************************************//** +Copy recs till a specified rec from a page to new_block of rtree. + +@return error code */ +dberr_t +rtr_page_copy_rec_list_start_no_locks( +/*==================================*/ + buf_block_t* new_block, /*!< in: index page to copy to */ + buf_block_t* block, /*!< in: index page of rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mem_heap_t* heap, /*!< in/out: heap memory */ + rtr_rec_move_t* rec_move, /*!< in: recording records moved */ + ulint max_move, /*!< in: num of rec to move */ + ulint* num_moved, /*!< out: num of rec to move */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_cur_t cur1; + rec_t* cur_rec; + rec_offs offsets_1[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets1 = offsets_1; + rec_offs offsets_2[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets2 = offsets_2; + page_cur_t page_cur; + ulint moved = 0; + const ulint n_core = page_is_leaf(buf_block_get_frame(block)) + ? index->n_core_fields : 0; + + rec_offs_init(offsets_1); + rec_offs_init(offsets_2); + + page_cur_set_before_first(block, &cur1); + if (UNIV_UNLIKELY(!page_cur_move_to_next(&cur1))) { + return DB_CORRUPTION; + } + + cur_rec = page_rec_get_next( + page_get_infimum_rec(buf_block_get_frame(new_block))); + if (UNIV_UNLIKELY(!cur_rec)) { + return DB_CORRUPTION; + } + page_cur_position(cur_rec, new_block, &page_cur); + page_cur.index = index; + + while (page_cur_get_rec(&cur1) != rec) { + rec_t* cur1_rec = page_cur_get_rec(&cur1); + rec_t* ins_rec; + + if (page_rec_is_infimum(cur_rec)) { + cur_rec = page_rec_get_next(cur_rec); + if (UNIV_UNLIKELY(!cur_rec)) { + return DB_CORRUPTION; + } + } + + offsets1 = rec_get_offsets(cur1_rec, index, offsets1, n_core, + ULINT_UNDEFINED, &heap); + + while (!page_rec_is_supremum(cur_rec)) { + ulint cur_matched_fields = 0; + + offsets2 = rec_get_offsets(cur_rec, index, offsets2, + n_core, + ULINT_UNDEFINED, &heap); + int cmp = cmp_rec_rec(cur1_rec, cur_rec, + offsets1, offsets2, index, false, + &cur_matched_fields); + if (cmp < 0) { + goto move_to_prev; + } else if (cmp > 0) { + /* Skip small recs. */ + cur_rec = page_cur_move_to_next(&page_cur); + } else if (n_core) { + if (rec_get_deleted_flag( + cur1_rec, + dict_table_is_comp(index->table))) { + goto next; + } else { + /* We have two identical leaf records, + skip copying the undeleted one, and + unmark deleted on the current page */ + btr_rec_set_deleted( + new_block, cur_rec, mtr); + goto next; + } + } + } + + /* If position is on suprenum rec, need to move to + previous rec. */ + if (page_rec_is_supremum(cur_rec)) { +move_to_prev: + cur_rec = page_cur_move_to_prev(&page_cur); + } else { + cur_rec = page_cur_get_rec(&page_cur); + } + + if (UNIV_UNLIKELY(!cur_rec)) { + return DB_CORRUPTION; + } + + offsets1 = rec_get_offsets(cur1_rec, index, offsets1, n_core, + ULINT_UNDEFINED, &heap); + + ins_rec = page_cur_insert_rec_low(&page_cur, + cur1_rec, offsets1, mtr); + if (UNIV_UNLIKELY(!ins_rec || moved >= max_move)) { + return DB_CORRUPTION; + } + + rec_move[moved].new_rec = ins_rec; + rec_move[moved].old_rec = cur1_rec; + rec_move[moved].moved = false; + moved++; +next: + if (UNIV_UNLIKELY(!page_cur_move_to_next(&cur1))) { + return DB_CORRUPTION; + } + } + + *num_moved = moved; + return DB_SUCCESS; +} + +/****************************************************************//** +Check two MBRs are identical or need to be merged */ +bool +rtr_merge_mbr_changed( +/*==================*/ + btr_cur_t* cursor, /*!< in/out: cursor */ + btr_cur_t* cursor2, /*!< in: the other cursor */ + rec_offs* offsets, /*!< in: rec offsets */ + rec_offs* offsets2, /*!< in: rec offsets */ + rtr_mbr_t* new_mbr) /*!< out: MBR to update */ +{ + double* mbr; + double mbr1[SPDIMS * 2]; + double mbr2[SPDIMS * 2]; + rec_t* rec; + ulint len; + bool changed = false; + + ut_ad(cursor->index()->is_spatial()); + + rec = btr_cur_get_rec(cursor); + + rtr_read_mbr(rec_get_nth_field(rec, offsets, 0, &len), + reinterpret_cast(mbr1)); + + rec = btr_cur_get_rec(cursor2); + + rtr_read_mbr(rec_get_nth_field(rec, offsets2, 0, &len), + reinterpret_cast(mbr2)); + + mbr = reinterpret_cast(new_mbr); + + for (int i = 0; i < SPDIMS * 2; i += 2) { + changed = (changed || mbr1[i] != mbr2[i]); + *mbr = mbr1[i] < mbr2[i] ? mbr1[i] : mbr2[i]; + mbr++; + changed = (changed || mbr1[i + 1] != mbr2 [i + 1]); + *mbr = mbr1[i + 1] > mbr2[i + 1] ? mbr1[i + 1] : mbr2[i + 1]; + mbr++; + } + + return(changed); +} + +/****************************************************************//** +Merge 2 mbrs and update the the mbr that cursor is on. */ +void +rtr_merge_and_update_mbr( +/*=====================*/ + btr_cur_t* cursor, /*!< in/out: cursor */ + btr_cur_t* cursor2, /*!< in: the other cursor */ + rec_offs* offsets, /*!< in: rec offsets */ + rec_offs* offsets2, /*!< in: rec offsets */ + page_t* child_page, /*!< in: the page. */ + mtr_t* mtr) /*!< in: mtr */ +{ + rtr_mbr_t new_mbr; + + if (rtr_merge_mbr_changed(cursor, cursor2, offsets, offsets2, + &new_mbr)) { + rtr_update_mbr_field(cursor, offsets, cursor2, child_page, + &new_mbr, NULL, mtr); + } else { + rtr_node_ptr_delete(cursor2, mtr); + } +} + +/*************************************************************//** +Deletes on the upper level the node pointer to a page. */ +void +rtr_node_ptr_delete( +/*================*/ + btr_cur_t* cursor, /*!< in: search cursor, contains information + about parent nodes in search */ + mtr_t* mtr) /*!< in: mtr */ +{ + ibool compressed; + dberr_t err; + + compressed = btr_cur_pessimistic_delete(&err, TRUE, cursor, + BTR_CREATE_FLAG, false, mtr); + ut_a(err == DB_SUCCESS); + + if (!compressed) { + btr_cur_compress_if_useful(cursor, FALSE, mtr); + } +} + +/**************************************************************//** +Check whether a Rtree page is child of a parent page +@return true if there is child/parent relationship */ +bool +rtr_check_same_block( +/*================*/ + dict_index_t* index, /*!< in: index tree */ + btr_cur_t* cursor, /*!< in/out: position at the parent entry + pointing to the child if successful */ + buf_block_t* parentb,/*!< in: parent page to check */ + mem_heap_t* heap) /*!< in: memory heap */ + +{ + const uint32_t page_no = + btr_cur_get_block(cursor)->page.id().page_no(); + rec_offs* offsets; + rec_t* rec = page_get_infimum_rec(parentb->page.frame); + + while ((rec = page_rec_get_next(rec)) && !page_rec_is_supremum(rec)) { + offsets = rec_get_offsets( + rec, index, NULL, 0, ULINT_UNDEFINED, &heap); + + if (btr_node_ptr_get_child_page_no(rec, offsets) == page_no) { + btr_cur_position(index, rec, parentb, cursor); + return(true); + } + } + + return(false); +} + +/*************************************************************//** +Calculates MBR_AREA(a+b) - MBR_AREA(a) +Note: when 'a' and 'b' objects are far from each other, +the area increase can be really big, so this function +can return 'inf' as a result. +Return the area increaed. */ +static double +rtree_area_increase( + const uchar* a, /*!< in: original mbr. */ + const uchar* b, /*!< in: new mbr. */ + double* ab_area) /*!< out: increased area. */ +{ + double a_area = 1.0; + double loc_ab_area = 1.0; + double amin, amax, bmin, bmax; + double data_round = 1.0; + + static_assert(DATA_MBR_LEN == SPDIMS * 2 * sizeof(double), + "compatibility"); + + for (auto i = SPDIMS; i--; ) { + double area; + + amin = mach_double_read(a); + bmin = mach_double_read(b); + amax = mach_double_read(a + sizeof(double)); + bmax = mach_double_read(b + sizeof(double)); + + a += 2 * sizeof(double); + b += 2 * sizeof(double); + + area = amax - amin; + if (area == 0) { + a_area *= LINE_MBR_WEIGHTS; + } else { + a_area *= area; + } + + area = (double)std::max(amax, bmax) - + (double)std::min(amin, bmin); + if (area == 0) { + loc_ab_area *= LINE_MBR_WEIGHTS; + } else { + loc_ab_area *= area; + } + + /* Value of amax or bmin can be so large that small difference + are ignored. For example: 3.2884281489988079e+284 - 100 = + 3.2884281489988079e+284. This results some area difference + are not detected */ + if (loc_ab_area == a_area) { + if (bmin < amin || bmax > amax) { + data_round *= ((double)std::max(amax, bmax) + - amax + + (amin - (double)std::min( + amin, bmin))); + } else { + data_round *= area; + } + } + } + + *ab_area = loc_ab_area; + + if (loc_ab_area == a_area && data_round != 1.0) { + return(data_round); + } + + return(loc_ab_area - a_area); +} + +/** Calculates overlapping area +@param[in] a mbr a +@param[in] b mbr b +@return overlapping area */ +static double rtree_area_overlapping(const byte *a, const byte *b) +{ + double area = 1.0; + double amin; + double amax; + double bmin; + double bmax; + + static_assert(DATA_MBR_LEN == SPDIMS * 2 * sizeof(double), + "compatibility"); + + for (auto i = SPDIMS; i--; ) { + amin = mach_double_read(a); + bmin = mach_double_read(b); + amax = mach_double_read(a + sizeof(double)); + bmax = mach_double_read(b + sizeof(double)); + a += 2 * sizeof(double); + b += 2 * sizeof(double); + + amin = std::max(amin, bmin); + amax = std::min(amax, bmax); + + if (amin > amax) { + return(0); + } else { + area *= (amax - amin); + } + } + + return(area); +} + +/****************************************************************//** +Calculate the area increased for a new record +@return area increased */ +double +rtr_rec_cal_increase( +/*=================*/ + const dtuple_t* dtuple, /*!< in: data tuple to insert, which + cause area increase */ + const rec_t* rec, /*!< in: physical record which differs from + dtuple in some of the common fields, or which + has an equal number or more fields than + dtuple */ + double* area) /*!< out: increased area */ +{ + const dfield_t* dtuple_field; + + ut_ad(!page_rec_is_supremum(rec)); + ut_ad(!page_rec_is_infimum(rec)); + + dtuple_field = dtuple_get_nth_field(dtuple, 0); + ut_ad(dfield_get_len(dtuple_field) == DATA_MBR_LEN); + + return rtree_area_increase(rec, + static_cast( + dfield_get_data(dtuple_field)), + area); +} + +/** Estimates the number of rows in a given area. +@param[in] index index +@param[in] tuple range tuple containing mbr, may also be empty tuple +@param[in] mode search mode +@return estimated number of rows */ +ha_rows +rtr_estimate_n_rows_in_range( + dict_index_t* index, + const dtuple_t* tuple, + page_cur_mode_t mode) +{ + ut_ad(dict_index_is_spatial(index)); + + /* Check tuple & mode */ + if (tuple->n_fields == 0) { + return(HA_POS_ERROR); + } + + switch (mode) { + case PAGE_CUR_DISJOINT: + case PAGE_CUR_CONTAIN: + case PAGE_CUR_INTERSECT: + case PAGE_CUR_WITHIN: + case PAGE_CUR_MBR_EQUAL: + break; + default: + return(HA_POS_ERROR); + } + + DBUG_EXECUTE_IF("rtr_pcur_move_to_next_return", + return(2); + ); + + /* Read mbr from tuple. */ + rtr_mbr_t range_mbr; + double range_area; + + const dfield_t* dtuple_field = dtuple_get_nth_field(tuple, 0); + ut_ad(dfield_get_len(dtuple_field) >= DATA_MBR_LEN); + const byte* range_mbr_ptr = reinterpret_cast( + dfield_get_data(dtuple_field)); + + rtr_read_mbr(range_mbr_ptr, &range_mbr); + range_area = (range_mbr.xmax - range_mbr.xmin) + * (range_mbr.ymax - range_mbr.ymin); + + /* Get index root page. */ + mtr_t mtr; + + mtr.start(); + index->set_modified(mtr); + mtr_s_lock_index(index, &mtr); + + dberr_t err; + buf_block_t* block = btr_root_block_get(index, RW_S_LATCH, &mtr, &err); + if (!block) { +err_exit: + mtr.commit(); + return HA_POS_ERROR; + } + const page_t* page = buf_block_get_frame(block); + const unsigned n_recs = page_header_get_field(page, PAGE_N_RECS); + + if (n_recs == 0) { + goto err_exit; + } + + /* Scan records in root page and calculate area. */ + double area = 0; + for (const rec_t* rec = page_rec_get_next_const( + page_get_infimum_rec(block->page.frame)); + rec && !page_rec_is_supremum(rec); + rec = page_rec_get_next_const(rec)) { + rtr_mbr_t mbr; + double rec_area; + + rtr_read_mbr(rec, &mbr); + + rec_area = (mbr.xmax - mbr.xmin) * (mbr.ymax - mbr.ymin); + + if (rec_area == 0) { + switch (mode) { + case PAGE_CUR_CONTAIN: + case PAGE_CUR_INTERSECT: + area += 1; + break; + + case PAGE_CUR_DISJOINT: + break; + + case PAGE_CUR_WITHIN: + case PAGE_CUR_MBR_EQUAL: + if (!rtree_key_cmp( + PAGE_CUR_WITHIN, range_mbr_ptr, + rec)) { + area += 1; + } + + break; + + default: + ut_error; + } + } else { + switch (mode) { + case PAGE_CUR_CONTAIN: + case PAGE_CUR_INTERSECT: + area += rtree_area_overlapping( + range_mbr_ptr, rec) + / rec_area; + break; + + case PAGE_CUR_DISJOINT: + area += 1; + area -= rtree_area_overlapping( + range_mbr_ptr, rec) + / rec_area; + break; + + case PAGE_CUR_WITHIN: + case PAGE_CUR_MBR_EQUAL: + if (!rtree_key_cmp( + PAGE_CUR_WITHIN, range_mbr_ptr, + rec)) { + area += range_area / rec_area; + } + + break; + default: + ut_error; + } + } + } + + mtr.commit(); + + if (!std::isfinite(area)) { + return(HA_POS_ERROR); + } + + area /= n_recs; + return ha_rows(static_cast(dict_table_get_n_rows(index->table)) + * area); +} diff --git a/storage/innobase/gis/gis0sea.cc b/storage/innobase/gis/gis0sea.cc new file mode 100644 index 00000000..8ca8681b --- /dev/null +++ b/storage/innobase/gis/gis0sea.cc @@ -0,0 +1,2403 @@ +/***************************************************************************** + +Copyright (c) 2016, 2018, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file gis/gis0sea.cc +InnoDB R-tree search interfaces + +Created 2014/01/16 Jimmy Yang +***********************************************************************/ + +#include "fsp0fsp.h" +#include "page0page.h" +#include "page0cur.h" +#include "page0zip.h" +#include "gis0rtree.h" +#include "btr0cur.h" +#include "btr0sea.h" +#include "btr0pcur.h" +#include "rem0cmp.h" +#include "lock0lock.h" +#include "ibuf0ibuf.h" +#include "trx0trx.h" +#include "srv0mon.h" +#include "que0que.h" +#include "gis0geo.h" + +/** Restore the stored position of a persistent cursor bufferfixing the page */ +static +bool +rtr_cur_restore_position( + btr_cur_t* cursor, /*!< in: detached persistent cursor */ + ulint level, /*!< in: index level */ + mtr_t* mtr); /*!< in: mtr */ + +/*************************************************************//** +Pop out used parent path entry, until we find the parent with matching +page number */ +static +void +rtr_adjust_parent_path( +/*===================*/ + rtr_info_t* rtr_info, /* R-Tree info struct */ + ulint page_no) /* page number to look for */ +{ + while (!rtr_info->parent_path->empty()) { + if (rtr_info->parent_path->back().child_no == page_no) { + break; + } else { + if (rtr_info->parent_path->back().cursor) { + btr_pcur_close( + rtr_info->parent_path->back().cursor); + ut_free(rtr_info->parent_path->back().cursor); + } + + rtr_info->parent_path->pop_back(); + } + } +} + +/** Latches the leaf page or pages requested. +@param[in] block_savepoint leaf page where the search converged +@param[in] latch_mode BTR_SEARCH_LEAF, ... +@param[in] cursor cursor +@param[in] mtr mini-transaction */ +static void +rtr_latch_leaves( + ulint block_savepoint, + btr_latch_mode latch_mode, + btr_cur_t* cursor, + mtr_t* mtr) +{ + compile_time_assert(int(MTR_MEMO_PAGE_S_FIX) == int(RW_S_LATCH)); + compile_time_assert(int(MTR_MEMO_PAGE_X_FIX) == int(RW_X_LATCH)); + compile_time_assert(int(MTR_MEMO_PAGE_SX_FIX) == int(RW_SX_LATCH)); + + buf_block_t* block = mtr->at_savepoint(block_savepoint); + + ut_ad(block->page.id().space() == cursor->index()->table->space->id); + ut_ad(block->page.in_file()); + ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock, + MTR_MEMO_S_LOCK + | MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + + switch (latch_mode) { + uint32_t left_page_no; + uint32_t right_page_no; + default: + ut_ad(latch_mode == BTR_CONT_MODIFY_TREE); + break; + case BTR_MODIFY_TREE: + /* It is exclusive for other operations which calls + btr_page_set_prev() */ + ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock, + MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + /* x-latch also siblings from left to right */ + left_page_no = btr_page_get_prev(block->page.frame); + + if (left_page_no != FIL_NULL) { + btr_block_get(*cursor->index(), left_page_no, RW_X_LATCH, + true, mtr); + } + + mtr->upgrade_buffer_fix(block_savepoint, RW_X_LATCH); + + right_page_no = btr_page_get_next(block->page.frame); + + if (right_page_no != FIL_NULL) { + btr_block_get(*cursor->index(), right_page_no, + RW_X_LATCH, true, mtr); + } + break; + case BTR_SEARCH_LEAF: + case BTR_MODIFY_LEAF: + rw_lock_type_t mode = + rw_lock_type_t(latch_mode & (RW_X_LATCH | RW_S_LATCH)); + static_assert(int{RW_S_LATCH} == int{BTR_SEARCH_LEAF}, ""); + static_assert(int{RW_X_LATCH} == int{BTR_MODIFY_LEAF}, ""); + mtr->upgrade_buffer_fix(block_savepoint, mode); + } +} + +/*************************************************************//** +Find the next matching record. This function is used by search +or record locating during index delete/update. +@return true if there is suitable record found, otherwise false */ +TRANSACTIONAL_TARGET +static +bool +rtr_pcur_getnext_from_path( +/*=======================*/ + const dtuple_t* tuple, /*!< in: data tuple */ + page_cur_mode_t mode, /*!< in: cursor search mode */ + btr_cur_t* btr_cur,/*!< in: persistent cursor; NOTE that the + function may release the page latch */ + ulint target_level, + /*!< in: target level */ + ulint latch_mode, + /*!< in: latch_mode */ + bool index_locked, + /*!< in: index tree locked */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_index_t* index = btr_cur->index(); + bool found = false; + page_cur_t* page_cursor; + ulint level = 0; + node_visit_t next_rec; + rtr_info_t* rtr_info = btr_cur->rtr_info; + node_seq_t page_ssn; + ulint skip_parent = false; + bool new_split = false; + bool for_delete = false; + bool for_undo_ins = false; + + /* exhausted all the pages to be searched */ + if (rtr_info->path->empty()) { + return(false); + } + + ut_ad(dtuple_get_n_fields_cmp(tuple)); + + const auto my_latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode); + + for_delete = latch_mode & BTR_RTREE_DELETE_MARK; + for_undo_ins = latch_mode & BTR_RTREE_UNDO_INS; + + /* There should be no insert coming to this function. Only + mode with BTR_MODIFY_* should be delete */ + ut_ad(mode != PAGE_CUR_RTREE_INSERT); + ut_ad(my_latch_mode == BTR_SEARCH_LEAF + || my_latch_mode == BTR_MODIFY_LEAF + || my_latch_mode == BTR_MODIFY_TREE + || my_latch_mode == BTR_CONT_MODIFY_TREE); + + /* Whether need to track parent information. Only need so + when we do tree altering operations (such as index page merge) */ + static_assert(BTR_CONT_MODIFY_TREE == (4 | BTR_MODIFY_TREE), ""); + + const bool need_parent = mode == PAGE_CUR_RTREE_LOCATE + && (my_latch_mode | 4) == BTR_CONT_MODIFY_TREE; + + if (!index_locked) { + ut_ad(mtr->is_empty()); + mtr_s_lock_index(index, mtr); + } else { + ut_ad(mtr->memo_contains_flagged(&index->lock, + MTR_MEMO_SX_LOCK + | MTR_MEMO_S_LOCK + | MTR_MEMO_X_LOCK)); + } + + const ulint zip_size = index->table->space->zip_size(); + + /* Pop each node/page to be searched from "path" structure + and do a search on it. Please note, any pages that are in + the "path" structure are protected by "page" lock, so tey + cannot be shrunk away */ + do { + buf_block_t* block; + node_seq_t path_ssn; + const page_t* page; + rw_lock_type_t rw_latch; + + mysql_mutex_lock(&rtr_info->rtr_path_mutex); + next_rec = rtr_info->path->back(); + rtr_info->path->pop_back(); + level = next_rec.level; + path_ssn = next_rec.seq_no; + + /* Maintain the parent path info as well, if needed */ + if (need_parent && !skip_parent && !new_split) { + ulint old_level; + ulint new_level; + + ut_ad(!rtr_info->parent_path->empty()); + + /* Cleanup unused parent info */ + if (rtr_info->parent_path->back().cursor) { + btr_pcur_close( + rtr_info->parent_path->back().cursor); + ut_free(rtr_info->parent_path->back().cursor); + } + + old_level = rtr_info->parent_path->back().level; + + rtr_info->parent_path->pop_back(); + + ut_ad(!rtr_info->parent_path->empty()); + + /* check whether there is a level change. If so, + the current parent path needs to pop enough + nodes to adjust to the new search page */ + new_level = rtr_info->parent_path->back().level; + + if (old_level < new_level) { + rtr_adjust_parent_path( + rtr_info, next_rec.page_no); + } + + ut_ad(!rtr_info->parent_path->empty()); + + ut_ad(next_rec.page_no + == rtr_info->parent_path->back().child_no); + } + + mysql_mutex_unlock(&rtr_info->rtr_path_mutex); + + skip_parent = false; + new_split = false; + + /* Once we have pages in "path", these pages are + predicate page locked, so they can't be shrunk away. + They also have SSN (split sequence number) to detect + splits, so we can directly latch single page while + getting them. They can be unlatched if not qualified. + One reason for pre-latch is that we might need to position + some parent position (requires latch) during search */ + if (level == 0) { + static_assert(ulint{BTR_SEARCH_LEAF} == + ulint{RW_S_LATCH}, ""); + static_assert(ulint{BTR_MODIFY_LEAF} == + ulint{RW_X_LATCH}, ""); + rw_latch = (my_latch_mode | 4) == BTR_CONT_MODIFY_TREE + ? RW_NO_LATCH + : rw_lock_type_t(my_latch_mode); + } else { + rw_latch = RW_X_LATCH; + } + + if (my_latch_mode == BTR_MODIFY_LEAF) { + mtr->rollback_to_savepoint(1); + } + + ut_ad((my_latch_mode | 4) == BTR_CONT_MODIFY_TREE + || !page_is_leaf(btr_cur_get_page(btr_cur)) + || !btr_cur->page_cur.block->page.lock.have_any()); + + const auto block_savepoint = mtr->get_savepoint(); + block = buf_page_get_gen( + page_id_t(index->table->space_id, + next_rec.page_no), zip_size, + rw_latch, NULL, BUF_GET, mtr); + + if (!block) { + found = false; + break; + } + + page = buf_block_get_frame(block); + page_ssn = page_get_ssn_id(page); + + /* If there are splits, push the splitted page. + Note that we have SX lock on index->lock, there + should not be any split/shrink happening here */ + if (page_ssn > path_ssn) { + uint32_t next_page_no = btr_page_get_next(page); + rtr_non_leaf_stack_push( + rtr_info->path, next_page_no, path_ssn, + level, 0, NULL, 0); + + if (!srv_read_only_mode + && mode != PAGE_CUR_RTREE_INSERT + && mode != PAGE_CUR_RTREE_LOCATE) { + ut_ad(rtr_info->thr); + lock_place_prdt_page_lock( + page_id_t(block->page.id().space(), + next_page_no), + index, + rtr_info->thr); + } + new_split = true; +#if defined(UNIV_GIS_DEBUG) + fprintf(stderr, + "GIS_DIAG: Splitted page found: %d, %ld\n", + static_cast(need_parent), next_page_no); +#endif + } + + page_cursor = btr_cur_get_page_cur(btr_cur); + page_cursor->rec = NULL; + page_cursor->block = block; + + if (mode == PAGE_CUR_RTREE_LOCATE) { + if (target_level == 0 && level == 0) { + ulint low_match = 0, up_match = 0; + + found = false; + + if (!page_cur_search_with_match( + tuple, PAGE_CUR_LE, + &up_match, &low_match, + btr_cur_get_page_cur(btr_cur), nullptr) + && low_match + == dtuple_get_n_fields_cmp(tuple)) { + rec_t* rec = btr_cur_get_rec(btr_cur); + + if (!rec_get_deleted_flag(rec, + dict_table_is_comp(index->table)) + || (!for_delete && !for_undo_ins)) { + found = true; + btr_cur->low_match = low_match; + } else { + /* mark we found deleted row */ + btr_cur->rtr_info->fd_del + = true; + } + } + } else { + page_cur_mode_t page_mode = mode; + + if (level == target_level + && target_level != 0) { + page_mode = PAGE_CUR_RTREE_GET_FATHER; + } + found = rtr_cur_search_with_match( + block, index, tuple, page_mode, + page_cursor, btr_cur->rtr_info); + + /* Save the position of parent if needed */ + if (found && need_parent) { + btr_pcur_t* r_cursor = + rtr_get_parent_cursor( + btr_cur, level, false); + + rec_t* rec = page_cur_get_rec( + page_cursor); + page_cur_position( + rec, block, + btr_pcur_get_page_cur(r_cursor)); + r_cursor->pos_state = + BTR_PCUR_IS_POSITIONED; + r_cursor->latch_mode = my_latch_mode; + btr_pcur_store_position(r_cursor, mtr); + ut_d(ulint num_stored =) + rtr_store_parent_path( + block, btr_cur, + btr_latch_mode(rw_latch), + level, mtr); + ut_ad(num_stored > 0); + } + } + } else { + found = rtr_cur_search_with_match( + block, index, tuple, mode, page_cursor, + btr_cur->rtr_info); + } + + /* Attach predicate lock if needed, no matter whether + there are matched records */ + if (mode != PAGE_CUR_RTREE_INSERT + && mode != PAGE_CUR_RTREE_LOCATE + && mode >= PAGE_CUR_CONTAIN + && btr_cur->rtr_info->need_prdt_lock) { + lock_prdt_t prdt; + + trx_t* trx = thr_get_trx( + btr_cur->rtr_info->thr); + { + TMLockTrxGuard g{TMLockTrxArgs(*trx)}; + lock_init_prdt_from_mbr( + &prdt, &btr_cur->rtr_info->mbr, + mode, trx->lock.lock_heap); + } + + if (rw_latch == RW_NO_LATCH) { + block->page.lock.s_lock(); + } + + lock_prdt_lock(block, &prdt, index, LOCK_S, + LOCK_PREDICATE, btr_cur->rtr_info->thr); + + if (rw_latch == RW_NO_LATCH) { + block->page.lock.s_unlock(); + } + } + + if (found) { + if (level == target_level) { + ut_ad(block + == mtr->at_savepoint(block_savepoint)); + + if (my_latch_mode == BTR_MODIFY_TREE + && level == 0) { + ut_ad(rw_latch == RW_NO_LATCH); + + rtr_latch_leaves( + block_savepoint, + BTR_MODIFY_TREE, + btr_cur, mtr); + } + + page_cur_position( + page_cur_get_rec(page_cursor), + page_cur_get_block(page_cursor), + btr_cur_get_page_cur(btr_cur)); + + btr_cur->low_match = level != 0 ? + DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1 + : btr_cur->low_match; + break; + } + + /* Keep the parent path node, which points to + last node just located */ + skip_parent = true; + } else { + mtr->release_last_page(); + } + + } while (!rtr_info->path->empty()); + + const rec_t* rec = btr_cur_get_rec(btr_cur); + + if (!page_rec_is_user_rec(rec)) { + mtr->commit(); + mtr->start(); + } else if (!index_locked) { + mtr->release(index->lock); + } + + return(found); +} + +/*************************************************************//** +Find the next matching record. This function will first exhaust +the copied record listed in the rtr_info->matches vector before +moving to the next page +@return true if there is suitable record found, otherwise false */ +bool +rtr_pcur_move_to_next( +/*==================*/ + const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in + tuple must be set so that it cannot get + compared to the node ptr page number field! */ + page_cur_mode_t mode, /*!< in: cursor search mode */ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + ulint level, /*!< in: target level */ + mtr_t* mtr) /*!< in: mtr */ +{ + rtr_info_t* rtr_info = cursor->btr_cur.rtr_info; + + ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + + mysql_mutex_lock(&rtr_info->matches->rtr_match_mutex); + /* First retrieve the next record on the current page */ + if (!rtr_info->matches->matched_recs->empty()) { + rtr_rec_t rec; + rec = rtr_info->matches->matched_recs->back(); + rtr_info->matches->matched_recs->pop_back(); + mysql_mutex_unlock(&rtr_info->matches->rtr_match_mutex); + + cursor->btr_cur.page_cur.rec = rec.r_rec; + cursor->btr_cur.page_cur.block = &rtr_info->matches->block; + + DEBUG_SYNC_C("rtr_pcur_move_to_next_return"); + return(true); + } + + mysql_mutex_unlock(&rtr_info->matches->rtr_match_mutex); + + /* Fetch the next page */ + return(rtr_pcur_getnext_from_path(tuple, mode, &cursor->btr_cur, + level, cursor->latch_mode, + false, mtr)); +} + +#ifdef UNIV_DEBUG +/*************************************************************//** +Check if the cursor holds record pointing to the specified child page +@return true if it is (pointing to the child page) false otherwise */ +static void rtr_compare_cursor_rec(const rec_t *rec, dict_index_t *index, + ulint page_no) +{ + if (!rec) + return; + mem_heap_t *heap= nullptr; + rec_offs *offsets= rec_get_offsets(rec, index, nullptr, 0, + ULINT_UNDEFINED, &heap); + ut_ad(btr_node_ptr_get_child_page_no(rec, offsets) == page_no); + mem_heap_free(heap); +} +#endif + +TRANSACTIONAL_TARGET +dberr_t rtr_search_to_nth_level(ulint level, const dtuple_t *tuple, + page_cur_mode_t mode, + btr_latch_mode latch_mode, + btr_cur_t *cur, mtr_t *mtr) +{ + page_cur_mode_t page_mode; + page_cur_mode_t search_mode= PAGE_CUR_UNSUPP; + + bool mbr_adj= false; + bool found= false; + dict_index_t *const index= cur->index(); + + mem_heap_t *heap= nullptr; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs *offsets= offsets_; + rec_offs_init(offsets_); + ut_ad(level == 0 || mode == PAGE_CUR_LE || RTREE_SEARCH_MODE(mode)); + ut_ad(dict_index_check_search_tuple(index, tuple)); + ut_ad(dtuple_check_typed(tuple)); + ut_ad(index->is_spatial()); + ut_ad(index->page != FIL_NULL); + + MEM_UNDEFINED(&cur->up_match, sizeof cur->up_match); + MEM_UNDEFINED(&cur->up_bytes, sizeof cur->up_bytes); + MEM_UNDEFINED(&cur->low_match, sizeof cur->low_match); + MEM_UNDEFINED(&cur->low_bytes, sizeof cur->low_bytes); + ut_d(cur->up_match= ULINT_UNDEFINED); + ut_d(cur->low_match= ULINT_UNDEFINED); + + const bool latch_by_caller= latch_mode & BTR_ALREADY_S_LATCHED; + + ut_ad(!latch_by_caller + || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_S_LOCK + | MTR_MEMO_SX_LOCK)); + latch_mode= BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode); + + ut_ad(!latch_by_caller || latch_mode == BTR_SEARCH_LEAF || + latch_mode == BTR_MODIFY_LEAF); + + cur->flag= BTR_CUR_BINARY; + +#ifndef BTR_CUR_ADAPT + buf_block_t *guess= nullptr; +#else + btr_search_t *const info= btr_search_get_info(index); + buf_block_t *guess= info->root_guess; +#endif + + /* Store the position of the tree latch we push to mtr so that we + know how to release it when we have latched leaf node(s) */ + + const ulint savepoint= mtr->get_savepoint(); + + rw_lock_type_t upper_rw_latch, root_leaf_rw_latch= RW_NO_LATCH; + + switch (latch_mode) { + case BTR_MODIFY_TREE: + mtr_x_lock_index(index, mtr); + upper_rw_latch= root_leaf_rw_latch= RW_X_LATCH; + break; + case BTR_CONT_MODIFY_TREE: + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK | + MTR_MEMO_SX_LOCK)); + upper_rw_latch= RW_X_LATCH; + break; + default: + ut_ad(latch_mode != BTR_MODIFY_PREV); + ut_ad(latch_mode != BTR_SEARCH_PREV); + if (!latch_by_caller) + mtr_s_lock_index(index, mtr); + upper_rw_latch= root_leaf_rw_latch= RW_S_LATCH; + if (latch_mode == BTR_MODIFY_LEAF) + root_leaf_rw_latch= RW_X_LATCH; + } + + auto root_savepoint= mtr->get_savepoint(); + const ulint zip_size= index->table->space->zip_size(); + + /* Start with the root page. */ + page_id_t page_id(index->table->space_id, index->page); + + ulint up_match= 0, up_bytes= 0, low_match= 0, low_bytes= 0; + ulint height= ULINT_UNDEFINED; + + /* We use these modified search modes on non-leaf levels of the + B-tree. These let us end up in the right B-tree leaf. In that leaf + we use the original search mode. */ + + switch (mode) { + case PAGE_CUR_GE: + page_mode= PAGE_CUR_L; + break; + case PAGE_CUR_G: + page_mode= PAGE_CUR_LE; + break; + default: +#ifdef PAGE_CUR_LE_OR_EXTENDS + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE + || RTREE_SEARCH_MODE(mode) + || mode == PAGE_CUR_LE_OR_EXTENDS); +#else /* PAGE_CUR_LE_OR_EXTENDS */ + ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE + || RTREE_SEARCH_MODE(mode)); +#endif /* PAGE_CUR_LE_OR_EXTENDS */ + page_mode= mode; + break; + } + + search_loop: + auto buf_mode= BUF_GET; + ulint rw_latch= RW_NO_LATCH; + + if (height) + { + /* We are about to fetch the root or a non-leaf page. */ + if (latch_mode != BTR_MODIFY_TREE || height == level) + /* If doesn't have SX or X latch of index, + each page should be latched before reading. */ + rw_latch= upper_rw_latch; + } + else if (latch_mode <= BTR_MODIFY_LEAF) + rw_latch= latch_mode; + + dberr_t err; + auto block_savepoint= mtr->get_savepoint(); + buf_block_t *block= buf_page_get_gen(page_id, zip_size, rw_latch, guess, + buf_mode, mtr, &err, false); + if (!block) + { + if (err == DB_DECRYPTION_FAILED) + btr_decryption_failed(*index); + func_exit: + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + + if (mbr_adj) + /* remember that we will need to adjust parent MBR */ + cur->rtr_info->mbr_adj= true; + + return err; + } + + const page_t *page= buf_block_get_frame(block); +#ifdef UNIV_ZIP_DEBUG + if (rw_latch != RW_NO_LATCH) { + const page_zip_des_t *page_zip= buf_block_get_page_zip(block); + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); + } +#endif /* UNIV_ZIP_DEBUG */ + + ut_ad(fil_page_index_page_check(page)); + ut_ad(index->id == btr_page_get_index_id(page)); + + if (height != ULINT_UNDEFINED); + else if (page_is_leaf(page) && + rw_latch != RW_NO_LATCH && rw_latch != root_leaf_rw_latch) + { + /* The root page is also a leaf page (root_leaf). + We should reacquire the page, because the root page + is latched differently from leaf pages. */ + ut_ad(root_leaf_rw_latch != RW_NO_LATCH); + ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_SX_LATCH); + + ut_ad(block == mtr->at_savepoint(block_savepoint)); + mtr->rollback_to_savepoint(block_savepoint); + + upper_rw_latch= root_leaf_rw_latch; + goto search_loop; + } + else + { + /* We are in the root node */ + + height= btr_page_get_level(page); + cur->tree_height= height + 1; + + ut_ad(cur->rtr_info); + + /* If SSN in memory is not initialized, fetch it from root page */ + if (!rtr_get_current_ssn_id(index)) + /* FIXME: do this in dict_load_table_one() */ + index->set_ssn(page_get_ssn_id(page) + 1); + + /* Save the MBR */ + cur->rtr_info->thr= cur->thr; + rtr_get_mbr_from_tuple(tuple, &cur->rtr_info->mbr); + +#ifdef BTR_CUR_ADAPT + info->root_guess= block; +#endif + } + + if (height == 0) { + if (rw_latch == RW_NO_LATCH) + { + ut_ad(block == mtr->at_savepoint(block_savepoint)); + rtr_latch_leaves(block_savepoint, latch_mode, cur, mtr); + } + + switch (latch_mode) { + case BTR_MODIFY_TREE: + case BTR_CONT_MODIFY_TREE: + break; + default: + if (!latch_by_caller) + { + /* Release the tree s-latch */ + mtr->rollback_to_savepoint(savepoint, + savepoint + 1); + block_savepoint--; + root_savepoint--; + } + /* release upper blocks */ + if (savepoint < block_savepoint) + mtr->rollback_to_savepoint(savepoint, block_savepoint); + } + + page_mode= mode; + } + + /* Remember the page search mode */ + search_mode= page_mode; + + /* Some adjustment on search mode, when the page search mode is + PAGE_CUR_RTREE_LOCATE or PAGE_CUR_RTREE_INSERT, as we are searching + with MBRs. When it is not the target level, we should search all + sub-trees that "CONTAIN" the search range/MBR. When it is at the + target level, the search becomes PAGE_CUR_LE */ + + if (page_mode == PAGE_CUR_RTREE_INSERT) + { + page_mode= (level == height) + ? PAGE_CUR_LE + : PAGE_CUR_RTREE_INSERT; + + ut_ad(!page_is_leaf(page) || page_mode == PAGE_CUR_LE); + } + else if (page_mode == PAGE_CUR_RTREE_LOCATE && level == height) + page_mode= level == 0 ? PAGE_CUR_LE : PAGE_CUR_RTREE_GET_FATHER; + + up_match= 0; + low_match= 0; + + if (latch_mode == BTR_MODIFY_TREE || latch_mode == BTR_CONT_MODIFY_TREE) + /* Tree are locked, no need for Page Lock to protect the "path" */ + cur->rtr_info->need_page_lock= false; + + cur->page_cur.block= block; + + if (page_mode >= PAGE_CUR_CONTAIN) + { + found= rtr_cur_search_with_match(block, index, tuple, page_mode, + &cur->page_cur, cur->rtr_info); + + /* Need to use BTR_MODIFY_TREE to do the MBR adjustment */ + if (search_mode == PAGE_CUR_RTREE_INSERT && cur->rtr_info->mbr_adj) { + static_assert(BTR_MODIFY_TREE == (8 | BTR_MODIFY_LEAF), ""); + + if (!(latch_mode & 8)) + /* Parent MBR needs updated, should retry with BTR_MODIFY_TREE */ + goto func_exit; + + cur->rtr_info->mbr_adj= false; + mbr_adj= true; + } + + if (found && page_mode == PAGE_CUR_RTREE_GET_FATHER) + cur->low_match= DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1; + } + else + { + /* Search for complete index fields. */ + up_bytes= low_bytes= 0; + if (page_cur_search_with_match(tuple, page_mode, &up_match, + &low_match, &cur->page_cur, nullptr)) { + err= DB_CORRUPTION; + goto func_exit; + } + } + + /* If this is the desired level, leave the loop */ + + ut_ad(height == btr_page_get_level(btr_cur_get_page(cur))); + + /* Add Predicate lock if it is serializable isolation + and only if it is in the search case */ + if (mode >= PAGE_CUR_CONTAIN && mode != PAGE_CUR_RTREE_INSERT && + mode != PAGE_CUR_RTREE_LOCATE && cur->rtr_info->need_prdt_lock) + { + lock_prdt_t prdt; + + { + trx_t* trx= thr_get_trx(cur->thr); + TMLockTrxGuard g{TMLockTrxArgs(*trx)}; + lock_init_prdt_from_mbr(&prdt, &cur->rtr_info->mbr, mode, + trx->lock.lock_heap); + } + + if (rw_latch == RW_NO_LATCH && height != 0) + block->page.lock.s_lock(); + + lock_prdt_lock(block, &prdt, index, LOCK_S, LOCK_PREDICATE, cur->thr); + + if (rw_latch == RW_NO_LATCH && height != 0) + block->page.lock.s_unlock(); + } + + if (level != height) + { + ut_ad(height > 0); + + height--; + guess= nullptr; + + const rec_t *node_ptr= btr_cur_get_rec(cur); + + offsets= rec_get_offsets(node_ptr, index, offsets, 0, + ULINT_UNDEFINED, &heap); + + if (page_rec_is_supremum(node_ptr)) + { + cur->low_match= 0; + cur->up_match= 0; + goto func_exit; + } + + /* If we are doing insertion or record locating, + remember the tree nodes we visited */ + if (page_mode == PAGE_CUR_RTREE_INSERT || + (search_mode == PAGE_CUR_RTREE_LOCATE && + latch_mode != BTR_MODIFY_LEAF)) + { + const bool add_latch= latch_mode == BTR_MODIFY_TREE && + rw_latch == RW_NO_LATCH; + + if (add_latch) + { + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK | + MTR_MEMO_SX_LOCK)); + block->page.lock.s_lock(); + } + + /* Store the parent cursor location */ + ut_d(auto num_stored=) + rtr_store_parent_path(block, cur, latch_mode, height + 1, mtr); + + if (page_mode == PAGE_CUR_RTREE_INSERT) + { + btr_pcur_t *r_cursor= rtr_get_parent_cursor(cur, height + 1, true); + /* If it is insertion, there should be only one parent for + each level traverse */ + ut_ad(num_stored == 1); + node_ptr= btr_pcur_get_rec(r_cursor); + } + + if (add_latch) + block->page.lock.s_unlock(); + + ut_ad(!page_rec_is_supremum(node_ptr)); + } + + ut_ad(page_mode == search_mode || + (page_mode == PAGE_CUR_WITHIN && + search_mode == PAGE_CUR_RTREE_LOCATE)); + page_mode= search_mode; + + if (height == level && latch_mode == BTR_MODIFY_TREE) + { + ut_ad(upper_rw_latch == RW_X_LATCH); + for (auto i= root_savepoint, n= mtr->get_savepoint(); i < n; i++) + mtr->upgrade_buffer_fix(i, RW_X_LATCH); + } + + /* Go to the child node */ + page_id.set_page_no(btr_node_ptr_get_child_page_no(node_ptr, offsets)); + + if (page_mode >= PAGE_CUR_CONTAIN && page_mode != PAGE_CUR_RTREE_INSERT) + { + rtr_node_path_t *path= cur->rtr_info->path; + + if (found && !path->empty()) + { + ut_ad(path->back().page_no == page_id.page_no()); + path->pop_back(); +#ifdef UNIV_DEBUG + if (page_mode == PAGE_CUR_RTREE_LOCATE && + latch_mode != BTR_MODIFY_LEAF) + { + btr_pcur_t* pcur= cur->rtr_info->parent_path->back().cursor; + rec_t *my_node_ptr= btr_pcur_get_rec(pcur); + + offsets= rec_get_offsets(my_node_ptr, index, offsets, + 0, ULINT_UNDEFINED, &heap); + + ut_ad(page_id.page_no() == + btr_node_ptr_get_child_page_no(my_node_ptr, offsets)); + } +#endif + } + } + + goto search_loop; + } + + if (level) + { + if (upper_rw_latch == RW_NO_LATCH) + { + ut_ad(latch_mode == BTR_CONT_MODIFY_TREE); + btr_block_get(*index, page_id.page_no(), RW_X_LATCH, false, mtr, &err); + } + else + { + ut_ad(mtr->memo_contains_flagged(block, upper_rw_latch)); + ut_ad(!latch_by_caller); + } + + if (page_mode <= PAGE_CUR_LE) + { + cur->low_match= low_match; + cur->up_match= up_match; + } + } + else + { + cur->low_match= low_match; + cur->low_bytes= low_bytes; + cur->up_match= up_match; + cur->up_bytes= up_bytes; + + ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE); + ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); + ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE); + } + + goto func_exit; +} + +dberr_t rtr_search_leaf(btr_cur_t *cur, const dtuple_t *tuple, + btr_latch_mode latch_mode, + mtr_t *mtr, page_cur_mode_t mode) +{ + return rtr_search_to_nth_level(0, tuple, mode, latch_mode, cur, mtr); +} + +/** Search for a spatial index leaf page record. +@param pcur cursor +@param tuple search tuple +@param mode search mode +@param mtr mini-transaction */ +dberr_t rtr_search_leaf(btr_pcur_t *pcur, const dtuple_t *tuple, + page_cur_mode_t mode, mtr_t *mtr) +{ +#ifdef UNIV_DEBUG + switch (mode) { + case PAGE_CUR_CONTAIN: + case PAGE_CUR_INTERSECT: + case PAGE_CUR_WITHIN: + case PAGE_CUR_DISJOINT: + case PAGE_CUR_MBR_EQUAL: + break; + default: + ut_ad("invalid mode" == 0); + } +#endif + pcur->latch_mode= BTR_SEARCH_LEAF; + pcur->search_mode= mode; + pcur->pos_state= BTR_PCUR_IS_POSITIONED; + pcur->trx_if_known= nullptr; + return rtr_search_leaf(&pcur->btr_cur, tuple, BTR_SEARCH_LEAF, mtr, mode); +} + +/**************************************************************//** +Initializes and opens a persistent cursor to an index tree. It should be +closed with btr_pcur_close. */ +bool rtr_search( + const dtuple_t* tuple, /*!< in: tuple on which search done */ + btr_latch_mode latch_mode,/*!< in: BTR_MODIFY_LEAF, ... */ + btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */ + mtr_t* mtr) /*!< in: mtr */ +{ + static_assert(BTR_MODIFY_TREE == (8 | BTR_MODIFY_LEAF), ""); + ut_ad(latch_mode & BTR_MODIFY_LEAF); + ut_ad(!(latch_mode & BTR_ALREADY_S_LATCHED)); + ut_ad(mtr->is_empty()); + + /* Initialize the cursor */ + + btr_pcur_init(cursor); + + cursor->latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode); + cursor->search_mode = PAGE_CUR_RTREE_LOCATE; + cursor->trx_if_known = nullptr; + + if (latch_mode & 8) { + mtr_x_lock_index(cursor->index(), mtr); + } else { + latch_mode + = btr_latch_mode(latch_mode | BTR_ALREADY_S_LATCHED); + mtr_sx_lock_index(cursor->index(), mtr); + } + + /* Search with the tree cursor */ + + btr_cur_t* btr_cursor = btr_pcur_get_btr_cur(cursor); + + btr_cursor->rtr_info + = rtr_create_rtr_info(false, false, + btr_cursor, cursor->index()); + + if (btr_cursor->thr) { + btr_cursor->rtr_info->need_page_lock = true; + btr_cursor->rtr_info->thr = btr_cursor->thr; + } + + if (rtr_search_leaf(btr_cursor, tuple, latch_mode, mtr) + != DB_SUCCESS) { + return true; + } + + cursor->pos_state = BTR_PCUR_IS_POSITIONED; + + const rec_t* rec = btr_pcur_get_rec(cursor); + + const bool d= rec_get_deleted_flag( + rec, cursor->index()->table->not_redundant()); + + if (page_rec_is_infimum(rec) + || btr_pcur_get_low_match(cursor) != dtuple_get_n_fields(tuple) + || (d && latch_mode + & (BTR_RTREE_DELETE_MARK | BTR_RTREE_UNDO_INS))) { + + if (d && latch_mode & BTR_RTREE_DELETE_MARK) { + btr_cursor->rtr_info->fd_del = true; + btr_cursor->low_match = 0; + } + + mtr->rollback_to_savepoint(1); + + if (!rtr_pcur_getnext_from_path(tuple, PAGE_CUR_RTREE_LOCATE, + btr_cursor, 0, latch_mode, + true, mtr)) { + return true; + } + + ut_ad(btr_pcur_get_low_match(cursor) + == dtuple_get_n_fields(tuple)); + } + + if (!(latch_mode & 8)) { + mtr->rollback_to_savepoint(0, 1); + } + + return false; +} + +/* Get the rtree page father. +@param[in,out] mtr mtr +@param[in] sea_cur search cursor, contains information + about parent nodes in search +@param[out] cursor cursor on node pointer record, + its page x-latched +@return whether the cursor was successfully positioned */ +bool rtr_page_get_father(mtr_t *mtr, btr_cur_t *sea_cur, btr_cur_t *cursor) +{ + mem_heap_t *heap = mem_heap_create(100); + rec_offs *offsets= rtr_page_get_father_block(nullptr, heap, + mtr, sea_cur, cursor); + mem_heap_free(heap); + return offsets != nullptr; +} + +MY_ATTRIBUTE((warn_unused_result)) +/********************************************************************//** +Returns the upper level node pointer to a R-Tree page. It is assumed +that mtr holds an x-latch on the tree. */ +static const rec_t* rtr_get_father_node( + ulint level, /*!< in: the tree level of search */ + const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in + tuple must be set so that it cannot get + compared to the node ptr page number field! */ + btr_cur_t* sea_cur,/*!< in: search cursor */ + btr_cur_t* btr_cur,/*!< in/out: tree cursor; the cursor page is + s- or x-latched, but see also above! */ + ulint page_no,/*!< Current page no */ + mtr_t* mtr) /*!< in: mtr */ +{ + const rec_t* rec = nullptr; + auto had_rtr = btr_cur->rtr_info; + dict_index_t* const index = btr_cur->index(); + + /* Try to optimally locate the parent node. Level should always + less than sea_cur->tree_height unless the root is splitting */ + if (sea_cur && sea_cur->tree_height > level) { + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + if (rtr_cur_restore_position(sea_cur, level, mtr)) { + btr_pcur_t* r_cursor = rtr_get_parent_cursor( + sea_cur, level, false); + + rec = btr_pcur_get_rec(r_cursor); + + ut_ad(r_cursor->rel_pos == BTR_PCUR_ON); + page_cur_position(rec, + btr_pcur_get_block(r_cursor), + btr_cur_get_page_cur(btr_cur)); + had_rtr = btr_cur->rtr_info = sea_cur->rtr_info; + btr_cur->tree_height = sea_cur->tree_height; + } + goto func_exit; + } + + /* We arrive here in one of two scenario + 1) check table and btr_valide + 2) index root page being raised */ + + if (btr_cur->rtr_info) { + rtr_clean_rtr_info(btr_cur->rtr_info, true); + } + + btr_cur->rtr_info = rtr_create_rtr_info(false, false, btr_cur, index); + + if (rtr_search_to_nth_level(level, tuple, PAGE_CUR_RTREE_LOCATE, + BTR_CONT_MODIFY_TREE, btr_cur, mtr) + != DB_SUCCESS) { + } else if (sea_cur && sea_cur->tree_height == level) { + rec = btr_cur_get_rec(btr_cur); + } else { + /* btr_validate */ + ut_ad(level >= 1); + ut_ad(!sea_cur); + + rec = btr_cur_get_rec(btr_cur); + const ulint n_fields = dtuple_get_n_fields_cmp(tuple); + + if (page_rec_is_infimum(rec) + || (btr_cur->low_match != n_fields)) { + if (!rtr_pcur_getnext_from_path( + tuple, PAGE_CUR_RTREE_LOCATE, btr_cur, + level, BTR_CONT_MODIFY_TREE, true, mtr)) { + rec = nullptr; + } else { + ut_ad(btr_cur->low_match == n_fields); + rec = btr_cur_get_rec(btr_cur); + } + } + } + +func_exit: + ut_d(rtr_compare_cursor_rec(rec, index, page_no)); + + if (!had_rtr && btr_cur->rtr_info) { + rtr_clean_rtr_info(btr_cur->rtr_info, true); + btr_cur->rtr_info = NULL; + } + + return rec; +} + +/** Returns the upper level node pointer to a R-Tree page. It is assumed +that mtr holds an SX-latch or X-latch on the tree. +@return rec_get_offsets() of the node pointer record */ +static +rec_offs* +rtr_page_get_father_node_ptr( + rec_offs* offsets,/*!< in: work area for the return value */ + mem_heap_t* heap, /*!< in: memory heap to use */ + btr_cur_t* sea_cur,/*!< in: search cursor */ + btr_cur_t* cursor, /*!< in: cursor pointing to user record, + out: cursor on node pointer record, + its page x-latched */ + mtr_t* mtr) /*!< in: mtr */ +{ + dtuple_t* tuple; + ulint level; + ulint page_no; + dict_index_t* index; + rtr_mbr_t mbr; + + page_no = btr_cur_get_block(cursor)->page.id().page_no(); + index = btr_cur_get_index(cursor); + + ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK + | MTR_MEMO_SX_LOCK)); + + ut_ad(dict_index_get_page(index) != page_no); + + level = btr_page_get_level(btr_cur_get_page(cursor)); + + const rec_t* user_rec = btr_cur_get_rec(cursor); + ut_a(page_rec_is_user_rec(user_rec)); + + offsets = rec_get_offsets(user_rec, index, offsets, + level ? 0 : index->n_fields, + ULINT_UNDEFINED, &heap); + rtr_get_mbr_from_rec(user_rec, offsets, &mbr); + + tuple = rtr_index_build_node_ptr( + index, &mbr, user_rec, page_no, heap); + + if (sea_cur && !sea_cur->rtr_info) { + sea_cur = NULL; + } + + const rec_t* node_ptr = rtr_get_father_node(level + 1, tuple, + sea_cur, cursor, + page_no, mtr); + if (!node_ptr) { + return nullptr; + } + + ut_ad(!page_rec_is_comp(node_ptr) + || rec_get_status(node_ptr) == REC_STATUS_NODE_PTR); + offsets = rec_get_offsets(node_ptr, index, offsets, 0, + ULINT_UNDEFINED, &heap); + + if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) { + offsets = nullptr; + } + + return(offsets); +} + +/************************************************************//** +Returns the father block to a page. It is assumed that mtr holds +an X or SX latch on the tree. +@return rec_get_offsets() of the node pointer record */ +rec_offs* +rtr_page_get_father_block( +/*======================*/ + rec_offs* offsets,/*!< in: work area for the return value */ + mem_heap_t* heap, /*!< in: memory heap to use */ + mtr_t* mtr, /*!< in: mtr */ + btr_cur_t* sea_cur,/*!< in: search cursor, contains information + about parent nodes in search */ + btr_cur_t* cursor) /*!< out: cursor on node pointer record, + its page x-latched */ +{ + rec_t *rec= + page_rec_get_next(page_get_infimum_rec(cursor->block()->page.frame)); + if (!rec) + return nullptr; + cursor->page_cur.rec= rec; + return rtr_page_get_father_node_ptr(offsets, heap, sea_cur, cursor, mtr); +} + +/*******************************************************************//** +Create a RTree search info structure */ +rtr_info_t* +rtr_create_rtr_info( +/******************/ + bool need_prdt, /*!< in: Whether predicate lock + is needed */ + bool init_matches, /*!< in: Whether to initiate the + "matches" structure for collecting + matched leaf records */ + btr_cur_t* cursor, /*!< in: tree search cursor */ + dict_index_t* index) /*!< in: index struct */ +{ + rtr_info_t* rtr_info; + + index = index ? index : cursor->index(); + ut_ad(index); + + rtr_info = static_cast(ut_zalloc_nokey(sizeof(*rtr_info))); + + rtr_info->allocated = true; + rtr_info->cursor = cursor; + rtr_info->index = index; + + if (init_matches) { + rtr_info->heap = mem_heap_create(sizeof(*(rtr_info->matches))); + rtr_info->matches = static_cast( + mem_heap_zalloc( + rtr_info->heap, + sizeof(*rtr_info->matches))); + + rtr_info->matches->matched_recs + = UT_NEW_NOKEY(rtr_rec_vector()); + + rtr_info->matches->bufp = page_align(rtr_info->matches->rec_buf + + UNIV_PAGE_SIZE_MAX + 1); + mysql_mutex_init(rtr_match_mutex_key, + &rtr_info->matches->rtr_match_mutex, + nullptr); + rtr_info->matches->block.page.lock.init(); + } + + rtr_info->path = UT_NEW_NOKEY(rtr_node_path_t()); + rtr_info->parent_path = UT_NEW_NOKEY(rtr_node_path_t()); + rtr_info->need_prdt_lock = need_prdt; + mysql_mutex_init(rtr_path_mutex_key, &rtr_info->rtr_path_mutex, + nullptr); + + mysql_mutex_lock(&index->rtr_track->rtr_active_mutex); + index->rtr_track->rtr_active.push_front(rtr_info); + mysql_mutex_unlock(&index->rtr_track->rtr_active_mutex); + return(rtr_info); +} + +/*******************************************************************//** +Update a btr_cur_t with rtr_info */ +void +rtr_info_update_btr( +/******************/ + btr_cur_t* cursor, /*!< in/out: tree cursor */ + rtr_info_t* rtr_info) /*!< in: rtr_info to set to the + cursor */ +{ + ut_ad(rtr_info); + + cursor->rtr_info = rtr_info; +} + +/*******************************************************************//** +Initialize a R-Tree Search structure */ +void +rtr_init_rtr_info( +/****************/ + rtr_info_t* rtr_info, /*!< in: rtr_info to set to the + cursor */ + bool need_prdt, /*!< in: Whether predicate lock is + needed */ + btr_cur_t* cursor, /*!< in: tree search cursor */ + dict_index_t* index, /*!< in: index structure */ + bool reinit) /*!< in: Whether this is a reinit */ +{ + ut_ad(rtr_info); + + if (!reinit) { + /* Reset all members. */ + memset(rtr_info, 0, sizeof *rtr_info); + static_assert(PAGE_CUR_UNSUPP == 0, "compatibility"); + mysql_mutex_init(rtr_path_mutex_key, &rtr_info->rtr_path_mutex, + nullptr); + } + + ut_ad(!rtr_info->matches || rtr_info->matches->matched_recs->empty()); + + rtr_info->path = UT_NEW_NOKEY(rtr_node_path_t()); + rtr_info->parent_path = UT_NEW_NOKEY(rtr_node_path_t()); + rtr_info->need_prdt_lock = need_prdt; + rtr_info->cursor = cursor; + rtr_info->index = index; + + mysql_mutex_lock(&index->rtr_track->rtr_active_mutex); + index->rtr_track->rtr_active.push_front(rtr_info); + mysql_mutex_unlock(&index->rtr_track->rtr_active_mutex); +} + +/**************************************************************//** +Clean up R-Tree search structure */ +void +rtr_clean_rtr_info( +/*===============*/ + rtr_info_t* rtr_info, /*!< in: RTree search info */ + bool free_all) /*!< in: need to free rtr_info itself */ +{ + dict_index_t* index; + bool initialized = false; + + if (!rtr_info) { + return; + } + + index = rtr_info->index; + + if (index) { + mysql_mutex_lock(&index->rtr_track->rtr_active_mutex); + } + + while (rtr_info->parent_path && !rtr_info->parent_path->empty()) { + btr_pcur_t* cur = rtr_info->parent_path->back().cursor; + rtr_info->parent_path->pop_back(); + + if (cur) { + btr_pcur_close(cur); + ut_free(cur); + } + } + + UT_DELETE(rtr_info->parent_path); + rtr_info->parent_path = NULL; + + if (rtr_info->path != NULL) { + UT_DELETE(rtr_info->path); + rtr_info->path = NULL; + initialized = true; + } + + if (rtr_info->matches) { + rtr_info->matches->used = false; + rtr_info->matches->locked = false; + rtr_info->matches->valid = false; + rtr_info->matches->matched_recs->clear(); + } + + if (index) { + index->rtr_track->rtr_active.remove(rtr_info); + mysql_mutex_unlock(&index->rtr_track->rtr_active_mutex); + } + + if (free_all) { + if (rtr_info->matches) { + if (rtr_info->matches->matched_recs != NULL) { + UT_DELETE(rtr_info->matches->matched_recs); + } + + rtr_info->matches->block.page.lock.free(); + + mysql_mutex_destroy( + &rtr_info->matches->rtr_match_mutex); + } + + if (rtr_info->heap) { + mem_heap_free(rtr_info->heap); + } + + if (initialized) { + mysql_mutex_destroy(&rtr_info->rtr_path_mutex); + } + + if (rtr_info->allocated) { + ut_free(rtr_info); + } + } +} + +/**************************************************************//** +Rebuilt the "path" to exclude the removing page no */ +static +void +rtr_rebuild_path( +/*=============*/ + rtr_info_t* rtr_info, /*!< in: RTree search info */ + ulint page_no) /*!< in: need to free rtr_info itself */ +{ + rtr_node_path_t* new_path + = UT_NEW_NOKEY(rtr_node_path_t()); + + rtr_node_path_t::iterator rit; +#ifdef UNIV_DEBUG + ulint before_size = rtr_info->path->size(); +#endif /* UNIV_DEBUG */ + + for (rit = rtr_info->path->begin(); + rit != rtr_info->path->end(); ++rit) { + node_visit_t next_rec = *rit; + + if (next_rec.page_no == page_no) { + continue; + } + + new_path->push_back(next_rec); +#ifdef UNIV_DEBUG + node_visit_t rec = new_path->back(); + ut_ad(rec.level < rtr_info->cursor->tree_height + && rec.page_no > 0); +#endif /* UNIV_DEBUG */ + } + + UT_DELETE(rtr_info->path); + + ut_ad(new_path->size() == before_size - 1); + + rtr_info->path = new_path; + + if (!rtr_info->parent_path->empty()) { + rtr_node_path_t* new_parent_path = UT_NEW_NOKEY( + rtr_node_path_t()); + + for (rit = rtr_info->parent_path->begin(); + rit != rtr_info->parent_path->end(); ++rit) { + node_visit_t next_rec = *rit; + + if (next_rec.child_no == page_no) { + btr_pcur_t* cur = next_rec.cursor; + + if (cur) { + btr_pcur_close(cur); + ut_free(cur); + } + + continue; + } + + new_parent_path->push_back(next_rec); + } + UT_DELETE(rtr_info->parent_path); + rtr_info->parent_path = new_parent_path; + } + +} + +/**************************************************************//** +Check whether a discarding page is in anyone's search path */ +void +rtr_check_discard_page( +/*===================*/ + dict_index_t* index, /*!< in: index */ + btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on + the root page */ + buf_block_t* block) /*!< in: block of page to be discarded */ +{ + const page_id_t id{block->page.id()}; + + mysql_mutex_lock(&index->rtr_track->rtr_active_mutex); + + for (const auto& rtr_info : index->rtr_track->rtr_active) { + if (cursor && rtr_info == cursor->rtr_info) { + continue; + } + + mysql_mutex_lock(&rtr_info->rtr_path_mutex); + for (const node_visit_t& node : *rtr_info->path) { + if (node.page_no == id.page_no()) { + rtr_rebuild_path(rtr_info, node.page_no); + break; + } + } + mysql_mutex_unlock(&rtr_info->rtr_path_mutex); + + if (auto matches = rtr_info->matches) { + mysql_mutex_lock(&matches->rtr_match_mutex); + + if (matches->block.page.id() == id) { + matches->matched_recs->clear(); + matches->valid = false; + } + + mysql_mutex_unlock(&matches->rtr_match_mutex); + } + } + + mysql_mutex_unlock(&index->rtr_track->rtr_active_mutex); + + lock_sys.prdt_page_free_from_discard(id, true); +} + +/** Structure acts as functor to get the optimistic access of the page. +It returns true if it successfully gets the page. */ +struct optimistic_get +{ + btr_pcur_t *const r_cursor; + mtr_t *const mtr; + + optimistic_get(btr_pcur_t *r_cursor,mtr_t *mtr) + :r_cursor(r_cursor), mtr(mtr) {} + + bool operator()(buf_block_t *hint) const + { + return hint && buf_page_optimistic_get( + RW_X_LATCH, hint, r_cursor->modify_clock, mtr); + } +}; + +/** Restore the stored position of a persistent cursor bufferfixing the page */ +static +bool +rtr_cur_restore_position( + btr_cur_t* btr_cur, /*!< in: detached persistent cursor */ + ulint level, /*!< in: index level */ + mtr_t* mtr) /*!< in: mtr */ +{ + dict_index_t* index; + mem_heap_t* heap; + btr_pcur_t* r_cursor = rtr_get_parent_cursor(btr_cur, level, false); + dtuple_t* tuple; + bool ret = false; + + ut_ad(mtr); + ut_ad(r_cursor); + ut_ad(mtr->is_active()); + + index = btr_cur_get_index(btr_cur); + ut_ad(r_cursor->index() == btr_cur->index()); + + if (r_cursor->rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE + || r_cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE) { + return(false); + } + + DBUG_EXECUTE_IF( + "rtr_pessimistic_position", + r_cursor->modify_clock = 100; + ); + + if (r_cursor->block_when_stored.run_with_hint( + optimistic_get(r_cursor, mtr))) { + ut_ad(r_cursor->pos_state == BTR_PCUR_IS_POSITIONED); + + ut_ad(r_cursor->rel_pos == BTR_PCUR_ON); +#ifdef UNIV_DEBUG + do { + const rec_t* rec; + const rec_offs* offsets1; + const rec_offs* offsets2; + ulint comp; + + rec = btr_pcur_get_rec(r_cursor); + + heap = mem_heap_create(256); + offsets1 = rec_get_offsets( + r_cursor->old_rec, index, NULL, + level ? 0 : r_cursor->old_n_fields, + r_cursor->old_n_fields, &heap); + offsets2 = rec_get_offsets( + rec, index, NULL, + level ? 0 : r_cursor->old_n_fields, + r_cursor->old_n_fields, &heap); + + comp = rec_offs_comp(offsets1); + + if (rec_get_info_bits(r_cursor->old_rec, comp) + & REC_INFO_MIN_REC_FLAG) { + ut_ad(rec_get_info_bits(rec, comp) + & REC_INFO_MIN_REC_FLAG); + } else { + + ut_ad(!cmp_rec_rec(r_cursor->old_rec, + rec, offsets1, offsets2, + index)); + } + + mem_heap_free(heap); + } while (0); +#endif /* UNIV_DEBUG */ + + return(true); + } + + /* Page has changed, for R-Tree, the page cannot be shrunk away, + so we search the page and its right siblings */ + node_seq_t page_ssn; + const page_t* page; + page_cur_t* page_cursor; + node_visit_t* node = rtr_get_parent_node(btr_cur, level, false); + node_seq_t path_ssn = node->seq_no; + const unsigned zip_size = index->table->space->zip_size(); + uint32_t page_no = node->page_no; + + heap = mem_heap_create(256); + + tuple = dict_index_build_data_tuple(r_cursor->old_rec, index, !level, + r_cursor->old_n_fields, heap); + + page_cursor = btr_pcur_get_page_cur(r_cursor); + ut_ad(r_cursor == node->cursor); + +search_again: + ulint up_match = 0, low_match = 0; + + page_cursor->block = buf_page_get_gen( + page_id_t(index->table->space_id, page_no), + zip_size, RW_X_LATCH, NULL, BUF_GET, mtr); + + if (!page_cursor->block) { +corrupted: + ret = false; + goto func_exit; + } + + /* Get the page SSN */ + page = buf_block_get_frame(page_cursor->block); + page_ssn = page_get_ssn_id(page); + + if (page_cur_search_with_match(tuple, PAGE_CUR_LE, + &up_match, &low_match, page_cursor, + nullptr)) { + goto corrupted; + } + + if (low_match == r_cursor->old_n_fields) { + const rec_t* rec; + const rec_offs* offsets1; + const rec_offs* offsets2; + ulint comp; + + rec = btr_pcur_get_rec(r_cursor); + + offsets1 = rec_get_offsets(r_cursor->old_rec, index, NULL, + level ? 0 : r_cursor->old_n_fields, + r_cursor->old_n_fields, &heap); + offsets2 = rec_get_offsets(rec, index, NULL, + level ? 0 : r_cursor->old_n_fields, + r_cursor->old_n_fields, &heap); + + comp = rec_offs_comp(offsets1); + + if ((rec_get_info_bits(r_cursor->old_rec, comp) + & REC_INFO_MIN_REC_FLAG) + && (rec_get_info_bits(rec, comp) & REC_INFO_MIN_REC_FLAG)) { + r_cursor->pos_state = BTR_PCUR_IS_POSITIONED; + ret = true; + } else if (!cmp_rec_rec(r_cursor->old_rec, rec, offsets1, offsets2, + index)) { + r_cursor->pos_state = BTR_PCUR_IS_POSITIONED; + ret = true; + } + } + + /* Check the page SSN to see if it has been splitted, if so, search + the right page */ + if (!ret && page_ssn > path_ssn) { + page_no = btr_page_get_next(page); + goto search_again; + } + +func_exit: + mem_heap_free(heap); + + return(ret); +} + +/****************************************************************//** +Copy the leaf level R-tree record, and push it to matched_rec in rtr_info */ +static +void +rtr_leaf_push_match_rec( +/*====================*/ + const rec_t* rec, /*!< in: record to copy */ + rtr_info_t* rtr_info, /*!< in/out: search stack */ + rec_offs* offsets, /*!< in: offsets */ + bool is_comp) /*!< in: is compact format */ +{ + byte* buf; + matched_rec_t* match_rec = rtr_info->matches; + rec_t* copy; + ulint data_len; + rtr_rec_t rtr_rec; + + buf = match_rec->block.page.frame + match_rec->used; + ut_ad(page_rec_is_leaf(rec)); + + copy = rec_copy(buf, rec, offsets); + + if (is_comp) { + rec_set_next_offs_new(copy, PAGE_NEW_SUPREMUM); + } else { + rec_set_next_offs_old(copy, PAGE_OLD_SUPREMUM); + } + + rtr_rec.r_rec = copy; + rtr_rec.locked = false; + + match_rec->matched_recs->push_back(rtr_rec); + match_rec->valid = true; + + data_len = rec_offs_data_size(offsets) + rec_offs_extra_size(offsets); + match_rec->used += data_len; + + ut_ad(match_rec->used < srv_page_size); +} + +/**************************************************************//** +Store the parent path cursor +@return number of cursor stored */ +ulint +rtr_store_parent_path( +/*==================*/ + const buf_block_t* block, /*!< in: block of the page */ + btr_cur_t* btr_cur,/*!< in/out: persistent cursor */ + btr_latch_mode latch_mode, + /*!< in: latch_mode */ + ulint level, /*!< in: index level */ + mtr_t* mtr) /*!< in: mtr */ +{ + ulint num = btr_cur->rtr_info->parent_path->size(); + ulint num_stored = 0; + + while (num >= 1) { + node_visit_t* node = &(*btr_cur->rtr_info->parent_path)[ + num - 1]; + btr_pcur_t* r_cursor = node->cursor; + buf_block_t* cur_block; + + if (node->level > level) { + break; + } + + r_cursor->pos_state = BTR_PCUR_IS_POSITIONED; + r_cursor->latch_mode = latch_mode; + + cur_block = btr_pcur_get_block(r_cursor); + + if (cur_block == block) { + btr_pcur_store_position(r_cursor, mtr); + num_stored++; + } else { + break; + } + + num--; + } + + return(num_stored); +} +/**************************************************************//** +push a nonleaf index node to the search path for insertion */ +static +void +rtr_non_leaf_insert_stack_push( +/*===========================*/ + dict_index_t* index, /*!< in: index descriptor */ + rtr_node_path_t* path, /*!< in/out: search path */ + ulint level, /*!< in: index page level */ + uint32_t child_no,/*!< in: child page no */ + const buf_block_t* block, /*!< in: block of the page */ + const rec_t* rec, /*!< in: positioned record */ + double mbr_inc)/*!< in: MBR needs to be enlarged */ +{ + node_seq_t new_seq; + btr_pcur_t* my_cursor; + + my_cursor = static_cast( + ut_malloc_nokey(sizeof(*my_cursor))); + + btr_pcur_init(my_cursor); + + page_cur_position(rec, block, btr_pcur_get_page_cur(my_cursor)); + + btr_pcur_get_page_cur(my_cursor)->index = index; + + new_seq = rtr_get_current_ssn_id(index); + rtr_non_leaf_stack_push(path, block->page.id().page_no(), + new_seq, level, child_no, my_cursor, mbr_inc); +} + +/** Copy a buf_block_t, except "block->page.lock". +@param[in,out] matches copy to match->block +@param[in] block block to copy */ +static +void +rtr_copy_buf( + matched_rec_t* matches, + const buf_block_t* block) +{ + /* Copy all members of "block" to "matches->block" except "lock". + We skip "lock" because it is not used + from the dummy buf_block_t we create here and because memcpy()ing + it generates (valid) compiler warnings that the vtable pointer + will be copied. */ + matches->block.page.lock.free(); + new (&matches->block.page) buf_page_t(block->page); + matches->block.page.frame = block->page.frame; + matches->block.unzip_LRU = block->unzip_LRU; + + ut_d(matches->block.in_unzip_LRU_list = block->in_unzip_LRU_list); + ut_d(matches->block.in_withdraw_list = block->in_withdraw_list); + + /* Skip buf_block_t::lock */ + matches->block.modify_clock = block->modify_clock; +#ifdef BTR_CUR_HASH_ADAPT + matches->block.n_hash_helps = block->n_hash_helps; + matches->block.n_fields = block->n_fields; + matches->block.left_side = block->left_side; +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + matches->block.n_pointers = 0; +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + matches->block.curr_n_fields = block->curr_n_fields; + matches->block.curr_left_side = block->curr_left_side; + matches->block.index = block->index; +#endif /* BTR_CUR_HASH_ADAPT */ +} + +/****************************************************************//** +Generate a shadow copy of the page block header to save the +matched records */ +static +void +rtr_init_match( +/*===========*/ + matched_rec_t* matches,/*!< in/out: match to initialize */ + const buf_block_t* block, /*!< in: buffer block */ + const page_t* page) /*!< in: buffer page */ +{ + ut_ad(matches->matched_recs->empty()); + matches->locked = false; + rtr_copy_buf(matches, block); + matches->block.page.frame = matches->bufp; + matches->valid = false; + /* We have to copy PAGE_*_SUPREMUM_END bytes so that we can + use infimum/supremum of this page as normal btr page for search. */ + memcpy(matches->block.page.frame, page, page_is_comp(page) + ? PAGE_NEW_SUPREMUM_END : PAGE_OLD_SUPREMUM_END); + matches->used = page_is_comp(page) + ? PAGE_NEW_SUPREMUM_END + : PAGE_OLD_SUPREMUM_END; +#ifdef RTR_SEARCH_DIAGNOSTIC + ulint pageno = page_get_page_no(page); + fprintf(stderr, "INNODB_RTR: Searching leaf page %d\n", + static_cast(pageno)); +#endif /* RTR_SEARCH_DIAGNOSTIC */ +} + +/****************************************************************//** +Get the bounding box content from an index record */ +void +rtr_get_mbr_from_rec( +/*=================*/ + const rec_t* rec, /*!< in: data tuple */ + const rec_offs* offsets,/*!< in: offsets array */ + rtr_mbr_t* mbr) /*!< out MBR */ +{ + ulint rec_f_len; + const byte* data; + + data = rec_get_nth_field(rec, offsets, 0, &rec_f_len); + + rtr_read_mbr(data, mbr); +} + +/****************************************************************//** +Get the bounding box content from a MBR data record */ +void +rtr_get_mbr_from_tuple( +/*===================*/ + const dtuple_t* dtuple, /*!< in: data tuple */ + rtr_mbr* mbr) /*!< out: mbr to fill */ +{ + const dfield_t* dtuple_field; + ulint dtuple_f_len; + + dtuple_field = dtuple_get_nth_field(dtuple, 0); + dtuple_f_len = dfield_get_len(dtuple_field); + ut_a(dtuple_f_len >= 4 * sizeof(double)); + + rtr_read_mbr(static_cast(dfield_get_data(dtuple_field)), + mbr); +} + +/** Compare minimum bounding rectangles. +@return 1, 0, -1, if mode == PAGE_CUR_MBR_EQUAL. And return +1, 0 for rest compare modes, depends on a and b qualifies the +relationship (CONTAINS, WITHIN etc.) */ +static int cmp_gis_field(page_cur_mode_t mode, const void *a, const void *b) +{ + return mode == PAGE_CUR_MBR_EQUAL + ? cmp_geometry_field(a, b) + : rtree_key_cmp(mode, a, b); +} + +/** Compare a GIS data tuple to a physical record in rtree non-leaf node. +We need to check the page number field, since we don't store pk field in +rtree non-leaf node. +@param[in] dtuple data tuple +@param[in] rec R-tree record +@return whether dtuple is less than rec */ +static bool +cmp_dtuple_rec_with_gis_internal(const dtuple_t* dtuple, const rec_t* rec) +{ + const dfield_t *dtuple_field= dtuple_get_nth_field(dtuple, 0); + ut_ad(dfield_get_len(dtuple_field) == DATA_MBR_LEN); + + if (cmp_gis_field(PAGE_CUR_WITHIN, dfield_get_data(dtuple_field), rec)) + return true; + + dtuple_field= dtuple_get_nth_field(dtuple, 1); + ut_ad(dfield_get_len(dtuple_field) == 4); /* child page number */ + ut_ad(dtuple_field->type.mtype == DATA_SYS_CHILD); + ut_ad(!(dtuple_field->type.prtype & ~DATA_NOT_NULL)); + + return memcmp(dtuple_field->data, rec + DATA_MBR_LEN, 4) != 0; +} + +#ifndef UNIV_DEBUG +static +#endif +/** Compare a GIS data tuple to a physical record. +@param[in] dtuple data tuple +@param[in] rec R-tree record +@param[in] mode compare mode +@retval negative if dtuple is less than rec */ +int cmp_dtuple_rec_with_gis(const dtuple_t *dtuple, const rec_t *rec, + page_cur_mode_t mode) +{ + const dfield_t *dtuple_field= dtuple_get_nth_field(dtuple, 0); + /* FIXME: TABLE_SHARE::init_from_binary_frm_image() is adding + field->key_part_length_bytes() to the key length */ + ut_ad(dfield_get_len(dtuple_field) == DATA_MBR_LEN || + dfield_get_len(dtuple_field) == DATA_MBR_LEN + 2); + + return cmp_gis_field(mode, dfield_get_data(dtuple_field), rec); +} + +/****************************************************************//** +Searches the right position in rtree for a page cursor. */ +bool +rtr_cur_search_with_match( +/*======================*/ + const buf_block_t* block, /*!< in: buffer block */ + dict_index_t* index, /*!< in: index descriptor */ + const dtuple_t* tuple, /*!< in: data tuple */ + page_cur_mode_t mode, /*!< in: PAGE_CUR_RTREE_INSERT, + PAGE_CUR_RTREE_LOCATE etc. */ + page_cur_t* cursor, /*!< in/out: page cursor */ + rtr_info_t* rtr_info)/*!< in/out: search stack */ +{ + bool found = false; + const page_t* page; + const rec_t* rec; + const rec_t* last_rec; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + mem_heap_t* heap = NULL; + int cmp = 1; + double least_inc = DBL_MAX; + const rec_t* best_rec; + const rec_t* last_match_rec = NULL; + bool match_init = false; + page_cur_mode_t orig_mode = mode; + const rec_t* first_rec = NULL; + + rec_offs_init(offsets_); + + ut_ad(RTREE_SEARCH_MODE(mode)); + + ut_ad(dict_index_is_spatial(index)); + + page = buf_block_get_frame(block); + + const ulint level = btr_page_get_level(page); + const ulint n_core = level ? 0 : index->n_fields; + + if (mode == PAGE_CUR_RTREE_LOCATE) { + ut_ad(level != 0); + mode = PAGE_CUR_WITHIN; + } + + rec = page_dir_slot_get_rec_validate(page_dir_get_nth_slot(page, 0)); + + if (UNIV_UNLIKELY(!rec)) { + return false; + } + + last_rec = rec; + best_rec = rec; + + if (page_rec_is_infimum(rec)) { + rec = page_rec_get_next_const(rec); + if (UNIV_UNLIKELY(!rec)) { + return false; + } + } + + /* Check insert tuple size is larger than first rec, and try to + avoid it if possible */ + if (mode == PAGE_CUR_RTREE_INSERT && !page_rec_is_supremum(rec)) { + + ulint new_rec_size = rec_get_converted_size(index, tuple, 0); + + offsets = rec_get_offsets(rec, index, offsets, n_core, + dtuple_get_n_fields_cmp(tuple), + &heap); + + if (rec_offs_size(offsets) < new_rec_size) { + first_rec = rec; + } + + /* If this is the left-most page of this index level + and the table is a compressed table, try to avoid + first page as much as possible, as there will be problem + when update MIN_REC rec in compress table */ + if (is_buf_block_get_page_zip(block) + && !page_has_prev(page) + && page_get_n_recs(page) >= 2) { + + rec = page_rec_get_next_const(rec); + } + } + + while (!page_rec_is_supremum(rec)) { + if (!n_core) { + switch (mode) { + case PAGE_CUR_CONTAIN: + case PAGE_CUR_INTERSECT: + case PAGE_CUR_MBR_EQUAL: + /* At non-leaf level, we will need to check + both CONTAIN and INTERSECT for either of + the search mode */ + cmp = cmp_dtuple_rec_with_gis( + tuple, rec, PAGE_CUR_CONTAIN); + + if (cmp != 0) { + cmp = cmp_dtuple_rec_with_gis( + tuple, rec, + PAGE_CUR_INTERSECT); + } + break; + case PAGE_CUR_DISJOINT: + cmp = cmp_dtuple_rec_with_gis( + tuple, rec, mode); + + if (cmp != 0) { + cmp = cmp_dtuple_rec_with_gis( + tuple, rec, + PAGE_CUR_INTERSECT); + } + break; + case PAGE_CUR_RTREE_INSERT: + double increase; + double area; + + cmp = cmp_dtuple_rec_with_gis( + tuple, rec, PAGE_CUR_WITHIN); + + if (cmp != 0) { + increase = rtr_rec_cal_increase( + tuple, rec, &area); + /* Once it goes beyond DBL_MAX, + it would not make sense to record + such value, just make it + DBL_MAX / 2 */ + if (increase >= DBL_MAX) { + increase = DBL_MAX / 2; + } + + if (increase < least_inc) { + least_inc = increase; + best_rec = rec; + } else if (best_rec + && best_rec == first_rec) { + /* if first_rec is set, + we will try to avoid it */ + least_inc = increase; + best_rec = rec; + } + } + break; + case PAGE_CUR_RTREE_GET_FATHER: + cmp = cmp_dtuple_rec_with_gis_internal( + tuple, rec); + break; + default: + /* WITHIN etc. */ + cmp = cmp_dtuple_rec_with_gis( + tuple, rec, mode); + } + } else { + /* At leaf level, INSERT should translate to LE */ + ut_ad(mode != PAGE_CUR_RTREE_INSERT); + + cmp = cmp_dtuple_rec_with_gis( + tuple, rec, mode); + } + + if (cmp == 0) { + found = true; + + /* If located, the matching node/rec will be pushed + to rtr_info->path for non-leaf nodes, or + rtr_info->matches for leaf nodes */ + if (rtr_info && mode != PAGE_CUR_RTREE_INSERT) { + if (!n_core) { + uint32_t page_no; + node_seq_t new_seq; + bool is_loc; + + is_loc = (orig_mode + == PAGE_CUR_RTREE_LOCATE + || orig_mode + == PAGE_CUR_RTREE_GET_FATHER); + + offsets = rec_get_offsets( + rec, index, offsets, 0, + ULINT_UNDEFINED, &heap); + + page_no = btr_node_ptr_get_child_page_no( + rec, offsets); + + ut_ad(level >= 1); + + /* Get current SSN, before we insert + it into the path stack */ + new_seq = rtr_get_current_ssn_id(index); + + rtr_non_leaf_stack_push( + rtr_info->path, + page_no, + new_seq, level - 1, 0, + NULL, 0); + + if (is_loc) { + rtr_non_leaf_insert_stack_push( + index, + rtr_info->parent_path, + level, page_no, block, + rec, 0); + } + + if (!srv_read_only_mode + && (rtr_info->need_page_lock + || !is_loc)) { + + /* Lock the page, preventing it + from being shrunk */ + lock_place_prdt_page_lock( + page_id_t(block->page + .id() + .space(), + page_no), + index, + rtr_info->thr); + } + } else { + ut_ad(orig_mode + != PAGE_CUR_RTREE_LOCATE); + + if (!match_init) { + rtr_init_match( + rtr_info->matches, + block, page); + match_init = true; + } + + /* Collect matched records on page */ + offsets = rec_get_offsets( + rec, index, offsets, + index->n_fields, + ULINT_UNDEFINED, &heap); + rtr_leaf_push_match_rec( + rec, rtr_info, offsets, + page_is_comp(page)); + } + + last_match_rec = rec; + } else { + /* This is the insertion case, it will break + once it finds the first MBR that can accomodate + the inserting rec */ + break; + } + } + + last_rec = rec; + + rec = page_rec_get_next_const(rec); + } + + /* All records on page are searched */ + if (rec && page_rec_is_supremum(rec)) { + if (!n_core) { + if (!found) { + /* No match case, if it is for insertion, + then we select the record that result in + least increased area */ + if (mode == PAGE_CUR_RTREE_INSERT) { + ut_ad(least_inc < DBL_MAX); + offsets = rec_get_offsets( + best_rec, index, offsets, + 0, ULINT_UNDEFINED, &heap); + uint32_t child_no = + btr_node_ptr_get_child_page_no( + best_rec, offsets); + + rtr_non_leaf_insert_stack_push( + index, rtr_info->parent_path, + level, child_no, block, + best_rec, least_inc); + + page_cur_position(best_rec, block, + cursor); + rtr_info->mbr_adj = true; + } else { + /* Position at the last rec of the + page, if it is not the leaf page */ + page_cur_position(last_rec, block, + cursor); + } + } else { + /* There are matching records, position + in the last matching records */ + if (rtr_info) { + rec = last_match_rec; + page_cur_position( + rec, block, cursor); + } + } + } else if (rtr_info) { + /* Leaf level, no match, position at the + last (supremum) rec */ + if (!last_match_rec) { + page_cur_position(rec, block, cursor); + goto func_exit; + } + + /* There are matched records */ + matched_rec_t* match_rec = rtr_info->matches; + + rtr_rec_t test_rec; + + test_rec = match_rec->matched_recs->back(); +#ifdef UNIV_DEBUG + rec_offs offsets_2[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets2 = offsets_2; + rec_offs_init(offsets_2); + + ut_ad(found); + + /* Verify the record to be positioned is the same + as the last record in matched_rec vector */ + offsets2 = rec_get_offsets(test_rec.r_rec, index, + offsets2, index->n_fields, + ULINT_UNDEFINED, &heap); + + offsets = rec_get_offsets(last_match_rec, index, + offsets, index->n_fields, + ULINT_UNDEFINED, &heap); + + ut_ad(cmp_rec_rec(test_rec.r_rec, last_match_rec, + offsets2, offsets, index) == 0); +#endif /* UNIV_DEBUG */ + /* Pop the last match record and position on it */ + match_rec->matched_recs->pop_back(); + page_cur_position(test_rec.r_rec, &match_rec->block, + cursor); + } + } else { + + if (mode == PAGE_CUR_RTREE_INSERT) { + ut_ad(!last_match_rec); + rtr_non_leaf_insert_stack_push( + index, rtr_info->parent_path, level, + mach_read_from_4(rec + DATA_MBR_LEN), + block, rec, 0); + + } else if (rtr_info && found && !n_core) { + rec = last_match_rec; + } + + page_cur_position(rec, block, cursor); + } + +#ifdef UNIV_DEBUG + /* Verify that we are positioned at the same child page as pushed in + the path stack */ + if (!n_core && (!page_rec_is_supremum(rec) || found) + && mode != PAGE_CUR_RTREE_INSERT) { + ulint page_no; + + offsets = rec_get_offsets(rec, index, offsets, 0, + ULINT_UNDEFINED, &heap); + page_no = btr_node_ptr_get_child_page_no(rec, offsets); + + if (rtr_info && found) { + rtr_node_path_t* path = rtr_info->path; + node_visit_t last_visit = path->back(); + + ut_ad(last_visit.page_no == page_no); + } + } +#endif /* UNIV_DEBUG */ + +func_exit: + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + + return(found); +} diff --git a/storage/innobase/ha/ha0storage.cc b/storage/innobase/ha/ha0storage.cc new file mode 100644 index 00000000..acde71b0 --- /dev/null +++ b/storage/innobase/ha/ha0storage.cc @@ -0,0 +1,178 @@ +/***************************************************************************** + +Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file ha/ha0storage.cc +Hash storage. +Provides a data structure that stores chunks of data in +its own storage, avoiding duplicates. + +Created September 22, 2007 Vasil Dimov +*******************************************************/ + +#include "ha0storage.h" +#include "hash0hash.h" +#include "mem0mem.h" +#include "ut0rnd.h" + +/*******************************************************************//** +Retrieves a data from a storage. If it is present, a pointer to the +stored copy of data is returned, otherwise NULL is returned. */ +static +const void* +ha_storage_get( +/*===========*/ + ha_storage_t* storage, /*!< in: hash storage */ + const void* data, /*!< in: data to check for */ + ulint data_len) /*!< in: data length */ +{ + ha_storage_node_t* node; + ulint fold; + + /* avoid repetitive calls to ut_fold_binary() in the HASH_SEARCH + macro */ + fold = ut_fold_binary(static_cast(data), data_len); + +#define IS_FOUND \ + node->data_len == data_len && memcmp(node->data, data, data_len) == 0 + + HASH_SEARCH( + next, /* node->"next" */ + &storage->hash, /* the hash table */ + fold, /* key */ + ha_storage_node_t*, /* type of node->next */ + node, /* auxiliary variable */ + , /* assertion */ + IS_FOUND); /* search criteria */ + + if (node == NULL) { + + return(NULL); + } + /* else */ + + return(node->data); +} + +/*******************************************************************//** +Copies data into the storage and returns a pointer to the copy. If the +same data chunk is already present, then pointer to it is returned. +Data chunks are considered to be equal if len1 == len2 and +memcmp(data1, data2, len1) == 0. If "data" is not present (and thus +data_len bytes need to be allocated) and the size of storage is going to +become more than "memlim" then "data" is not added and NULL is returned. +To disable this behavior "memlim" can be set to 0, which stands for +"no limit". */ +const void* +ha_storage_put_memlim( +/*==================*/ + ha_storage_t* storage, /*!< in/out: hash storage */ + const void* data, /*!< in: data to store */ + ulint data_len, /*!< in: data length */ + ulint memlim) /*!< in: memory limit to obey */ +{ + void* raw; + ha_storage_node_t* node; + const void* data_copy; + ulint fold; + + /* check if data chunk is already present */ + data_copy = ha_storage_get(storage, data, data_len); + if (data_copy != NULL) { + + return(data_copy); + } + + /* not present */ + + /* check if we are allowed to allocate data_len bytes */ + if (memlim > 0 + && ha_storage_get_size(storage) + data_len > memlim) { + + return(NULL); + } + + /* we put the auxiliary node struct and the data itself in one + continuous block */ + raw = mem_heap_alloc(storage->heap, + sizeof(ha_storage_node_t) + data_len); + + node = (ha_storage_node_t*) raw; + data_copy = (byte*) raw + sizeof(*node); + + memcpy((byte*) raw + sizeof(*node), data, data_len); + + node->data_len = data_len; + node->data = data_copy; + + /* avoid repetitive calls to ut_fold_binary() in the HASH_INSERT + macro */ + fold = ut_fold_binary(static_cast(data), data_len); + + HASH_INSERT( + ha_storage_node_t, /* type used in the hash chain */ + next, /* node->"next" */ + &storage->hash, /* the hash table */ + fold, /* key */ + node); /* add this data to the hash */ + + /* the output should not be changed because it will spoil the + hash table */ + return(data_copy); +} + +#ifdef UNIV_COMPILE_TEST_FUNCS + +void +test_ha_storage() +{ + ha_storage_t* storage; + char buf[1024]; + int i; + const void* stored[256]; + const void* p; + + storage = ha_storage_create(0, 0); + + for (i = 0; i < 256; i++) { + + memset(buf, i, sizeof(buf)); + stored[i] = ha_storage_put(storage, buf, sizeof(buf)); + } + + //ha_storage_empty(&storage); + + for (i = 255; i >= 0; i--) { + + memset(buf, i, sizeof(buf)); + p = ha_storage_put(storage, buf, sizeof(buf)); + + if (p != stored[i]) { + ib::warn() << "ha_storage_put() returned " << p + << " instead of " << stored[i] << ", i=" << i; + return; + } + } + + ib::info() << "all ok"; + + ha_storage_free(storage); +} + +#endif /* UNIV_COMPILE_TEST_FUNCS */ diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc new file mode 100644 index 00000000..21bf10a1 --- /dev/null +++ b/storage/innobase/handler/ha_innodb.cc @@ -0,0 +1,21217 @@ +/***************************************************************************** + +Copyright (c) 2000, 2020, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2009, Percona Inc. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, 2023, MariaDB Corporation. + +Portions of this file contain modifications contributed and copyrighted +by Percona Inc.. Those modifications are +gratefully acknowledged and are described briefly in the InnoDB +documentation. The contributions by Percona Inc. are incorporated with +their permission, and subject to the conditions contained in the file +COPYING.Percona. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/** @file ha_innodb.cc */ + +#include "univ.i" + +/* Include necessary SQL headers */ +#include "ha_prototypes.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "sql_type_geom.h" +#include "scope.h" +#include "srv0srv.h" + +// MYSQL_PLUGIN_IMPORT extern my_bool lower_case_file_system; +// MYSQL_PLUGIN_IMPORT extern char mysql_unpacked_real_data_home[]; + +#include +#include +#include + +/* Include necessary InnoDB headers */ +#include "btr0btr.h" +#include "btr0cur.h" +#include "btr0bulk.h" +#include "btr0sea.h" +#include "buf0dblwr.h" +#include "buf0dump.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "buf0lru.h" +#include "dict0boot.h" +#include "dict0load.h" +#include "btr0defragment.h" +#include "dict0crea.h" +#include "dict0stats.h" +#include "dict0stats_bg.h" +#include "fil0fil.h" +#include "fsp0fsp.h" +#include "fts0fts.h" +#include "fts0plugin.h" +#include "fts0priv.h" +#include "fts0types.h" +#include "ibuf0ibuf.h" +#include "lock0lock.h" +#include "log0crypt.h" +#include "mtr0mtr.h" +#include "os0file.h" +#include "page0zip.h" +#include "row0import.h" +#include "row0ins.h" +#include "row0log.h" +#include "row0merge.h" +#include "row0mysql.h" +#include "row0quiesce.h" +#include "row0sel.h" +#include "row0upd.h" +#include "fil0crypt.h" +#include "srv0mon.h" +#include "srv0start.h" +#include "rem0rec.h" +#include "trx0purge.h" +#include "trx0roll.h" +#include "trx0rseg.h" +#include "trx0trx.h" +#include "fil0pagecompress.h" +#include "ut0mem.h" +#include "row0ext.h" +#include "mariadb_stats.h" +thread_local ha_handler_stats mariadb_dummy_stats; +thread_local ha_handler_stats *mariadb_stats= &mariadb_dummy_stats; + +#include "lz4.h" +#include "lzo/lzo1x.h" +#include "lzma.h" +#include "bzlib.h" +#include "snappy-c.h" + +#include + +#define thd_get_trx_isolation(X) ((enum_tx_isolation)thd_tx_isolation(X)) + +extern "C" void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all); +unsigned long long thd_get_query_id(const MYSQL_THD thd); +void thd_clear_error(MYSQL_THD thd); + +TABLE *find_fk_open_table(THD *thd, const char *db, size_t db_len, + const char *table, size_t table_len); +MYSQL_THD create_background_thd(); +void reset_thd(MYSQL_THD thd); +TABLE *get_purge_table(THD *thd); +TABLE *open_purge_table(THD *thd, const char *db, size_t dblen, + const char *tb, size_t tblen); +void close_thread_tables(THD* thd); + +#ifdef MYSQL_DYNAMIC_PLUGIN +#define tc_size 400 +#endif + +#include +#include + +#include "ha_innodb.h" +#include "i_s.h" + +#include +#include + +#ifdef WITH_WSREP +#include +#include "wsrep_sst.h" +#endif /* WITH_WSREP */ + +#ifdef HAVE_URING +/** The Linux kernel version if io_uring() is considered unsafe */ +const char *io_uring_may_be_unsafe; +#endif + +#define INSIDE_HA_INNOBASE_CC + +#define EQ_CURRENT_THD(thd) ((thd) == current_thd) + +struct handlerton* innodb_hton_ptr; + +static const long AUTOINC_OLD_STYLE_LOCKING = 0; +static const long AUTOINC_NEW_STYLE_LOCKING = 1; +static const long AUTOINC_NO_LOCKING = 2; + +static constexpr size_t buf_pool_chunk_min_size= 1U << 20; + +static ulong innobase_open_files; +static long innobase_autoinc_lock_mode; + +ulonglong innobase_buffer_pool_size; + +/** Percentage of the buffer pool to reserve for 'old' blocks. +Connected to buf_LRU_old_ratio. */ +static uint innobase_old_blocks_pct; + +static char* innobase_data_file_path; +static char* innobase_temp_data_file_path; + +/* The default values for the following char* start-up parameters +are determined in innodb_init_params(). */ + +static char* innobase_data_home_dir; +static char* innobase_enable_monitor_counter; +static char* innobase_disable_monitor_counter; +static char* innobase_reset_monitor_counter; +static char* innobase_reset_all_monitor_counter; + +/* This variable can be set in the server configure file, specifying +stopword table to be used */ +static char* innobase_server_stopword_table; + +my_bool innobase_rollback_on_timeout; +static my_bool innobase_create_status_file; +my_bool innobase_stats_on_metadata; +static my_bool innodb_optimize_fulltext_only; + +extern uint srv_fil_crypt_rotate_key_age; +extern uint srv_n_fil_crypt_iops; + +#ifdef UNIV_DEBUG +my_bool innodb_evict_tables_on_commit_debug; +#endif + +/** File format constraint for ALTER TABLE */ +ulong innodb_instant_alter_column_allowed; + +/** Note we cannot use rec_format_enum because we do not allow +COMPRESSED row format for innodb_default_row_format option. */ +enum default_row_format_enum { + DEFAULT_ROW_FORMAT_REDUNDANT = 0, + DEFAULT_ROW_FORMAT_COMPACT = 1, + DEFAULT_ROW_FORMAT_DYNAMIC = 2, +}; + +/** Whether ROW_FORMAT=COMPRESSED tables are read-only */ +static my_bool innodb_read_only_compressed; + +/** A dummy variable */ +static uint innodb_max_purge_lag_wait; + +/** Wait for trx_sys.history_size() to be below a limit. */ +static void innodb_max_purge_lag_wait_update(THD *thd, st_mysql_sys_var *, + void *, const void *limit) +{ + if (high_level_read_only) + return; + const uint l= *static_cast(limit); + if (!trx_sys.history_exceeds(l)) + return; + mysql_mutex_unlock(&LOCK_global_system_variables); + while (trx_sys.history_exceeds(l)) + { + if (thd_kill_level(thd)) + break; + /* Adjust for purge_coordinator_state::refresh() */ + log_sys.latch.rd_lock(SRW_LOCK_CALL); + const lsn_t last= log_sys.last_checkpoint_lsn, + max_age= log_sys.max_checkpoint_age; + log_sys.latch.rd_unlock(); + const lsn_t lsn= log_sys.get_lsn(); + if ((lsn - last) / 4 >= max_age / 5) + buf_flush_ahead(last + max_age / 5, false); + purge_sys.wake_if_not_active(); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + mysql_mutex_lock(&LOCK_global_system_variables); +} + +static +void set_my_errno(int err) +{ + errno = err; +} + +/** Checks whether the file name belongs to a partition of a table. +@param[in] file_name file name +@return pointer to the end of the table name part of the file name, or NULL */ +static +char* +is_partition( +/*=========*/ + char* file_name) +{ + /* We look for pattern #P# to see if the table is partitioned + MariaDB table. */ + return strstr(file_name, table_name_t::part_suffix); +} + + + +/** Return the InnoDB ROW_FORMAT enum value +@param[in] row_format row_format from "innodb_default_row_format" +@return InnoDB ROW_FORMAT value from rec_format_t enum. */ +static +rec_format_t +get_row_format( + ulong row_format) +{ + switch(row_format) { + case DEFAULT_ROW_FORMAT_REDUNDANT: + return(REC_FORMAT_REDUNDANT); + case DEFAULT_ROW_FORMAT_COMPACT: + return(REC_FORMAT_COMPACT); + case DEFAULT_ROW_FORMAT_DYNAMIC: + return(REC_FORMAT_DYNAMIC); + default: + ut_ad(0); + return(REC_FORMAT_DYNAMIC); + } +} + +static ulong innodb_default_row_format = DEFAULT_ROW_FORMAT_DYNAMIC; + +/** Possible values for system variable "innodb_stats_method". The values +are defined the same as its corresponding MyISAM system variable +"myisam_stats_method"(see "myisam_stats_method_names"), for better usability */ +static const char* innodb_stats_method_names[] = { + "nulls_equal", + "nulls_unequal", + "nulls_ignored", + NullS +}; + +/** Used to define an enumerate type of the system variable innodb_stats_method. +This is the same as "myisam_stats_method_typelib" */ +static TYPELIB innodb_stats_method_typelib = { + array_elements(innodb_stats_method_names) - 1, + "innodb_stats_method_typelib", + innodb_stats_method_names, + NULL +}; + +/** Possible values of the parameter innodb_checksum_algorithm */ +const char* innodb_checksum_algorithm_names[] = { + "crc32", + "strict_crc32", + "full_crc32", + "strict_full_crc32", + NullS +}; + +/** Used to define an enumerate type of the system variable +innodb_checksum_algorithm. */ +TYPELIB innodb_checksum_algorithm_typelib = { + array_elements(innodb_checksum_algorithm_names) - 1, + "innodb_checksum_algorithm_typelib", + innodb_checksum_algorithm_names, + NULL +}; + +/** Possible values for system variable "innodb_default_row_format". */ +static const char* innodb_default_row_format_names[] = { + "redundant", + "compact", + "dynamic", + NullS +}; + +/** Used to define an enumerate type of the system variable +innodb_default_row_format. */ +static TYPELIB innodb_default_row_format_typelib = { + array_elements(innodb_default_row_format_names) - 1, + "innodb_default_row_format_typelib", + innodb_default_row_format_names, + NULL +}; + +/** Names of allowed values of innodb_flush_method */ +const char* innodb_flush_method_names[] = { + "fsync", + "O_DSYNC", + "littlesync", + "nosync", + "O_DIRECT", + "O_DIRECT_NO_FSYNC", +#ifdef _WIN32 + "unbuffered", + "async_unbuffered" /* alias for "unbuffered" */, + "normal" /* alias for "fsync" */, +#endif + NullS +}; + +/** Enumeration of innodb_flush_method */ +TYPELIB innodb_flush_method_typelib = { + array_elements(innodb_flush_method_names) - 1, + "innodb_flush_method_typelib", + innodb_flush_method_names, + NULL +}; + +/** Names of allowed values of innodb_deadlock_report */ +static const char *innodb_deadlock_report_names[]= { + "off", /* Do not report any details of deadlocks */ + "basic", /* Report waiting transactions and lock requests */ + "full", /* Also report blocking locks */ + NullS +}; + +static_assert(Deadlock::REPORT_OFF == 0, "compatibility"); +static_assert(Deadlock::REPORT_BASIC == 1, "compatibility"); +static_assert(Deadlock::REPORT_FULL == 2, "compatibility"); + +/** Enumeration of innodb_deadlock_report */ +static TYPELIB innodb_deadlock_report_typelib = { + array_elements(innodb_deadlock_report_names) - 1, + "innodb_deadlock_report_typelib", + innodb_deadlock_report_names, + NULL +}; + +/** Allowed values of innodb_change_buffering */ +static const char* innodb_change_buffering_names[] = { + "none", /* IBUF_USE_NONE */ + "inserts", /* IBUF_USE_INSERT */ + "deletes", /* IBUF_USE_DELETE_MARK */ + "changes", /* IBUF_USE_INSERT_DELETE_MARK */ + "purges", /* IBUF_USE_DELETE */ + "all", /* IBUF_USE_ALL */ + NullS +}; + +/** Enumeration of innodb_change_buffering */ +static TYPELIB innodb_change_buffering_typelib = { + array_elements(innodb_change_buffering_names) - 1, + "innodb_change_buffering_typelib", + innodb_change_buffering_names, + NULL +}; + +/** Allowed values of innodb_instant_alter_column_allowed */ +const char* innodb_instant_alter_column_allowed_names[] = { + "never", /* compatible with MariaDB 5.5 to 10.2 */ + "add_last",/* allow instant ADD COLUMN ... LAST */ + "add_drop_reorder", /* allow instant ADD anywhere & DROP & reorder */ + NullS +}; + +/** Enumeration of innodb_instant_alter_column_allowed */ +static TYPELIB innodb_instant_alter_column_allowed_typelib = { + array_elements(innodb_instant_alter_column_allowed_names) - 1, + "innodb_instant_alter_column_allowed_typelib", + innodb_instant_alter_column_allowed_names, + NULL +}; + +/** Retrieve the FTS Relevance Ranking result for doc with doc_id +of m_prebuilt->fts_doc_id +@param[in,out] fts_hdl FTS handler +@return the relevance ranking value */ +static +float +innobase_fts_retrieve_ranking( + FT_INFO* fts_hdl); +/** Free the memory for the FTS handler +@param[in,out] fts_hdl FTS handler */ +static +void +innobase_fts_close_ranking( + FT_INFO* fts_hdl); +/** Find and Retrieve the FTS Relevance Ranking result for doc with doc_id +of m_prebuilt->fts_doc_id +@param[in,out] fts_hdl FTS handler +@return the relevance ranking value */ +static +float +innobase_fts_find_ranking( + FT_INFO* fts_hdl, + uchar*, + uint); + +/* Call back function array defined by MySQL and used to +retrieve FTS results. */ +const struct _ft_vft ft_vft_result = {NULL, + innobase_fts_find_ranking, + innobase_fts_close_ranking, + innobase_fts_retrieve_ranking, + NULL}; + +/** @return version of the extended FTS API */ +static +uint +innobase_fts_get_version() +{ + /* Currently this doesn't make much sense as returning + HA_CAN_FULLTEXT_EXT automatically mean this version is supported. + This supposed to ease future extensions. */ + return(2); +} + +/** @return Which part of the extended FTS API is supported */ +static +ulonglong +innobase_fts_flags() +{ + return(FTS_ORDERED_RESULT | FTS_DOCID_IN_RESULT); +} + +/** Find and Retrieve the FTS doc_id for the current result row +@param[in,out] fts_hdl FTS handler +@return the document ID */ +static +ulonglong +innobase_fts_retrieve_docid( + FT_INFO_EXT* fts_hdl); + +/** Find and retrieve the size of the current result +@param[in,out] fts_hdl FTS handler +@return number of matching rows */ +static +ulonglong +innobase_fts_count_matches( + FT_INFO_EXT* fts_hdl) /*!< in: FTS handler */ +{ + NEW_FT_INFO* handle = reinterpret_cast(fts_hdl); + + if (handle->ft_result->rankings_by_id != NULL) { + return(rbt_size(handle->ft_result->rankings_by_id)); + } else { + return(0); + } +} + +const struct _ft_vft_ext ft_vft_ext_result = {innobase_fts_get_version, + innobase_fts_flags, + innobase_fts_retrieve_docid, + innobase_fts_count_matches}; + +#ifdef HAVE_PSI_INTERFACE +# define PSI_KEY(n) {&n##_key, #n, 0} +/* Keys to register pthread mutexes in the current file with +performance schema */ +static mysql_pfs_key_t pending_checkpoint_mutex_key; + +# ifdef UNIV_PFS_MUTEX +mysql_pfs_key_t buf_pool_mutex_key; +mysql_pfs_key_t dict_foreign_err_mutex_key; +mysql_pfs_key_t fil_system_mutex_key; +mysql_pfs_key_t flush_list_mutex_key; +mysql_pfs_key_t fts_cache_mutex_key; +mysql_pfs_key_t fts_cache_init_mutex_key; +mysql_pfs_key_t fts_delete_mutex_key; +mysql_pfs_key_t fts_doc_id_mutex_key; +mysql_pfs_key_t ibuf_bitmap_mutex_key; +mysql_pfs_key_t ibuf_mutex_key; +mysql_pfs_key_t ibuf_pessimistic_insert_mutex_key; +mysql_pfs_key_t recalc_pool_mutex_key; +mysql_pfs_key_t purge_sys_pq_mutex_key; +mysql_pfs_key_t recv_sys_mutex_key; +mysql_pfs_key_t page_zip_stat_per_index_mutex_key; +mysql_pfs_key_t rtr_active_mutex_key; +mysql_pfs_key_t rtr_match_mutex_key; +mysql_pfs_key_t rtr_path_mutex_key; +mysql_pfs_key_t srv_innodb_monitor_mutex_key; +mysql_pfs_key_t srv_misc_tmpfile_mutex_key; +mysql_pfs_key_t srv_monitor_file_mutex_key; +mysql_pfs_key_t buf_dblwr_mutex_key; +mysql_pfs_key_t trx_pool_mutex_key; +mysql_pfs_key_t trx_pool_manager_mutex_key; +mysql_pfs_key_t lock_wait_mutex_key; +mysql_pfs_key_t trx_sys_mutex_key; +mysql_pfs_key_t srv_threads_mutex_key; +mysql_pfs_key_t tpool_cache_mutex_key; + +/* all_innodb_mutexes array contains mutexes that are +performance schema instrumented if "UNIV_PFS_MUTEX" +is defined */ +static PSI_mutex_info all_innodb_mutexes[] = { + PSI_KEY(pending_checkpoint_mutex), + PSI_KEY(buf_pool_mutex), + PSI_KEY(dict_foreign_err_mutex), + PSI_KEY(recalc_pool_mutex), + PSI_KEY(fil_system_mutex), + PSI_KEY(flush_list_mutex), + PSI_KEY(fts_cache_mutex), + PSI_KEY(fts_cache_init_mutex), + PSI_KEY(fts_delete_mutex), + PSI_KEY(fts_doc_id_mutex), + PSI_KEY(ibuf_mutex), + PSI_KEY(ibuf_pessimistic_insert_mutex), + PSI_KEY(index_online_log), + PSI_KEY(page_zip_stat_per_index_mutex), + PSI_KEY(purge_sys_pq_mutex), + PSI_KEY(recv_sys_mutex), + PSI_KEY(srv_innodb_monitor_mutex), + PSI_KEY(srv_misc_tmpfile_mutex), + PSI_KEY(srv_monitor_file_mutex), + PSI_KEY(buf_dblwr_mutex), + PSI_KEY(trx_pool_mutex), + PSI_KEY(trx_pool_manager_mutex), + PSI_KEY(lock_wait_mutex), + PSI_KEY(srv_threads_mutex), + PSI_KEY(rtr_active_mutex), + PSI_KEY(rtr_match_mutex), + PSI_KEY(rtr_path_mutex), + PSI_KEY(trx_sys_mutex), + PSI_KEY(tpool_cache_mutex), +}; +# endif /* UNIV_PFS_MUTEX */ + +# ifdef UNIV_PFS_RWLOCK +mysql_pfs_key_t dict_operation_lock_key; +mysql_pfs_key_t index_tree_rw_lock_key; +mysql_pfs_key_t index_online_log_key; +mysql_pfs_key_t fil_space_latch_key; +mysql_pfs_key_t trx_i_s_cache_lock_key; +mysql_pfs_key_t trx_purge_latch_key; +mysql_pfs_key_t trx_rseg_latch_key; +mysql_pfs_key_t lock_latch_key; +mysql_pfs_key_t log_latch_key; + +/* all_innodb_rwlocks array contains rwlocks that are +performance schema instrumented if "UNIV_PFS_RWLOCK" +is defined */ +static PSI_rwlock_info all_innodb_rwlocks[] = +{ +# ifdef BTR_CUR_HASH_ADAPT + { &btr_search_latch_key, "btr_search_latch", 0 }, +# endif + { &dict_operation_lock_key, "dict_operation_lock", 0 }, + { &fil_space_latch_key, "fil_space_latch", 0 }, + { &trx_i_s_cache_lock_key, "trx_i_s_cache_lock", 0 }, + { &trx_purge_latch_key, "trx_purge_latch", 0 }, + { &trx_rseg_latch_key, "trx_rseg_latch", 0 }, + { &lock_latch_key, "lock_latch", 0 }, + { &log_latch_key, "log_latch", 0 }, + { &index_tree_rw_lock_key, "index_tree_rw_lock", PSI_RWLOCK_FLAG_SX } +}; +# endif /* UNIV_PFS_RWLOCK */ + +# ifdef UNIV_PFS_THREAD +/* all_innodb_threads array contains threads that are +performance schema instrumented if "UNIV_PFS_THREAD" +is defined */ +static PSI_thread_info all_innodb_threads[] = { + PSI_KEY(page_cleaner_thread), + PSI_KEY(trx_rollback_clean_thread), + PSI_KEY(thread_pool_thread) +}; +# endif /* UNIV_PFS_THREAD */ + +# ifdef UNIV_PFS_IO +/* all_innodb_files array contains the type of files that are +performance schema instrumented if "UNIV_PFS_IO" is defined */ +static PSI_file_info all_innodb_files[] = { + PSI_KEY(innodb_data_file), + PSI_KEY(innodb_temp_file) +}; +# endif /* UNIV_PFS_IO */ +#endif /* HAVE_PSI_INTERFACE */ + +static void innodb_remember_check_sysvar_funcs(); +mysql_var_check_func check_sysvar_enum; +mysql_var_check_func check_sysvar_int; + +// should page compression be used by default for new tables +static MYSQL_THDVAR_BOOL(compression_default, PLUGIN_VAR_OPCMDARG, + "Is compression the default for new tables", + NULL, NULL, FALSE); + +/** Update callback for SET [SESSION] innodb_default_encryption_key_id */ +static void +innodb_default_encryption_key_id_update(THD* thd, st_mysql_sys_var* var, + void* var_ptr, const void *save) +{ + uint key_id = *static_cast(save); + if (key_id != FIL_DEFAULT_ENCRYPTION_KEY + && !encryption_key_id_exists(key_id)) { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "innodb_default_encryption_key=%u" + " is not available", key_id); + } + *static_cast(var_ptr) = key_id; +} + +static MYSQL_THDVAR_UINT(default_encryption_key_id, PLUGIN_VAR_RQCMDARG, + "Default encryption key id used for table encryption.", + NULL, innodb_default_encryption_key_id_update, + FIL_DEFAULT_ENCRYPTION_KEY, 1, UINT_MAX32, 0); + +/** + Structure for CREATE TABLE options (table options). + It needs to be called ha_table_option_struct. + + The option values can be specified in the CREATE TABLE at the end: + CREATE TABLE ( ... ) *here* +*/ + +ha_create_table_option innodb_table_option_list[]= +{ + /* With this option user can enable page compression feature for the + table */ + HA_TOPTION_SYSVAR("PAGE_COMPRESSED", page_compressed, compression_default), + /* With this option user can set zip compression level for page + compression for this table*/ + HA_TOPTION_NUMBER("PAGE_COMPRESSION_LEVEL", page_compression_level, 0, 1, 9, 1), + /* With this option the user can enable encryption for the table */ + HA_TOPTION_ENUM("ENCRYPTED", encryption, "DEFAULT,YES,NO", 0), + /* With this option the user defines the key identifier using for the encryption */ + HA_TOPTION_SYSVAR("ENCRYPTION_KEY_ID", encryption_key_id, default_encryption_key_id), + + HA_TOPTION_END +}; + +/*************************************************************//** +Check whether valid argument given to innodb_ft_*_stopword_table. +This function is registered as a callback with MySQL. +@return 0 for valid stopword table */ +static +int +innodb_stopword_table_validate( +/*===========================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to system + variable */ + void* save, /*!< out: immediate result + for update function */ + struct st_mysql_value* value); /*!< in: incoming string */ + +static +void innodb_ft_cache_size_update(THD*, st_mysql_sys_var*, void*, const void* save) +{ + fts_max_cache_size= *static_cast(save); +} + +static +void innodb_ft_total_cache_size_update(THD*, st_mysql_sys_var*, void*, const void* save) +{ + fts_max_total_cache_size= *static_cast(save); +} + +static bool is_mysql_datadir_path(const char *path); + +/** Validate passed-in "value" is a valid directory name. +This function is registered as a callback with MySQL. +@param[in,out] thd thread handle +@param[in] var pointer to system variable +@param[out] save immediate result for update +@param[in] value incoming string +@return 0 for valid name */ +static +int +innodb_tmpdir_validate( + THD* thd, + struct st_mysql_sys_var*, + void* save, + struct st_mysql_value* value) +{ + + char* alter_tmp_dir; + char* innodb_tmp_dir; + char buff[OS_FILE_MAX_PATH]; + int len = sizeof(buff); + char tmp_abs_path[FN_REFLEN + 2]; + + ut_ad(save != NULL); + ut_ad(value != NULL); + + if (check_global_access(thd, FILE_ACL)) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "InnoDB: FILE Permissions required"); + *static_cast(save) = NULL; + return(1); + } + + alter_tmp_dir = (char*) value->val_str(value, buff, &len); + + if (!alter_tmp_dir) { + *static_cast(save) = alter_tmp_dir; + return(0); + } + + if (strlen(alter_tmp_dir) > FN_REFLEN) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "Path length should not exceed %d bytes", FN_REFLEN); + *static_cast(save) = NULL; + return(1); + } + + my_realpath(tmp_abs_path, alter_tmp_dir, 0); + size_t tmp_abs_len = strlen(tmp_abs_path); + + if (my_access(tmp_abs_path, F_OK)) { + + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "InnoDB: Path doesn't exist."); + *static_cast(save) = NULL; + return(1); + } else if (my_access(tmp_abs_path, R_OK | W_OK)) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "InnoDB: Server doesn't have permission in " + "the given location."); + *static_cast(save) = NULL; + return(1); + } + + MY_STAT stat_info_dir; + + if (my_stat(tmp_abs_path, &stat_info_dir, MYF(0))) { + if ((stat_info_dir.st_mode & S_IFDIR) != S_IFDIR) { + + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "Given path is not a directory. "); + *static_cast(save) = NULL; + return(1); + } + } + + if (!is_mysql_datadir_path(tmp_abs_path)) { + + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "InnoDB: Path Location should not be same as " + "mysql data directory location."); + *static_cast(save) = NULL; + return(1); + } + + innodb_tmp_dir = static_cast( + thd_memdup(thd, tmp_abs_path, tmp_abs_len + 1)); + *static_cast(save) = innodb_tmp_dir; + return(0); +} + +/******************************************************************//** +Maps a MySQL trx isolation level code to the InnoDB isolation level code +@return InnoDB isolation level */ +static inline +uint +innobase_map_isolation_level( +/*=========================*/ + enum_tx_isolation iso); /*!< in: MySQL isolation level code */ + +/** Gets field offset for a field in a table. +@param[in] table MySQL table object +@param[in] field MySQL field object (from table->field array) +@return offset */ +static inline +uint +get_field_offset( + const TABLE* table, + const Field* field) +{ + return field->offset(table->record[0]); +} + + +/*************************************************************//** +Check for a valid value of innobase_compression_algorithm. +@return 0 for valid innodb_compression_algorithm. */ +static +int +innodb_compression_algorithm_validate( +/*==================================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to system + variable */ + void* save, /*!< out: immediate result + for update function */ + struct st_mysql_value* value); /*!< in: incoming string */ + +static ibool innodb_have_punch_hole=IF_PUNCH_HOLE(1, 0); + +static +int +innodb_encrypt_tables_validate( +/*==================================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to system + variable */ + void* save, /*!< out: immediate result + for update function */ + struct st_mysql_value* value); /*!< in: incoming string */ + +static const char innobase_hton_name[]= "InnoDB"; + +static MYSQL_THDVAR_BOOL(table_locks, PLUGIN_VAR_OPCMDARG, + "Enable InnoDB locking in LOCK TABLES", + /* check_func */ NULL, /* update_func */ NULL, + /* default */ TRUE); + +static MYSQL_THDVAR_BOOL(strict_mode, PLUGIN_VAR_OPCMDARG, + "Use strict mode when evaluating create options.", + NULL, NULL, TRUE); + +static MYSQL_THDVAR_BOOL(ft_enable_stopword, PLUGIN_VAR_OPCMDARG, + "Create FTS index with stopword.", + NULL, NULL, + /* default */ TRUE); + +static MYSQL_THDVAR_UINT(lock_wait_timeout, PLUGIN_VAR_RQCMDARG, + "Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. The value 100000000 is infinite timeout.", + NULL, NULL, 50, 0, 100000000, 0); + +static MYSQL_THDVAR_STR(ft_user_stopword_table, + PLUGIN_VAR_OPCMDARG|PLUGIN_VAR_MEMALLOC, + "User supplied stopword table name, effective in the session level.", + innodb_stopword_table_validate, NULL, NULL); + +static MYSQL_THDVAR_STR(tmpdir, + PLUGIN_VAR_OPCMDARG|PLUGIN_VAR_MEMALLOC, + "Directory for temporary non-tablespace files.", + innodb_tmpdir_validate, NULL, NULL); + +static size_t truncated_status_writes; + +static SHOW_VAR innodb_status_variables[]= { +#ifdef BTR_CUR_HASH_ADAPT + {"adaptive_hash_hash_searches", &export_vars.innodb_ahi_hit, SHOW_SIZE_T}, + {"adaptive_hash_non_hash_searches", + &export_vars.innodb_ahi_miss, SHOW_SIZE_T}, +#endif + {"background_log_sync", &srv_log_writes_and_flush, SHOW_SIZE_T}, + {"buffer_pool_dump_status", + (char*) &export_vars.innodb_buffer_pool_dump_status, SHOW_CHAR}, + {"buffer_pool_load_status", + (char*) &export_vars.innodb_buffer_pool_load_status, SHOW_CHAR}, + {"buffer_pool_resize_status", + (char*) &export_vars.innodb_buffer_pool_resize_status, SHOW_CHAR}, + {"buffer_pool_load_incomplete", + &export_vars.innodb_buffer_pool_load_incomplete, SHOW_BOOL}, + {"buffer_pool_pages_data", &UT_LIST_GET_LEN(buf_pool.LRU), SHOW_SIZE_T}, + {"buffer_pool_bytes_data", + &export_vars.innodb_buffer_pool_bytes_data, SHOW_SIZE_T}, + {"buffer_pool_pages_dirty", + &UT_LIST_GET_LEN(buf_pool.flush_list), SHOW_SIZE_T}, + {"buffer_pool_bytes_dirty", &buf_pool.flush_list_bytes, SHOW_SIZE_T}, + {"buffer_pool_pages_flushed", &buf_pool.stat.n_pages_written, SHOW_SIZE_T}, + {"buffer_pool_pages_free", &UT_LIST_GET_LEN(buf_pool.free), SHOW_SIZE_T}, +#ifdef UNIV_DEBUG + {"buffer_pool_pages_latched", + &export_vars.innodb_buffer_pool_pages_latched, SHOW_SIZE_T}, +#endif /* UNIV_DEBUG */ + {"buffer_pool_pages_made_not_young", + &buf_pool.stat.n_pages_not_made_young, SHOW_SIZE_T}, + {"buffer_pool_pages_made_young", + &buf_pool.stat.n_pages_made_young, SHOW_SIZE_T}, + {"buffer_pool_pages_misc", + &export_vars.innodb_buffer_pool_pages_misc, SHOW_SIZE_T}, + {"buffer_pool_pages_old", &buf_pool.LRU_old_len, SHOW_SIZE_T}, + {"buffer_pool_pages_total", + &export_vars.innodb_buffer_pool_pages_total, SHOW_SIZE_T}, + {"buffer_pool_pages_LRU_flushed", &buf_lru_flush_page_count, SHOW_SIZE_T}, + {"buffer_pool_pages_LRU_freed", &buf_lru_freed_page_count, SHOW_SIZE_T}, + {"buffer_pool_pages_split", &buf_pool.pages_split, SHOW_SIZE_T}, + {"buffer_pool_read_ahead_rnd", + &buf_pool.stat.n_ra_pages_read_rnd, SHOW_SIZE_T}, + {"buffer_pool_read_ahead", &buf_pool.stat.n_ra_pages_read, SHOW_SIZE_T}, + {"buffer_pool_read_ahead_evicted", + &buf_pool.stat.n_ra_pages_evicted, SHOW_SIZE_T}, + {"buffer_pool_read_requests", + &export_vars.innodb_buffer_pool_read_requests, SHOW_SIZE_T}, + {"buffer_pool_reads", &buf_pool.stat.n_pages_read, SHOW_SIZE_T}, + {"buffer_pool_wait_free", &buf_pool.stat.LRU_waits, SHOW_SIZE_T}, + {"buffer_pool_write_requests", &buf_pool.flush_list_requests, SHOW_SIZE_T}, + {"checkpoint_age", &export_vars.innodb_checkpoint_age, SHOW_SIZE_T}, + {"checkpoint_max_age", &export_vars.innodb_checkpoint_max_age, SHOW_SIZE_T}, + {"data_fsyncs", (size_t*) &os_n_fsyncs, SHOW_SIZE_T}, + {"data_pending_fsyncs", + (size_t*) &fil_n_pending_tablespace_flushes, SHOW_SIZE_T}, + {"data_pending_reads", &export_vars.innodb_data_pending_reads, SHOW_SIZE_T}, + {"data_pending_writes", &export_vars.innodb_data_pending_writes,SHOW_SIZE_T}, + {"data_read", &export_vars.innodb_data_read, SHOW_SIZE_T}, + {"data_reads", &export_vars.innodb_data_reads, SHOW_SIZE_T}, + {"data_writes", &export_vars.innodb_data_writes, SHOW_SIZE_T}, + {"data_written", &export_vars.innodb_data_written, SHOW_SIZE_T}, + {"dblwr_pages_written", &export_vars.innodb_dblwr_pages_written,SHOW_SIZE_T}, + {"dblwr_writes", &export_vars.innodb_dblwr_writes, SHOW_SIZE_T}, + {"deadlocks", &lock_sys.deadlocks, SHOW_SIZE_T}, + {"history_list_length", &export_vars.innodb_history_list_length,SHOW_SIZE_T}, + {"ibuf_discarded_delete_marks", &ibuf.n_discarded_ops[IBUF_OP_DELETE_MARK], + SHOW_SIZE_T}, + {"ibuf_discarded_deletes", &ibuf.n_discarded_ops[IBUF_OP_DELETE], + SHOW_SIZE_T}, + {"ibuf_discarded_inserts", &ibuf.n_discarded_ops[IBUF_OP_INSERT], + SHOW_SIZE_T}, + {"ibuf_free_list", &ibuf.free_list_len, SHOW_SIZE_T}, + {"ibuf_merged_delete_marks", &ibuf.n_merged_ops[IBUF_OP_DELETE_MARK], + SHOW_SIZE_T}, + {"ibuf_merged_deletes", &ibuf.n_merged_ops[IBUF_OP_DELETE], SHOW_SIZE_T}, + {"ibuf_merged_inserts", &ibuf.n_merged_ops[IBUF_OP_INSERT], SHOW_SIZE_T}, + {"ibuf_merges", &ibuf.n_merges, SHOW_SIZE_T}, + {"ibuf_segment_size", &ibuf.seg_size, SHOW_SIZE_T}, + {"ibuf_size", &ibuf.size, SHOW_SIZE_T}, + {"log_waits", &log_sys.waits, SHOW_SIZE_T}, + {"log_write_requests", &log_sys.write_to_buf, SHOW_SIZE_T}, + {"log_writes", &log_sys.write_to_log, SHOW_SIZE_T}, + {"lsn_current", &export_vars.innodb_lsn_current, SHOW_ULONGLONG}, + {"lsn_flushed", &export_vars.innodb_lsn_flushed, SHOW_ULONGLONG}, + {"lsn_last_checkpoint", &export_vars.innodb_lsn_last_checkpoint, + SHOW_ULONGLONG}, + {"master_thread_active_loops", &srv_main_active_loops, SHOW_SIZE_T}, + {"master_thread_idle_loops", &srv_main_idle_loops, SHOW_SIZE_T}, + {"max_trx_id", &export_vars.innodb_max_trx_id, SHOW_ULONGLONG}, +#ifdef BTR_CUR_HASH_ADAPT + {"mem_adaptive_hash", &export_vars.innodb_mem_adaptive_hash, SHOW_SIZE_T}, +#endif + {"mem_dictionary", &export_vars.innodb_mem_dictionary, SHOW_SIZE_T}, + {"os_log_written", &export_vars.innodb_os_log_written, SHOW_SIZE_T}, + {"page_size", &srv_page_size, SHOW_ULONG}, + {"pages_created", &buf_pool.stat.n_pages_created, SHOW_SIZE_T}, + {"pages_read", &buf_pool.stat.n_pages_read, SHOW_SIZE_T}, + {"pages_written", &buf_pool.stat.n_pages_written, SHOW_SIZE_T}, + {"row_lock_current_waits", &export_vars.innodb_row_lock_current_waits, + SHOW_SIZE_T}, + {"row_lock_time", &export_vars.innodb_row_lock_time, SHOW_LONGLONG}, + {"row_lock_time_avg", &export_vars.innodb_row_lock_time_avg, SHOW_ULONGLONG}, + {"row_lock_time_max", &export_vars.innodb_row_lock_time_max, SHOW_ULONGLONG}, + {"row_lock_waits", &export_vars.innodb_row_lock_waits, SHOW_SIZE_T}, + {"num_open_files", &fil_system.n_open, SHOW_SIZE_T}, + {"truncated_status_writes", &truncated_status_writes, SHOW_SIZE_T}, + {"available_undo_logs", &srv_available_undo_logs, SHOW_ULONG}, + {"undo_truncations", &export_vars.innodb_undo_truncations, SHOW_ULONG}, + + /* Status variables for page compression */ + {"page_compression_saved", + &export_vars.innodb_page_compression_saved, SHOW_LONGLONG}, + {"num_pages_page_compressed", + &export_vars.innodb_pages_page_compressed, SHOW_LONGLONG}, + {"num_page_compressed_trim_op", + &export_vars.innodb_page_compressed_trim_op, SHOW_LONGLONG}, + {"num_pages_page_decompressed", + &export_vars.innodb_pages_page_decompressed, SHOW_LONGLONG}, + {"num_pages_page_compression_error", + &export_vars.innodb_pages_page_compression_error, SHOW_LONGLONG}, + {"num_pages_encrypted", + &export_vars.innodb_pages_encrypted, SHOW_LONGLONG}, + {"num_pages_decrypted", + &export_vars.innodb_pages_decrypted, SHOW_LONGLONG}, + {"have_lz4", &(provider_service_lz4->is_loaded), SHOW_BOOL}, + {"have_lzo", &(provider_service_lzo->is_loaded), SHOW_BOOL}, + {"have_lzma", &(provider_service_lzma->is_loaded), SHOW_BOOL}, + {"have_bzip2", &(provider_service_bzip2->is_loaded), SHOW_BOOL}, + {"have_snappy", &(provider_service_snappy->is_loaded), SHOW_BOOL}, + {"have_punch_hole", &innodb_have_punch_hole, SHOW_BOOL}, + + /* Defragmentation */ + {"defragment_compression_failures", + &export_vars.innodb_defragment_compression_failures, SHOW_SIZE_T}, + {"defragment_failures", &export_vars.innodb_defragment_failures,SHOW_SIZE_T}, + {"defragment_count", &export_vars.innodb_defragment_count, SHOW_SIZE_T}, + + {"instant_alter_column", + &export_vars.innodb_instant_alter_column, SHOW_ULONG}, + + /* Online alter table status variables */ + {"onlineddl_rowlog_rows", + &export_vars.innodb_onlineddl_rowlog_rows, SHOW_SIZE_T}, + {"onlineddl_rowlog_pct_used", + &export_vars.innodb_onlineddl_rowlog_pct_used, SHOW_SIZE_T}, + {"onlineddl_pct_progress", + &export_vars.innodb_onlineddl_pct_progress, SHOW_SIZE_T}, + + /* Encryption */ + {"encryption_rotation_pages_read_from_cache", + &export_vars.innodb_encryption_rotation_pages_read_from_cache, SHOW_SIZE_T}, + {"encryption_rotation_pages_read_from_disk", + &export_vars.innodb_encryption_rotation_pages_read_from_disk, SHOW_SIZE_T}, + {"encryption_rotation_pages_modified", + &export_vars.innodb_encryption_rotation_pages_modified, SHOW_SIZE_T}, + {"encryption_rotation_pages_flushed", + &export_vars.innodb_encryption_rotation_pages_flushed, SHOW_SIZE_T}, + {"encryption_rotation_estimated_iops", + &export_vars.innodb_encryption_rotation_estimated_iops, SHOW_SIZE_T}, + {"encryption_n_merge_blocks_encrypted", + &export_vars.innodb_n_merge_blocks_encrypted, SHOW_LONGLONG}, + {"encryption_n_merge_blocks_decrypted", + &export_vars.innodb_n_merge_blocks_decrypted, SHOW_LONGLONG}, + {"encryption_n_rowlog_blocks_encrypted", + &export_vars.innodb_n_rowlog_blocks_encrypted, SHOW_LONGLONG}, + {"encryption_n_rowlog_blocks_decrypted", + &export_vars.innodb_n_rowlog_blocks_decrypted, SHOW_LONGLONG}, + {"encryption_n_temp_blocks_encrypted", + &export_vars.innodb_n_temp_blocks_encrypted, SHOW_LONGLONG}, + {"encryption_n_temp_blocks_decrypted", + &export_vars.innodb_n_temp_blocks_decrypted, SHOW_LONGLONG}, + {"encryption_num_key_requests", &export_vars.innodb_encryption_key_requests, + SHOW_LONGLONG}, + + {NullS, NullS, SHOW_LONG} +}; + +/*****************************************************************//** +Frees a possible InnoDB trx object associated with the current THD. +@return 0 or error number */ +static +int +innobase_close_connection( +/*======================*/ + handlerton* hton, /*!< in/out: InnoDB handlerton */ + THD* thd); /*!< in: MySQL thread handle for + which to close the connection */ + +/** Cancel any pending lock request associated with the current THD. +@sa THD::awake() @sa ha_kill_query() */ +static void innobase_kill_query(handlerton*, THD* thd, enum thd_kill_levels); +static void innobase_commit_ordered(handlerton *hton, THD* thd, bool all); + +/*****************************************************************//** +Commits a transaction in an InnoDB database or marks an SQL statement +ended. +@return 0 */ +static +int +innobase_commit( +/*============*/ + handlerton* hton, /*!< in/out: InnoDB handlerton */ + THD* thd, /*!< in: MySQL thread handle of the + user for whom the transaction should + be committed */ + bool commit_trx); /*!< in: true - commit transaction + false - the current SQL statement + ended */ + +/*****************************************************************//** +Rolls back a transaction to a savepoint. +@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the +given name */ +static +int +innobase_rollback( +/*==============*/ + handlerton* hton, /*!< in/out: InnoDB handlerton */ + THD* thd, /*!< in: handle to the MySQL thread + of the user whose transaction should + be rolled back */ + bool rollback_trx); /*!< in: TRUE - rollback entire + transaction FALSE - rollback the current + statement only */ + +/*****************************************************************//** +Rolls back a transaction to a savepoint. +@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the +given name */ +static +int +innobase_rollback_to_savepoint( +/*===========================*/ + handlerton* hton, /*!< in/out: InnoDB handlerton */ + THD* thd, /*!< in: handle to the MySQL thread of + the user whose XA transaction should + be rolled back to savepoint */ + void* savepoint); /*!< in: savepoint data */ + +/*****************************************************************//** +Check whether innodb state allows to safely release MDL locks after +rollback to savepoint. +@return true if it is safe, false if its not safe. */ +static +bool +innobase_rollback_to_savepoint_can_release_mdl( +/*===========================================*/ + handlerton* hton, /*!< in/out: InnoDB handlerton */ + THD* thd); /*!< in: handle to the MySQL thread of + the user whose XA transaction should + be rolled back to savepoint */ + +/*****************************************************************//** +Sets a transaction savepoint. +@return always 0, that is, always succeeds */ +static +int +innobase_savepoint( +/*===============*/ + handlerton* hton, /*!< in/out: InnoDB handlerton */ + THD* thd, /*!< in: handle to the MySQL thread of + the user's XA transaction for which + we need to take a savepoint */ + void* savepoint); /*!< in: savepoint data */ + +/*****************************************************************//** +Release transaction savepoint name. +@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the +given name */ +static +int +innobase_release_savepoint( +/*=======================*/ + handlerton* hton, /*!< in/out: handlerton for InnoDB */ + THD* thd, /*!< in: handle to the MySQL thread + of the user whose transaction's + savepoint should be released */ + void* savepoint); /*!< in: savepoint data */ + +/** Request notification of log writes */ +static void innodb_log_flush_request(void *cookie); + +/** Requests for log flushes */ +struct log_flush_request +{ + /** earlier request (for a smaller LSN) */ + log_flush_request *next; + /** parameter provided to innodb_log_flush_request() */ + void *cookie; + /** log sequence number that is being waited for */ + lsn_t lsn; +}; + +/** Buffer of pending innodb_log_flush_request() */ +alignas(CPU_LEVEL1_DCACHE_LINESIZE) static +struct +{ + /** first request */ + std::atomic start; + /** last request */ + log_flush_request *end; + /** mutex protecting this object */ + mysql_mutex_t mutex; +} +log_requests; + +/** @brief Adjust some InnoDB startup parameters based on file contents +or innodb_page_size. */ +static +void +innodb_params_adjust(); + +/*******************************************************************//** +This function is used to prepare an X/Open XA distributed transaction. +@return 0 or error number */ +static +int +innobase_xa_prepare( +/*================*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + THD* thd, /*!< in: handle to the MySQL thread of + the user whose XA transaction should + be prepared */ + bool all); /*!< in: true - prepare transaction + false - the current SQL statement + ended */ +/*******************************************************************//** +This function is used to recover X/Open XA distributed transactions. +@return number of prepared transactions stored in xid_list */ +static +int +innobase_xa_recover( +/*================*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + XID* xid_list, /*!< in/out: prepared transactions */ + uint len); /*!< in: number of slots in xid_list */ +/*******************************************************************//** +This function is used to commit one X/Open XA distributed transaction +which is in the prepared state +@return 0 or error number */ +static +int +innobase_commit_by_xid( +/*===================*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + XID* xid); /*!< in: X/Open XA transaction + identification */ + +/** Ignore FOREIGN KEY constraints that would be violated by DROP DATABASE */ +static ibool innodb_drop_database_ignore_fk(void*,void*) { return false; } + +/** FOREIGN KEY error reporting context for DROP DATABASE */ +struct innodb_drop_database_fk_report +{ + /** database name, with trailing '/' */ + const span name; + /** whether errors were found */ + bool violated; +}; + +/** Report FOREIGN KEY constraints that would be violated by DROP DATABASE +@return whether processing should continue */ +static ibool innodb_drop_database_fk(void *node, void *report) +{ + auto s= static_cast(node); + auto r= static_cast(report); + const dfield_t *name= que_node_get_val(s->select_list); + ut_ad(name->type.mtype == DATA_VARCHAR); + + if (name->len == UNIV_SQL_NULL || name->len <= r->name.size() || + memcmp(static_cast(name->data), r->name.data(), + r->name.size())) + return false; /* End of matches */ + + node= que_node_get_next(s->select_list); + const dfield_t *id= que_node_get_val(node); + ut_ad(id->type.mtype == DATA_VARCHAR); + ut_ad(!que_node_get_next(node)); + + if (id->len != UNIV_SQL_NULL) + sql_print_error("DROP DATABASE: table %.*s is referenced" + " by FOREIGN KEY %.*s", + static_cast(name->len), + static_cast(name->data), + static_cast(id->len), + static_cast(id->data)); + else + ut_ad("corrupted SYS_FOREIGN record" == 0); + + return true; +} + +/** After DROP DATABASE executed ha_innobase::delete_table() on all +tables that it was aware of, drop any leftover tables inside InnoDB. +@param path database path */ +static void innodb_drop_database(handlerton*, char *path) +{ + if (high_level_read_only) + return; + + ulint len= 0; + char *ptr; + + for (ptr= strend(path) - 2; ptr >= path && +#ifdef _WIN32 + *ptr != '\\' && +#endif + *ptr != '/'; ptr--) + len++; + + ptr++; + char *namebuf= static_cast + (my_malloc(PSI_INSTRUMENT_ME, len + 2, MYF(0))); + if (!namebuf) + return; + memcpy(namebuf, ptr, len); + namebuf[len] = '/'; + namebuf[len + 1] = '\0'; + +#ifdef _WIN32 + innobase_casedn_str(namebuf); +#endif /* _WIN32 */ + + THD * const thd= current_thd; + trx_t *trx= innobase_trx_allocate(thd); + dberr_t err= DB_SUCCESS; + + dict_sys.lock(SRW_LOCK_CALL); + + for (auto i= dict_sys.table_id_hash.n_cells; i--; ) + { + for (dict_table_t *next, *table= static_cast + (dict_sys.table_id_hash.array[i].node); table; table= next) + { + ut_ad(table->cached); + next= table->id_hash; + if (strncmp(table->name.m_name, namebuf, len + 1)) + continue; + const auto n_handles= table->get_ref_count(); + const bool locks= !n_handles && lock_table_has_locks(table); + if (n_handles || locks) + { + err= DB_ERROR; + ib::error errmsg; + errmsg << "DROP DATABASE: cannot DROP TABLE " << table->name; + if (n_handles) + errmsg << " due to " << n_handles << " open handles"; + else + errmsg << " due to locks"; + continue; + } + dict_sys.remove(table); + } + } + + dict_sys.unlock(); + + dict_table_t *table_stats, *index_stats; + MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr; + table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false, + DICT_ERR_IGNORE_NONE); + if (table_stats) + { + dict_sys.freeze(SRW_LOCK_CALL); + table_stats= dict_acquire_mdl_shared(table_stats, + thd, &mdl_table); + dict_sys.unfreeze(); + } + index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false, + DICT_ERR_IGNORE_NONE); + if (index_stats) + { + dict_sys.freeze(SRW_LOCK_CALL); + index_stats= dict_acquire_mdl_shared(index_stats, + thd, &mdl_index); + dict_sys.unfreeze(); + } + + trx_start_for_ddl(trx); + + uint errors= 0; + char db[NAME_LEN + 1]; + strconvert(&my_charset_filename, namebuf, len, system_charset_info, db, + sizeof db, &errors); + if (!errors && table_stats && index_stats && + !strcmp(table_stats->name.m_name, TABLE_STATS_NAME) && + !strcmp(index_stats->name.m_name, INDEX_STATS_NAME) && + lock_table_for_trx(table_stats, trx, LOCK_X) == DB_SUCCESS && + lock_table_for_trx(index_stats, trx, LOCK_X) == DB_SUCCESS) + { + row_mysql_lock_data_dictionary(trx); + if (dict_stats_delete(db, trx)) + { + /* Ignore this error. Leaving garbage statistics behind is a + lesser evil. Carry on to try to remove any garbage tables. */ + trx->rollback(); + trx_start_for_ddl(trx); + } + row_mysql_unlock_data_dictionary(trx); + } + + if (err == DB_SUCCESS) + err= lock_sys_tables(trx); + row_mysql_lock_data_dictionary(trx); + + static const char drop_database[] = + "PROCEDURE DROP_DATABASE_PROC () IS\n" + "fk CHAR;\n" + "name CHAR;\n" + "tid CHAR;\n" + "iid CHAR;\n" + + "DECLARE FUNCTION fk_report;\n" + + "DECLARE CURSOR fkf IS\n" + "SELECT ID FROM SYS_FOREIGN WHERE ID >= :db FOR UPDATE;\n" + + "DECLARE CURSOR fkr IS\n" + "SELECT REF_NAME,ID FROM SYS_FOREIGN WHERE REF_NAME >= :db FOR UPDATE\n" + "ORDER BY REF_NAME;\n" + + "DECLARE CURSOR tab IS\n" + "SELECT ID,NAME FROM SYS_TABLES WHERE NAME >= :db FOR UPDATE;\n" + + "DECLARE CURSOR idx IS\n" + "SELECT ID FROM SYS_INDEXES WHERE TABLE_ID = tid FOR UPDATE;\n" + + "BEGIN\n" + + "OPEN fkf;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH fkf INTO fk;\n" + " IF (SQL % NOTFOUND) THEN EXIT; END IF;\n" + " IF TO_BINARY(SUBSTR(fk, 0, LENGTH(:db)))<>TO_BINARY(:db)" + " THEN EXIT; END IF;\n" + " DELETE FROM SYS_FOREIGN_COLS WHERE TO_BINARY(ID)=TO_BINARY(fk);\n" + " DELETE FROM SYS_FOREIGN WHERE CURRENT OF fkf;\n" + "END LOOP;\n" + "CLOSE fkf;\n" + + "OPEN fkr;\n" + "FETCH fkr INTO fk_report();\n" + "CLOSE fkr;\n" + + "OPEN tab;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH tab INTO tid,name;\n" + " IF (SQL % NOTFOUND) THEN EXIT; END IF;\n" + " IF TO_BINARY(SUBSTR(name, 0, LENGTH(:db))) <> TO_BINARY(:db)" + " THEN EXIT; END IF;\n" + " DELETE FROM SYS_COLUMNS WHERE TABLE_ID=tid;\n" + " DELETE FROM SYS_TABLES WHERE ID=tid;\n" + " OPEN idx;\n" + " WHILE 1 = 1 LOOP\n" + " FETCH idx INTO iid;\n" + " IF (SQL % NOTFOUND) THEN EXIT; END IF;\n" + " DELETE FROM SYS_FIELDS WHERE INDEX_ID=iid;\n" + " DELETE FROM SYS_INDEXES WHERE CURRENT OF idx;\n" + " END LOOP;\n" + " CLOSE idx;\n" + "END LOOP;\n" + "CLOSE tab;\n" + + "END;\n"; + + innodb_drop_database_fk_report report{{namebuf, len + 1}, false}; + + if (err == DB_SUCCESS) + { + pars_info_t* pinfo = pars_info_create(); + pars_info_bind_function(pinfo, "fk_report", trx->check_foreigns + ? innodb_drop_database_fk + : innodb_drop_database_ignore_fk, &report); + pars_info_add_str_literal(pinfo, "db", namebuf); + err= que_eval_sql(pinfo, drop_database, trx); + if (err == DB_SUCCESS && report.violated) + err= DB_CANNOT_DROP_CONSTRAINT; + } + + const trx_id_t trx_id= trx->id; + + if (err != DB_SUCCESS) + { + trx->rollback(); + namebuf[len] = '\0'; + ib::error() << "DROP DATABASE " << namebuf << ": " << err; + } + else + trx->commit(); + + if (table_stats) + dict_table_close(table_stats, true, thd, mdl_table); + if (index_stats) + dict_table_close(index_stats, true, thd, mdl_index); + row_mysql_unlock_data_dictionary(trx); + + trx->free(); + + if (err == DB_SUCCESS) + { + /* Eventually after the DELETE FROM SYS_INDEXES was committed, + purge would invoke dict_drop_index_tree() to delete the associated + tablespaces. Because the SQL layer expects the directory to be empty, + we will "manually" purge the tablespaces that belong to the + records that we delete-marked. */ + + dfield_t dfield; + dtuple_t tuple{ + 0,1,1,&dfield,0,nullptr +#ifdef UNIV_DEBUG + , DATA_TUPLE_MAGIC_N +#endif + }; + dict_index_t* sys_index= UT_LIST_GET_FIRST(dict_sys.sys_tables->indexes); + btr_pcur_t pcur; + namebuf[len++]= '/'; + dfield_set_data(&dfield, namebuf, len); + dict_index_copy_types(&tuple, sys_index, 1); + std::vector to_close; + std::vector space_ids; + mtr_t mtr; + mtr.start(); + pcur.btr_cur.page_cur.index = sys_index; + err= btr_pcur_open_on_user_rec(&tuple, BTR_SEARCH_LEAF, &pcur, &mtr); + if (err != DB_SUCCESS) + goto err_exit; + + for (; btr_pcur_is_on_user_rec(&pcur); + btr_pcur_move_to_next_user_rec(&pcur, &mtr)) + { + const rec_t *rec= btr_pcur_get_rec(&pcur); + if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_TABLES) + { + ut_ad("corrupted SYS_TABLES record" == 0); + break; + } + if (!rec_get_deleted_flag(rec, false)) + continue; + ulint flen; + static_assert(DICT_FLD__SYS_TABLES__NAME == 0, "compatibility"); + rec_get_nth_field_offs_old(rec, 0, &flen); + if (flen == UNIV_SQL_NULL || flen <= len || memcmp(rec, namebuf, len)) + /* We ran out of tables that had existed in the database. */ + break; + const byte *db_trx_id= + rec_get_nth_field_old(rec, DICT_FLD__SYS_TABLES__DB_TRX_ID, &flen); + if (flen != 6) + { + ut_ad("corrupted SYS_TABLES.SPACE" == 0); + break; + } + if (mach_read_from_6(db_trx_id) != trx_id) + /* This entry was modified by some other transaction than us. + Unfortunately, because SYS_TABLES.NAME is the PRIMARY KEY, + we cannot distinguish RENAME and DROP here. It is possible + that the table had been renamed to some other database. */ + continue; + const byte *s= + rec_get_nth_field_old(rec, DICT_FLD__SYS_TABLES__SPACE, &flen); + if (flen != 4) + ut_ad("corrupted SYS_TABLES.SPACE" == 0); + else if (uint32_t space_id= mach_read_from_4(s)) + { + space_ids.emplace_back(space_id); + pfs_os_file_t detached= fil_delete_tablespace(space_id); + if (detached != OS_FILE_CLOSED) + to_close.emplace_back(detached); + } + } + err_exit: + mtr.commit(); + for (pfs_os_file_t detached : to_close) + os_file_close(detached); + for (const auto id : space_ids) + ibuf_delete_for_discarded_space(id); + + /* Any changes must be persisted before we return. */ + log_write_up_to(mtr.commit_lsn(), true); + } + + my_free(namebuf); +} + +/** Shut down the InnoDB storage engine. +@return 0 */ +static +int +innobase_end(handlerton*, ha_panic_function); + +/*****************************************************************//** +Creates an InnoDB transaction struct for the thd if it does not yet have one. +Starts a new InnoDB transaction if a transaction is not yet started. And +assigns a new snapshot for a consistent read if the transaction does not yet +have one. +@return 0 */ +static +int +innobase_start_trx_and_assign_read_view( +/*====================================*/ + handlerton* hton, /* in: InnoDB handlerton */ + THD* thd); /* in: MySQL thread handle of the + user for whom the transaction should + be committed */ + +/** Flush InnoDB redo logs to the file system. +@return false */ +static bool innobase_flush_logs(handlerton*) +{ + if (!srv_read_only_mode && srv_flush_log_at_trx_commit) + /* Write any outstanding redo log. Durably if + innodb_flush_log_at_trx_commit=1. */ + log_buffer_flush_to_disk(srv_flush_log_at_trx_commit == 1); + return false; +} + +/************************************************************************//** +Implements the SHOW ENGINE INNODB STATUS command. Sends the output of the +InnoDB Monitor to the client. +@return 0 on success */ +static +int +innodb_show_status( +/*===============*/ + handlerton* hton, /*!< in: the innodb handlerton */ + THD* thd, /*!< in: the MySQL query thread of + the caller */ + stat_print_fn* stat_print); +/************************************************************************//** +Return 0 on success and non-zero on failure. Note: the bool return type +seems to be abused here, should be an int. */ +static +bool +innobase_show_status( +/*=================*/ + handlerton* hton, /*!< in: the innodb handlerton */ + THD* thd, /*!< in: the MySQL query thread of + the caller */ + stat_print_fn* stat_print, + enum ha_stat_type stat_type); + +/** After ALTER TABLE, recompute statistics. */ +inline void ha_innobase::reload_statistics() +{ + if (dict_table_t *table= m_prebuilt ? m_prebuilt->table : nullptr) + { + if (table->is_readable()) + dict_stats_init(table); + else + table->stat_initialized= 1; + } +} + +/** After ALTER TABLE, recompute statistics. */ +static int innodb_notify_tabledef_changed(handlerton *, + LEX_CSTRING *, LEX_CSTRING *, + LEX_CUSTRING *, LEX_CUSTRING *, + handler *handler) +{ + DBUG_ENTER("innodb_notify_tabledef_changed"); + if (handler) + static_cast(handler)->reload_statistics(); + DBUG_RETURN(0); +} + +/****************************************************************//** +Parse and enable InnoDB monitor counters during server startup. +User can enable monitor counters/groups by specifying +"loose-innodb_monitor_enable = monitor_name1;monitor_name2..." +in server configuration file or at the command line. */ +static +void +innodb_enable_monitor_at_startup( +/*=============================*/ + char* str); /*!< in: monitor counter enable list */ + +#ifdef MYSQL_STORE_FTS_DOC_ID +/** Store doc_id value into FTS_DOC_ID field +@param[in,out] tbl table containing FULLTEXT index +@param[in] doc_id FTS_DOC_ID value */ +static +void +innobase_fts_store_docid( + TABLE* tbl, + ulonglong doc_id) +{ + my_bitmap_map* old_map + = dbug_tmp_use_all_columns(tbl, tbl->write_set); + + tbl->fts_doc_id_field->store(static_cast(doc_id), true); + + dbug_tmp_restore_column_map(tbl->write_set, old_map); +} +#endif + +/*******************************************************************//** +Function for constructing an InnoDB table handler instance. */ +static +handler* +innobase_create_handler( +/*====================*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + TABLE_SHARE* table, + MEM_ROOT* mem_root) +{ + return(new (mem_root) ha_innobase(hton, table)); +} + +/* General functions */ + +/** Check that a page_size is correct for InnoDB. +If correct, set the associated page_size_shift which is the power of 2 +for this page size. +@param[in] page_size Page Size to evaluate +@return an associated page_size_shift if valid, 0 if invalid. */ +inline uint32_t innodb_page_size_validate(ulong page_size) +{ + DBUG_ENTER("innodb_page_size_validate"); + + for (uint32_t n = UNIV_PAGE_SIZE_SHIFT_MIN; + n <= UNIV_PAGE_SIZE_SHIFT_MAX; + n++) { + if (page_size == static_cast(1 << n)) { + DBUG_RETURN(n); + } + } + + DBUG_RETURN(0); +} + +/******************************************************************//** +Returns true if transaction should be flagged as read-only. +@return true if the thd is marked as read-only */ +bool +thd_trx_is_read_only( +/*=================*/ + THD* thd) /*!< in: thread handle */ +{ + return(thd != 0 && thd_tx_is_read_only(thd)); +} + +static MYSQL_THDVAR_BOOL(background_thread, + PLUGIN_VAR_NOCMDOPT | PLUGIN_VAR_NOSYSVAR, + "Internal (not user visible) flag to mark " + "background purge threads", NULL, NULL, 0); + +/** Create a MYSQL_THD for a background thread and mark it as such. +@param name thread info for SHOW PROCESSLIST +@return new MYSQL_THD */ +MYSQL_THD innobase_create_background_thd(const char* name) +{ + MYSQL_THD thd= create_background_thd(); + thd_proc_info(thd, name); + THDVAR(thd, background_thread) = true; + return thd; +} + + +/** Close opened tables, free memory, delete items for a MYSQL_THD. +@param[in] thd MYSQL_THD to reset */ +void +innobase_reset_background_thd(MYSQL_THD thd) +{ + if (!thd) { + thd = current_thd; + } + + ut_ad(thd); + ut_ad(THDVAR(thd, background_thread)); + + /* background purge thread */ + const char *proc_info= thd_proc_info(thd, "reset"); + reset_thd(thd); + thd_proc_info(thd, proc_info); +} + + +/******************************************************************//** +Check if the transaction is an auto-commit transaction. TRUE also +implies that it is a SELECT (read-only) transaction. +@return true if the transaction is an auto commit read-only transaction. */ +ibool +thd_trx_is_auto_commit( +/*===================*/ + THD* thd) /*!< in: thread handle, can be NULL */ +{ + return(thd != NULL + && !thd_test_options( + thd, + OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN) + && thd_sql_command(thd) == SQLCOM_SELECT); +} + +/******************************************************************//** +Returns the NUL terminated value of glob_hostname. +@return pointer to glob_hostname. */ +const char* +server_get_hostname() +/*=================*/ +{ + return(glob_hostname); +} + +/******************************************************************//** +Returns true if the transaction this thread is processing has edited +non-transactional tables. Used by the deadlock detector when deciding +which transaction to rollback in case of a deadlock - we try to avoid +rolling back transactions that have edited non-transactional tables. +@return true if non-transactional tables have been edited */ +ibool +thd_has_edited_nontrans_tables( +/*===========================*/ + THD* thd) /*!< in: thread handle */ +{ + return((ibool) thd_non_transactional_update(thd)); +} + +/******************************************************************//** +Returns the lock wait timeout for the current connection. +@return the lock wait timeout, in seconds */ +uint& +thd_lock_wait_timeout( +/*==================*/ + THD* thd) /*!< in: thread handle, or NULL to query + the global innodb_lock_wait_timeout */ +{ + /* According to , passing thd == NULL + returns the global value of the session variable. */ + return(THDVAR(thd, lock_wait_timeout)); +} + +/** Get the value of innodb_tmpdir. +@param[in] thd thread handle, or NULL to query + the global innodb_tmpdir. +@retval NULL if innodb_tmpdir="" */ +const char *thd_innodb_tmpdir(THD *thd) +{ + const char* tmp_dir = THDVAR(thd, tmpdir); + + if (tmp_dir != NULL && *tmp_dir == '\0') { + tmp_dir = NULL; + } + + return(tmp_dir); +} + +/** Obtain the InnoDB transaction of a MySQL thread. +@param[in,out] thd thread handle +@return reference to transaction pointer */ +static trx_t* thd_to_trx(THD* thd) +{ + return reinterpret_cast(thd_get_ha_data(thd, innodb_hton_ptr)); +} + +#ifdef WITH_WSREP +/********************************************************************//** +Obtain the InnoDB transaction id of a MySQL thread. +@return transaction id */ +__attribute__((warn_unused_result, nonnull)) +ulonglong +thd_to_trx_id( + THD* thd) /*!< in: MySQL thread */ +{ + return(thd_to_trx(thd)->id); +} + +Atomic_relaxed wsrep_sst_disable_writes; + +static void sst_disable_innodb_writes() +{ + const uint old_count= srv_n_fil_crypt_threads; + fil_crypt_set_thread_cnt(0); + srv_n_fil_crypt_threads= old_count; + + wsrep_sst_disable_writes= true; + dict_stats_shutdown(); + purge_sys.stop(); + /* We are holding a global MDL thanks to FLUSH TABLES WITH READ LOCK. + + That will prevent any writes from arriving into InnoDB, but it will + not prevent writes of modified pages from the buffer pool, or log + checkpoints. + + Let us perform a log checkpoint to ensure that the entire buffer + pool is clean, so that no writes to persistent files will be + possible during the snapshot, and to guarantee that no crash + recovery will be necessary when starting up on the snapshot. */ + log_make_checkpoint(); + /* If any FILE_MODIFY records were written by the checkpoint, an + extra write of a FILE_CHECKPOINT record could still be invoked by + buf_flush_page_cleaner(). Let us prevent that by invoking another + checkpoint (which will write the FILE_CHECKPOINT record). */ + log_make_checkpoint(); + ut_d(recv_no_log_write= true); + /* If this were not a no-op, an assertion would fail due to + recv_no_log_write. */ + ut_d(log_make_checkpoint()); +} + +static void sst_enable_innodb_writes() +{ + ut_ad(recv_no_log_write); + ut_d(recv_no_log_write= false); + dict_stats_start(); + purge_sys.resume(); + wsrep_sst_disable_writes= false; + const uint old_count= srv_n_fil_crypt_threads; + srv_n_fil_crypt_threads= 0; + fil_crypt_set_thread_cnt(old_count); +} + +static void innodb_disable_internal_writes(bool disable) +{ + if (disable) + sst_disable_innodb_writes(); + else + sst_enable_innodb_writes(); +} + +static void wsrep_abort_transaction(handlerton *, THD *, THD *, my_bool) + __attribute__((nonnull)); +static int innobase_wsrep_set_checkpoint(handlerton *hton, const XID *xid); +static int innobase_wsrep_get_checkpoint(handlerton* hton, XID* xid); +#endif /* WITH_WSREP */ + +#define normalize_table_name(a,b) \ + normalize_table_name_c_low(a,b,IF_WIN(true,false)) + +ulonglong ha_innobase::table_version() const +{ + /* This is either "garbage" or something that was assigned + on a successful ha_innobase::prepare_inplace_alter_table(). */ + return m_prebuilt->trx_id; +} + +#ifdef UNIV_DEBUG +/** whether the DDL log recovery has been completed */ +static bool ddl_recovery_done; +#endif + +static int innodb_check_version(handlerton *hton, const char *path, + const LEX_CUSTRING *version, + ulonglong create_id) +{ + DBUG_ENTER("innodb_check_version"); + DBUG_ASSERT(hton == innodb_hton_ptr); + ut_ad(!ddl_recovery_done); + + if (!create_id) + DBUG_RETURN(0); + + char norm_path[FN_REFLEN]; + normalize_table_name(norm_path, path); + + if (dict_table_t *table= dict_table_open_on_name(norm_path, false, + DICT_ERR_IGNORE_NONE)) + { + const trx_id_t trx_id= table->def_trx_id; + DBUG_ASSERT(trx_id <= create_id); + dict_table_close(table); + DBUG_PRINT("info", ("create_id: %llu trx_id: %llu", create_id, trx_id)); + DBUG_RETURN(create_id != trx_id); + } + else + DBUG_RETURN(2); +} + +/** Drop any garbage intermediate tables that existed in the system +after a backup was restored. + +In a final phase of Mariabackup, the commit of DDL operations is blocked, +and those DDL operations will have to be rolled back. Because the +normal DDL recovery will not run due to the lack of the log file, +at least some #sql-alter- garbage tables may remain in the InnoDB +data dictionary (while the data files themselves are missing). +We will attempt to drop the tables here. */ +static void drop_garbage_tables_after_restore() +{ + btr_pcur_t pcur; + mtr_t mtr; + trx_t *trx= trx_create(); + + ut_ad(!purge_sys.enabled()); + ut_d(purge_sys.stop_FTS()); + + mtr.start(); + if (pcur.open_leaf(true, dict_sys.sys_tables->indexes.start, BTR_SEARCH_LEAF, + &mtr) != DB_SUCCESS) + goto all_fail; + for (;;) + { + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + + if (!btr_pcur_is_on_user_rec(&pcur)) + break; + + const rec_t *rec= btr_pcur_get_rec(&pcur); + if (rec_get_deleted_flag(rec, 0)) + continue; + + static_assert(DICT_FLD__SYS_TABLES__NAME == 0, "compatibility"); + size_t len; + if (rec_get_1byte_offs_flag(rec)) + { + len= rec_1_get_field_end_info(rec, 0); + if (len & REC_1BYTE_SQL_NULL_MASK) + continue; /* corrupted SYS_TABLES.NAME */ + } + else + { + len= rec_2_get_field_end_info(rec, 0); + static_assert(REC_2BYTE_EXTERN_MASK == 16384, "compatibility"); + if (len >= REC_2BYTE_EXTERN_MASK) + continue; /* corrupted SYS_TABLES.NAME */ + } + + if (len < tmp_file_prefix_length) + continue; + if (const char *f= static_cast + (memchr(rec, '/', len - tmp_file_prefix_length))) + { + if (memcmp(f + 1, tmp_file_prefix, tmp_file_prefix_length)) + continue; + } + else + continue; + + btr_pcur_store_position(&pcur, &mtr); + btr_pcur_commit_specify_mtr(&pcur, &mtr); + + trx_start_for_ddl(trx); + std::vector deleted; + dberr_t err= DB_TABLE_NOT_FOUND; + row_mysql_lock_data_dictionary(trx); + + if (dict_table_t *table= dict_sys.load_table + ({reinterpret_cast(pcur.old_rec), len}, + DICT_ERR_IGNORE_DROP)) + { + table->acquire(); + row_mysql_unlock_data_dictionary(trx); + err= lock_table_for_trx(table, trx, LOCK_X); + if (err == DB_SUCCESS && + (table->flags2 & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS))) + { + fts_optimize_remove_table(table); + err= fts_lock_tables(trx, *table); + } + if (err == DB_SUCCESS) + err= lock_sys_tables(trx); + row_mysql_lock_data_dictionary(trx); + table->release(); + + if (err == DB_SUCCESS) + err= trx->drop_table(*table); + if (err != DB_SUCCESS) + goto fail; + trx->commit(deleted); + } + else + { +fail: + trx->rollback(); + sql_print_error("InnoDB: cannot drop %.*s: %s", + static_cast(len), pcur.old_rec, ut_strerr(err)); + } + + row_mysql_unlock_data_dictionary(trx); + for (pfs_os_file_t d : deleted) + os_file_close(d); + + mtr.start(); + if (pcur.restore_position(BTR_SEARCH_LEAF, &mtr) == btr_pcur_t::CORRUPTED) + break; + } + +all_fail: + mtr.commit(); + trx->free(); + ut_free(pcur.old_rec_buf); + ut_d(purge_sys.resume_FTS()); +} + +static void innodb_ddl_recovery_done(handlerton*) +{ + ut_ad(!ddl_recovery_done); + ut_d(ddl_recovery_done= true); + if (!srv_read_only_mode && srv_operation <= SRV_OPERATION_EXPORT_RESTORED && + srv_force_recovery < SRV_FORCE_NO_BACKGROUND) + { + if (srv_start_after_restore && !high_level_read_only) + drop_garbage_tables_after_restore(); + srv_init_purge_tasks(); + } +} + +/********************************************************************//** +Converts an InnoDB error code to a MySQL error code and also tells to MySQL +about a possible transaction rollback inside InnoDB caused by a lock wait +timeout or a deadlock. +@return MySQL error code */ +static int +convert_error_code_to_mysql( +/*========================*/ + dberr_t error, /*!< in: InnoDB error code */ + ulint flags, /*!< in: InnoDB table flags, or 0 */ + THD* thd) /*!< in: user thread handle or NULL */ +{ + switch (error) { + case DB_SUCCESS: + return(0); + + case DB_INTERRUPTED: + return(HA_ERR_ABORTED_BY_USER); + + case DB_FOREIGN_EXCEED_MAX_CASCADE: + ut_ad(thd); + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_ROW_IS_REFERENCED, + "InnoDB: Cannot delete/update " + "rows with cascading foreign key " + "constraints that exceed max " + "depth of %d. Please " + "drop extra constraints and try " + "again", FK_MAX_CASCADE_DEL); + return(HA_ERR_FK_DEPTH_EXCEEDED); + + case DB_CANT_CREATE_GEOMETRY_OBJECT: + my_error(ER_CANT_CREATE_GEOMETRY_OBJECT, MYF(0)); + return(HA_ERR_NULL_IN_SPATIAL); + + case DB_ERROR: + default: + return(HA_ERR_GENERIC); /* unspecified error */ + + case DB_DUPLICATE_KEY: + /* Be cautious with returning this error, since + mysql could re-enter the storage layer to get + duplicated key info, the operation requires a + valid table handle and/or transaction information, + which might not always be available in the error + handling stage. */ + return(HA_ERR_FOUND_DUPP_KEY); + + case DB_READ_ONLY: + return(HA_ERR_TABLE_READONLY); + + case DB_FOREIGN_DUPLICATE_KEY: + return(HA_ERR_FOREIGN_DUPLICATE_KEY); + + case DB_MISSING_HISTORY: + return(HA_ERR_TABLE_DEF_CHANGED); + + case DB_RECORD_NOT_FOUND: + return(HA_ERR_NO_ACTIVE_RECORD); + + case DB_DEADLOCK: + /* Since we rolled back the whole transaction, we must + tell it also to MySQL so that MySQL knows to empty the + cached binlog for this transaction */ + + if (thd != NULL) { + thd_mark_transaction_to_rollback(thd, 1); + } + + return(HA_ERR_LOCK_DEADLOCK); + + case DB_LOCK_WAIT_TIMEOUT: + /* Starting from 5.0.13, we let MySQL just roll back the + latest SQL statement in a lock wait timeout. Previously, we + rolled back the whole transaction. */ + + if (thd) { + thd_mark_transaction_to_rollback( + thd, innobase_rollback_on_timeout); + } + + return(HA_ERR_LOCK_WAIT_TIMEOUT); + + case DB_NO_REFERENCED_ROW: + return(HA_ERR_NO_REFERENCED_ROW); + + case DB_ROW_IS_REFERENCED: + return(HA_ERR_ROW_IS_REFERENCED); + + case DB_NO_FK_ON_S_BASE_COL: + case DB_CANNOT_ADD_CONSTRAINT: + case DB_CHILD_NO_INDEX: + case DB_PARENT_NO_INDEX: + return(HA_ERR_CANNOT_ADD_FOREIGN); + + case DB_CANNOT_DROP_CONSTRAINT: + + return(HA_ERR_ROW_IS_REFERENCED); /* TODO: This is a bit + misleading, a new MySQL error + code should be introduced */ + + case DB_CORRUPTION: + case DB_PAGE_CORRUPTED: + return(HA_ERR_CRASHED); + + case DB_OUT_OF_FILE_SPACE: + return(HA_ERR_RECORD_FILE_FULL); + + case DB_TEMP_FILE_WRITE_FAIL: + my_error(ER_GET_ERRMSG, MYF(0), + DB_TEMP_FILE_WRITE_FAIL, + ut_strerr(DB_TEMP_FILE_WRITE_FAIL), + "InnoDB"); + return(HA_ERR_INTERNAL_ERROR); + + case DB_TABLE_NOT_FOUND: + return(HA_ERR_NO_SUCH_TABLE); + + case DB_DECRYPTION_FAILED: + return(HA_ERR_DECRYPTION_FAILED); + + case DB_TABLESPACE_NOT_FOUND: + return(HA_ERR_TABLESPACE_MISSING); + + case DB_TOO_BIG_RECORD: { + /* If prefix is true then a 768-byte prefix is stored + locally for BLOB fields. Refer to dict_table_get_format(). + We limit max record size to 16k for 64k page size. */ + bool prefix = !DICT_TF_HAS_ATOMIC_BLOBS(flags); + bool comp = !!(flags & DICT_TF_COMPACT); + ulint free_space = page_get_free_space_of_empty(comp) / 2; + + if (free_space >= ulint(comp ? COMPRESSED_REC_MAX_DATA_SIZE : + REDUNDANT_REC_MAX_DATA_SIZE)) { + free_space = (comp ? COMPRESSED_REC_MAX_DATA_SIZE : + REDUNDANT_REC_MAX_DATA_SIZE) - 1; + } + + my_printf_error(ER_TOO_BIG_ROWSIZE, + "Row size too large (> " ULINTPF "). Changing some columns " + "to TEXT or BLOB %smay help. In current row " + "format, BLOB prefix of %d bytes is stored inline.", + MYF(0), + free_space, + prefix + ? "or using ROW_FORMAT=DYNAMIC or" + " ROW_FORMAT=COMPRESSED " + : "", + prefix + ? DICT_MAX_FIXED_COL_LEN + : 0); + return(HA_ERR_TO_BIG_ROW); + } + + case DB_TOO_BIG_INDEX_COL: + my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0), + (ulong) DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags)); + return(HA_ERR_INDEX_COL_TOO_LONG); + + case DB_NO_SAVEPOINT: + return(HA_ERR_NO_SAVEPOINT); + + case DB_LOCK_TABLE_FULL: + /* Since we rolled back the whole transaction, we must + tell it also to MySQL so that MySQL knows to empty the + cached binlog for this transaction */ + + if (thd) { + thd_mark_transaction_to_rollback(thd, 1); + } + + return(HA_ERR_LOCK_TABLE_FULL); + + case DB_FTS_INVALID_DOCID: + return(HA_FTS_INVALID_DOCID); + case DB_FTS_EXCEED_RESULT_CACHE_LIMIT: + return(HA_ERR_OUT_OF_MEM); + case DB_TOO_MANY_CONCURRENT_TRXS: + return(HA_ERR_TOO_MANY_CONCURRENT_TRXS); + case DB_UNSUPPORTED: + return(HA_ERR_UNSUPPORTED); + case DB_INDEX_CORRUPT: + return(HA_ERR_INDEX_CORRUPT); + case DB_UNDO_RECORD_TOO_BIG: + return(HA_ERR_UNDO_REC_TOO_BIG); + case DB_OUT_OF_MEMORY: + return(HA_ERR_OUT_OF_MEM); + case DB_TABLESPACE_EXISTS: + return(HA_ERR_TABLESPACE_EXISTS); + case DB_TABLESPACE_DELETED: + return(HA_ERR_TABLESPACE_MISSING); + case DB_IDENTIFIER_TOO_LONG: + return(HA_ERR_INTERNAL_ERROR); + case DB_TABLE_CORRUPT: + return(HA_ERR_TABLE_CORRUPT); + case DB_FTS_TOO_MANY_WORDS_IN_PHRASE: + return(HA_ERR_FTS_TOO_MANY_WORDS_IN_PHRASE); + case DB_COMPUTE_VALUE_FAILED: + return(HA_ERR_GENERIC); // impossible + } +} + +/*************************************************************//** +Prints info of a THD object (== user session thread) to the given file. */ +void +innobase_mysql_print_thd( +/*=====================*/ + FILE* f, /*!< in: output stream */ + THD* thd, /*!< in: MySQL THD object */ + uint max_query_len) /*!< in: max query length to print, or 0 to + use the default max length */ +{ + char buffer[1024]; + + fputs(thd_get_error_context_description(thd, buffer, sizeof buffer, + max_query_len), f); + putc('\n', f); +} + +/******************************************************************//** +Get the variable length bounds of the given character set. */ +static void +innobase_get_cset_width( +/*====================*/ + ulint cset, /*!< in: MySQL charset-collation code */ + unsigned*mbminlen, /*!< out: minimum length of a char (in bytes) */ + unsigned*mbmaxlen) /*!< out: maximum length of a char (in bytes) */ +{ + CHARSET_INFO* cs; + ut_ad(cset <= MAX_CHAR_COLL_NUM); + ut_ad(mbminlen); + ut_ad(mbmaxlen); + + cs = cset ? get_charset((uint)cset, MYF(MY_WME)) : NULL; + if (cs) { + *mbminlen = cs->mbminlen; + *mbmaxlen = cs->mbmaxlen; + ut_ad(*mbminlen < DATA_MBMAX); + ut_ad(*mbmaxlen < DATA_MBMAX); + } else { + THD* thd = current_thd; + + if (thd && thd_sql_command(thd) == SQLCOM_DROP_TABLE) { + + /* Fix bug#46256: allow tables to be dropped if the + collation is not found, but issue a warning. */ + if (cset != 0) { + + sql_print_warning( + "Unknown collation #" ULINTPF ".", + cset); + } + } else { + + ut_a(cset == 0); + } + + *mbminlen = *mbmaxlen = 0; + } +} + +/*********************************************************************//** +Compute the mbminlen and mbmaxlen members of a data type structure. */ +void +dtype_get_mblen( +/*============*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type (and collation) */ + unsigned*mbminlen, /*!< out: minimum length of a + multi-byte character */ + unsigned*mbmaxlen) /*!< out: maximum length of a + multi-byte character */ +{ + if (dtype_is_string_type(mtype)) { + innobase_get_cset_width(dtype_get_charset_coll(prtype), + mbminlen, mbmaxlen); + ut_ad(*mbminlen <= *mbmaxlen); + ut_ad(*mbminlen < DATA_MBMAX); + ut_ad(*mbmaxlen < DATA_MBMAX); + } else { + *mbminlen = *mbmaxlen = 0; + } +} + +/******************************************************************//** +Converts an identifier to a table name. */ +void +innobase_convert_from_table_id( +/*===========================*/ + CHARSET_INFO* cs, /*!< in: the 'from' character set */ + char* to, /*!< out: converted identifier */ + const char* from, /*!< in: identifier to convert */ + ulint len) /*!< in: length of 'to', in bytes */ +{ + uint errors; + + strconvert(cs, from, FN_REFLEN, &my_charset_filename, to, (uint) len, &errors); +} + +/********************************************************************** +Check if the length of the identifier exceeds the maximum allowed. +return true when length of identifier is too long. */ +my_bool +innobase_check_identifier_length( +/*=============================*/ + const char* id) /* in: FK identifier to check excluding the + database portion. */ +{ + int well_formed_error = 0; + CHARSET_INFO *cs = system_charset_info; + DBUG_ENTER("innobase_check_identifier_length"); + + size_t len = my_well_formed_length( + cs, id, id + strlen(id), + NAME_CHAR_LEN, &well_formed_error); + + if (well_formed_error || len == NAME_CHAR_LEN) { + my_error(ER_TOO_LONG_IDENT, MYF(0), id); + DBUG_RETURN(true); + } + DBUG_RETURN(false); +} + +/******************************************************************//** +Converts an identifier to UTF-8. */ +void +innobase_convert_from_id( +/*=====================*/ + CHARSET_INFO* cs, /*!< in: the 'from' character set */ + char* to, /*!< out: converted identifier */ + const char* from, /*!< in: identifier to convert */ + ulint len) /*!< in: length of 'to', in bytes */ +{ + uint errors; + + strconvert(cs, from, FN_REFLEN, system_charset_info, to, (uint) len, &errors); +} + +/******************************************************************//** +Compares NUL-terminated UTF-8 strings case insensitively. +@return 0 if a=b, <0 if a1 if a>b */ +int +innobase_strcasecmp( +/*================*/ + const char* a, /*!< in: first string to compare */ + const char* b) /*!< in: second string to compare */ +{ + if (!a) { + if (!b) { + return(0); + } else { + return(-1); + } + } else if (!b) { + return(1); + } + + return(my_strcasecmp(system_charset_info, a, b)); +} + +/******************************************************************//** +Compares NUL-terminated UTF-8 strings case insensitively. The +second string contains wildcards. +@return 0 if a match is found, 1 if not */ +static +int +innobase_wildcasecmp( +/*=================*/ + const char* a, /*!< in: string to compare */ + const char* b) /*!< in: wildcard string to compare */ +{ + return(wild_case_compare(system_charset_info, a, b)); +} + +/** Strip dir name from a full path name and return only the file name +@param[in] path_name full path name +@return file name or "null" if no file name */ +const char* +innobase_basename( + const char* path_name) +{ + const char* name = base_name(path_name); + + return((name) ? name : "null"); +} + +/******************************************************************//** +Makes all characters in a NUL-terminated UTF-8 string lower case. */ +void +innobase_casedn_str( +/*================*/ + char* a) /*!< in/out: string to put in lower case */ +{ + my_casedn_str(system_charset_info, a); +} + +/** Determines the current SQL statement. +Thread unsafe, can only be called from the thread owning the THD. +@param[in] thd MySQL thread handle +@param[out] length Length of the SQL statement +@return SQL statement string */ +const char* +innobase_get_stmt_unsafe( + THD* thd, + size_t* length) +{ + if (const LEX_STRING *stmt = thd_query_string(thd)) { + *length = stmt->length; + return stmt->str; + } + + *length = 0; + return NULL; +} + +/** + Test a file path whether it is same as mysql data directory path. + + @param path null terminated character string + + @return + @retval TRUE The path is different from mysql data directory. + @retval FALSE The path is same as mysql data directory. +*/ +static bool is_mysql_datadir_path(const char *path) +{ + if (path == NULL) + return false; + + char mysql_data_dir[FN_REFLEN], path_dir[FN_REFLEN]; + convert_dirname(path_dir, path, NullS); + convert_dirname(mysql_data_dir, mysql_unpacked_real_data_home, NullS); + size_t mysql_data_home_len= dirname_length(mysql_data_dir); + size_t path_len = dirname_length(path_dir); + + if (path_len < mysql_data_home_len) + return true; + + if (!lower_case_file_system) + return(memcmp(mysql_data_dir, path_dir, mysql_data_home_len)); + + return(files_charset_info->strnncoll((uchar *) path_dir, path_len, + (uchar *) mysql_data_dir, + mysql_data_home_len, + TRUE)); +} + +/*********************************************************************//** +Wrapper around MySQL's copy_and_convert function. +@return number of bytes copied to 'to' */ +static +ulint +innobase_convert_string( +/*====================*/ + void* to, /*!< out: converted string */ + ulint to_length, /*!< in: number of bytes reserved + for the converted string */ + CHARSET_INFO* to_cs, /*!< in: character set to convert to */ + const void* from, /*!< in: string to convert */ + ulint from_length, /*!< in: number of bytes to convert */ + CHARSET_INFO* from_cs, /*!< in: character set to convert + from */ + uint* errors) /*!< out: number of errors encountered + during the conversion */ +{ + return(copy_and_convert( + (char*) to, (uint32) to_length, to_cs, + (const char*) from, (uint32) from_length, from_cs, + errors)); +} + +/*******************************************************************//** +Formats the raw data in "data" (in InnoDB on-disk format) that is of +type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes +the result to "buf". The result is converted to "system_charset_info". +Not more than "buf_size" bytes are written to "buf". +The result is always NUL-terminated (provided buf_size > 0) and the +number of bytes that were written to "buf" is returned (including the +terminating NUL). +@return number of bytes that were written */ +ulint +innobase_raw_format( +/*================*/ + const char* data, /*!< in: raw data */ + ulint data_len, /*!< in: raw data length + in bytes */ + ulint charset_coll, /*!< in: charset collation */ + char* buf, /*!< out: output buffer */ + ulint buf_size) /*!< in: output buffer size + in bytes */ +{ + /* XXX we use a hard limit instead of allocating + but_size bytes from the heap */ + CHARSET_INFO* data_cs; + char buf_tmp[8192]; + ulint buf_tmp_used; + uint num_errors; + + data_cs = all_charsets[charset_coll]; + + buf_tmp_used = innobase_convert_string(buf_tmp, sizeof(buf_tmp), + system_charset_info, + data, data_len, data_cs, + &num_errors); + + return(ut_str_sql_format(buf_tmp, buf_tmp_used, buf, buf_size)); +} + +/* +The helper function nlz(x) calculates the number of leading zeros +in the binary representation of the number "x", either using a +built-in compiler function or a substitute trick based on the use +of the multiplication operation and a table indexed by the prefix +of the multiplication result: +*/ +#ifdef __GNUC__ +#define nlz(x) __builtin_clzll(x) +#elif defined(_MSC_VER) && !defined(_M_CEE_PURE) && \ + (defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64)) +#ifndef __INTRIN_H_ +#pragma warning(push, 4) +#pragma warning(disable: 4255 4668) +#include +#pragma warning(pop) +#endif +__forceinline unsigned int nlz (ulonglong x) +{ +#if defined(_M_IX86) || defined(_M_X64) + unsigned long n; +#ifdef _M_X64 + _BitScanReverse64(&n, x); + return (unsigned int) n ^ 63; +#else + unsigned long y = (unsigned long) (x >> 32); + unsigned int m = 31; + if (y == 0) + { + y = (unsigned long) x; + m = 63; + } + _BitScanReverse(&n, y); + return (unsigned int) n ^ m; +#endif +#elif defined(_M_ARM64) + return _CountLeadingZeros64(x); +#endif +} +#else +inline unsigned int nlz (ulonglong x) +{ + static unsigned char table [48] = { + 32, 6, 5, 0, 4, 12, 0, 20, + 15, 3, 11, 0, 0, 18, 25, 31, + 8, 14, 2, 0, 10, 0, 0, 0, + 0, 0, 0, 21, 0, 0, 19, 26, + 7, 0, 13, 0, 16, 1, 22, 27, + 9, 0, 17, 23, 28, 24, 29, 30 + }; + unsigned int y= (unsigned int) (x >> 32); + unsigned int n= 0; + if (y == 0) { + y= (unsigned int) x; + n= 32; + } + y = y | (y >> 1); // Propagate leftmost 1-bit to the right. + y = y | (y >> 2); + y = y | (y >> 4); + y = y | (y >> 8); + y = y & ~(y >> 16); + y = y * 0x3EF5D037; + return n + table[y >> 26]; +} +#endif + +/*********************************************************************//** +Compute the next autoinc value. + +For MySQL replication the autoincrement values can be partitioned among +the nodes. The offset is the start or origin of the autoincrement value +for a particular node. For n nodes the increment will be n and the offset +will be in the interval [1, n]. The formula tries to allocate the next +value for a particular node. + +Note: This function is also called with increment set to the number of +values we want to reserve for multi-value inserts e.g., + + INSERT INTO T VALUES(), (), (); + +innobase_next_autoinc() will be called with increment set to 3 where +autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for +the multi-value INSERT above. +@return the next value */ +ulonglong +innobase_next_autoinc( +/*==================*/ + ulonglong current, /*!< in: Current value */ + ulonglong need, /*!< in: count of values needed */ + ulonglong step, /*!< in: AUTOINC increment step */ + ulonglong offset, /*!< in: AUTOINC offset */ + ulonglong max_value) /*!< in: max value for type */ +{ + ulonglong next_value; + ulonglong block; + + /* Should never be 0. */ + ut_a(need > 0); + ut_a(step > 0); + ut_a(max_value > 0); + + /* + We need to calculate the "block" value equal to the product + "step * need". However, when calculating this product, an integer + overflow can occur, so we cannot simply use the usual multiplication + operation. The snippet below calculates the product of two numbers + and detects an unsigned integer overflow: + */ + unsigned int m= nlz(need); + unsigned int n= nlz(step); + if (m + n <= 8 * sizeof(ulonglong) - 2) { + // The bit width of the original values is too large, + // therefore we are guaranteed to get an overflow. + goto overflow; + } + block = need * (step >> 1); + if ((longlong) block < 0) { + goto overflow; + } + block += block; + if (step & 1) { + block += need; + if (block < need) { + goto overflow; + } + } + + /* Check for overflow. Current can be > max_value if the value + is in reality a negative value. Also, the visual studio compiler + converts large double values (which hypothetically can then be + passed here as the values of the "current" parameter) automatically + into unsigned long long datatype maximum value: */ + if (current > max_value) { + goto overflow; + } + + /* According to MySQL documentation, if the offset is greater than + the step then the offset is ignored. */ + if (offset > step) { + offset = 0; + } + + /* + Let's round the current value to within a step-size block: + */ + if (current > offset) { + next_value = current - offset; + } else { + next_value = offset - current; + } + next_value -= next_value % step; + + /* + Add an offset to the next value and check that the addition + does not cause an integer overflow: + */ + next_value += offset; + if (next_value < offset) { + goto overflow; + } + + /* + Add a block to the next value and check that the addition + does not cause an integer overflow: + */ + next_value += block; + if (next_value < block) { + goto overflow; + } + + return(next_value); + +overflow: + /* + Allow auto_increment to go over max_value up to max ulonglong. + This allows us to detect that all values are exhausted. + If we don't do this, we will return max_value several times + and get duplicate key errors instead of auto increment value + out of range: + */ + return(~(ulonglong) 0); +} + +/*********************************************************************//** +Initializes some fields in an InnoDB transaction object. */ +static +void +innobase_trx_init( +/*==============*/ + THD* thd, /*!< in: user thread handle */ + trx_t* trx) /*!< in/out: InnoDB transaction handle */ +{ + DBUG_ENTER("innobase_trx_init"); + DBUG_ASSERT(thd == trx->mysql_thd); + + /* Ensure that thd_lock_wait_timeout(), which may be called + while holding lock_sys.latch, by lock_rec_enqueue_waiting(), + will not end up acquiring LOCK_global_system_variables in + intern_sys_var_ptr(). */ + (void) THDVAR(thd, lock_wait_timeout); + + trx->check_foreigns = !thd_test_options( + thd, OPTION_NO_FOREIGN_KEY_CHECKS); + + trx->check_unique_secondary = !thd_test_options( + thd, OPTION_RELAXED_UNIQUE_CHECKS); +#ifdef WITH_WSREP + trx->wsrep = wsrep_on(thd); +#endif + + DBUG_VOID_RETURN; +} + +/*********************************************************************//** +Allocates an InnoDB transaction for a MySQL handler object for DML. +@return InnoDB transaction handle */ +trx_t* +innobase_trx_allocate( +/*==================*/ + THD* thd) /*!< in: user thread handle */ +{ + trx_t* trx; + + DBUG_ENTER("innobase_trx_allocate"); + DBUG_ASSERT(thd != NULL); + DBUG_ASSERT(EQ_CURRENT_THD(thd)); + + trx = trx_create(); + + trx->mysql_thd = thd; + + innobase_trx_init(thd, trx); + + DBUG_RETURN(trx); +} + +/*********************************************************************//** +Gets the InnoDB transaction handle for a MySQL handler object, creates +an InnoDB transaction struct if the corresponding MySQL thread struct still +lacks one. +@return InnoDB transaction handle */ +static inline +trx_t* +check_trx_exists( +/*=============*/ + THD* thd) /*!< in: user thread handle */ +{ + if (trx_t* trx = thd_to_trx(thd)) { + ut_a(trx->magic_n == TRX_MAGIC_N); + innobase_trx_init(thd, trx); + return trx; + } else { + trx = innobase_trx_allocate(thd); + thd_set_ha_data(thd, innodb_hton_ptr, trx); + return trx; + } +} + +/** + Gets current trx. + + This function may be called during InnoDB initialisation, when + innodb_hton_ptr->slot is not yet set to meaningful value. +*/ + +trx_t *current_trx() +{ + THD *thd=current_thd; + if (likely(thd != 0) && innodb_hton_ptr->slot != HA_SLOT_UNDEF) { + return thd_to_trx(thd); + } else { + return(NULL); + } +} + +/*********************************************************************//** +Note that a transaction has been registered with MySQL. +@return true if transaction is registered with MySQL 2PC coordinator */ +static inline +bool +trx_is_registered_for_2pc( +/*======================*/ + const trx_t* trx) /* in: transaction */ +{ + return(trx->is_registered == 1); +} + +/*********************************************************************//** +Note that a transaction has been deregistered. */ +static inline +void +trx_deregister_from_2pc( +/*====================*/ + trx_t* trx) /* in: transaction */ +{ + trx->is_registered= false; + trx->active_commit_ordered= false; +} + +/*********************************************************************//** +Copy table flags from MySQL's HA_CREATE_INFO into an InnoDB table object. +Those flags are stored in .frm file and end up in the MySQL table object, +but are frequently used inside InnoDB so we keep their copies into the +InnoDB table object. */ +static +void +innobase_copy_frm_flags_from_create_info( +/*=====================================*/ + dict_table_t* innodb_table, /*!< in/out: InnoDB table */ + const HA_CREATE_INFO* create_info) /*!< in: create info */ +{ + ibool ps_on; + ibool ps_off; + + if (innodb_table->is_temporary() + || innodb_table->no_rollback()) { + /* Temp tables do not use persistent stats. */ + ps_on = FALSE; + ps_off = TRUE; + } else { + ps_on = create_info->table_options + & HA_OPTION_STATS_PERSISTENT; + ps_off = create_info->table_options + & HA_OPTION_NO_STATS_PERSISTENT; + } + + dict_stats_set_persistent(innodb_table, ps_on, ps_off); + + dict_stats_auto_recalc_set( + innodb_table, + create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON, + create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF); + + innodb_table->stats_sample_pages = create_info->stats_sample_pages; +} + +/*********************************************************************//** +Copy table flags from MySQL's TABLE_SHARE into an InnoDB table object. +Those flags are stored in .frm file and end up in the MySQL table object, +but are frequently used inside InnoDB so we keep their copies into the +InnoDB table object. */ +void +innobase_copy_frm_flags_from_table_share( +/*=====================================*/ + dict_table_t* innodb_table, /*!< in/out: InnoDB table */ + const TABLE_SHARE* table_share) /*!< in: table share */ +{ + ibool ps_on; + ibool ps_off; + + if (innodb_table->is_temporary()) { + /* Temp tables do not use persistent stats */ + ps_on = FALSE; + ps_off = TRUE; + } else { + ps_on = table_share->db_create_options + & HA_OPTION_STATS_PERSISTENT; + ps_off = table_share->db_create_options + & HA_OPTION_NO_STATS_PERSISTENT; + } + + dict_stats_set_persistent(innodb_table, ps_on, ps_off); + + dict_stats_auto_recalc_set( + innodb_table, + table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON, + table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF); + + innodb_table->stats_sample_pages = table_share->stats_sample_pages; +} + +/*********************************************************************//** +Construct ha_innobase handler. */ + +ha_innobase::ha_innobase( +/*=====================*/ + handlerton* hton, + TABLE_SHARE* table_arg) + :handler(hton, table_arg), + m_prebuilt(), + m_user_thd(), + m_int_table_flags(HA_REC_NOT_IN_SEQ + | HA_NULL_IN_KEY + | HA_CAN_VIRTUAL_COLUMNS + | HA_CAN_INDEX_BLOBS + | HA_CAN_SQL_HANDLER + | HA_REQUIRES_KEY_COLUMNS_FOR_DELETE + | HA_PRIMARY_KEY_REQUIRED_FOR_POSITION + | HA_PRIMARY_KEY_IN_READ_INDEX + | HA_BINLOG_ROW_CAPABLE + | HA_CAN_GEOMETRY + | HA_PARTIAL_COLUMN_READ + | HA_TABLE_SCAN_ON_INDEX + | HA_CAN_FULLTEXT + | HA_CAN_FULLTEXT_EXT + /* JAN: TODO: MySQL 5.7 + | HA_CAN_FULLTEXT_HINTS + */ + | HA_CAN_EXPORT + | HA_ONLINE_ANALYZE + | HA_CAN_RTREEKEYS + | HA_CAN_TABLES_WITHOUT_ROLLBACK + | HA_CAN_ONLINE_BACKUPS + | HA_CONCURRENT_OPTIMIZE + | HA_CAN_SKIP_LOCKED + | (srv_force_primary_key ? HA_REQUIRE_PRIMARY_KEY : 0) + ), + m_start_of_scan(), + m_mysql_has_locked() +{} + +/*********************************************************************//** +Destruct ha_innobase handler. */ + +ha_innobase::~ha_innobase() = default; +/*======================*/ + +/*********************************************************************//** +Updates the user_thd field in a handle and also allocates a new InnoDB +transaction handle if needed, and updates the transaction fields in the +m_prebuilt struct. */ +void +ha_innobase::update_thd( +/*====================*/ + THD* thd) /*!< in: thd to use the handle */ +{ + DBUG_ENTER("ha_innobase::update_thd"); + DBUG_PRINT("ha_innobase::update_thd", ("user_thd: %p -> %p", + m_user_thd, thd)); + + /* The table should have been opened in ha_innobase::open(). */ + DBUG_ASSERT(m_prebuilt->table->get_ref_count() > 0); + + trx_t* trx = check_trx_exists(thd); + + ut_ad(!trx->dict_operation_lock_mode); + ut_ad(!trx->dict_operation); + + if (m_prebuilt->trx != trx) { + + row_update_prebuilt_trx(m_prebuilt, trx); + } + + m_user_thd = thd; + + DBUG_ASSERT(m_prebuilt->trx->magic_n == TRX_MAGIC_N); + DBUG_ASSERT(m_prebuilt->trx == thd_to_trx(m_user_thd)); + + DBUG_VOID_RETURN; +} + +/*********************************************************************//** +Updates the user_thd field in a handle and also allocates a new InnoDB +transaction handle if needed, and updates the transaction fields in the +m_prebuilt struct. */ + +void +ha_innobase::update_thd() +/*=====================*/ +{ + THD* thd = ha_thd(); + + ut_ad(EQ_CURRENT_THD(thd)); + update_thd(thd); +} + +/*********************************************************************//** +Registers an InnoDB transaction with the MySQL 2PC coordinator, so that +the MySQL XA code knows to call the InnoDB prepare and commit, or rollback +for the transaction. This MUST be called for every transaction for which +the user may call commit or rollback. Calling this several times to register +the same transaction is allowed, too. This function also registers the +current SQL statement. */ +static inline +void +innobase_register_trx( +/*==================*/ + handlerton* hton, /* in: Innobase handlerton */ + THD* thd, /* in: MySQL thd (connection) object */ + trx_t* trx) /* in: transaction to register */ +{ + ut_ad(!trx->active_commit_ordered); + const trx_id_t trx_id= trx->id; + + trans_register_ha(thd, false, hton, trx_id); + + if (!trx->is_registered) + { + trx->is_registered= true; + if (thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) + trans_register_ha(thd, true, hton, trx_id); + } +} + +/* BACKGROUND INFO: HOW THE MYSQL QUERY CACHE WORKS WITH INNODB + ------------------------------------------------------------ + +1) The use of the query cache for TBL is disabled when there is an +uncommitted change to TBL. + +2) When a change to TBL commits, InnoDB stores the current value of +its global trx id counter, let us denote it by INV_TRX_ID, to the table object +in the InnoDB data dictionary, and does only allow such transactions whose +id <= INV_TRX_ID to use the query cache. + +3) When InnoDB does an INSERT/DELETE/UPDATE to a table TBL, or an implicit +modification because an ON DELETE CASCADE, we invalidate the MySQL query cache +of TBL immediately. + +How this is implemented inside InnoDB: + +1) Since every modification always sets an IX type table lock on the InnoDB +table, it is easy to check if there can be uncommitted modifications for a +table: just check if there are locks in the lock list of the table. + +2) When a transaction inside InnoDB commits, it reads the global trx id +counter and stores the value INV_TRX_ID to the tables on which it had a lock. + +3) If there is an implicit table change from ON DELETE CASCADE or SET NULL, +InnoDB calls an invalidate method for the MySQL query cache for that table. + +How this is implemented inside sql_cache.cc: + +1) The query cache for an InnoDB table TBL is invalidated immediately at an +INSERT/UPDATE/DELETE, just like in the case of MyISAM. No need to delay +invalidation to the transaction commit. + +2) To store or retrieve a value from the query cache of an InnoDB table TBL, +any query must first ask InnoDB's permission. We must pass the thd as a +parameter because InnoDB will look at the trx id, if any, associated with +that thd. Also the full_name which is used as key to search for the table +object. The full_name is a string containing the normalized path to the +table in the canonical format. + +3) Use of the query cache for InnoDB tables is now allowed also when +AUTOCOMMIT==0 or we are inside BEGIN ... COMMIT. Thus transactions no longer +put restrictions on the use of the query cache. +*/ + +/** Check if mysql can allow the transaction to read from/store to +the query cache. +@param[in] table table object +@param[in] trx transaction object +@return whether the storing or retrieving from the query cache is permitted */ +TRANSACTIONAL_TARGET +static bool innobase_query_caching_table_check_low( + dict_table_t* table, trx_t* trx) +{ + /* The following conditions will decide the query cache + retrieval or storing into: + + (1) There should not be any locks on the table. + (2) Someother trx shouldn't invalidate the cache before this + transaction started. + (3) Read view shouldn't exist. If exists then the view + low_limit_id should be greater than or equal to the transaction that + invalidates the cache for the particular table. + + For read-only transaction: should satisfy (1) and (3) + For read-write transaction: should satisfy (1), (2), (3) */ + + const trx_id_t inv = table->query_cache_inv_trx_id; + + if (trx->id && trx->id < inv) { + return false; + } + + if (trx->read_view.is_open() && trx->read_view.low_limit_id() < inv) { + return false; + } + +#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC + if (xbegin()) { + if (table->lock_mutex_is_locked()) + xabort(); + auto len = UT_LIST_GET_LEN(table->locks); + xend(); + return len == 0; + } +#endif + + table->lock_mutex_lock(); + auto len= UT_LIST_GET_LEN(table->locks); + table->lock_mutex_unlock(); + return len == 0; +} + +/** Checks if MySQL at the moment is allowed for this table to retrieve a +consistent read result, or store it to the query cache. +@param[in,out] trx transaction +@param[in] norm_name concatenation of database name, + '/' char, table name +@return whether storing or retrieving from the query cache is permitted */ +static bool innobase_query_caching_table_check( + trx_t* trx, + const char* norm_name) +{ + dict_table_t* table = dict_table_open_on_name( + norm_name, false, DICT_ERR_IGNORE_FK_NOKEY); + + if (table == NULL) { + return false; + } + + /* Start the transaction if it is not started yet */ + trx_start_if_not_started(trx, false); + + bool allow = innobase_query_caching_table_check_low(table, trx); + + dict_table_close(table); + + if (allow) { + /* If the isolation level is high, assign a read view for the + transaction if it does not yet have one */ + + if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ + && !srv_read_only_mode + && !trx->read_view.is_open()) { + + /* Start the transaction if it is not started yet */ + trx_start_if_not_started(trx, false); + + trx->read_view.open(trx); + } + } + + return allow; +} + +/******************************************************************//** +The MySQL query cache uses this to check from InnoDB if the query cache at +the moment is allowed to operate on an InnoDB table. The SQL query must +be a non-locking SELECT. + +The query cache is allowed to operate on certain query only if this function +returns TRUE for all tables in the query. + +If thd is not in the autocommit state, this function also starts a new +transaction for thd if there is no active trx yet, and assigns a consistent +read view to it if there is no read view yet. + +Why a deadlock of threads is not possible: the query cache calls this function +at the start of a SELECT processing. Then the calling thread cannot be +holding any InnoDB semaphores. The calling thread is holding the +query cache mutex, and this function will reserve the trx_sys.mutex. +@return TRUE if permitted, FALSE if not; note that the value FALSE +does not mean we should invalidate the query cache: invalidation is +called explicitly */ +static +my_bool +innobase_query_caching_of_table_permitted( +/*======================================*/ + THD* thd, /*!< in: thd of the user who is trying to + store a result to the query cache or + retrieve it */ + const char* full_name, /*!< in: normalized path to the table */ + uint full_name_len, /*!< in: length of the normalized path + to the table */ + ulonglong *) +{ + char norm_name[1000]; + trx_t* trx = check_trx_exists(thd); + + ut_a(full_name_len < 999); + + if (trx->isolation_level == TRX_ISO_SERIALIZABLE) { + /* In the SERIALIZABLE mode we add LOCK IN SHARE MODE to every + plain SELECT if AUTOCOMMIT is not on. */ + + return(false); + } + + if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN) + && trx->n_mysql_tables_in_use == 0) { + /* We are going to retrieve the query result from the query + cache. This cannot be a store operation to the query cache + because then MySQL would have locks on tables already. + + TODO: if the user has used LOCK TABLES to lock the table, + then we open a transaction in the call of row_.. below. + That trx can stay open until UNLOCK TABLES. The same problem + exists even if we do not use the query cache. MySQL should be + modified so that it ALWAYS calls some cleanup function when + the processing of a query ends! + + We can imagine we instantaneously serialize this consistent + read trx to the current trx id counter. If trx2 would have + changed the tables of a query result stored in the cache, and + trx2 would have already committed, making the result obsolete, + then trx2 would have already invalidated the cache. Thus we + can trust the result in the cache is ok for this query. */ + + return(true); + } + + /* Normalize the table name to InnoDB format */ + normalize_table_name(norm_name, full_name); + + innobase_register_trx(innodb_hton_ptr, thd, trx); + + return innobase_query_caching_table_check(trx, norm_name); +} + +/*****************************************************************//** +Invalidates the MySQL query cache for the table. */ +void +innobase_invalidate_query_cache( +/*============================*/ + trx_t* trx, /*!< in: transaction which + modifies the table */ + const char* full_name) /*!< in: concatenation of + database name, path separator, + table name, null char NUL; + NOTE that in Windows this is + always in LOWER CASE! */ +{ + /* Note that the query cache mutex is just above the trx_sys.mutex. + The caller of this function must not have latches of a lower rank. */ + +#ifdef HAVE_QUERY_CACHE + char qcache_key_name[2 * (NAME_LEN + 1)]; + char db_name[NAME_CHAR_LEN * MY_CS_MBMAXLEN + 1]; + const char *key_ptr; + size_t tabname_len; + + // Extract the database name. + key_ptr= strchr(full_name, '/'); + DBUG_ASSERT(key_ptr != NULL); // Database name should be present + size_t dbname_len= size_t(key_ptr - full_name); + memcpy(db_name, full_name, dbname_len); + db_name[dbname_len]= '\0'; + + /* Construct the key("db-name\0table$name\0") for the query cache using + the path name("db@002dname\0table@0024name\0") of the table in its + canonical form. */ + dbname_len = filename_to_tablename(db_name, qcache_key_name, + sizeof(qcache_key_name)); + tabname_len = filename_to_tablename(++key_ptr, + (qcache_key_name + dbname_len + 1), + sizeof(qcache_key_name) - + dbname_len - 1); + + /* Argument TRUE below means we are using transactions */ + mysql_query_cache_invalidate4(trx->mysql_thd, + qcache_key_name, + uint(dbname_len + tabname_len + 2), + TRUE); +#endif +} + +/** Quote a standard SQL identifier like index or column name. +@param[in] file output stream +@param[in] trx InnoDB transaction, or NULL +@param[in] id identifier to quote */ +void +innobase_quote_identifier( + FILE* file, + trx_t* trx, + const char* id) +{ + const int q = trx != NULL && trx->mysql_thd != NULL + ? get_quote_char_for_identifier(trx->mysql_thd, id, strlen(id)) + : '`'; + + if (q == EOF) { + fputs(id, file); + } else { + putc(q, file); + + while (int c = *id++) { + if (c == q) { + putc(c, file); + } + putc(c, file); + } + + putc(q, file); + } +} + +/** Quote a standard SQL identifier like tablespace, index or column name. +@param[in] trx InnoDB transaction, or NULL +@param[in] id identifier to quote +@return quoted identifier */ +std::string +innobase_quote_identifier( +/*======================*/ + trx_t* trx, + const char* id) +{ + std::string quoted_identifier; + const int q = trx != NULL && trx->mysql_thd != NULL + ? get_quote_char_for_identifier(trx->mysql_thd, id, strlen(id)) + : '`'; + + if (q == EOF) { + quoted_identifier.append(id); + } else { + quoted_identifier += char(q); + quoted_identifier.append(id); + quoted_identifier += char(q); + } + + return (quoted_identifier); +} + +/** Convert a table name to the MySQL system_charset_info (UTF-8) +and quote it. +@param[out] buf buffer for converted identifier +@param[in] buflen length of buf, in bytes +@param[in] id identifier to convert +@param[in] idlen length of id, in bytes +@param[in] thd MySQL connection thread, or NULL +@return pointer to the end of buf */ +static +char* +innobase_convert_identifier( + char* buf, + ulint buflen, + const char* id, + ulint idlen, + THD* thd) +{ + const char* s = id; + + char nz[MAX_TABLE_NAME_LEN + 1]; + char nz2[MAX_TABLE_NAME_LEN + 1]; + + /* Decode the table name. The MySQL function expects + a NUL-terminated string. The input and output strings + buffers must not be shared. */ + ut_a(idlen <= MAX_TABLE_NAME_LEN); + memcpy(nz, id, idlen); + nz[idlen] = 0; + + s = nz2; + idlen = explain_filename(thd, nz, nz2, sizeof nz2, + EXPLAIN_PARTITIONS_AS_COMMENT); + if (idlen > buflen) { + idlen = buflen; + } + memcpy(buf, s, idlen); + return(buf + idlen); +} + +/*****************************************************************//** +Convert a table name to the MySQL system_charset_info (UTF-8). +@return pointer to the end of buf */ +char* +innobase_convert_name( +/*==================*/ + char* buf, /*!< out: buffer for converted identifier */ + ulint buflen, /*!< in: length of buf, in bytes */ + const char* id, /*!< in: table name to convert */ + ulint idlen, /*!< in: length of id, in bytes */ + THD* thd) /*!< in: MySQL connection thread, or NULL */ +{ + char* s = buf; + const char* bufend = buf + buflen; + + const char* slash = (const char*) memchr(id, '/', idlen); + + if (slash == NULL) { + return(innobase_convert_identifier( + buf, buflen, id, idlen, thd)); + } + + /* Print the database name and table name separately. */ + s = innobase_convert_identifier(s, ulint(bufend - s), + id, ulint(slash - id), thd); + if (s < bufend) { + *s++ = '.'; + s = innobase_convert_identifier(s, ulint(bufend - s), + slash + 1, idlen + - ulint(slash - id) - 1, + thd); + } + + return(s); +} + +/*****************************************************************//** +A wrapper function of innobase_convert_name(), convert a table name +to the MySQL system_charset_info (UTF-8) and quote it if needed. +@return pointer to the end of buf */ +void +innobase_format_name( +/*==================*/ + char* buf, /*!< out: buffer for converted identifier */ + ulint buflen, /*!< in: length of buf, in bytes */ + const char* name) /*!< in: table name to format */ +{ + const char* bufend; + + bufend = innobase_convert_name(buf, buflen, name, strlen(name), NULL); + + ut_ad((ulint) (bufend - buf) < buflen); + + buf[bufend - buf] = '\0'; +} + +/**********************************************************************//** +Determines if the currently running transaction has been interrupted. +@return true if interrupted */ +bool +trx_is_interrupted( +/*===============*/ + const trx_t* trx) /*!< in: transaction */ +{ + return(trx && trx->mysql_thd && thd_kill_level(trx->mysql_thd)); +} + +/**************************************************************//** +Resets some fields of a m_prebuilt struct. The template is used in fast +retrieval of just those column values MySQL needs in its processing. */ +void +ha_innobase::reset_template(void) +/*=============================*/ +{ + ut_ad(m_prebuilt->magic_n == ROW_PREBUILT_ALLOCATED); + ut_ad(m_prebuilt->magic_n2 == m_prebuilt->magic_n); + + /* Force table to be freed in close_thread_table(). */ + DBUG_EXECUTE_IF("free_table_in_fts_query", + if (m_prebuilt->in_fts_query) { + table->mark_table_for_reopen(); + } + ); + + m_prebuilt->keep_other_fields_on_keyread = false; + m_prebuilt->read_just_key = 0; + m_prebuilt->in_fts_query = 0; + + /* Reset index condition pushdown state. */ + if (m_prebuilt->idx_cond) { + m_prebuilt->idx_cond = NULL; + m_prebuilt->idx_cond_n_cols = 0; + /* Invalidate m_prebuilt->mysql_template + in ha_innobase::write_row(). */ + m_prebuilt->template_type = ROW_MYSQL_NO_TEMPLATE; + } + if (m_prebuilt->pk_filter) { + m_prebuilt->pk_filter = NULL; + m_prebuilt->template_type = ROW_MYSQL_NO_TEMPLATE; + } +} + +/*****************************************************************//** +Call this when you have opened a new table handle in HANDLER, before you +call index_read_map() etc. Actually, we can let the cursor stay open even +over a transaction commit! Then you should call this before every operation, +fetch next etc. This function inits the necessary things even after a +transaction commit. */ + +void +ha_innobase::init_table_handle_for_HANDLER(void) +/*============================================*/ +{ + /* If current thd does not yet have a trx struct, create one. + If the current handle does not yet have a m_prebuilt struct, create + one. Update the trx pointers in the m_prebuilt struct. Normally + this operation is done in external_lock. */ + + update_thd(ha_thd()); + + /* Initialize the m_prebuilt struct much like it would be inited in + external_lock */ + + /* If the transaction is not started yet, start it */ + + trx_start_if_not_started_xa(m_prebuilt->trx, false); + + /* Assign a read view if the transaction does not have it yet */ + + m_prebuilt->trx->read_view.open(m_prebuilt->trx); + + innobase_register_trx(ht, m_user_thd, m_prebuilt->trx); + + /* We did the necessary inits in this function, no need to repeat them + in row_search_mvcc() */ + + m_prebuilt->sql_stat_start = FALSE; + + /* We let HANDLER always to do the reads as consistent reads, even + if the trx isolation level would have been specified as SERIALIZABLE */ + + m_prebuilt->select_lock_type = LOCK_NONE; + m_prebuilt->stored_select_lock_type = LOCK_NONE; + + /* Always fetch all columns in the index record */ + + m_prebuilt->hint_need_to_fetch_extra_cols = ROW_RETRIEVE_ALL_COLS; + + /* We want always to fetch all columns in the whole row? Or do + we???? */ + + m_prebuilt->used_in_HANDLER = TRUE; + + reset_template(); + m_prebuilt->trx->bulk_insert = false; +} + +/*********************************************************************//** +Free any resources that were allocated and return failure. +@return always return 1 */ +static int innodb_init_abort() +{ + DBUG_ENTER("innodb_init_abort"); + + if (fil_system.temp_space) { + fil_system.temp_space->close(); + } + + srv_sys_space.shutdown(); + if (srv_tmp_space.get_sanity_check_status()) { + srv_tmp_space.delete_files(); + } + srv_tmp_space.shutdown(); + + DBUG_RETURN(1); +} + +/** Return the minimum buffer pool size based on page size */ +static inline ulint min_buffer_pool_size() +{ + ulint s= (BUF_LRU_MIN_LEN + BUF_LRU_MIN_LEN / 4) * srv_page_size; + /* buf_pool_chunk_size minimum is 1M, so round up to a multiple */ + ulint alignment= 1U << 20; + return UT_CALC_ALIGN(s, alignment); +} + +/** Validate the requested buffer pool size. Also, reserve the necessary +memory needed for buffer pool resize. +@param[in] thd thread handle +@param[in] var pointer to system variable +@param[out] save immediate result for update function +@param[in] value incoming string +@return 0 on success, 1 on failure. +*/ +static +int +innodb_buffer_pool_size_validate( + THD* thd, + struct st_mysql_sys_var* var, + void* save, + struct st_mysql_value* value); + +/** Update the system variable innodb_buffer_pool_size using the "saved" +value. This function is registered as a callback with MySQL. +@param[in] thd thread handle +@param[in] var pointer to system variable +@param[out] var_ptr where the formal string goes +@param[in] save immediate result from check function */ +static +void +innodb_buffer_pool_size_update( + THD* thd, + struct st_mysql_sys_var* var, + void* var_ptr, + const void* save); + +static MYSQL_SYSVAR_ULONGLONG(buffer_pool_size, innobase_buffer_pool_size, + PLUGIN_VAR_RQCMDARG, + "The size of the memory buffer InnoDB uses to cache data and indexes of its tables.", + innodb_buffer_pool_size_validate, + innodb_buffer_pool_size_update, + 128ULL << 20, + 2ULL << 20, + LLONG_MAX, 1024*1024L); + +/****************************************************************//** +Gives the file extension of an InnoDB single-table tablespace. */ +static const char* ha_innobase_exts[] = { + dot_ext[IBD], + dot_ext[ISL], + NullS +}; + +/** Determine if system-versioned data was modified by the transaction. +@param[in,out] thd current session +@param[out] trx_id transaction start ID +@return transaction commit ID +@retval 0 if no system-versioned data was affected by the transaction */ +static ulonglong innodb_prepare_commit_versioned(THD* thd, ulonglong *trx_id) +{ + if (trx_t *trx= thd_to_trx(thd)) + { + *trx_id= trx->id; + bool versioned= false; + + for (auto &t : trx->mod_tables) + { + if (t.second.is_versioned()) + { + DBUG_ASSERT(t.first->versioned_by_id()); + DBUG_ASSERT(trx->rsegs.m_redo.rseg); + versioned= true; + if (!trx->bulk_insert) + break; + } + if (t.second.is_bulk_insert()) + { + ut_ad(trx->bulk_insert); + if (t.second.write_bulk(t.first, trx)) + return ULONGLONG_MAX; + } + } + + return versioned ? trx_sys.get_new_trx_id() : 0; + } + + *trx_id= 0; + return 0; +} + +/** Initialize and normalize innodb_buffer_pool_{chunk_,}size. */ +static void innodb_buffer_pool_size_init() +{ + if (srv_buf_pool_chunk_unit > srv_buf_pool_size) + { + /* Size unit of buffer pool is larger than srv_buf_pool_size. + adjust srv_buf_pool_chunk_unit for srv_buf_pool_size. */ + srv_buf_pool_chunk_unit = srv_buf_pool_size; + } + else if (srv_buf_pool_chunk_unit == 0) + { + srv_buf_pool_chunk_unit = srv_buf_pool_size / 64; + my_large_page_truncate(&srv_buf_pool_chunk_unit); + } + + if (srv_buf_pool_chunk_unit < buf_pool_chunk_min_size) + srv_buf_pool_chunk_unit = buf_pool_chunk_min_size; + + srv_buf_pool_size = buf_pool_size_align(srv_buf_pool_size); + innobase_buffer_pool_size = srv_buf_pool_size; +} + + +static bool +compression_algorithm_is_not_loaded(ulong compression_algorithm, myf flags) +{ + bool is_loaded[PAGE_ALGORITHM_LAST+1]= { 1, 1, provider_service_lz4->is_loaded, + provider_service_lzo->is_loaded, provider_service_lzma->is_loaded, + provider_service_bzip2->is_loaded, provider_service_snappy->is_loaded }; + + DBUG_ASSERT(compression_algorithm <= PAGE_ALGORITHM_LAST); + + if (is_loaded[compression_algorithm]) + return 0; + + my_printf_error(HA_ERR_UNSUPPORTED, "InnoDB: compression algorithm %s (%u)" + " is not available. Please, load the corresponding provider plugin.", flags, + page_compression_algorithms[compression_algorithm], compression_algorithm); + return 1; +} + +/** Initialize, validate and normalize the InnoDB startup parameters. +@return failure code +@retval 0 on success +@retval HA_ERR_OUT_OF_MEM when out of memory +@retval HA_ERR_INITIALIZATION when some parameters are out of range */ +static int innodb_init_params() +{ + DBUG_ENTER("innodb_init_params"); + + ulong num_pll_degree; + + /* Check that values don't overflow on 32-bit systems. */ + if (sizeof(ulint) == 4) { + if (innobase_buffer_pool_size > UINT_MAX32) { + sql_print_error( + "innodb_buffer_pool_size can't be over 4GB" + " on 32-bit systems"); + DBUG_RETURN(HA_ERR_OUT_OF_MEM); + } + } + + /* The buffer pool needs to be able to accommodate enough many + pages, even for larger pages */ + MYSQL_SYSVAR_NAME(buffer_pool_size).min_val= min_buffer_pool_size(); + + if (innobase_buffer_pool_size < MYSQL_SYSVAR_NAME(buffer_pool_size).min_val) { + ib::error() << "innodb_page_size=" + << srv_page_size << " requires " + << "innodb_buffer_pool_size >= " + << (MYSQL_SYSVAR_NAME(buffer_pool_size).min_val >> 20) + << "MiB current " << (innobase_buffer_pool_size >> 20) + << "MiB"; + DBUG_RETURN(HA_ERR_INITIALIZATION); + } + + if (compression_algorithm_is_not_loaded(innodb_compression_algorithm, ME_ERROR_LOG)) + DBUG_RETURN(HA_ERR_INITIALIZATION); + + if ((srv_encrypt_tables || srv_encrypt_log + || innodb_encrypt_temporary_tables) + && !encryption_key_id_exists(FIL_DEFAULT_ENCRYPTION_KEY)) { + sql_print_error("InnoDB: cannot enable encryption, " + "encryption plugin is not available"); + DBUG_RETURN(HA_ERR_INITIALIZATION); + } + +#ifdef _WIN32 + if (!is_filename_allowed(srv_buf_dump_filename, + strlen(srv_buf_dump_filename), FALSE)) { + sql_print_error("InnoDB: innodb_buffer_pool_filename" + " cannot have colon (:) in the file name."); + DBUG_RETURN(HA_ERR_INITIALIZATION); + } +#endif + + /* First calculate the default path for innodb_data_home_dir etc., + in case the user has not given any value. + + Note that when using the embedded server, the datadirectory is not + necessarily the current directory of this program. */ + + fil_path_to_mysql_datadir = +#ifndef HAVE_REPLICATION + mysqld_embedded ? mysql_real_data_home : +#endif + "./"; + + /* Set InnoDB initialization parameters according to the values + read from MySQL .cnf file */ + + /* The default dir for data files is the datadir of MySQL */ + + srv_data_home = innobase_data_home_dir + ? innobase_data_home_dir + : const_cast(fil_path_to_mysql_datadir); +#ifdef WITH_WSREP + /* If we use the wsrep API, then we need to tell the server + the path to the data files (for passing it to the SST scripts): */ + wsrep_set_data_home_dir(srv_data_home); +#endif /* WITH_WSREP */ + + + /*--------------- Shared tablespaces -------------------------*/ + + /* Check that the value of system variable innodb_page_size was + set correctly. Its value was put into srv_page_size. If valid, + return the associated srv_page_size_shift. */ + srv_page_size_shift = innodb_page_size_validate(srv_page_size); + if (!srv_page_size_shift) { + sql_print_error("InnoDB: Invalid page size=%lu.\n", + srv_page_size); + DBUG_RETURN(HA_ERR_INITIALIZATION); + } + + srv_sys_space.set_space_id(TRX_SYS_SPACE); + + switch (srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_FULL_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32: + srv_sys_space.set_flags(FSP_FLAGS_FCRC32_MASK_MARKER + | FSP_FLAGS_FCRC32_PAGE_SSIZE()); + break; + default: + srv_sys_space.set_flags(FSP_FLAGS_PAGE_SSIZE()); + } + + srv_sys_space.set_path(srv_data_home); + + /* Supports raw devices */ + if (!srv_sys_space.parse_params(innobase_data_file_path, true)) { + ib::error() << "Unable to parse innodb_data_file_path=" + << innobase_data_file_path; + DBUG_RETURN(HA_ERR_INITIALIZATION); + } + + srv_tmp_space.set_path(srv_data_home); + + /* Temporary tablespace is in full crc32 format. */ + srv_tmp_space.set_flags(FSP_FLAGS_FCRC32_MASK_MARKER + | FSP_FLAGS_FCRC32_PAGE_SSIZE()); + + if (!srv_tmp_space.parse_params(innobase_temp_data_file_path, false)) { + ib::error() << "Unable to parse innodb_temp_data_file_path=" + << innobase_temp_data_file_path; + DBUG_RETURN(HA_ERR_INITIALIZATION); + } + + /* Perform all sanity check before we take action of deleting files*/ + if (srv_sys_space.intersection(&srv_tmp_space)) { + sql_print_error("innodb_temporary and innodb_system" + " file names seem to be the same."); + DBUG_RETURN(HA_ERR_INITIALIZATION); + } + + srv_sys_space.normalize_size(); + srv_tmp_space.normalize_size(); + + /* ------------ UNDO tablespaces files ---------------------*/ + if (!srv_undo_dir) { + srv_undo_dir = const_cast(fil_path_to_mysql_datadir); + } + + if (strchr(srv_undo_dir, ';')) { + sql_print_error("syntax error in innodb_undo_directory"); + DBUG_RETURN(HA_ERR_INITIALIZATION); + } + + /* -------------- All log files ---------------------------*/ + + /* The default dir for log files is the datadir of MySQL */ + + if (!srv_log_group_home_dir) { + srv_log_group_home_dir + = const_cast(fil_path_to_mysql_datadir); + } + + if (strchr(srv_log_group_home_dir, ';')) { + sql_print_error("syntax error in innodb_log_group_home_dir"); + DBUG_RETURN(HA_ERR_INITIALIZATION); + } + + DBUG_ASSERT(innodb_change_buffering <= IBUF_USE_ALL); + + /* Check that interdependent parameters have sane values. */ + if (srv_max_buf_pool_modified_pct < srv_max_dirty_pages_pct_lwm) { + sql_print_warning("InnoDB: innodb_max_dirty_pages_pct_lwm" + " cannot be set higher than" + " innodb_max_dirty_pages_pct.\n" + "InnoDB: Setting" + " innodb_max_dirty_pages_pct_lwm to %lf\n", + srv_max_buf_pool_modified_pct); + + srv_max_dirty_pages_pct_lwm = srv_max_buf_pool_modified_pct; + } + + if (srv_max_io_capacity == SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT) { + + if (srv_io_capacity >= SRV_MAX_IO_CAPACITY_LIMIT / 2) { + /* Avoid overflow. */ + srv_max_io_capacity = SRV_MAX_IO_CAPACITY_LIMIT; + } else { + /* The user has not set the value. We should + set it based on innodb_io_capacity. */ + srv_max_io_capacity = + ut_max(2 * srv_io_capacity, 2000UL); + } + + } else if (srv_max_io_capacity < srv_io_capacity) { + sql_print_warning("InnoDB: innodb_io_capacity" + " cannot be set higher than" + " innodb_io_capacity_max." + "Setting innodb_io_capacity=%lu", + srv_max_io_capacity); + + srv_io_capacity = srv_max_io_capacity; + } + + if (UNIV_PAGE_SIZE_DEF != srv_page_size) { + ib::info() << "innodb_page_size=" << srv_page_size; + + srv_max_undo_log_size = std::max( + srv_max_undo_log_size, + ulonglong(SRV_UNDO_TABLESPACE_SIZE_IN_PAGES) + << srv_page_size_shift); + } + + srv_buf_pool_size = ulint(innobase_buffer_pool_size); + + if (innobase_open_files < 10) { + innobase_open_files = 300; + if (srv_file_per_table && tc_size > 300 && tc_size < open_files_limit) { + innobase_open_files = tc_size; + } + } + + if (innobase_open_files > open_files_limit) { + ib::warn() << "innodb_open_files " << innobase_open_files + << " should not be greater" + << " than the open_files_limit " << open_files_limit; + if (innobase_open_files > tc_size) { + innobase_open_files = tc_size; + } + } + + srv_max_n_open_files = innobase_open_files; + srv_innodb_status = (ibool) innobase_create_status_file; + + srv_print_verbose_log = mysqld_embedded ? 0 : 1; + + /* Round up fts_sort_pll_degree to nearest power of 2 number */ + for (num_pll_degree = 1; + num_pll_degree < fts_sort_pll_degree; + num_pll_degree <<= 1) { + + /* No op */ + } + + fts_sort_pll_degree = num_pll_degree; + + /* Store the default charset-collation number of this MySQL + installation */ + + data_mysql_default_charset_coll = (ulint) default_charset_info->number; + +#ifndef _WIN32 + if (srv_use_atomic_writes && my_may_have_atomic_write) { + /* + Force O_DIRECT on Unixes (on Windows writes are always + unbuffered) + */ + switch (srv_file_flush_method) { + case SRV_O_DIRECT: + case SRV_O_DIRECT_NO_FSYNC: + break; + default: + srv_file_flush_method = SRV_O_DIRECT; + fprintf(stderr, "InnoDB: using O_DIRECT due to atomic writes.\n"); + } + } +#endif + +#if defined __linux__ || defined _WIN32 + if (srv_flush_log_at_trx_commit == 2) { + /* Do not disable the file system cache if + innodb_flush_log_at_trx_commit=2. */ + log_sys.log_buffered = true; + } +#endif + + if (srv_read_only_mode) { + ib::info() << "Started in read only mode"; + srv_use_doublewrite_buf = FALSE; + } + +#if !defined LINUX_NATIVE_AIO && !defined HAVE_URING && !defined _WIN32 + /* Currently native AIO is supported only on windows and linux + and that also when the support is compiled in. In all other + cases, we ignore the setting of innodb_use_native_aio. */ + srv_use_native_aio = FALSE; +#endif +#ifdef HAVE_URING + if (srv_use_native_aio && io_uring_may_be_unsafe) { + sql_print_warning("innodb_use_native_aio may cause " + "hangs with this kernel %s; see " + "https://jira.mariadb.org/browse/MDEV-26674", + io_uring_may_be_unsafe); + } +#endif + +#ifndef _WIN32 + ut_ad(srv_file_flush_method <= SRV_O_DIRECT_NO_FSYNC); +#else + switch (srv_file_flush_method) { + case SRV_ALL_O_DIRECT_FSYNC + 1 /* "async_unbuffered"="unbuffered" */: + srv_file_flush_method = SRV_ALL_O_DIRECT_FSYNC; + break; + case SRV_ALL_O_DIRECT_FSYNC + 2 /* "normal"="fsync" */: + srv_file_flush_method = SRV_FSYNC; + break; + default: + ut_ad(srv_file_flush_method <= SRV_ALL_O_DIRECT_FSYNC); + } +#endif + innodb_buffer_pool_size_init(); + + srv_lock_table_size = 5 * (srv_buf_pool_size >> srv_page_size_shift); + DBUG_RETURN(0); +} + +/** Initialize the InnoDB storage engine plugin. +@param[in,out] p InnoDB handlerton +@return error code +@retval 0 on success */ +static int innodb_init(void* p) +{ + DBUG_ENTER("innodb_init"); + handlerton* innobase_hton= static_cast(p); + innodb_hton_ptr = innobase_hton; + + innobase_hton->db_type = DB_TYPE_INNODB; + innobase_hton->savepoint_offset = sizeof(trx_named_savept_t); + innobase_hton->close_connection = innobase_close_connection; + innobase_hton->kill_query = innobase_kill_query; + innobase_hton->savepoint_set = innobase_savepoint; + innobase_hton->savepoint_rollback = innobase_rollback_to_savepoint; + + innobase_hton->savepoint_rollback_can_release_mdl = + innobase_rollback_to_savepoint_can_release_mdl; + + innobase_hton->savepoint_release = innobase_release_savepoint; + innobase_hton->prepare_ordered= NULL; + innobase_hton->commit_ordered= innobase_commit_ordered; + innobase_hton->commit = innobase_commit; + innobase_hton->rollback = innobase_rollback; + innobase_hton->prepare = innobase_xa_prepare; + innobase_hton->recover = innobase_xa_recover; + innobase_hton->commit_by_xid = innobase_commit_by_xid; + innobase_hton->rollback_by_xid = innobase_rollback_by_xid; + innobase_hton->commit_checkpoint_request = innodb_log_flush_request; + innobase_hton->create = innobase_create_handler; + + innobase_hton->drop_database = innodb_drop_database; + innobase_hton->panic = innobase_end; + innobase_hton->pre_shutdown = innodb_preshutdown; + + innobase_hton->start_consistent_snapshot = + innobase_start_trx_and_assign_read_view; + + innobase_hton->flush_logs = innobase_flush_logs; + innobase_hton->show_status = innobase_show_status; + innobase_hton->notify_tabledef_changed= innodb_notify_tabledef_changed; + innobase_hton->flags = + HTON_SUPPORTS_EXTENDED_KEYS | HTON_SUPPORTS_FOREIGN_KEYS | + HTON_NATIVE_SYS_VERSIONING | + HTON_WSREP_REPLICATION | + HTON_REQUIRES_CLOSE_AFTER_TRUNCATE | + HTON_TRUNCATE_REQUIRES_EXCLUSIVE_USE | + HTON_REQUIRES_NOTIFY_TABLEDEF_CHANGED_AFTER_COMMIT; + +#ifdef WITH_WSREP + innobase_hton->abort_transaction=wsrep_abort_transaction; + innobase_hton->set_checkpoint=innobase_wsrep_set_checkpoint; + innobase_hton->get_checkpoint=innobase_wsrep_get_checkpoint; + innobase_hton->disable_internal_writes=innodb_disable_internal_writes; +#endif /* WITH_WSREP */ + + innobase_hton->check_version = innodb_check_version; + innobase_hton->signal_ddl_recovery_done = innodb_ddl_recovery_done; + + innobase_hton->tablefile_extensions = ha_innobase_exts; + innobase_hton->table_options = innodb_table_option_list; + + /* System Versioning */ + innobase_hton->prepare_commit_versioned + = innodb_prepare_commit_versioned; + + innodb_remember_check_sysvar_funcs(); + + compile_time_assert(DATA_MYSQL_TRUE_VARCHAR == MYSQL_TYPE_VARCHAR); + +#ifndef DBUG_OFF + static const char test_filename[] = "-@"; + char test_tablename[sizeof test_filename + + sizeof(srv_mysql50_table_name_prefix) - 1]; + DBUG_ASSERT(sizeof test_tablename - 1 + == filename_to_tablename(test_filename, + test_tablename, + sizeof test_tablename, true)); + DBUG_ASSERT(!strncmp(test_tablename, + srv_mysql50_table_name_prefix, + sizeof srv_mysql50_table_name_prefix - 1)); + DBUG_ASSERT(!strcmp(test_tablename + + sizeof srv_mysql50_table_name_prefix - 1, + test_filename)); +#endif /* DBUG_OFF */ + + os_file_set_umask(my_umask); + + /* Setup the memory alloc/free tracing mechanisms before calling + any functions that could possibly allocate memory. */ + ut_new_boot(); + + if (int error = innodb_init_params()) { + DBUG_RETURN(error); + } + + /* After this point, error handling has to use + innodb_init_abort(). */ + +#ifdef HAVE_PSI_INTERFACE + /* Register keys with MySQL performance schema */ + int count; + +# ifdef UNIV_PFS_MUTEX + count = array_elements(all_innodb_mutexes); + mysql_mutex_register("innodb", all_innodb_mutexes, count); +# endif /* UNIV_PFS_MUTEX */ + +# ifdef UNIV_PFS_RWLOCK + count = array_elements(all_innodb_rwlocks); + mysql_rwlock_register("innodb", all_innodb_rwlocks, count); +# endif /* UNIV_PFS_MUTEX */ + +# ifdef UNIV_PFS_THREAD + count = array_elements(all_innodb_threads); + mysql_thread_register("innodb", all_innodb_threads, count); +# endif /* UNIV_PFS_THREAD */ + +# ifdef UNIV_PFS_IO + count = array_elements(all_innodb_files); + mysql_file_register("innodb", all_innodb_files, count); +# endif /* UNIV_PFS_IO */ +#endif /* HAVE_PSI_INTERFACE */ + + bool create_new_db = false; + + /* Check whether the data files exist. */ + dberr_t err = srv_sys_space.check_file_spec(&create_new_db, 5U << 20); + + if (err != DB_SUCCESS) { + DBUG_RETURN(innodb_init_abort()); + } + + err = srv_start(create_new_db); + + if (err != DB_SUCCESS) { + innodb_shutdown(); + DBUG_RETURN(innodb_init_abort()); + } + + srv_was_started = true; + innodb_params_adjust(); + + innobase_old_blocks_pct = buf_LRU_old_ratio_update( + innobase_old_blocks_pct, true); + + ibuf_max_size_update(srv_change_buffer_max_size); + + mysql_mutex_init(pending_checkpoint_mutex_key, + &log_requests.mutex, + MY_MUTEX_INIT_FAST); +#ifdef MYSQL_DYNAMIC_PLUGIN + if (innobase_hton != p) { + innobase_hton = reinterpret_cast(p); + *innobase_hton = *innodb_hton_ptr; + } +#endif /* MYSQL_DYNAMIC_PLUGIN */ + + memset(innodb_counter_value, 0, sizeof innodb_counter_value); + + /* Do this as late as possible so server is fully starts up, + since we might get some initial stats if user choose to turn + on some counters from start up */ + if (innobase_enable_monitor_counter) { + innodb_enable_monitor_at_startup( + innobase_enable_monitor_counter); + } + + /* Turn on monitor counters that are default on */ + srv_mon_default_on(); + + /* Unit Tests */ +#ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR + unit_test_os_file_get_parent_dir(); +#endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */ + +#ifdef UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH + test_make_filepath(); +#endif /*UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH */ + +#ifdef UNIV_ENABLE_DICT_STATS_TEST + test_dict_stats_all(); +#endif /*UNIV_ENABLE_DICT_STATS_TEST */ + +#ifdef UNIV_ENABLE_UNIT_TEST_ROW_RAW_FORMAT_INT +# ifdef HAVE_UT_CHRONO_T + test_row_raw_format_int(); +# endif /* HAVE_UT_CHRONO_T */ +#endif /* UNIV_ENABLE_UNIT_TEST_ROW_RAW_FORMAT_INT */ + + DBUG_RETURN(0); +} + +/** Shut down the InnoDB storage engine. +@return 0 */ +static +int +innobase_end(handlerton*, ha_panic_function) +{ + DBUG_ENTER("innobase_end"); + + if (srv_was_started) { + THD *thd= current_thd; + if (thd) { // may be UNINSTALL PLUGIN statement + if (trx_t* trx = thd_to_trx(thd)) { + trx->free(); + } + } + + + innodb_shutdown(); + mysql_mutex_destroy(&log_requests.mutex); + } + + DBUG_RETURN(0); +} + +/*****************************************************************//** +Commits a transaction in an InnoDB database. */ +void +innobase_commit_low( +/*================*/ + trx_t* trx) /*!< in: transaction handle */ +{ +#ifdef WITH_WSREP + const char* tmp = 0; + const bool is_wsrep = trx->is_wsrep(); + if (is_wsrep) { + tmp = thd_proc_info(trx->mysql_thd, "innobase_commit_low()"); + } +#endif /* WITH_WSREP */ + if (trx_is_started(trx)) { + trx_commit_for_mysql(trx); + } else { + trx->will_lock = false; +#ifdef WITH_WSREP + trx->wsrep = false; +#endif /* WITH_WSREP */ + } + +#ifdef WITH_WSREP + if (is_wsrep) { + thd_proc_info(trx->mysql_thd, tmp); + } +#endif /* WITH_WSREP */ +} + +/*****************************************************************//** +Creates an InnoDB transaction struct for the thd if it does not yet have one. +Starts a new InnoDB transaction if a transaction is not yet started. And +assigns a new snapshot for a consistent read if the transaction does not yet +have one. +@return 0 */ +static +int +innobase_start_trx_and_assign_read_view( +/*====================================*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + THD* thd) /*!< in: MySQL thread handle of the user for + whom the transaction should be committed */ +{ + DBUG_ENTER("innobase_start_trx_and_assign_read_view"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + /* Create a new trx struct for thd, if it does not yet have one */ + + trx_t* trx = check_trx_exists(thd); + + /* The transaction should not be active yet, start it */ + + ut_ad(!trx_is_started(trx)); + + trx_start_if_not_started_xa(trx, false); + + /* Assign a read view if the transaction does not have it yet. + Do this only if transaction is using REPEATABLE READ isolation + level. */ + trx->isolation_level = innobase_map_isolation_level( + thd_get_trx_isolation(thd)); + + if (trx->isolation_level == TRX_ISO_REPEATABLE_READ) { + trx->read_view.open(trx); + } else { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_UNSUPPORTED, + "InnoDB: WITH CONSISTENT SNAPSHOT" + " was ignored because this phrase" + " can only be used with" + " REPEATABLE READ isolation level."); + } + + /* Set the MySQL flag to mark that there is an active transaction */ + + innobase_register_trx(hton, current_thd, trx); + + DBUG_RETURN(0); +} + +static +void +innobase_commit_ordered_2( +/*======================*/ + trx_t* trx, /*!< in: Innodb transaction */ + THD* thd) /*!< in: MySQL thread handle */ +{ + DBUG_ENTER("innobase_commit_ordered_2"); + + if (trx->id) { + /* The following call reads the binary log position of + the transaction being committed. + + Binary logging of other engines is not relevant to + InnoDB as all InnoDB requires is that committing + InnoDB transactions appear in the same order in the + MySQL binary log as they appear in InnoDB logs, which + is guaranteed by the server. + + If the binary log is not enabled, or the transaction + is not written to the binary log, the file name will + be a NULL pointer. */ + thd_binlog_pos(thd, &trx->mysql_log_file_name, + &trx->mysql_log_offset); + + /* Don't do write + flush right now. For group commit + to work we want to do the flush later. */ + trx->flush_log_later = true; + } + +#ifdef WITH_WSREP + /* If the transaction is not run in 2pc, we must assign wsrep + XID here in order to get it written in rollback segment. */ + if (trx->is_wsrep()) { + thd_get_xid(thd, &reinterpret_cast(trx->xid)); + } +#endif /* WITH_WSREP */ + + innobase_commit_low(trx); + trx->mysql_log_file_name = NULL; + trx->flush_log_later = false; + + DBUG_VOID_RETURN; +} + +/*****************************************************************//** +Perform the first, fast part of InnoDB commit. + +Doing it in this call ensures that we get the same commit order here +as in binlog and any other participating transactional storage engines. + +Note that we want to do as little as really needed here, as we run +under a global mutex. The expensive fsync() is done later, in +innobase_commit(), without a lock so group commit can take place. + +Note also that this method can be called from a different thread than +the one handling the rest of the transaction. */ +static +void +innobase_commit_ordered( +/*====================*/ + handlerton *hton, /*!< in: Innodb handlerton */ + THD* thd, /*!< in: MySQL thread handle of the user for whom + the transaction should be committed */ + bool all) /*!< in: TRUE - commit transaction + FALSE - the current SQL statement ended */ +{ + trx_t* trx; + DBUG_ENTER("innobase_commit_ordered"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + trx = check_trx_exists(thd); + + if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) { + /* We cannot throw error here; instead we will catch this error + again in innobase_commit() and report it from there. */ + DBUG_VOID_RETURN; + } + + /* commit_ordered is only called when committing the whole transaction + (or an SQL statement when autocommit is on). */ + DBUG_ASSERT(all || + (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))); + + innobase_commit_ordered_2(trx, thd); + trx->active_commit_ordered = true; + + DBUG_VOID_RETURN; +} + +/** Mark the end of a statement. +@param trx transaction +@return whether an error occurred */ +static bool end_of_statement(trx_t *trx) +{ + trx_mark_sql_stat_end(trx); + if (UNIV_LIKELY(trx->error_state == DB_SUCCESS)) + return false; + + trx_savept_t savept; + savept.least_undo_no= 0; + trx->rollback(&savept); + /* MariaDB will roll back the entire transaction. */ + trx->bulk_insert= false; + trx->last_sql_stat_start.least_undo_no= 0; + trx->savepoints_discard(); + return true; +} + +/*****************************************************************//** +Commits a transaction in an InnoDB database or marks an SQL statement +ended. +@return 0 or deadlock error if the transaction was aborted by another + higher priority transaction. */ +static +int +innobase_commit( +/*============*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + THD* thd, /*!< in: MySQL thread handle of the + user for whom the transaction should + be committed */ + bool commit_trx) /*!< in: true - commit transaction + false - the current SQL statement + ended */ +{ + DBUG_ENTER("innobase_commit"); + DBUG_PRINT("enter", ("commit_trx: %d", commit_trx)); + DBUG_ASSERT(hton == innodb_hton_ptr); + DBUG_PRINT("trans", ("ending transaction")); + + trx_t* trx = check_trx_exists(thd); + + ut_ad(!trx->dict_operation_lock_mode); + ut_ad(!trx->dict_operation); + + /* Transaction is deregistered only in a commit or a rollback. If + it is deregistered we know there cannot be resources to be freed + and we could return immediately. For the time being, we play safe + and do the cleanup though there should be nothing to clean up. */ + + if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) { + + sql_print_error("Transaction not registered for MariaDB 2PC," + " but transaction is active"); + } + + bool read_only = trx->read_only || trx->id == 0; + DBUG_PRINT("info", ("readonly: %d", read_only)); + + if (commit_trx + || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) { + + /* Run the fast part of commit if we did not already. */ + if (!trx->active_commit_ordered) { + innobase_commit_ordered_2(trx, thd); + + } + + /* We were instructed to commit the whole transaction, or + this is an SQL statement end and autocommit is on */ + + /* At this point commit order is fixed and transaction is + visible to others. So we can wakeup other commits waiting for + this one, to allow then to group commit with us. */ + thd_wakeup_subsequent_commits(thd, 0); + + /* Now do a write + flush of logs. */ + trx_commit_complete_for_mysql(trx); + + trx_deregister_from_2pc(trx); + } else { + /* We just mark the SQL statement ended and do not do a + transaction commit */ + + /* If we had reserved the auto-inc lock for some + table in this SQL statement we release it now */ + + if (!read_only) { + lock_unlock_table_autoinc(trx); + } + + /* Store the current undo_no of the transaction so that we + know where to roll back if we have to roll back the next + SQL statement */ + if (UNIV_UNLIKELY(end_of_statement(trx))) { + DBUG_RETURN(1); + } + } + + /* Reset the number AUTO-INC rows required */ + trx->n_autoinc_rows = 0; + + /* This is a statement level variable. */ + trx->fts_next_doc_id = 0; + + DBUG_RETURN(0); +} + +/*****************************************************************//** +Rolls back a transaction or the latest SQL statement. +@return 0 or error number */ +static +int +innobase_rollback( +/*==============*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + THD* thd, /*!< in: handle to the MySQL thread + of the user whose transaction should + be rolled back */ + bool rollback_trx) /*!< in: TRUE - rollback entire + transaction FALSE - rollback the current + statement only */ +{ + DBUG_ENTER("innobase_rollback"); + DBUG_ASSERT(hton == innodb_hton_ptr); + DBUG_PRINT("trans", ("aborting transaction")); + + trx_t* trx = check_trx_exists(thd); + + ut_ad(!trx->dict_operation_lock_mode); + ut_ad(!trx->dict_operation); + + /* Reset the number AUTO-INC rows required */ + + trx->n_autoinc_rows = 0; + + /* If we had reserved the auto-inc lock for some table (if + we come here to roll back the latest SQL statement) we + release it now before a possibly lengthy rollback */ + lock_unlock_table_autoinc(trx); + + /* This is a statement level variable. */ + + trx->fts_next_doc_id = 0; + + dberr_t error; + +#ifdef WITH_WSREP + /* If trx was assigned wsrep XID in prepare phase and the + trx is being rolled back due to BF abort, clear XID in order + to avoid writing it to rollback segment out of order. The XID + will be reassigned when the transaction is replayed. */ + if (trx->state != TRX_STATE_NOT_STARTED + && wsrep_is_wsrep_xid(&trx->xid)) { + trx->xid.null(); + } +#endif /* WITH_WSREP */ + if (rollback_trx + || !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { + + error = trx_rollback_for_mysql(trx); + + trx_deregister_from_2pc(trx); + } else { + + error = trx_rollback_last_sql_stat_for_mysql(trx); + } + + DBUG_RETURN(convert_error_code_to_mysql(error, 0, trx->mysql_thd)); +} + +/*****************************************************************//** +Rolls back a transaction +@return 0 or error number */ +static +int +innobase_rollback_trx( +/*==================*/ + trx_t* trx) /*!< in: transaction */ +{ + DBUG_ENTER("innobase_rollback_trx"); + DBUG_PRINT("trans", ("aborting transaction")); + + /* If we had reserved the auto-inc lock for some table (if + we come here to roll back the latest SQL statement) we + release it now before a possibly lengthy rollback */ + lock_unlock_table_autoinc(trx); + trx_deregister_from_2pc(trx); + + DBUG_RETURN(convert_error_code_to_mysql(trx_rollback_for_mysql(trx), + 0, trx->mysql_thd)); +} + +/** Invoke commit_checkpoint_notify_ha() on completed log flush requests. +@param pending log_requests.start +@param lsn log_sys.get_flushed_lsn() */ +static void log_flush_notify_and_unlock(log_flush_request *pending, lsn_t lsn) +{ + mysql_mutex_assert_owner(&log_requests.mutex); + ut_ad(pending == log_requests.start.load(std::memory_order_relaxed)); + log_flush_request *entry= pending, *last= nullptr; + /* Process the first requests that have been completed. Since + the list is not necessarily in ascending order of LSN, we may + miss to notify some requests that have already been completed. + But there is no harm in delaying notifications for those a bit. + And in practise, the list is unlikely to have more than one + element anyway, because the redo log would be flushed every + srv_flush_log_at_timeout seconds (1 by default). */ + for (; entry && entry->lsn <= lsn; last= entry, entry= entry->next); + + if (!last) + { + mysql_mutex_unlock(&log_requests.mutex); + return; + } + + /* Detach the head of the list that corresponds to persisted log writes. */ + if (!entry) + log_requests.end= entry; + log_requests.start.store(entry, std::memory_order_relaxed); + mysql_mutex_unlock(&log_requests.mutex); + + /* Now that we have released the mutex, notify the submitters + and free the head of the list. */ + do + { + entry= pending; + pending= pending->next; + commit_checkpoint_notify_ha(entry->cookie); + my_free(entry); + } + while (entry != last); +} + +/** Invoke commit_checkpoint_notify_ha() to notify that outstanding +log writes have been completed. */ +void log_flush_notify(lsn_t flush_lsn) +{ + if (auto pending= log_requests.start.load(std::memory_order_acquire)) + { + mysql_mutex_lock(&log_requests.mutex); + pending= log_requests.start.load(std::memory_order_relaxed); + log_flush_notify_and_unlock(pending, flush_lsn); + } +} + +/** Handle a commit checkpoint request from server layer. +We put the request in a queue, so that we can notify upper layer about +checkpoint complete when we have flushed the redo log. +If we have already flushed all relevant redo log, we notify immediately.*/ +static void innodb_log_flush_request(void *cookie) +{ + lsn_t flush_lsn= log_sys.get_flushed_lsn(); + /* Load lsn relaxed after flush_lsn was loaded from the same cache line */ + const lsn_t lsn= log_sys.get_lsn(); + + if (flush_lsn >= lsn) + /* All log is already persistent. */; + else if (UNIV_UNLIKELY(srv_force_recovery >= SRV_FORCE_NO_BACKGROUND)) + /* Normally, srv_master_callback() should periodically invoke + srv_sync_log_buffer_in_background(), which should initiate a log + flush about once every srv_flush_log_at_timeout seconds. But, + starting with the innodb_force_recovery=2 level, that background + task will not run. */ + log_write_up_to(flush_lsn= lsn, true); + else if (log_flush_request *req= static_cast + (my_malloc(PSI_INSTRUMENT_ME, sizeof *req, MYF(MY_WME)))) + { + req->next= nullptr; + req->cookie= cookie; + req->lsn= lsn; + + log_flush_request *start= nullptr; + + mysql_mutex_lock(&log_requests.mutex); + /* In order to prevent a race condition where log_flush_notify() + would skip a notification due to, we must update log_requests.start from + nullptr (empty) to the first req using std::memory_order_release. */ + if (log_requests.start.compare_exchange_strong(start, req, + std::memory_order_release, + std::memory_order_relaxed)) + { + ut_ad(!log_requests.end); + start= req; + /* In case log_flush_notify() executed + log_requests.start.load(std::memory_order_acquire) right before + our successful compare_exchange, we must re-read flush_lsn to + ensure that our request will be notified immediately if applicable. */ + flush_lsn= log_sys.get_flushed_lsn(); + } + else + { + /* Append the entry to the list. Because we determined req->lsn before + acquiring the mutex, this list may not be ordered by req->lsn, + even though log_flush_notify_and_unlock() assumes so. */ + log_requests.end->next= req; + } + + log_requests.end= req; + + /* This hopefully addresses the hang that was reported in MDEV-24302. + Upon receiving a new request, we will notify old requests of + completion. */ + log_flush_notify_and_unlock(start, flush_lsn); + return; + } + else + sql_print_error("Failed to allocate %zu bytes." + " Commit checkpoint will be skipped.", sizeof *req); + + /* This hopefully addresses the hang that was reported in MDEV-24302. + Upon receiving a new request to notify of log writes becoming + persistent, we will notify old requests of completion. Note: + log_flush_notify() may skip some notifications because it is + basically assuming that the list is in ascending order of LSN. */ + log_flush_notify(flush_lsn); + commit_checkpoint_notify_ha(cookie); +} + +/*****************************************************************//** +Rolls back a transaction to a savepoint. +@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the +given name */ +static +int +innobase_rollback_to_savepoint( +/*===========================*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + THD* thd, /*!< in: handle to the MySQL thread + of the user whose transaction should + be rolled back to savepoint */ + void* savepoint) /*!< in: savepoint data */ +{ + + DBUG_ENTER("innobase_rollback_to_savepoint"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + trx_t* trx = check_trx_exists(thd); + + /* TODO: use provided savepoint data area to store savepoint data */ + + char name[64]; + + longlong2str(longlong(savepoint), name, 36); + + int64_t mysql_binlog_cache_pos; + + dberr_t error = trx_rollback_to_savepoint_for_mysql( + trx, name, &mysql_binlog_cache_pos); + + if (error == DB_SUCCESS && trx->fts_trx != NULL) { + fts_savepoint_rollback(trx, name); + } + + DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); +} + +/*****************************************************************//** +Check whether innodb state allows to safely release MDL locks after +rollback to savepoint. +When binlog is on, MDL locks acquired after savepoint unit are not +released if there are any locks held in InnoDB. +@return true if it is safe, false if its not safe. */ +static +bool +innobase_rollback_to_savepoint_can_release_mdl( +/*===========================================*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + THD* thd) /*!< in: handle to the MySQL thread + of the user whose transaction should + be rolled back to savepoint */ +{ + DBUG_ENTER("innobase_rollback_to_savepoint_can_release_mdl"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + trx_t* trx = check_trx_exists(thd); + + /* If transaction has not acquired any locks then it is safe + to release MDL after rollback to savepoint */ + if (UT_LIST_GET_LEN(trx->lock.trx_locks) == 0) { + + DBUG_RETURN(true); + } + + DBUG_RETURN(false); +} + +/*****************************************************************//** +Release transaction savepoint name. +@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the +given name */ +static +int +innobase_release_savepoint( +/*=======================*/ + handlerton* hton, /*!< in: handlerton for InnoDB */ + THD* thd, /*!< in: handle to the MySQL thread + of the user whose transaction's + savepoint should be released */ + void* savepoint) /*!< in: savepoint data */ +{ + dberr_t error; + trx_t* trx; + char name[64]; + + DBUG_ENTER("innobase_release_savepoint"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + trx = check_trx_exists(thd); + + /* TODO: use provided savepoint data area to store savepoint data */ + + longlong2str(longlong(savepoint), name, 36); + + error = trx_release_savepoint_for_mysql(trx, name); + + if (error == DB_SUCCESS && trx->fts_trx != NULL) { + fts_savepoint_release(trx, name); + } + + DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); +} + +/*****************************************************************//** +Sets a transaction savepoint. +@return always 0, that is, always succeeds */ +static +int +innobase_savepoint( +/*===============*/ + handlerton* hton, /*!< in: handle to the InnoDB handlerton */ + THD* thd, /*!< in: handle to the MySQL thread */ + void* savepoint)/*!< in: savepoint data */ +{ + DBUG_ENTER("innobase_savepoint"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + /* In the autocommit mode there is no sense to set a savepoint + (unless we are in sub-statement), so SQL layer ensures that + this method is never called in such situation. */ + + trx_t* trx = check_trx_exists(thd); + + /* Cannot happen outside of transaction */ + DBUG_ASSERT(trx_is_registered_for_2pc(trx)); + + /* TODO: use provided savepoint data area to store savepoint data */ + char name[64]; + + longlong2str(longlong(savepoint), name, 36); + + dberr_t error = trx_savepoint_for_mysql(trx, name, 0); + + if (error == DB_SUCCESS && trx->fts_trx != NULL) { + fts_savepoint_take(trx->fts_trx, name); + } + + DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); +} + + +/** + Frees a possible InnoDB trx object associated with the current THD. + + @param hton innobase handlerton + @param thd server thread descriptor, which resources should be free'd + + @return 0 always +*/ +static int innobase_close_connection(handlerton *hton, THD *thd) +{ + DBUG_ASSERT(hton == innodb_hton_ptr); + if (auto trx= thd_to_trx(thd)) + { + thd_set_ha_data(thd, innodb_hton_ptr, NULL); + if (trx->state == TRX_STATE_PREPARED && trx->has_logged_persistent()) + { + trx_disconnect_prepared(trx); + return 0; + } + innobase_rollback_trx(trx); + trx->free(); + DEBUG_SYNC(thd, "innobase_connection_closed"); + } + return 0; +} + +/** Cancel any pending lock request associated with the current THD. +@sa THD::awake() @sa ha_kill_query() */ +static void innobase_kill_query(handlerton*, THD *thd, enum thd_kill_levels) +{ + DBUG_ENTER("innobase_kill_query"); + + if (trx_t* trx= thd_to_trx(thd)) + { + ut_ad(trx->mysql_thd == thd); + mysql_mutex_lock(&lock_sys.wait_mutex); + lock_t *lock= trx->lock.wait_lock; + + if (!lock) + /* The transaction is not waiting for any lock. */; +#ifdef WITH_WSREP + else if (trx->is_wsrep() && wsrep_thd_is_aborting(thd)) + /* if victim has been signaled by BF thread and/or aborting is already + progressing, following query aborting is not necessary any more. + Also, BF thread should own trx mutex for the victim. */; +#endif /* WITH_WSREP */ + else + { + if (!trx->dict_operation) + { + /* Dictionary transactions must be immune to KILL, because they + may be executed as part of a multi-transaction DDL operation, such + as rollback_inplace_alter_table() or ha_innobase::delete_table(). */; + trx->error_state= DB_INTERRUPTED; + lock_sys_t::cancel(trx, lock); + } + lock_sys.deadlock_check(); + } + mysql_mutex_unlock(&lock_sys.wait_mutex); + } + + DBUG_VOID_RETURN; +} + + +/*************************************************************************//** +** InnoDB database tables +*****************************************************************************/ + +/** Get the record format from the data dictionary. +@return one of ROW_TYPE_REDUNDANT, ROW_TYPE_COMPACT, +ROW_TYPE_COMPRESSED, ROW_TYPE_DYNAMIC */ + +enum row_type +ha_innobase::get_row_type() const +{ + if (m_prebuilt && m_prebuilt->table) { + const ulint flags = m_prebuilt->table->flags; + + switch (dict_tf_get_rec_format(flags)) { + case REC_FORMAT_REDUNDANT: + return(ROW_TYPE_REDUNDANT); + case REC_FORMAT_COMPACT: + return(ROW_TYPE_COMPACT); + case REC_FORMAT_COMPRESSED: + return(ROW_TYPE_COMPRESSED); + case REC_FORMAT_DYNAMIC: + return(ROW_TYPE_DYNAMIC); + } + } + ut_ad(0); + return(ROW_TYPE_NOT_USED); +} + +/****************************************************************//** +Get the table flags to use for the statement. +@return table flags */ + +handler::Table_flags +ha_innobase::table_flags() const +/*============================*/ +{ + THD* thd = ha_thd(); + handler::Table_flags flags = m_int_table_flags; + + /* Need to use tx_isolation here since table flags is (also) + called before prebuilt is inited. */ + + if (thd_tx_isolation(thd) <= ISO_READ_COMMITTED) { + return(flags); + } + + return(flags | HA_BINLOG_STMT_CAPABLE); +} + +/****************************************************************//** +Returns the table type (storage engine name). +@return table type */ + +const char* +ha_innobase::table_type() const +/*===========================*/ +{ + return(innobase_hton_name); +} + +/****************************************************************//** +Returns the index type. +@return index type */ + +const char* +ha_innobase::index_type( +/*====================*/ + uint keynr) /*!< : index number */ +{ + dict_index_t* index = innobase_get_index(keynr); + + if (!index) { + return "Corrupted"; + } + + if (index->type & DICT_FTS) { + return("FULLTEXT"); + } + + if (dict_index_is_spatial(index)) { + return("SPATIAL"); + } + + return("BTREE"); +} + +/****************************************************************//** +Returns the operations supported for indexes. +@return flags of supported operations */ + +ulong +ha_innobase::index_flags( +/*=====================*/ + uint key, + uint, + bool) const +{ + if (table_share->key_info[key].algorithm == HA_KEY_ALG_FULLTEXT) { + return(0); + } + + /* For spatial index, we don't support descending scan + and ICP so far. */ + if (table_share->key_info[key].flags & HA_SPATIAL) { + return HA_READ_NEXT | HA_READ_ORDER| HA_READ_RANGE + | HA_KEYREAD_ONLY | HA_KEY_SCAN_NOT_ROR; + } + + ulong flags= key == table_share->primary_key + ? HA_CLUSTERED_INDEX : 0; + + flags |= HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER + | HA_READ_RANGE | HA_KEYREAD_ONLY + | HA_DO_INDEX_COND_PUSHDOWN + | HA_DO_RANGE_FILTER_PUSHDOWN; + + return(flags); +} + +/****************************************************************//** +Returns the maximum number of keys. +@return MAX_KEY */ + +uint +ha_innobase::max_supported_keys() const +/*===================================*/ +{ + return(MAX_KEY); +} + +/****************************************************************//** +Returns the maximum key length. +@return maximum supported key length, in bytes */ + +uint +ha_innobase::max_supported_key_length() const +/*=========================================*/ +{ + /* An InnoDB page must store >= 2 keys; a secondary key record + must also contain the primary key value. Therefore, if both + the primary key and the secondary key are at this maximum length, + it must be less than 1/4th of the free space on a page including + record overhead. + + MySQL imposes its own limit to this number; MAX_KEY_LENGTH = 3072. + + For page sizes = 16k, InnoDB historically reported 3500 bytes here, + But the MySQL limit of 3072 was always used through the handler + interface. + + Note: Handle 16k and 32k pages the same here since the limits + are higher than imposed by MySQL. */ + + switch (srv_page_size) { + case 4096: + /* Hack: allow mysql.innodb_index_stats to be created. */ + /* FIXME: rewrite this API, and in sql_table.cc consider + that in index-organized tables (such as InnoDB), secondary + index records will be padded with the PRIMARY KEY, instead + of some short ROWID or record heap address. */ + return(1173); + case 8192: + return(1536); + default: + return(3500); + } +} + +/****************************************************************//** +Returns the key map of keys that are usable for scanning. +@return key_map_full */ + +const key_map* +ha_innobase::keys_to_use_for_scanning() +/*===================================*/ +{ + return(&key_map_full); +} + +/** Ensure that indexed virtual columns will be computed. */ +void ha_innobase::column_bitmaps_signal() +{ + if (!table->vfield || table->current_lock != F_WRLCK) + return; + + dict_index_t* clust_index= dict_table_get_first_index(m_prebuilt->table); + uint num_v= 0; + for (uint j = 0; j < table->s->virtual_fields; j++) + { + if (table->vfield[j]->stored_in_db()) + continue; + + dict_col_t *col= &m_prebuilt->table->v_cols[num_v].m_col; + if (col->ord_part || + (dict_index_is_online_ddl(clust_index) && + row_log_col_is_indexed(clust_index, num_v))) + table->mark_virtual_column_with_deps(table->vfield[j]); + num_v++; + } +} + + +/****************************************************************//** +Determines if table caching is supported. +@return HA_CACHE_TBL_ASKTRANSACT */ + +uint8 +ha_innobase::table_cache_type() +/*===========================*/ +{ + return(HA_CACHE_TBL_ASKTRANSACT); +} + +/** Normalizes a table name string. +A normalized name consists of the database name catenated to '/' +and table name. For example: test/mytable. +On Windows, normalization puts both the database name and the +table name always to lower case if "set_lower_case" is set to TRUE. +@param[out] norm_name Normalized name, null-terminated. +@param[in] name Name to normalize. +@param[in] set_lower_case True if we also should fold to lower case. */ +void +normalize_table_name_c_low( +/*=======================*/ + char* norm_name, /* out: normalized name as a + null-terminated string */ + const char* name, /* in: table name string */ + bool set_lower_case) /* in: TRUE if we want to set + name to lower case */ +{ + char* name_ptr; + ulint name_len; + char* db_ptr; + ulint db_len; + char* ptr; + ulint norm_len; + + /* Scan name from the end */ + + ptr = strend(name) - 1; + + /* seek to the last path separator */ + while (ptr >= name && *ptr != '\\' && *ptr != '/') { + ptr--; + } + + name_ptr = ptr + 1; + name_len = strlen(name_ptr); + + /* skip any number of path separators */ + while (ptr >= name && (*ptr == '\\' || *ptr == '/')) { + ptr--; + } + + DBUG_ASSERT(ptr >= name); + + /* seek to the last but one path separator or one char before + the beginning of name */ + db_len = 0; + while (ptr >= name && *ptr != '\\' && *ptr != '/') { + ptr--; + db_len++; + } + + db_ptr = ptr + 1; + + norm_len = db_len + name_len + sizeof "/"; + ut_a(norm_len < FN_REFLEN - 1); + + memcpy(norm_name, db_ptr, db_len); + + norm_name[db_len] = '/'; + + /* Copy the name and null-byte. */ + memcpy(norm_name + db_len + 1, name_ptr, name_len + 1); + + if (set_lower_case) { + innobase_casedn_str(norm_name); + } +} + +create_table_info_t::create_table_info_t( + THD* thd, + const TABLE* form, + HA_CREATE_INFO* create_info, + char* table_name, + char* remote_path, + bool file_per_table, + trx_t* trx) + : m_thd(thd), + m_trx(trx), + m_form(form), + m_default_row_format(innodb_default_row_format), + m_create_info(create_info), + m_table_name(table_name), m_table(NULL), + m_remote_path(remote_path), + m_innodb_file_per_table(file_per_table) +{ +} + +#if !defined(DBUG_OFF) +/********************************************************************* +Test normalize_table_name_low(). */ +static +void +test_normalize_table_name_low() +/*===========================*/ +{ + char norm_name[FN_REFLEN]; + const char* test_data[][2] = { + /* input, expected result */ + {"./mysqltest/t1", "mysqltest/t1"}, + {"./test/#sql-842b_2", "test/#sql-842b_2"}, + {"./test/#sql-85a3_10", "test/#sql-85a3_10"}, + {"./test/#sql2-842b-2", "test/#sql2-842b-2"}, + {"./test/bug29807", "test/bug29807"}, + {"./test/foo", "test/foo"}, + {"./test/innodb_bug52663", "test/innodb_bug52663"}, + {"./test/t", "test/t"}, + {"./test/t1", "test/t1"}, + {"./test/t10", "test/t10"}, + {"/a/b/db/table", "db/table"}, + {"/a/b/db///////table", "db/table"}, + {"/a/b////db///////table", "db/table"}, + {"/var/tmp/mysqld.1/#sql842b_2_10", "mysqld.1/#sql842b_2_10"}, + {"db/table", "db/table"}, + {"ddd/t", "ddd/t"}, + {"d/ttt", "d/ttt"}, + {"d/t", "d/t"}, + {".\\mysqltest\\t1", "mysqltest/t1"}, + {".\\test\\#sql-842b_2", "test/#sql-842b_2"}, + {".\\test\\#sql-85a3_10", "test/#sql-85a3_10"}, + {".\\test\\#sql2-842b-2", "test/#sql2-842b-2"}, + {".\\test\\bug29807", "test/bug29807"}, + {".\\test\\foo", "test/foo"}, + {".\\test\\innodb_bug52663", "test/innodb_bug52663"}, + {".\\test\\t", "test/t"}, + {".\\test\\t1", "test/t1"}, + {".\\test\\t10", "test/t10"}, + {"C:\\a\\b\\db\\table", "db/table"}, + {"C:\\a\\b\\db\\\\\\\\\\\\\\table", "db/table"}, + {"C:\\a\\b\\\\\\\\db\\\\\\\\\\\\\\table", "db/table"}, + {"C:\\var\\tmp\\mysqld.1\\#sql842b_2_10", "mysqld.1/#sql842b_2_10"}, + {"db\\table", "db/table"}, + {"ddd\\t", "ddd/t"}, + {"d\\ttt", "d/ttt"}, + {"d\\t", "d/t"}, + }; + + for (size_t i = 0; i < UT_ARR_SIZE(test_data); i++) { + printf("test_normalize_table_name_low():" + " testing \"%s\", expected \"%s\"... ", + test_data[i][0], test_data[i][1]); + + normalize_table_name_c_low( + norm_name, test_data[i][0], FALSE); + + if (strcmp(norm_name, test_data[i][1]) == 0) { + printf("ok\n"); + } else { + printf("got \"%s\"\n", norm_name); + ut_error; + } + } +} + +/********************************************************************* +Test ut_format_name(). */ +static +void +test_ut_format_name() +/*=================*/ +{ + char buf[NAME_LEN * 3]; + + struct { + const char* name; + ulint buf_size; + const char* expected; + } test_data[] = { + {"test/t1", sizeof(buf), "`test`.`t1`"}, + {"test/t1", 12, "`test`.`t1`"}, + {"test/t1", 11, "`test`.`t1"}, + {"test/t1", 10, "`test`.`t"}, + {"test/t1", 9, "`test`.`"}, + {"test/t1", 8, "`test`."}, + {"test/t1", 7, "`test`"}, + {"test/t1", 6, "`test"}, + {"test/t1", 5, "`tes"}, + {"test/t1", 4, "`te"}, + {"test/t1", 3, "`t"}, + {"test/t1", 2, "`"}, + {"test/t1", 1, ""}, + {"test/t1", 0, "BUF_NOT_CHANGED"}, + {"table", sizeof(buf), "`table`"}, + {"ta'le", sizeof(buf), "`ta'le`"}, + {"ta\"le", sizeof(buf), "`ta\"le`"}, + {"ta`le", sizeof(buf), "`ta``le`"}, + }; + + for (size_t i = 0; i < UT_ARR_SIZE(test_data); i++) { + + memcpy(buf, "BUF_NOT_CHANGED", strlen("BUF_NOT_CHANGED") + 1); + + char* ret; + + ret = ut_format_name(test_data[i].name, + buf, + test_data[i].buf_size); + + ut_a(ret == buf); + + if (strcmp(buf, test_data[i].expected) == 0) { + ib::info() << "ut_format_name(" << test_data[i].name + << ", buf, " << test_data[i].buf_size << ")," + " expected " << test_data[i].expected + << ", OK"; + } else { + ib::error() << "ut_format_name(" << test_data[i].name + << ", buf, " << test_data[i].buf_size << ")," + " expected " << test_data[i].expected + << ", ERROR: got " << buf; + ut_error; + } + } +} +#endif /* !DBUG_OFF */ + +/** Match index columns between MySQL and InnoDB. +This function checks whether the index column information +is consistent between KEY info from mysql and that from innodb index. +@param[in] key_info Index info from mysql +@param[in] index_info Index info from InnoDB +@return true if all column types match. */ +static +bool +innobase_match_index_columns( + const KEY* key_info, + const dict_index_t* index_info) +{ + const KEY_PART_INFO* key_part; + const KEY_PART_INFO* key_end; + const dict_field_t* innodb_idx_fld; + const dict_field_t* innodb_idx_fld_end; + + DBUG_ENTER("innobase_match_index_columns"); + + /* Check whether user defined index column count matches */ + if (key_info->user_defined_key_parts != + index_info->n_user_defined_cols) { + DBUG_RETURN(FALSE); + } + + key_part = key_info->key_part; + key_end = key_part + key_info->user_defined_key_parts; + innodb_idx_fld = index_info->fields; + innodb_idx_fld_end = index_info->fields + index_info->n_fields; + + /* Check each index column's datatype. We do not check + column name because there exists case that index + column name got modified in mysql but such change does not + propagate to InnoDB. + One hidden assumption here is that the index column sequences + are matched up between those in mysql and InnoDB. */ + for (; key_part != key_end; ++key_part) { + unsigned is_unsigned; + auto mtype = innodb_idx_fld->col->mtype; + + /* Need to translate to InnoDB column type before + comparison. */ + auto col_type = get_innobase_type_from_mysql_type( + &is_unsigned, key_part->field); + + /* Ignore InnoDB specific system columns. */ + while (mtype == DATA_SYS) { + innodb_idx_fld++; + + if (innodb_idx_fld >= innodb_idx_fld_end) { + DBUG_RETURN(FALSE); + } + } + + /* MariaDB-5.5 compatibility */ + if ((key_part->field->real_type() == MYSQL_TYPE_ENUM || + key_part->field->real_type() == MYSQL_TYPE_SET) && + mtype == DATA_FIXBINARY) { + col_type= DATA_FIXBINARY; + } + + if (innodb_idx_fld->descending + != !!(key_part->key_part_flag & HA_REVERSE_SORT)) { + DBUG_RETURN(FALSE); + } + + if (col_type != mtype) { + /* If the col_type we get from mysql type is a geometry + data type, we should check if mtype is a legacy type + from 5.6, either upgraded to DATA_GEOMETRY or not. + This is indeed not an accurate check, but should be + safe, since DATA_BLOB would be upgraded once we create + spatial index on it and we intend to use DATA_GEOMETRY + for legacy GIS data types which are of var-length. */ + switch (col_type) { + case DATA_GEOMETRY: + if (mtype == DATA_BLOB) { + break; + } + /* Fall through */ + default: + /* Column type mismatches */ + DBUG_RETURN(false); + } + } + + innodb_idx_fld++; + } + + DBUG_RETURN(TRUE); +} + +/** Build a template for a base column for a virtual column +@param[in] table MySQL TABLE +@param[in] clust_index InnoDB clustered index +@param[in] field field in MySQL table +@param[in] col InnoDB column +@param[in,out] templ template to fill +@param[in] col_no field index for virtual col +*/ +static +void +innobase_vcol_build_templ( + const TABLE* table, + dict_index_t* clust_index, + Field* field, + const dict_col_t* col, + mysql_row_templ_t* templ, + ulint col_no) +{ + templ->col_no = col_no; + templ->is_virtual = col->is_virtual(); + + if (templ->is_virtual) { + templ->clust_rec_field_no = ULINT_UNDEFINED; + templ->rec_field_no = col->ind; + } else { + templ->clust_rec_field_no = dict_col_get_clust_pos( + col, clust_index); + ut_a(templ->clust_rec_field_no != ULINT_UNDEFINED); + + templ->rec_field_no = templ->clust_rec_field_no; + } + + if (field->real_maybe_null()) { + templ->mysql_null_byte_offset = + field->null_offset(); + + templ->mysql_null_bit_mask = (ulint) field->null_bit; + } else { + templ->mysql_null_bit_mask = 0; + } + + templ->mysql_col_offset = static_cast( + get_field_offset(table, field)); + templ->mysql_col_len = static_cast(field->pack_length()); + templ->type = col->mtype; + templ->mysql_type = static_cast(field->type()); + + if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) { + templ->mysql_length_bytes = static_cast( + ((Field_varstring*) field)->length_bytes); + } + + templ->charset = dtype_get_charset_coll(col->prtype); + templ->mbminlen = dict_col_get_mbminlen(col); + templ->mbmaxlen = dict_col_get_mbmaxlen(col); + templ->is_unsigned = col->prtype & DATA_UNSIGNED; +} + +/** Build template for the virtual columns and their base columns. This +is done when the table first opened. +@param[in] table MySQL TABLE +@param[in] ib_table InnoDB dict_table_t +@param[in,out] s_templ InnoDB template structure +@param[in] add_v new virtual columns added along with + add index call +@param[in] locked true if dict_sys.latch is held */ +void +innobase_build_v_templ( + const TABLE* table, + const dict_table_t* ib_table, + dict_vcol_templ_t* s_templ, + const dict_add_v_col_t* add_v, + bool locked) +{ + ulint ncol = unsigned(ib_table->n_cols) - DATA_N_SYS_COLS; + ulint n_v_col = ib_table->n_v_cols; + bool marker[REC_MAX_N_FIELDS]; + + DBUG_ENTER("innobase_build_v_templ"); + ut_ad(ncol < REC_MAX_N_FIELDS); + + if (add_v != NULL) { + n_v_col += add_v->n_v_col; + } + + ut_ad(n_v_col > 0); + + if (!locked) { + dict_sys.lock(SRW_LOCK_CALL); + } + +#if 0 + /* This does not (need to) hold for ctx->new_table in + alter_rebuild_apply_log() */ + ut_ad(dict_sys.locked()); +#endif + + if (s_templ->vtempl) { + if (!locked) { + dict_sys.unlock(); + } + DBUG_VOID_RETURN; + } + + memset(marker, 0, sizeof(bool) * ncol); + + s_templ->vtempl = static_cast( + ut_zalloc_nokey((ncol + n_v_col) + * sizeof *s_templ->vtempl)); + s_templ->n_col = ncol; + s_templ->n_v_col = n_v_col; + s_templ->rec_len = table->s->reclength; + s_templ->default_rec = UT_NEW_ARRAY_NOKEY(uchar, s_templ->rec_len); + memcpy(s_templ->default_rec, table->s->default_values, s_templ->rec_len); + + /* Mark those columns could be base columns */ + for (ulint i = 0; i < ib_table->n_v_cols; i++) { + const dict_v_col_t* vcol = dict_table_get_nth_v_col( + ib_table, i); + + for (ulint j = vcol->num_base; j--; ) { + marker[vcol->base_col[j]->ind] = true; + } + } + + if (add_v) { + for (ulint i = 0; i < add_v->n_v_col; i++) { + const dict_v_col_t* vcol = &add_v->v_col[i]; + + for (ulint j = vcol->num_base; j--; ) { + marker[vcol->base_col[j]->ind] = true; + } + } + } + + ulint j = 0; + ulint z = 0; + + dict_index_t* clust_index = dict_table_get_first_index(ib_table); + + for (ulint i = 0; i < table->s->fields; i++) { + Field* field = table->field[i]; + + /* Build template for virtual columns */ + if (!field->stored_in_db()) { +#ifdef UNIV_DEBUG + const char* name; + + if (z >= ib_table->n_v_def) { + name = add_v->v_col_name[z - ib_table->n_v_def]; + } else { + name = dict_table_get_v_col_name(ib_table, z); + } + + ut_ad(!my_strcasecmp(system_charset_info, name, + field->field_name.str)); +#endif + const dict_v_col_t* vcol; + + if (z >= ib_table->n_v_def) { + vcol = &add_v->v_col[z - ib_table->n_v_def]; + } else { + vcol = dict_table_get_nth_v_col(ib_table, z); + } + + s_templ->vtempl[z + s_templ->n_col] + = static_cast( + ut_malloc_nokey( + sizeof *s_templ->vtempl[j])); + + innobase_vcol_build_templ( + table, clust_index, field, + &vcol->m_col, + s_templ->vtempl[z + s_templ->n_col], + z); + z++; + continue; + } + + ut_ad(j < ncol); + + /* Build template for base columns */ + if (marker[j]) { + dict_col_t* col = dict_table_get_nth_col( + ib_table, j); + + ut_ad(!my_strcasecmp(system_charset_info, + dict_table_get_col_name( + ib_table, j), + field->field_name.str)); + + s_templ->vtempl[j] = static_cast< + mysql_row_templ_t*>( + ut_malloc_nokey( + sizeof *s_templ->vtempl[j])); + + innobase_vcol_build_templ( + table, clust_index, field, col, + s_templ->vtempl[j], j); + } + + j++; + } + + if (!locked) { + dict_sys.unlock(); + } + + s_templ->db_name = table->s->db.str; + s_templ->tb_name = table->s->table_name.str; + DBUG_VOID_RETURN; +} + +/** Check consistency between .frm indexes and InnoDB indexes. +@param[in] table table object formed from .frm +@param[in] ib_table InnoDB table definition +@retval true if not errors were found */ +static bool +check_index_consistency(const TABLE* table, const dict_table_t* ib_table) +{ + ulint mysql_num_index = table->s->keys; + ulint ib_num_index = UT_LIST_GET_LEN(ib_table->indexes); + bool ret = true; + + /* If there exists inconsistency between MySQL and InnoDB dictionary + (metadata) information, the number of index defined in MySQL + could exceed that in InnoDB, return error */ + if (ib_num_index < mysql_num_index) { + ret = false; + goto func_exit; + } + + /* For each index in the mysql key_info array, fetch its + corresponding InnoDB index pointer into index_mapping + array. */ + for (ulint count = 0; count < mysql_num_index; count++) { + const dict_index_t* index = dict_table_get_index_on_name( + ib_table, table->key_info[count].name.str); + + if (index == NULL) { + sql_print_error("Cannot find index %s in InnoDB" + " index dictionary.", + table->key_info[count].name.str); + ret = false; + goto func_exit; + } + + /* Double check fetched index has the same + column info as those in mysql key_info. */ + if (!innobase_match_index_columns(&table->key_info[count], + index)) { + sql_print_error("Found index %s whose column info" + " does not match that of MariaDB.", + table->key_info[count].name.str); + ret = false; + goto func_exit; + } + } + +func_exit: + return ret; +} + +/********************************************************************//** +Get the upper limit of the MySQL integral and floating-point type. +@return maximum allowed value for the field */ +ulonglong innobase_get_int_col_max_value(const Field *field) +{ + ulonglong max_value = 0; + + switch (field->key_type()) { + /* TINY */ + case HA_KEYTYPE_BINARY: + max_value = 0xFFULL; + break; + case HA_KEYTYPE_INT8: + max_value = 0x7FULL; + break; + /* SHORT */ + case HA_KEYTYPE_USHORT_INT: + max_value = 0xFFFFULL; + break; + case HA_KEYTYPE_SHORT_INT: + max_value = 0x7FFFULL; + break; + /* MEDIUM */ + case HA_KEYTYPE_UINT24: + max_value = 0xFFFFFFULL; + break; + case HA_KEYTYPE_INT24: + max_value = 0x7FFFFFULL; + break; + /* LONG */ + case HA_KEYTYPE_ULONG_INT: + max_value = 0xFFFFFFFFULL; + break; + case HA_KEYTYPE_LONG_INT: + max_value = 0x7FFFFFFFULL; + break; + /* BIG */ + case HA_KEYTYPE_ULONGLONG: + max_value = 0xFFFFFFFFFFFFFFFFULL; + break; + case HA_KEYTYPE_LONGLONG: + max_value = 0x7FFFFFFFFFFFFFFFULL; + break; + case HA_KEYTYPE_FLOAT: + /* We use the maximum as per IEEE754-2008 standard, 2^24 */ + max_value = 0x1000000ULL; + break; + case HA_KEYTYPE_DOUBLE: + /* We use the maximum as per IEEE754-2008 standard, 2^53 */ + max_value = 0x20000000000000ULL; + break; + default: + ut_error; + } + + return(max_value); +} + +/** Initialize the AUTO_INCREMENT column metadata. + +Since a partial table definition for a persistent table can already be +present in the InnoDB dict_sys cache before it is accessed from SQL, +we have to initialize the AUTO_INCREMENT counter on the first +ha_innobase::open(). + +@param[in,out] table persistent table +@param[in] field the AUTO_INCREMENT column */ +static +void +initialize_auto_increment(dict_table_t* table, const Field* field) +{ + ut_ad(!table->is_temporary()); + + const unsigned col_no = innodb_col_no(field); + + table->autoinc_mutex.wr_lock(); + + table->persistent_autoinc = static_cast( + dict_table_get_nth_col_pos(table, col_no, NULL) + 1) + & dict_index_t::MAX_N_FIELDS; + + if (table->autoinc) { + /* Already initialized. Our caller checked + table->persistent_autoinc without + autoinc_mutex protection, and there might be multiple + ha_innobase::open() executing concurrently. */ + } else if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) { + /* If the recovery level is set so high that writes + are disabled we force the AUTOINC counter to 0 + value effectively disabling writes to the table. + Secondly, we avoid reading the table in case the read + results in failure due to a corrupted table/index. + + We will not return an error to the client, so that the + tables can be dumped with minimal hassle. If an error + were returned in this case, the first attempt to read + the table would fail and subsequent SELECTs would succeed. */ + } else if (table->persistent_autoinc) { + table->autoinc = innobase_next_autoinc( + btr_read_autoinc_with_fallback(table, col_no), + 1 /* need */, + 1 /* auto_increment_increment */, + 0 /* auto_increment_offset */, + innobase_get_int_col_max_value(field)); + } + + table->autoinc_mutex.wr_unlock(); +} + +/** Open an InnoDB table +@param[in] name table name +@return error code +@retval 0 on success */ +int +ha_innobase::open(const char* name, int, uint) +{ + char norm_name[FN_REFLEN]; + + DBUG_ENTER("ha_innobase::open"); + + normalize_table_name(norm_name, name); + + m_user_thd = NULL; + + /* Will be allocated if it is needed in ::update_row() */ + m_upd_buf = NULL; + m_upd_buf_size = 0; + + char* is_part = is_partition(norm_name); + THD* thd = ha_thd(); + dict_table_t* ib_table = open_dict_table(name, norm_name, is_part, + DICT_ERR_IGNORE_FK_NOKEY); + + DEBUG_SYNC(thd, "ib_open_after_dict_open"); + + if (NULL == ib_table) { + + if (is_part) { + sql_print_error("Failed to open table %s.\n", + norm_name); + } + set_my_errno(ENOENT); + + DBUG_RETURN(HA_ERR_NO_SUCH_TABLE); + } + + size_t n_fields = omits_virtual_cols(*table_share) + ? table_share->stored_fields : table_share->fields; + size_t n_cols = dict_table_get_n_user_cols(ib_table) + + dict_table_get_n_v_cols(ib_table) + - !!DICT_TF2_FLAG_IS_SET(ib_table, DICT_TF2_FTS_HAS_DOC_ID); + + if (UNIV_UNLIKELY(n_cols != n_fields)) { + ib::warn() << "Table " << norm_name << " contains " + << n_cols << " user" + " defined columns in InnoDB, but " << n_fields + << " columns in MariaDB. Please check" + " INFORMATION_SCHEMA.INNODB_SYS_COLUMNS and" + " https://mariadb.com/kb/en/innodb-data-dictionary-troubleshooting/" + " for how to resolve the issue."; + + /* Mark this table as corrupted, so the drop table + or force recovery can still use it, but not others. */ + ib_table->file_unreadable = true; + ib_table->corrupted = true; + ib_table->release(); + set_my_errno(ENOENT); + DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE); + } + + innobase_copy_frm_flags_from_table_share(ib_table, table->s); + + MONITOR_INC(MONITOR_TABLE_OPEN); + + if ((ib_table->flags2 & DICT_TF2_DISCARDED)) { + /* Allow an open because a proper DISCARD should have set + all the flags and index root page numbers to FIL_NULL that + should prevent any DML from running but it should allow DDL + operations. */ + } else if (!ib_table->is_readable()) { + const fil_space_t* space = ib_table->space; + if (!space) { + ib_senderrf( + thd, IB_LOG_LEVEL_WARN, + ER_TABLESPACE_MISSING, norm_name); + } + + if (!thd_tablespace_op(thd)) { + set_my_errno(ENOENT); + int ret_err = HA_ERR_TABLESPACE_MISSING; + + if (space && space->crypt_data + && space->crypt_data->is_encrypted()) { + push_warning_printf( + thd, + Sql_condition::WARN_LEVEL_WARN, + HA_ERR_DECRYPTION_FAILED, + "Table %s in file %s is encrypted" + " but encryption service or" + " used key_id %u is not available. " + " Can't continue reading table.", + table_share->table_name.str, + space->chain.start->name, + space->crypt_data->key_id); + ret_err = HA_ERR_DECRYPTION_FAILED; + } + + ib_table->release(); + DBUG_RETURN(ret_err); + } + } + + m_prebuilt = row_create_prebuilt(ib_table, table->s->reclength); + + m_prebuilt->default_rec = table->s->default_values; + ut_ad(m_prebuilt->default_rec); + + m_prebuilt->m_mysql_table = table; + + /* Looks like MySQL-3.23 sometimes has primary key number != 0 */ + m_primary_key = table->s->primary_key; + + key_used_on_scan = m_primary_key; + + if (ib_table->n_v_cols) { + dict_sys.lock(SRW_LOCK_CALL); + if (ib_table->vc_templ == NULL) { + ib_table->vc_templ = UT_NEW_NOKEY(dict_vcol_templ_t()); + innobase_build_v_templ( + table, ib_table, ib_table->vc_templ, NULL, + true); + } + + dict_sys.unlock(); + } + + if (!check_index_consistency(table, ib_table)) { + sql_print_error("InnoDB indexes are inconsistent with what " + "defined in .frm for table %s", + name); + } + + /* Allocate a buffer for a 'row reference'. A row reference is + a string of bytes of length ref_length which uniquely specifies + a row in our table. Note that MySQL may also compare two row + references for equality by doing a simple memcmp on the strings + of length ref_length! */ + if (!(m_prebuilt->clust_index_was_generated + = dict_index_is_auto_gen_clust(ib_table->indexes.start))) { + if (m_primary_key >= MAX_KEY) { + ib_table->dict_frm_mismatch = DICT_FRM_NO_PK; + + /* This mismatch could cause further problems + if not attended, bring this to the user's attention + by printing a warning in addition to log a message + in the errorlog */ + + ib_push_frm_error(thd, ib_table, table, 0, true); + + /* If m_primary_key >= MAX_KEY, its (m_primary_key) + value could be out of bound if continue to index + into key_info[] array. Find InnoDB primary index, + and assign its key_length to ref_length. + In addition, since MySQL indexes are sorted starting + with primary index, unique index etc., initialize + ref_length to the first index key length in + case we fail to find InnoDB cluster index. + + Please note, this will not resolve the primary + index mismatch problem, other side effects are + possible if users continue to use the table. + However, we allow this table to be opened so + that user can adopt necessary measures for the + mismatch while still being accessible to the table + date. */ + if (!table->key_info) { + ut_ad(!table->s->keys); + ref_length = 0; + } else { + ref_length = table->key_info[0].key_length; + } + + /* Find corresponding cluster index + key length in MySQL's key_info[] array */ + for (uint i = 0; i < table->s->keys; i++) { + dict_index_t* index; + index = innobase_get_index(i); + if (dict_index_is_clust(index)) { + ref_length = + table->key_info[i].key_length; + } + } + } else { + /* MySQL allocates the buffer for ref. + key_info->key_length includes space for all key + columns + one byte for each column that may be + NULL. ref_length must be as exact as possible to + save space, because all row reference buffers are + allocated based on ref_length. */ + + ref_length = table->key_info[m_primary_key].key_length; + } + } else { + if (m_primary_key != MAX_KEY) { + + ib_table->dict_frm_mismatch = DICT_NO_PK_FRM_HAS; + + /* This mismatch could cause further problems + if not attended, bring this to the user attention + by printing a warning in addition to log a message + in the errorlog */ + ib_push_frm_error(thd, ib_table, table, 0, true); + } + + ref_length = DATA_ROW_ID_LEN; + + /* If we automatically created the clustered index, then + MySQL does not know about it, and MySQL must NOT be aware + of the index used on scan, to make it avoid checking if we + update the column of the index. That is why we assert below + that key_used_on_scan is the undefined value MAX_KEY. + The column is the row id in the automatical generation case, + and it will never be updated anyway. */ + + if (key_used_on_scan != MAX_KEY) { + sql_print_warning( + "Table %s key_used_on_scan is %u even " + "though there is no primary key inside " + "InnoDB.", name, key_used_on_scan); + } + } + + /* Index block size in InnoDB: used by MySQL in query optimization */ + stats.block_size = static_cast(srv_page_size); + + const my_bool for_vc_purge = THDVAR(thd, background_thread); + + if (for_vc_purge || !m_prebuilt->table + || m_prebuilt->table->is_temporary() + || m_prebuilt->table->persistent_autoinc + || !m_prebuilt->table->is_readable()) { + } else if (const Field* ai = table->found_next_number_field) { + initialize_auto_increment(m_prebuilt->table, ai); + } + + /* Set plugin parser for fulltext index */ + for (uint i = 0; i < table->s->keys; i++) { + if (table->key_info[i].flags & HA_USES_PARSER) { + dict_index_t* index = innobase_get_index(i); + plugin_ref parser = table->key_info[i].parser; + + ut_ad(index->type & DICT_FTS); + index->parser = + static_cast( + plugin_decl(parser)->info); + + DBUG_EXECUTE_IF("fts_instrument_use_default_parser", + index->parser = &fts_default_parser;); + } + } + + ut_ad(!m_prebuilt->table + || table->versioned() == m_prebuilt->table->versioned()); + + if (!for_vc_purge) { + info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST + | HA_STATUS_OPEN); + } + + DBUG_RETURN(0); +} + +/** Convert MySQL column number to dict_table_t::cols[] offset. +@param[in] field non-virtual column +@return column number relative to dict_table_t::cols[] */ +unsigned +innodb_col_no(const Field* field) +{ + ut_ad(!innobase_is_s_fld(field)); + const TABLE* table = field->table; + unsigned col_no = 0; + ut_ad(field == table->field[field->field_index]); + for (unsigned i = 0; i < field->field_index; i++) { + if (table->field[i]->stored_in_db()) { + col_no++; + } + } + return(col_no); +} + +/** Opens dictionary table object using table name. For partition, we need to +try alternative lower/upper case names to support moving data files across +platforms. +@param[in] table_name name of the table/partition +@param[in] norm_name normalized name of the table/partition +@param[in] is_partition if this is a partition of a table +@param[in] ignore_err error to ignore for loading dictionary object +@return dictionary table object or NULL if not found */ +dict_table_t* +ha_innobase::open_dict_table( + const char* +#ifdef _WIN32 + table_name +#endif + , + const char* norm_name, + bool is_partition, + dict_err_ignore_t ignore_err) +{ + DBUG_ENTER("ha_innobase::open_dict_table"); + /* FIXME: try_drop_aborted */ + dict_table_t* ib_table = dict_table_open_on_name(norm_name, false, + ignore_err); + + if (NULL == ib_table && is_partition) { + /* MySQL partition engine hard codes the file name + separator as "#P#". The text case is fixed even if + lower_case_table_names is set to 1 or 2. This is true + for sub-partition names as well. InnoDB always + normalises file names to lower case on Windows, this + can potentially cause problems when copying/moving + tables between platforms. + + 1) If boot against an installation from Windows + platform, then its partition table name could + be in lower case in system tables. So we will + need to check lower case name when load table. + + 2) If we boot an installation from other case + sensitive platform in Windows, we might need to + check the existence of table name without lower + case in the system table. */ + if (lower_case_table_names == 1) { + char par_case_name[FN_REFLEN]; + +#ifndef _WIN32 + /* Check for the table using lower + case name, including the partition + separator "P" */ + strcpy(par_case_name, norm_name); + innobase_casedn_str(par_case_name); +#else + /* On Windows platfrom, check + whether there exists table name in + system table whose name is + not being normalized to lower case */ + normalize_table_name_c_low( + par_case_name, table_name, false); +#endif + /* FIXME: try_drop_aborted */ + ib_table = dict_table_open_on_name( + par_case_name, false, ignore_err); + } + + if (ib_table != NULL) { +#ifndef _WIN32 + sql_print_warning("Partition table %s opened" + " after converting to lower" + " case. The table may have" + " been moved from a case" + " in-sensitive file system." + " Please recreate table in" + " the current file system\n", + norm_name); +#else + sql_print_warning("Partition table %s opened" + " after skipping the step to" + " lower case the table name." + " The table may have been" + " moved from a case sensitive" + " file system. Please" + " recreate table in the" + " current file system\n", + norm_name); +#endif + } + } + + DBUG_RETURN(ib_table); +} + +handler* +ha_innobase::clone( +/*===============*/ + const char* name, /*!< in: table name */ + MEM_ROOT* mem_root) /*!< in: memory context */ +{ + DBUG_ENTER("ha_innobase::clone"); + + ha_innobase* new_handler = static_cast( + handler::clone(m_prebuilt->table->name.m_name, mem_root)); + + if (new_handler != NULL) { + DBUG_ASSERT(new_handler->m_prebuilt != NULL); + + new_handler->m_prebuilt->select_lock_type + = m_prebuilt->select_lock_type; + } + + DBUG_RETURN(new_handler); +} + + +uint +ha_innobase::max_supported_key_part_length() const +/*==============================================*/ +{ + /* A table format specific index column length check will be performed + at ha_innobase::add_index() and row_create_index_for_mysql() */ + return(REC_VERSION_56_MAX_INDEX_COL_LEN); +} + +/******************************************************************//** +Closes a handle to an InnoDB table. +@return 0 */ + +int +ha_innobase::close() +/*================*/ +{ + DBUG_ENTER("ha_innobase::close"); + + row_prebuilt_free(m_prebuilt); + + if (m_upd_buf != NULL) { + ut_ad(m_upd_buf_size != 0); + my_free(m_upd_buf); + m_upd_buf = NULL; + m_upd_buf_size = 0; + } + + DBUG_RETURN(0); +} + +/* The following accessor functions should really be inside MySQL code! */ + +#ifdef WITH_WSREP +ulint +wsrep_innobase_mysql_sort( + /* out: str contains sort string */ + int mysql_type, /* in: MySQL type */ + uint charset_number, /* in: number of the charset */ + unsigned char* str, /* in: data field */ + ulint str_length, /* in: data field length, + not UNIV_SQL_NULL */ + ulint buf_length) /* in: total str buffer length */ + +{ + CHARSET_INFO* charset; + enum_field_types mysql_tp; + ulint ret_length = str_length; + + DBUG_ASSERT(str_length != UNIV_SQL_NULL); + + mysql_tp = (enum_field_types) mysql_type; + + switch (mysql_tp) { + + case MYSQL_TYPE_BIT: + case MYSQL_TYPE_STRING: + case MYSQL_TYPE_VAR_STRING: + case MYSQL_TYPE_TINY_BLOB: + case MYSQL_TYPE_MEDIUM_BLOB: + case MYSQL_TYPE_BLOB: + case MYSQL_TYPE_LONG_BLOB: + case MYSQL_TYPE_VARCHAR: + { + uchar tmp_str[REC_VERSION_56_MAX_INDEX_COL_LEN] = {'\0'}; + ulint tmp_length = REC_VERSION_56_MAX_INDEX_COL_LEN; + + /* Use the charset number to pick the right charset struct for + the comparison. Since the MySQL function get_charset may be + slow before Bar removes the mutex operation there, we first + look at 2 common charsets directly. */ + + if (charset_number == default_charset_info->number) { + charset = default_charset_info; + } else if (charset_number == my_charset_latin1.number) { + charset = &my_charset_latin1; + } else { + charset = get_charset(charset_number, MYF(MY_WME)); + + if (charset == NULL) { + sql_print_error("InnoDB needs charset %lu for doing " + "a comparison, but MariaDB cannot " + "find that charset.", + (ulong) charset_number); + ut_a(0); + } + } + + ut_a(str_length <= tmp_length); + memcpy(tmp_str, str, str_length); + + tmp_length = charset->strnxfrm(str, str_length, + uint(str_length), tmp_str, + tmp_length, 0); + DBUG_ASSERT(tmp_length <= str_length); + if (wsrep_protocol_version < 3) { + tmp_length = charset->strnxfrm( + str, str_length, + uint(str_length), tmp_str, tmp_length, 0); + DBUG_ASSERT(tmp_length <= str_length); + } else { + /* strnxfrm will expand the destination string, + protocols < 3 truncated the sorted sring + protocols >= 3 gets full sorted sring + */ + tmp_length = charset->strnxfrm( + str, buf_length, + uint(str_length), tmp_str, str_length, 0); + DBUG_ASSERT(tmp_length <= buf_length); + ret_length = tmp_length; + } + + break; + } + case MYSQL_TYPE_DECIMAL : + case MYSQL_TYPE_TINY : + case MYSQL_TYPE_SHORT : + case MYSQL_TYPE_LONG : + case MYSQL_TYPE_FLOAT : + case MYSQL_TYPE_DOUBLE : + case MYSQL_TYPE_NULL : + case MYSQL_TYPE_TIMESTAMP : + case MYSQL_TYPE_LONGLONG : + case MYSQL_TYPE_INT24 : + case MYSQL_TYPE_DATE : + case MYSQL_TYPE_TIME : + case MYSQL_TYPE_DATETIME : + case MYSQL_TYPE_YEAR : + case MYSQL_TYPE_NEWDATE : + case MYSQL_TYPE_NEWDECIMAL : + case MYSQL_TYPE_ENUM : + case MYSQL_TYPE_SET : + case MYSQL_TYPE_GEOMETRY : + break; + default: + break; + } + + return ret_length; +} +#endif /* WITH_WSREP */ + +/******************************************************************//** +compare two character string according to their charset. */ +int +innobase_fts_text_cmp( +/*==================*/ + const void* cs, /*!< in: Character set */ + const void* p1, /*!< in: key */ + const void* p2) /*!< in: node */ +{ + const CHARSET_INFO* charset = (const CHARSET_INFO*) cs; + const fts_string_t* s1 = (const fts_string_t*) p1; + const fts_string_t* s2 = (const fts_string_t*) p2; + + return(ha_compare_word(charset, + s1->f_str, static_cast(s1->f_len), + s2->f_str, static_cast(s2->f_len))); +} + +/******************************************************************//** +compare two character string case insensitively according to their charset. */ +int +innobase_fts_text_case_cmp( +/*=======================*/ + const void* cs, /*!< in: Character set */ + const void* p1, /*!< in: key */ + const void* p2) /*!< in: node */ +{ + const CHARSET_INFO* charset = (const CHARSET_INFO*) cs; + const fts_string_t* s1 = (const fts_string_t*) p1; + const fts_string_t* s2 = (const fts_string_t*) p2; + ulint newlen; + + my_casedn_str(charset, (char*) s2->f_str); + + newlen = strlen((const char*) s2->f_str); + + return(ha_compare_word(charset, + s1->f_str, static_cast(s1->f_len), + s2->f_str, static_cast(newlen))); +} + +/******************************************************************//** +Get the first character's code position for FTS index partition. */ +ulint +innobase_strnxfrm( +/*==============*/ + const CHARSET_INFO* + cs, /*!< in: Character set */ + const uchar* str, /*!< in: string */ + const ulint len) /*!< in: string length */ +{ + uchar mystr[2]; + ulint value; + + if (!str || len == 0) { + return(0); + } + + cs->strnxfrm((uchar*) mystr, 2, str, len); + + value = mach_read_from_2(mystr); + + if (value > 255) { + value = value / 256; + } + + return(value); +} + +/******************************************************************//** +compare two character string according to their charset. */ +int +innobase_fts_text_cmp_prefix( +/*=========================*/ + const void* cs, /*!< in: Character set */ + const void* p1, /*!< in: prefix key */ + const void* p2) /*!< in: value to compare */ +{ + const CHARSET_INFO* charset = (const CHARSET_INFO*) cs; + const fts_string_t* s1 = (const fts_string_t*) p1; + const fts_string_t* s2 = (const fts_string_t*) p2; + int result; + + result = ha_compare_word_prefix(charset, + s2->f_str, static_cast(s2->f_len), + s1->f_str, static_cast(s1->f_len)); + + /* We switched s1, s2 position in the above call. So we need + to negate the result */ + return(-result); +} + +/******************************************************************//** +Makes all characters in a string lower case. */ +size_t +innobase_fts_casedn_str( +/*====================*/ + CHARSET_INFO* cs, /*!< in: Character set */ + char* src, /*!< in: string to put in lower case */ + size_t src_len,/*!< in: input string length */ + char* dst, /*!< in: buffer for result string */ + size_t dst_len)/*!< in: buffer size */ +{ + if (cs->casedn_multiply() == 1) { + memcpy(dst, src, src_len); + dst[src_len] = 0; + my_casedn_str(cs, dst); + + return(strlen(dst)); + } else { + return(cs->casedn(src, src_len, dst, dst_len)); + } +} + +#define true_word_char(c, ch) ((c) & (_MY_U | _MY_L | _MY_NMR) || (ch) == '_') + +#define misc_word_char(X) 0 + +/*************************************************************//** +Get the next token from the given string and store it in *token. +It is mostly copied from MyISAM's doc parsing function ft_simple_get_word() +@return length of string processed */ +ulint +innobase_mysql_fts_get_token( +/*=========================*/ + CHARSET_INFO* cs, /*!< in: Character set */ + const byte* start, /*!< in: start of text */ + const byte* end, /*!< in: one character past end of + text */ + fts_string_t* token) /*!< out: token's text */ +{ + int mbl; + const uchar* doc = start; + + ut_a(cs); + + token->f_n_char = token->f_len = 0; + token->f_str = NULL; + + for (;;) { + + if (doc >= end) { + return ulint(doc - start); + } + + int ctype; + + mbl = cs->ctype(&ctype, doc, (const uchar*) end); + + if (true_word_char(ctype, *doc)) { + break; + } + + doc += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1); + } + + ulint mwc = 0; + ulint length = 0; + bool reset_token_str = false; +reset: + token->f_str = const_cast(doc); + + while (doc < end) { + + int ctype; + + mbl = cs->ctype(&ctype, (uchar*) doc, (uchar*) end); + if (true_word_char(ctype, *doc)) { + mwc = 0; + } else if (*doc == '\'' && length == 1) { + /* Could be apostrophe */ + reset_token_str = true; + } else if (!misc_word_char(*doc) || mwc) { + break; + } else { + ++mwc; + } + + ++length; + + doc += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1); + if (reset_token_str) { + /* Reset the token if the single character + followed by apostrophe */ + mwc = 0; + length = 0; + reset_token_str = false; + goto reset; + } + } + + token->f_len = (uint) (doc - token->f_str) - mwc; + token->f_n_char = length; + + return ulint(doc - start); +} + +/** Converts a MySQL type to an InnoDB type. Note that this function returns +the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1 +VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'. +@param[out] unsigned_flag DATA_UNSIGNED if an 'unsigned type'; at least +ENUM and SET, and unsigned integer types are 'unsigned types' +@param[in] f MySQL Field +@return DATA_BINARY, DATA_VARCHAR, ... */ +uint8_t +get_innobase_type_from_mysql_type(unsigned *unsigned_flag, const Field *field) +{ + /* The following asserts try to check that the MySQL type code fits in + 8 bits: this is used in ibuf and also when DATA_NOT_NULL is ORed to + the type */ + + static_assert(MYSQL_TYPE_STRING < 256, "compatibility"); + static_assert(MYSQL_TYPE_VAR_STRING < 256, "compatibility"); + static_assert(MYSQL_TYPE_DOUBLE < 256, "compatibility"); + static_assert(MYSQL_TYPE_FLOAT < 256, "compatibility"); + static_assert(MYSQL_TYPE_DECIMAL < 256, "compatibility"); + + if (field->flags & UNSIGNED_FLAG) { + + *unsigned_flag = DATA_UNSIGNED; + } else { + *unsigned_flag = 0; + } + + if (field->real_type() == MYSQL_TYPE_ENUM + || field->real_type() == MYSQL_TYPE_SET) { + + /* MySQL has field->type() a string type for these, but the + data is actually internally stored as an unsigned integer + code! */ + + *unsigned_flag = DATA_UNSIGNED; /* MySQL has its own unsigned + flag set to zero, even though + internally this is an unsigned + integer type */ + return(DATA_INT); + } + + switch (field->type()) { + /* NOTE that we only allow string types in DATA_MYSQL and + DATA_VARMYSQL */ + case MYSQL_TYPE_VAR_STRING: /* old <= 4.1 VARCHAR */ + case MYSQL_TYPE_VARCHAR: /* new >= 5.0.3 true VARCHAR */ + if (field->binary()) { + return(DATA_BINARY); + } else if (field->charset() == &my_charset_latin1) { + return(DATA_VARCHAR); + } else { + return(DATA_VARMYSQL); + } + case MYSQL_TYPE_BIT: + case MYSQL_TYPE_STRING: + if (field->binary() || field->key_type() == HA_KEYTYPE_BINARY) { + return(DATA_FIXBINARY); + } else if (field->charset() == &my_charset_latin1) { + return(DATA_CHAR); + } else { + return(DATA_MYSQL); + } + case MYSQL_TYPE_NEWDECIMAL: + return(DATA_FIXBINARY); + case MYSQL_TYPE_LONG: + case MYSQL_TYPE_LONGLONG: + case MYSQL_TYPE_TINY: + case MYSQL_TYPE_SHORT: + case MYSQL_TYPE_INT24: + case MYSQL_TYPE_DATE: + case MYSQL_TYPE_YEAR: + case MYSQL_TYPE_NEWDATE: + return(DATA_INT); + case MYSQL_TYPE_TIME: + case MYSQL_TYPE_DATETIME: + case MYSQL_TYPE_TIMESTAMP: + if (field->key_type() == HA_KEYTYPE_BINARY) { + return(DATA_FIXBINARY); + } else { + return(DATA_INT); + } + case MYSQL_TYPE_FLOAT: + return(DATA_FLOAT); + case MYSQL_TYPE_DOUBLE: + return(DATA_DOUBLE); + case MYSQL_TYPE_DECIMAL: + return(DATA_DECIMAL); + case MYSQL_TYPE_GEOMETRY: + return(DATA_GEOMETRY); + case MYSQL_TYPE_TINY_BLOB: + case MYSQL_TYPE_MEDIUM_BLOB: + case MYSQL_TYPE_BLOB: + case MYSQL_TYPE_LONG_BLOB: + return(DATA_BLOB); + case MYSQL_TYPE_NULL: + /* MySQL currently accepts "NULL" datatype, but will + reject such datatype in the next release. We will cope + with it and not trigger assertion failure in 5.1 */ + break; + default: + ut_error; + } + + return(0); +} + +/*******************************************************************//** +Reads an unsigned integer value < 64k from 2 bytes, in the little-endian +storage format. +@return value */ +static inline +uint +innobase_read_from_2_little_endian( +/*===============================*/ + const uchar* buf) /*!< in: from where to read */ +{ + return((uint) ((ulint)(buf[0]) + 256 * ((ulint)(buf[1])))); +} + +#ifdef WITH_WSREP +/*******************************************************************//** +Stores a key value for a row to a buffer. +@return key value length as stored in buff */ +static +uint16_t +wsrep_store_key_val_for_row( +/*=========================*/ + THD* thd, + TABLE* table, + uint keynr, /*!< in: key number */ + char* buff, /*!< in/out: buffer for the key value (in MySQL + format) */ + uint buff_len,/*!< in: buffer length */ + const uchar* record, + bool* key_is_null)/*!< out: full key was null */ +{ + KEY* key_info = table->key_info + keynr; + KEY_PART_INFO* key_part = key_info->key_part; + KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts; + char* buff_start = buff; + enum_field_types mysql_type; + Field* field; + ulint buff_space = buff_len; + + DBUG_ENTER("wsrep_store_key_val_for_row"); + + memset(buff, 0, buff_len); + *key_is_null = true; + + for (; key_part != end; key_part++) { + uchar sorted[REC_VERSION_56_MAX_INDEX_COL_LEN] = {'\0'}; + bool part_is_null = false; + + if (key_part->null_bit) { + if (buff_space > 0) { + if (record[key_part->null_offset] + & key_part->null_bit) { + *buff = 1; + part_is_null = true; + } else { + *buff = 0; + } + buff++; + buff_space--; + } else { + fprintf (stderr, "WSREP: key truncated: %s\n", + wsrep_thd_query(thd)); + } + } + if (!part_is_null) *key_is_null = false; + + field = key_part->field; + mysql_type = field->type(); + + if (mysql_type == MYSQL_TYPE_VARCHAR) { + /* >= 5.0.3 true VARCHAR */ + ulint lenlen; + ulint len; + const byte* data; + ulint key_len; + ulint true_len; + const CHARSET_INFO* cs; + int error=0; + + key_len = key_part->length; + + if (part_is_null) { + true_len = key_len + 2; + if (true_len > buff_space) { + fprintf (stderr, + "WSREP: key truncated: %s\n", + wsrep_thd_query(thd)); + true_len = buff_space; + } + buff += true_len; + buff_space -= true_len; + continue; + } + cs = field->charset(); + + lenlen = (ulint) + (((Field_varstring*)field)->length_bytes); + + data = row_mysql_read_true_varchar(&len, + (byte*) (record + + (ulint)get_field_offset(table, field)), + lenlen); + + true_len = len; + + /* For multi byte character sets we need to calculate + the true length of the key */ + + if (len > 0 && cs->mbmaxlen > 1) { + true_len = (ulint) my_well_formed_length(cs, + (const char *) data, + (const char *) data + len, + (uint) (key_len / + cs->mbmaxlen), + &error); + } + + /* In a column prefix index, we may need to truncate + the stored value: */ + if (true_len > key_len) { + true_len = key_len; + } + /* cannot exceed max column lenght either, we may need to truncate + the stored value: */ + if (true_len > sizeof(sorted)) { + true_len = sizeof(sorted); + } + + memcpy(sorted, data, true_len); + true_len = wsrep_innobase_mysql_sort( + mysql_type, cs->number, sorted, true_len, + REC_VERSION_56_MAX_INDEX_COL_LEN); + if (wsrep_protocol_version > 1) { + /* Note that we always reserve the maximum possible + length of the true VARCHAR in the key value, though + only len first bytes after the 2 length bytes contain + actual data. The rest of the space was reset to zero + in the bzero() call above. */ + if (true_len > buff_space) { + WSREP_DEBUG ( + "write set key truncated for: %s\n", + wsrep_thd_query(thd)); + true_len = buff_space; + } + memcpy(buff, sorted, true_len); + buff += true_len; + buff_space -= true_len; + } else { + buff += key_len; + } + } else if (mysql_type == MYSQL_TYPE_TINY_BLOB + || mysql_type == MYSQL_TYPE_MEDIUM_BLOB + || mysql_type == MYSQL_TYPE_BLOB + || mysql_type == MYSQL_TYPE_LONG_BLOB + /* MYSQL_TYPE_GEOMETRY data is treated + as BLOB data in innodb. */ + || mysql_type == MYSQL_TYPE_GEOMETRY) { + + const CHARSET_INFO* cs; + ulint key_len; + ulint true_len; + int error=0; + ulint blob_len; + const byte* blob_data; + + ut_a(key_part->key_part_flag & HA_PART_KEY_SEG); + + key_len = key_part->length; + + if (part_is_null) { + true_len = key_len + 2; + if (true_len > buff_space) { + fprintf (stderr, + "WSREP: key truncated: %s\n", + wsrep_thd_query(thd)); + true_len = buff_space; + } + buff += true_len; + buff_space -= true_len; + + continue; + } + + cs = field->charset(); + + blob_data = row_mysql_read_blob_ref(&blob_len, + (byte*) (record + + (ulint)get_field_offset(table, field)), + (ulint) field->pack_length()); + + true_len = blob_len; + + ut_a(get_field_offset(table, field) + == key_part->offset); + + /* For multi byte character sets we need to calculate + the true length of the key */ + + if (blob_len > 0 && cs->mbmaxlen > 1) { + true_len = (ulint) my_well_formed_length(cs, + (const char *) blob_data, + (const char *) blob_data + + blob_len, + (uint) (key_len / + cs->mbmaxlen), + &error); + } + + /* All indexes on BLOB and TEXT are column prefix + indexes, and we may need to truncate the data to be + stored in the key value: */ + + if (true_len > key_len) { + true_len = key_len; + } + + memcpy(sorted, blob_data, true_len); + true_len = wsrep_innobase_mysql_sort( + mysql_type, cs->number, sorted, true_len, + REC_VERSION_56_MAX_INDEX_COL_LEN); + + + /* Note that we always reserve the maximum possible + length of the BLOB prefix in the key value. */ + if (wsrep_protocol_version > 1) { + if (true_len > buff_space) { + fprintf (stderr, + "WSREP: key truncated: %s\n", + wsrep_thd_query(thd)); + true_len = buff_space; + } + buff += true_len; + buff_space -= true_len; + } else { + buff += key_len; + } + memcpy(buff, sorted, true_len); + } else { + /* Here we handle all other data types except the + true VARCHAR, BLOB and TEXT. Note that the column + value we store may be also in a column prefix + index. */ + + const CHARSET_INFO* cs = NULL; + ulint true_len; + ulint key_len; + const uchar* src_start; + int error=0; + enum_field_types real_type; + + key_len = key_part->length; + + if (part_is_null) { + true_len = key_len; + if (true_len > buff_space) { + fprintf (stderr, + "WSREP: key truncated: %s\n", + wsrep_thd_query(thd)); + true_len = buff_space; + } + buff += true_len; + buff_space -= true_len; + + continue; + } + + src_start = record + key_part->offset; + real_type = field->real_type(); + true_len = key_len; + + /* Character set for the field is defined only + to fields whose type is string and real field + type is not enum or set. For these fields check + if character set is multi byte. */ + + if (real_type != MYSQL_TYPE_ENUM + && real_type != MYSQL_TYPE_SET + && ( mysql_type == MYSQL_TYPE_VAR_STRING + || mysql_type == MYSQL_TYPE_STRING)) { + + cs = field->charset(); + + /* For multi byte character sets we need to + calculate the true length of the key */ + + if (key_len > 0 && cs->mbmaxlen > 1) { + + true_len = (ulint) + my_well_formed_length(cs, + (const char *)src_start, + (const char *)src_start + + key_len, + (uint) (key_len / + cs->mbmaxlen), + &error); + } + memcpy(sorted, src_start, true_len); + true_len = wsrep_innobase_mysql_sort( + mysql_type, cs->number, sorted, true_len, + REC_VERSION_56_MAX_INDEX_COL_LEN); + + if (true_len > buff_space) { + fprintf (stderr, + "WSREP: key truncated: %s\n", + wsrep_thd_query(thd)); + true_len = buff_space; + } + memcpy(buff, sorted, true_len); + } else { + memcpy(buff, src_start, true_len); + } + buff += true_len; + buff_space -= true_len; + } + } + + ut_a(buff <= buff_start + buff_len); + + DBUG_RETURN(static_cast(buff - buff_start)); +} +#endif /* WITH_WSREP */ +/**************************************************************//** +Determines if a field is needed in a m_prebuilt struct 'template'. +@return field to use, or NULL if the field is not needed */ +static +const Field* +build_template_needs_field( +/*=======================*/ + bool index_contains, /*!< in: + dict_index_t::contains_col_or_prefix( + i) */ + bool read_just_key, /*!< in: TRUE when MySQL calls + ha_innobase::extra with the + argument HA_EXTRA_KEYREAD; it is enough + to read just columns defined in + the index (i.e., no read of the + clustered index record necessary) */ + bool fetch_all_in_key, + /*!< in: true=fetch all fields in + the index */ + bool fetch_primary_key_cols, + /*!< in: true=fetch the + primary key columns */ + dict_index_t* index, /*!< in: InnoDB index to use */ + const TABLE* table, /*!< in: MySQL table object */ + ulint i, /*!< in: field index in InnoDB table */ + ulint num_v) /*!< in: num virtual column so far */ +{ + const Field* field = table->field[i]; + + if (!field->stored_in_db() + && ha_innobase::omits_virtual_cols(*table->s)) { + return NULL; + } + + if (!index_contains) { + if (read_just_key) { + /* If this is a 'key read', we do not need + columns that are not in the key */ + + return(NULL); + } + } else if (fetch_all_in_key) { + /* This field is needed in the query */ + + return(field); + } + + if (bitmap_is_set(table->read_set, static_cast(i)) + || bitmap_is_set(table->write_set, static_cast(i))) { + /* This field is needed in the query */ + + return(field); + } + + ut_ad(i >= num_v); + if (fetch_primary_key_cols + && dict_table_col_in_clustered_key(index->table, i - num_v)) { + /* This field is needed in the query */ + return(field); + } + + /* This field is not needed in the query, skip it */ + + return(NULL); +} + +/**************************************************************//** +Determines if a field is needed in a m_prebuilt struct 'template'. +@return whether the field is needed for index condition pushdown */ +inline +bool +build_template_needs_field_in_icp( +/*==============================*/ + const dict_index_t* index, /*!< in: InnoDB index */ + const row_prebuilt_t* prebuilt,/*!< in: row fetch template */ + bool contains,/*!< in: whether the index contains + column i */ + ulint i, /*!< in: column number */ + bool is_virtual) + /*!< in: a virtual column or not */ +{ + ut_ad(contains == index->contains_col_or_prefix(i, is_virtual)); + + return(index == prebuilt->index + ? contains + : prebuilt->index->contains_col_or_prefix(i, is_virtual)); +} + +/**************************************************************//** +Adds a field to a m_prebuilt struct 'template'. +@return the field template */ +static +mysql_row_templ_t* +build_template_field( +/*=================*/ + row_prebuilt_t* prebuilt, /*!< in/out: template */ + dict_index_t* clust_index, /*!< in: InnoDB clustered index */ + dict_index_t* index, /*!< in: InnoDB index to use */ + TABLE* table, /*!< in: MySQL table object */ + const Field* field, /*!< in: field in MySQL table */ + ulint i, /*!< in: field index in InnoDB table */ + ulint v_no) /*!< in: field index for virtual col */ +{ + mysql_row_templ_t* templ; + const dict_col_t* col; + + ut_ad(clust_index->table == index->table); + + templ = prebuilt->mysql_template + prebuilt->n_template++; + MEM_UNDEFINED(templ, sizeof *templ); + templ->rec_field_is_prefix = FALSE; + templ->rec_prefix_field_no = ULINT_UNDEFINED; + templ->is_virtual = !field->stored_in_db(); + + if (!templ->is_virtual) { + templ->col_no = i; + col = dict_table_get_nth_col(index->table, i); + templ->clust_rec_field_no = dict_col_get_clust_pos( + col, clust_index); + /* If clustered index record field is not found, lets print out + field names and all the rest to understand why field is not found. */ + if (templ->clust_rec_field_no == ULINT_UNDEFINED) { + const char* tb_col_name = dict_table_get_col_name(clust_index->table, i); + dict_field_t* field=NULL; + size_t size = 0; + + for(ulint j=0; j < clust_index->n_user_defined_cols; j++) { + dict_field_t* ifield = &(clust_index->fields[j]); + if (ifield && !memcmp(tb_col_name, ifield->name, + strlen(tb_col_name))) { + field = ifield; + break; + } + } + + ib::info() << "Looking for field " << i << " name " + << (tb_col_name ? tb_col_name : "NULL") + << " from table " << clust_index->table->name; + + + for(ulint j=0; j < clust_index->n_user_defined_cols; j++) { + dict_field_t* ifield = &(clust_index->fields[j]); + ib::info() << "InnoDB Table " + << clust_index->table->name + << "field " << j << " name " + << (ifield ? ifield->name() : "NULL"); + } + + for(ulint j=0; j < table->s->stored_fields; j++) { + ib::info() << "MySQL table " + << table->s->table_name.str + << " field " << j << " name " + << table->field[j]->field_name.str; + } + + ib::fatal() << "Clustered record field for column " << i + << " not found table n_user_defined " + << clust_index->n_user_defined_cols + << " index n_user_defined " + << clust_index->table->n_cols - DATA_N_SYS_COLS + << " InnoDB table " + << clust_index->table->name + << " field name " + << (field ? field->name() : "NULL") + << " MySQL table " + << table->s->table_name.str + << " field name " + << (tb_col_name ? tb_col_name : "NULL") + << " n_fields " + << table->s->stored_fields + << " query " + << innobase_get_stmt_unsafe(current_thd, &size); + } + + if (dict_index_is_clust(index)) { + templ->rec_field_no = templ->clust_rec_field_no; + } else { + /* If we're in a secondary index, keep track + * of the original index position even if this + * is just a prefix index; we will use this + * later to avoid a cluster index lookup in + * some cases.*/ + + templ->rec_field_no = dict_index_get_nth_col_pos(index, i, + &templ->rec_prefix_field_no); + } + } else { + DBUG_ASSERT(!ha_innobase::omits_virtual_cols(*table->s)); + col = &dict_table_get_nth_v_col(index->table, v_no)->m_col; + templ->clust_rec_field_no = v_no; + + if (dict_index_is_clust(index)) { + templ->rec_field_no = templ->clust_rec_field_no; + } else { + templ->rec_field_no + = dict_index_get_nth_col_or_prefix_pos( + index, v_no, FALSE, true, + &templ->rec_prefix_field_no); + } + templ->icp_rec_field_no = ULINT_UNDEFINED; + } + + if (field->real_maybe_null()) { + templ->mysql_null_byte_offset = + field->null_offset(); + + templ->mysql_null_bit_mask = (ulint) field->null_bit; + } else { + templ->mysql_null_bit_mask = 0; + } + + + templ->mysql_col_offset = (ulint) get_field_offset(table, field); + templ->mysql_col_len = (ulint) field->pack_length(); + templ->type = col->mtype; + templ->mysql_type = (ulint) field->type(); + + if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) { + templ->mysql_length_bytes = (ulint) + (((Field_varstring*) field)->length_bytes); + } else { + templ->mysql_length_bytes = 0; + } + + templ->charset = dtype_get_charset_coll(col->prtype); + templ->mbminlen = dict_col_get_mbminlen(col); + templ->mbmaxlen = dict_col_get_mbmaxlen(col); + templ->is_unsigned = col->prtype & DATA_UNSIGNED; + + if (!dict_index_is_clust(index) + && templ->rec_field_no == ULINT_UNDEFINED) { + prebuilt->need_to_access_clustered = TRUE; + + if (templ->rec_prefix_field_no != ULINT_UNDEFINED) { + dict_field_t* field = dict_index_get_nth_field( + index, + templ->rec_prefix_field_no); + templ->rec_field_is_prefix = (field->prefix_len != 0); + } + } + + /* For spatial index, we need to access cluster index. */ + if (dict_index_is_spatial(index)) { + prebuilt->need_to_access_clustered = TRUE; + } + + if (prebuilt->mysql_prefix_len < templ->mysql_col_offset + + templ->mysql_col_len) { + prebuilt->mysql_prefix_len = templ->mysql_col_offset + + templ->mysql_col_len; + } + + if (DATA_LARGE_MTYPE(templ->type)) { + prebuilt->templ_contains_blob = TRUE; + } + + return(templ); +} + +/**************************************************************//** +Builds a 'template' to the m_prebuilt struct. The template is used in fast +retrieval of just those column values MySQL needs in its processing. */ + +void +ha_innobase::build_template( +/*========================*/ + bool whole_row) /*!< in: true=ROW_MYSQL_WHOLE_ROW, + false=ROW_MYSQL_REC_FIELDS */ +{ + dict_index_t* index; + dict_index_t* clust_index; + ibool fetch_all_in_key = FALSE; + ibool fetch_primary_key_cols = FALSE; + + if (m_prebuilt->select_lock_type == LOCK_X || m_prebuilt->table->no_rollback()) { + /* We always retrieve the whole clustered index record if we + use exclusive row level locks, for example, if the read is + done in an UPDATE statement or if we are using a no rollback + table */ + + whole_row = true; + } else if (!whole_row) { + if (m_prebuilt->hint_need_to_fetch_extra_cols + == ROW_RETRIEVE_ALL_COLS) { + + /* We know we must at least fetch all columns in the + key, or all columns in the table */ + + if (m_prebuilt->read_just_key) { + /* MySQL has instructed us that it is enough + to fetch the columns in the key; looks like + MySQL can set this flag also when there is + only a prefix of the column in the key: in + that case we retrieve the whole column from + the clustered index */ + + fetch_all_in_key = TRUE; + } else { + whole_row = true; + } + } else if (m_prebuilt->hint_need_to_fetch_extra_cols + == ROW_RETRIEVE_PRIMARY_KEY) { + /* We must at least fetch all primary key cols. Note + that if the clustered index was internally generated + by InnoDB on the row id (no primary key was + defined), then row_search_mvcc() will always + retrieve the row id to a special buffer in the + m_prebuilt struct. */ + + fetch_primary_key_cols = TRUE; + } + } + + clust_index = dict_table_get_first_index(m_prebuilt->table); + + index = whole_row ? clust_index : m_prebuilt->index; + + m_prebuilt->versioned_write = table->versioned_write(VERS_TRX_ID); + m_prebuilt->need_to_access_clustered = (index == clust_index); + + if (m_prebuilt->in_fts_query) { + /* Do clustered index lookup to fetch the FTS_DOC_ID */ + m_prebuilt->need_to_access_clustered = true; + } + + /* Either m_prebuilt->index should be a secondary index, or it + should be the clustered index. */ + ut_ad(dict_index_is_clust(index) == (index == clust_index)); + + /* Below we check column by column if we need to access + the clustered index. */ + + if (pushed_rowid_filter && rowid_filter_is_active) { + fetch_primary_key_cols = TRUE; + m_prebuilt->pk_filter = this; + } else { + m_prebuilt->pk_filter = NULL; + } + + const bool skip_virtual = omits_virtual_cols(*table_share); + const ulint n_fields = table_share->fields; + + if (!m_prebuilt->mysql_template) { + m_prebuilt->mysql_template = (mysql_row_templ_t*) + ut_malloc_nokey(n_fields * sizeof(mysql_row_templ_t)); + } + + m_prebuilt->template_type = whole_row + ? ROW_MYSQL_WHOLE_ROW : ROW_MYSQL_REC_FIELDS; + m_prebuilt->null_bitmap_len = table->s->null_bytes + & dict_index_t::MAX_N_FIELDS; + + /* Prepare to build m_prebuilt->mysql_template[]. */ + m_prebuilt->templ_contains_blob = FALSE; + m_prebuilt->mysql_prefix_len = 0; + m_prebuilt->n_template = 0; + m_prebuilt->idx_cond_n_cols = 0; + + /* Note that in InnoDB, i is the column number in the table. + MySQL calls columns 'fields'. */ + + ulint num_v = 0; + + if (active_index != MAX_KEY + && active_index == pushed_idx_cond_keyno) { + m_prebuilt->idx_cond = this; + goto icp; + } else if (pushed_rowid_filter && rowid_filter_is_active) { +icp: + /* Push down an index condition or an end_range check. */ + for (ulint i = 0; i < n_fields; i++) { + const Field* field = table->field[i]; + const bool is_v = !field->stored_in_db(); + if (is_v && skip_virtual) { + num_v++; + continue; + } + bool index_contains = index->contains_col_or_prefix( + is_v ? num_v : i - num_v, is_v); + if (is_v && index_contains) { + m_prebuilt->n_template = 0; + num_v = 0; + goto no_icp; + } + + /* Test if an end_range or an index condition + refers to the field. Note that "index" and + "index_contains" may refer to the clustered index. + Index condition pushdown is relative to + m_prebuilt->index (the index that is being + looked up first). */ + + /* When join_read_always_key() invokes this + code via handler::ha_index_init() and + ha_innobase::index_init(), end_range is not + yet initialized. Because of that, we must + always check for index_contains, instead of + the subset + field->part_of_key.is_set(active_index) + which would be acceptable if end_range==NULL. */ + if (build_template_needs_field_in_icp( + index, m_prebuilt, index_contains, + is_v ? num_v : i - num_v, is_v)) { + if (!whole_row) { + field = build_template_needs_field( + index_contains, + m_prebuilt->read_just_key, + fetch_all_in_key, + fetch_primary_key_cols, + index, table, i, num_v); + if (!field) { + if (is_v) { + num_v++; + } + continue; + } + } + + ut_ad(!is_v); + + mysql_row_templ_t* templ= build_template_field( + m_prebuilt, clust_index, index, + table, field, i - num_v, 0); + + ut_ad(!templ->is_virtual); + + m_prebuilt->idx_cond_n_cols++; + ut_ad(m_prebuilt->idx_cond_n_cols + == m_prebuilt->n_template); + + if (index == m_prebuilt->index) { + templ->icp_rec_field_no + = templ->rec_field_no; + } else { + templ->icp_rec_field_no + = dict_index_get_nth_col_pos( + m_prebuilt->index, + i - num_v, + &templ->rec_prefix_field_no); + } + + if (dict_index_is_clust(m_prebuilt->index)) { + ut_ad(templ->icp_rec_field_no + != ULINT_UNDEFINED); + /* If the primary key includes + a column prefix, use it in + index condition pushdown, + because the condition is + evaluated before fetching any + off-page (externally stored) + columns. */ + if (templ->icp_rec_field_no + < m_prebuilt->index->n_uniq) { + /* This is a key column; + all set. */ + continue; + } + } else if (templ->icp_rec_field_no + != ULINT_UNDEFINED) { + continue; + } + + /* This is a column prefix index. + The column prefix can be used in + an end_range comparison. */ + + templ->icp_rec_field_no + = dict_index_get_nth_col_or_prefix_pos( + m_prebuilt->index, i - num_v, + true, false, + &templ->rec_prefix_field_no); + ut_ad(templ->icp_rec_field_no + != ULINT_UNDEFINED); + + /* Index condition pushdown can be used on + all columns of a secondary index, and on + the PRIMARY KEY columns. On the clustered + index, it must never be used on other than + PRIMARY KEY columns, because those columns + may be stored off-page, and we will not + fetch externally stored columns before + checking the index condition. */ + /* TODO: test the above with an assertion + like this. Note that index conditions are + currently pushed down as part of the + "optimizer phase" while end_range is done + as part of the execution phase. Therefore, + we were unable to use an accurate condition + for end_range in the "if" condition above, + and the following assertion would fail. + ut_ad(!dict_index_is_clust(m_prebuilt->index) + || templ->rec_field_no + < m_prebuilt->index->n_uniq); + */ + } + + if (is_v) { + num_v++; + } + } + + ut_ad(m_prebuilt->idx_cond_n_cols > 0); + ut_ad(m_prebuilt->idx_cond_n_cols == m_prebuilt->n_template); + + num_v = 0; + + /* Include the fields that are not needed in index condition + pushdown. */ + for (ulint i = 0; i < n_fields; i++) { + const Field* field = table->field[i]; + const bool is_v = !field->stored_in_db(); + if (is_v && skip_virtual) { + num_v++; + continue; + } + + bool index_contains = index->contains_col_or_prefix( + is_v ? num_v : i - num_v, is_v); + + if (!build_template_needs_field_in_icp( + index, m_prebuilt, index_contains, + is_v ? num_v : i - num_v, is_v)) { + /* Not needed in ICP */ + if (!whole_row) { + field = build_template_needs_field( + index_contains, + m_prebuilt->read_just_key, + fetch_all_in_key, + fetch_primary_key_cols, + index, table, i, num_v); + if (!field) { + if (is_v) { + num_v++; + } + continue; + } + } + + ut_d(mysql_row_templ_t* templ =) + build_template_field( + m_prebuilt, clust_index, index, + table, field, i - num_v, num_v); + ut_ad(templ->is_virtual == (ulint)is_v); + + if (is_v) { + num_v++; + } + } + } + } else { +no_icp: + /* No index condition pushdown */ + m_prebuilt->idx_cond = NULL; + ut_ad(num_v == 0); + + for (ulint i = 0; i < n_fields; i++) { + const Field* field = table->field[i]; + const bool is_v = !field->stored_in_db(); + + if (whole_row) { + if (is_v && skip_virtual) { + num_v++; + continue; + } + /* Even this is whole_row, if the seach is + on a virtual column, and read_just_key is + set, and field is not in this index, we + will not try to fill the value since they + are not stored in such index nor in the + cluster index. */ + if (is_v + && m_prebuilt->read_just_key + && !m_prebuilt->index->contains_col_or_prefix( + num_v, true)) + { + /* Turn off ROW_MYSQL_WHOLE_ROW */ + m_prebuilt->template_type = + ROW_MYSQL_REC_FIELDS; + num_v++; + continue; + } + } else { + if (is_v + && (skip_virtual || index->is_primary())) { + num_v++; + continue; + } + + bool contain = index->contains_col_or_prefix( + is_v ? num_v: i - num_v, is_v); + + field = build_template_needs_field( + contain, + m_prebuilt->read_just_key, + fetch_all_in_key, + fetch_primary_key_cols, + index, table, i, num_v); + if (!field) { + if (is_v) { + num_v++; + } + continue; + } + } + + ut_d(mysql_row_templ_t* templ =) + build_template_field( + m_prebuilt, clust_index, index, + table, field, i - num_v, num_v); + ut_ad(templ->is_virtual == (ulint)is_v); + if (is_v) { + num_v++; + } + } + } + + if (index != clust_index && m_prebuilt->need_to_access_clustered) { + /* Change rec_field_no's to correspond to the clustered index + record */ + for (ulint i = 0; i < m_prebuilt->n_template; i++) { + mysql_row_templ_t* templ + = &m_prebuilt->mysql_template[i]; + + templ->rec_field_no = templ->clust_rec_field_no; + } + } +} + +/********************************************************************//** +This special handling is really to overcome the limitations of MySQL's +binlogging. We need to eliminate the non-determinism that will arise in +INSERT ... SELECT type of statements, since MySQL binlog only stores the +min value of the autoinc interval. Once that is fixed we can get rid of +the special lock handling. +@return DB_SUCCESS if all OK else error code */ + +dberr_t +ha_innobase::innobase_lock_autoinc(void) +/*====================================*/ +{ + DBUG_ENTER("ha_innobase::innobase_lock_autoinc"); + dberr_t error = DB_SUCCESS; + + ut_ad(!srv_read_only_mode); + + switch (innobase_autoinc_lock_mode) { + case AUTOINC_NO_LOCKING: + /* Acquire only the AUTOINC mutex. */ + m_prebuilt->table->autoinc_mutex.wr_lock(); + break; + + case AUTOINC_NEW_STYLE_LOCKING: + /* For simple (single/multi) row INSERTs/REPLACEs and RBR + events, we fallback to the old style only if another + transaction has already acquired the AUTOINC lock on + behalf of a LOAD FILE or INSERT ... SELECT etc. type of + statement. */ + switch (thd_sql_command(m_user_thd)) { + case SQLCOM_INSERT: + case SQLCOM_REPLACE: + case SQLCOM_END: // RBR event + /* Acquire the AUTOINC mutex. */ + m_prebuilt->table->autoinc_mutex.wr_lock(); + /* We need to check that another transaction isn't + already holding the AUTOINC lock on the table. */ + if (!m_prebuilt->table->n_waiting_or_granted_auto_inc_locks) { + /* Do not fall back to old style locking. */ + DBUG_RETURN(error); + } + m_prebuilt->table->autoinc_mutex.wr_unlock(); + } + /* Use old style locking. */ + /* fall through */ + case AUTOINC_OLD_STYLE_LOCKING: + DBUG_EXECUTE_IF("die_if_autoinc_old_lock_style_used", + ut_ad(0);); + error = row_lock_table_autoinc_for_mysql(m_prebuilt); + + if (error == DB_SUCCESS) { + + /* Acquire the AUTOINC mutex. */ + m_prebuilt->table->autoinc_mutex.wr_lock(); + } + break; + + default: + ut_error; + } + + DBUG_RETURN(error); +} + +/********************************************************************//** +Store the autoinc value in the table. The autoinc value is only set if +it's greater than the existing autoinc value in the table. +@return DB_SUCCESS if all went well else error code */ + +dberr_t +ha_innobase::innobase_set_max_autoinc( +/*==================================*/ + ulonglong auto_inc) /*!< in: value to store */ +{ + dberr_t error; + + error = innobase_lock_autoinc(); + + if (error == DB_SUCCESS) { + + dict_table_autoinc_update_if_greater(m_prebuilt->table, auto_inc); + m_prebuilt->table->autoinc_mutex.wr_unlock(); + } + + return(error); +} + +/** @return whether the table is read-only */ +bool ha_innobase::is_read_only(bool altering_to_supported) const +{ + ut_ad(m_prebuilt->trx == thd_to_trx(m_user_thd)); + + if (high_level_read_only) + { + ib_senderrf(m_user_thd, IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); + return true; + } + + if (altering_to_supported) + return false; + + if (!DICT_TF_GET_ZIP_SSIZE(m_prebuilt->table->flags) || + !innodb_read_only_compressed) + return false; + + ib_senderrf(m_user_thd, IB_LOG_LEVEL_WARN, ER_UNSUPPORTED_COMPRESSED_TABLE); + return true; +} + +/********************************************************************//** +Stores a row in an InnoDB database, to the table specified in this +handle. +@return error code */ + +int +ha_innobase::write_row( +/*===================*/ + const uchar* record) /*!< in: a row in MySQL format */ +{ + dberr_t error; +#ifdef WITH_WSREP + bool wsrep_auto_inc_inserted= false; +#endif + int error_result = 0; + bool auto_inc_used = false; + mariadb_set_stats set_stats_temporary(handler_stats); + + DBUG_ENTER("ha_innobase::write_row"); + + trx_t* trx = thd_to_trx(m_user_thd); + + /* Validation checks before we commence write_row operation. */ + if (is_read_only()) { + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } + + if (!trx_is_started(trx)) { + trx->will_lock = true; + } + + ins_mode_t vers_set_fields; + /* Handling of Auto-Increment Columns. */ + if (table->next_number_field && record == table->record[0]) { + + /* Reset the error code before calling + innobase_get_auto_increment(). */ + m_prebuilt->autoinc_error = DB_SUCCESS; + +#ifdef WITH_WSREP + wsrep_auto_inc_inserted = trx->is_wsrep() + && wsrep_drupal_282555_workaround + && table->next_number_field->val_int() == 0; +#endif + + if ((error_result = update_auto_increment())) { + /* We don't want to mask autoinc overflow errors. */ + + /* Handle the case where the AUTOINC sub-system + failed during initialization. */ + if (m_prebuilt->autoinc_error == DB_UNSUPPORTED) { + error_result = ER_AUTOINC_READ_FAILED; + /* Set the error message to report too. */ + my_error(ER_AUTOINC_READ_FAILED, MYF(0)); + goto func_exit; + } else if (m_prebuilt->autoinc_error != DB_SUCCESS) { + error = m_prebuilt->autoinc_error; + goto report_error; + } + + /* MySQL errors are passed straight back. */ + goto func_exit; + } + + auto_inc_used = true; + } + + /* Prepare INSERT graph that will be executed for actual INSERT + (This is a one time operation) */ + if (m_prebuilt->mysql_template == NULL + || m_prebuilt->template_type != ROW_MYSQL_WHOLE_ROW) { + + /* Build the template used in converting quickly between + the two database formats */ + + build_template(true); + } + + vers_set_fields = table->versioned_write(VERS_TRX_ID) ? + ROW_INS_VERSIONED : ROW_INS_NORMAL; + + /* Execute insert graph that will result in actual insert. */ + error = row_insert_for_mysql((byte*) record, m_prebuilt, vers_set_fields); + + DEBUG_SYNC(m_user_thd, "ib_after_row_insert"); + + /* Handling of errors related to auto-increment. */ + if (auto_inc_used) { + ulonglong auto_inc; + + /* Note the number of rows processed for this statement, used + by get_auto_increment() to determine the number of AUTO-INC + values to reserve. This is only useful for a mult-value INSERT + and is a statement level counter. */ + if (trx->n_autoinc_rows > 0) { + --trx->n_autoinc_rows; + } + + /* Get the value that MySQL attempted to store in the table.*/ + auto_inc = table->next_number_field->val_uint(); + + switch (error) { + case DB_DUPLICATE_KEY: + + /* A REPLACE command and LOAD DATA INFILE REPLACE + handle a duplicate key error themselves, but we + must update the autoinc counter if we are performing + those statements. */ + + switch (thd_sql_command(m_user_thd)) { + case SQLCOM_LOAD: + if (!trx->duplicates) { + break; + } + + case SQLCOM_REPLACE: + case SQLCOM_INSERT_SELECT: + case SQLCOM_REPLACE_SELECT: + goto set_max_autoinc; + +#ifdef WITH_WSREP + /* workaround for LP bug #355000, retrying the insert */ + case SQLCOM_INSERT: + + WSREP_DEBUG("DUPKEY error for autoinc\n" + "THD %ld, value %llu, off %llu inc %llu", + thd_get_thread_id(m_user_thd), + auto_inc, + m_prebuilt->autoinc_offset, + m_prebuilt->autoinc_increment); + + if (wsrep_auto_inc_inserted && + wsrep_thd_retry_counter(m_user_thd) == 0 && + !thd_test_options(m_user_thd, + OPTION_NOT_AUTOCOMMIT | + OPTION_BEGIN)) { + WSREP_DEBUG( + "retrying insert: %s", + wsrep_thd_query(m_user_thd)); + error= DB_SUCCESS; + wsrep_thd_self_abort(m_user_thd); + /* jump straight to func exit over + * later wsrep hooks */ + goto func_exit; + } + break; +#endif /* WITH_WSREP */ + + default: + break; + } + + break; + + case DB_SUCCESS: + /* If the actual value inserted is greater than + the upper limit of the interval, then we try and + update the table upper limit. Note: last_value + will be 0 if get_auto_increment() was not called. */ + + if (auto_inc >= m_prebuilt->autoinc_last_value) { +set_max_autoinc: + /* We need the upper limit of the col type to check for + whether we update the table autoinc counter or not. */ + ulonglong col_max_value = + table->next_number_field->get_max_int_value(); + + /* This should filter out the negative + values set explicitly by the user. */ + if (auto_inc <= col_max_value) { + ut_ad(m_prebuilt->autoinc_increment > 0); + + ulonglong offset; + ulonglong increment; + dberr_t err; + + offset = m_prebuilt->autoinc_offset; + increment = m_prebuilt->autoinc_increment; + + auto_inc = innobase_next_autoinc( + auto_inc, 1, increment, offset, + col_max_value); + + err = innobase_set_max_autoinc( + auto_inc); + + if (err != DB_SUCCESS) { + error = err; + } + } + } + break; + default: + break; + } + } + +report_error: + /* Cleanup and exit. */ + if (error == DB_TABLESPACE_DELETED) { + ib_senderrf( + trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_DISCARDED, + table->s->table_name.str); + } + + error_result = convert_error_code_to_mysql( + error, m_prebuilt->table->flags, m_user_thd); + +#ifdef WITH_WSREP + if (!error_result && trx->is_wsrep() + && !trx->is_bulk_insert() + && wsrep_thd_is_local(m_user_thd) + && !wsrep_thd_ignore_table(m_user_thd) + && !wsrep_consistency_check(m_user_thd) + && (thd_sql_command(m_user_thd) != SQLCOM_CREATE_TABLE) + && (thd_sql_command(m_user_thd) != SQLCOM_LOAD || + thd_binlog_format(m_user_thd) == BINLOG_FORMAT_ROW)) { + if (wsrep_append_keys(m_user_thd, WSREP_SERVICE_KEY_EXCLUSIVE, + record, + NULL)) { + DBUG_PRINT("wsrep", ("row key failed")); + error_result = HA_ERR_INTERNAL_ERROR; + goto func_exit; + } + } +#endif /* WITH_WSREP */ + + if (error_result == HA_FTS_INVALID_DOCID) { + my_error(HA_FTS_INVALID_DOCID, MYF(0)); + } + +func_exit: + DBUG_RETURN(error_result); +} + +/** Fill the update vector's "old_vrow" field for those non-updated, +but indexed columns. Such columns could stil present in the virtual +index rec fields even if they are not updated (some other fields updated), +so needs to be logged. +@param[in] prebuilt InnoDB prebuilt struct +@param[in,out] vfield field to filled +@param[in] o_len actual column length +@param[in,out] col column to be filled +@param[in] old_mysql_row_col MySQL old field ptr +@param[in] col_pack_len MySQL field col length +@param[in,out] buf buffer for a converted integer value +@return used buffer ptr from row_mysql_store_col_in_innobase_format() */ +static +byte* +innodb_fill_old_vcol_val( + row_prebuilt_t* prebuilt, + dfield_t* vfield, + ulint o_len, + dict_col_t* col, + const byte* old_mysql_row_col, + ulint col_pack_len, + byte* buf) +{ + dict_col_copy_type( + col, dfield_get_type(vfield)); + if (o_len != UNIV_SQL_NULL) { + + buf = row_mysql_store_col_in_innobase_format( + vfield, + buf, + TRUE, + old_mysql_row_col, + col_pack_len, + dict_table_is_comp(prebuilt->table)); + } else { + dfield_set_null(vfield); + } + + return(buf); +} + +/** Calculate an update vector corresponding to the changes +between old_row and new_row. +@param[out] uvect update vector +@param[in] old_row current row in MySQL format +@param[in] new_row intended updated row in MySQL format +@param[in] table MySQL table handle +@param[in,out] upd_buff buffer to use for converted values +@param[in] buff_len length of upd_buff +@param[in,out] prebuilt InnoDB execution context +@param[out] auto_inc updated AUTO_INCREMENT value, or 0 if none +@return DB_SUCCESS or error code */ +static +dberr_t +calc_row_difference( + upd_t* uvect, + const uchar* old_row, + const uchar* new_row, + TABLE* table, + uchar* upd_buff, + ulint buff_len, + row_prebuilt_t* prebuilt, + ib_uint64_t& auto_inc) +{ + uchar* original_upd_buff = upd_buff; + Field* field; + enum_field_types field_mysql_type; + ulint o_len; + ulint n_len; + ulint col_pack_len; + const byte* new_mysql_row_col; + const byte* old_mysql_row_col; + const byte* o_ptr; + const byte* n_ptr; + byte* buf; + upd_field_t* ufield; + ulint col_type; + ulint n_changed = 0; + dfield_t dfield; + dict_index_t* clust_index; + ibool changes_fts_column = FALSE; + ibool changes_fts_doc_col = FALSE; + trx_t* const trx = prebuilt->trx; + doc_id_t doc_id = FTS_NULL_DOC_ID; + uint16_t num_v = 0; +#ifndef DBUG_OFF + uint vers_fields = 0; +#endif + prebuilt->versioned_write = table->versioned_write(VERS_TRX_ID); + const bool skip_virtual = ha_innobase::omits_virtual_cols(*table->s); + + ut_ad(!srv_read_only_mode); + + clust_index = dict_table_get_first_index(prebuilt->table); + auto_inc = 0; + + /* We use upd_buff to convert changed fields */ + buf = (byte*) upd_buff; + + for (uint i = 0; i < table->s->fields; i++) { + field = table->field[i]; + +#ifndef DBUG_OFF + if (!field->vers_sys_field() + && !field->vers_update_unversioned()) { + ++vers_fields; + } +#endif + + const bool is_virtual = !field->stored_in_db(); + if (is_virtual && skip_virtual) { + num_v++; + continue; + } + dict_col_t* col = is_virtual + ? &prebuilt->table->v_cols[num_v].m_col + : &prebuilt->table->cols[i - num_v]; + + o_ptr = (const byte*) old_row + get_field_offset(table, field); + n_ptr = (const byte*) new_row + get_field_offset(table, field); + + /* Use new_mysql_row_col and col_pack_len save the values */ + + new_mysql_row_col = n_ptr; + old_mysql_row_col = o_ptr; + col_pack_len = field->pack_length(); + + o_len = col_pack_len; + n_len = col_pack_len; + + /* We use o_ptr and n_ptr to dig up the actual data for + comparison. */ + + field_mysql_type = field->type(); + + col_type = col->mtype; + + switch (col_type) { + + case DATA_BLOB: + case DATA_GEOMETRY: + o_ptr = row_mysql_read_blob_ref(&o_len, o_ptr, o_len); + n_ptr = row_mysql_read_blob_ref(&n_len, n_ptr, n_len); + + break; + + case DATA_VARCHAR: + case DATA_BINARY: + case DATA_VARMYSQL: + if (field_mysql_type == MYSQL_TYPE_VARCHAR) { + /* This is a >= 5.0.3 type true VARCHAR where + the real payload data length is stored in + 1 or 2 bytes */ + + o_ptr = row_mysql_read_true_varchar( + &o_len, o_ptr, + (ulint) + (((Field_varstring*) field)->length_bytes)); + + n_ptr = row_mysql_read_true_varchar( + &n_len, n_ptr, + (ulint) + (((Field_varstring*) field)->length_bytes)); + } + + break; + default: + ; + } + + if (field_mysql_type == MYSQL_TYPE_LONGLONG + && prebuilt->table->fts + && innobase_strcasecmp( + field->field_name.str, FTS_DOC_ID_COL_NAME) == 0) { + doc_id = mach_read_uint64_little_endian(n_ptr); + if (doc_id == 0) { + return(DB_FTS_INVALID_DOCID); + } + } + + if (field->real_maybe_null()) { + if (field->is_null_in_record(old_row)) { + o_len = UNIV_SQL_NULL; + } + + if (field->is_null_in_record(new_row)) { + n_len = UNIV_SQL_NULL; + } + } + + if (is_virtual) { + /* If the virtual column is not indexed, + we shall ignore it for update */ + if (!col->ord_part) { + next: + num_v++; + continue; + } + + if (!uvect->old_vrow) { + uvect->old_vrow = dtuple_create_with_vcol( + uvect->heap, 0, prebuilt->table->n_v_cols); + } + + ulint max_field_len = DICT_MAX_FIELD_LEN_BY_FORMAT( + prebuilt->table); + + /* for virtual columns, we only materialize + its index, and index field length would not + exceed max_field_len. So continue if the + first max_field_len bytes are matched up */ + if (o_len != UNIV_SQL_NULL + && n_len != UNIV_SQL_NULL + && o_len >= max_field_len + && n_len >= max_field_len + && memcmp(o_ptr, n_ptr, max_field_len) == 0) { + dfield_t* vfield = dtuple_get_nth_v_field( + uvect->old_vrow, num_v); + buf = innodb_fill_old_vcol_val( + prebuilt, vfield, o_len, + col, old_mysql_row_col, + col_pack_len, buf); + goto next; + } + } + + if (o_len != n_len || (o_len != 0 && o_len != UNIV_SQL_NULL + && 0 != memcmp(o_ptr, n_ptr, o_len))) { + /* The field has changed */ + + ufield = uvect->fields + n_changed; + MEM_UNDEFINED(ufield, sizeof *ufield); + + /* Let us use a dummy dfield to make the conversion + from the MySQL column format to the InnoDB format */ + + + /* If the length of new geometry object is 0, means + this object is invalid geometry object, we need + to block it. */ + if (DATA_GEOMETRY_MTYPE(col_type) + && o_len != 0 && n_len == 0) { + return(DB_CANT_CREATE_GEOMETRY_OBJECT); + } + + if (n_len != UNIV_SQL_NULL) { + dict_col_copy_type( + col, dfield_get_type(&dfield)); + + buf = row_mysql_store_col_in_innobase_format( + &dfield, + (byte*) buf, + TRUE, + new_mysql_row_col, + col_pack_len, + dict_table_is_comp(prebuilt->table)); + dfield_copy(&ufield->new_val, &dfield); + } else { + dict_col_copy_type( + col, dfield_get_type(&ufield->new_val)); + dfield_set_null(&ufield->new_val); + } + + ufield->exp = NULL; + ufield->orig_len = 0; + if (is_virtual) { + dfield_t* vfield = dtuple_get_nth_v_field( + uvect->old_vrow, num_v); + upd_fld_set_virtual_col(ufield); + ufield->field_no = num_v; + + ut_ad(col->ord_part); + ufield->old_v_val = static_cast( + mem_heap_alloc( + uvect->heap, + sizeof *ufield->old_v_val)); + + if (!field->is_null_in_record(old_row)) { + if (n_len == UNIV_SQL_NULL) { + dict_col_copy_type( + col, dfield_get_type( + &dfield)); + } + + buf = row_mysql_store_col_in_innobase_format( + &dfield, + (byte*) buf, + TRUE, + old_mysql_row_col, + col_pack_len, + dict_table_is_comp( + prebuilt->table)); + dfield_copy(ufield->old_v_val, + &dfield); + dfield_copy(vfield, &dfield); + } else { + dict_col_copy_type( + col, dfield_get_type( + ufield->old_v_val)); + dfield_set_null(ufield->old_v_val); + dfield_set_null(vfield); + } + num_v++; + ut_ad(field != table->found_next_number_field); + } else { + ufield->field_no = static_cast( + dict_col_get_clust_pos( + &prebuilt->table->cols + [i - num_v], + clust_index)); + ufield->old_v_val = NULL; + if (field != table->found_next_number_field + || dfield_is_null(&ufield->new_val)) { + } else { + auto_inc = field->val_uint(); + } + } + n_changed++; + + /* If an FTS indexed column was changed by this + UPDATE then we need to inform the FTS sub-system. + + NOTE: Currently we re-index all FTS indexed columns + even if only a subset of the FTS indexed columns + have been updated. That is the reason we are + checking only once here. Later we will need to + note which columns have been updated and do + selective processing. */ + if (prebuilt->table->fts != NULL && !is_virtual) { + ulint offset; + dict_table_t* innodb_table; + + innodb_table = prebuilt->table; + + if (!changes_fts_column) { + offset = row_upd_changes_fts_column( + innodb_table, ufield); + + if (offset != ULINT_UNDEFINED) { + changes_fts_column = TRUE; + } + } + + if (!changes_fts_doc_col) { + changes_fts_doc_col = + row_upd_changes_doc_id( + innodb_table, ufield); + } + } + } else if (is_virtual) { + dfield_t* vfield = dtuple_get_nth_v_field( + uvect->old_vrow, num_v); + buf = innodb_fill_old_vcol_val( + prebuilt, vfield, o_len, + col, old_mysql_row_col, + col_pack_len, buf); + ut_ad(col->ord_part); + num_v++; + } + } + + /* If the update changes a column with an FTS index on it, we + then add an update column node with a new document id to the + other changes. We piggy back our changes on the normal UPDATE + to reduce processing and IO overhead. */ + if (!prebuilt->table->fts) { + trx->fts_next_doc_id = 0; + } else if (changes_fts_column || changes_fts_doc_col) { + dict_table_t* innodb_table = prebuilt->table; + + ufield = uvect->fields + n_changed; + + if (!DICT_TF2_FLAG_IS_SET( + innodb_table, DICT_TF2_FTS_HAS_DOC_ID)) { + + /* If Doc ID is managed by user, and if any + FTS indexed column has been updated, its corresponding + Doc ID must also be updated. Otherwise, return + error */ + if (changes_fts_column && !changes_fts_doc_col) { + ib::warn() << "A new Doc ID must be supplied" + " while updating FTS indexed columns."; + return(DB_FTS_INVALID_DOCID); + } + + /* Doc ID must monotonically increase */ + ut_ad(innodb_table->fts->cache); + if (doc_id < prebuilt->table->fts->cache->next_doc_id) { + + ib::warn() << "FTS Doc ID must be larger than " + << innodb_table->fts->cache->next_doc_id + - 1 << " for table " + << innodb_table->name; + + return(DB_FTS_INVALID_DOCID); + } + + + trx->fts_next_doc_id = doc_id; + } else { + /* If the Doc ID is a hidden column, it can't be + changed by user */ + ut_ad(!changes_fts_doc_col); + + /* Doc ID column is hidden, a new Doc ID will be + generated by following fts_update_doc_id() call */ + trx->fts_next_doc_id = 0; + } + + fts_update_doc_id( + innodb_table, ufield, &trx->fts_next_doc_id); + + ++n_changed; + } else { + /* We have a Doc ID column, but none of FTS indexed + columns are touched, nor the Doc ID column, so set + fts_next_doc_id to UINT64_UNDEFINED, which means do not + update the Doc ID column */ + trx->fts_next_doc_id = UINT64_UNDEFINED; + } + + uvect->n_fields = n_changed; + uvect->info_bits = 0; + + ut_a(buf <= (byte*) original_upd_buff + buff_len); + + const TABLE_LIST *tl= table->pos_in_table_list; + const uint8 op_map= tl->trg_event_map | tl->slave_fk_event_map; + /* Used to avoid reading history in FK check on DELETE (see MDEV-16210). */ + prebuilt->upd_node->is_delete = + (op_map & trg2bit(TRG_EVENT_DELETE) + && table->versioned(VERS_TIMESTAMP)) + ? VERSIONED_DELETE : NO_DELETE; + + if (prebuilt->versioned_write) { + /* Guaranteed by CREATE TABLE, but anyway we make sure we + generate history only when there are versioned fields. */ + DBUG_ASSERT(vers_fields); + prebuilt->upd_node->vers_make_update(trx); + } + + ut_ad(uvect->validate()); + return(DB_SUCCESS); +} + +#ifdef WITH_WSREP +static +int +wsrep_calc_row_hash( +/*================*/ + byte* digest, /*!< in/out: md5 sum */ + const uchar* row, /*!< in: row in MySQL format */ + TABLE* table, /*!< in: table in MySQL data + dictionary */ + row_prebuilt_t* prebuilt) /*!< in: InnoDB prebuilt struct */ +{ + void *ctx = alloca(my_md5_context_size()); + my_md5_init(ctx); + + for (uint i = 0; i < table->s->fields; i++) { + byte null_byte=0; + byte true_byte=1; + unsigned is_unsigned; + + const Field* field = table->field[i]; + if (!field->stored_in_db()) { + continue; + } + + auto ptr = row + get_field_offset(table, field); + ulint len = field->pack_length(); + + switch (get_innobase_type_from_mysql_type(&is_unsigned, + field)) { + case DATA_BLOB: + ptr = row_mysql_read_blob_ref(&len, ptr, len); + + break; + + case DATA_VARCHAR: + case DATA_BINARY: + case DATA_VARMYSQL: + if (field->type() == MYSQL_TYPE_VARCHAR) { + /* This is a >= 5.0.3 type true VARCHAR where + the real payload data length is stored in + 1 or 2 bytes */ + + ptr = row_mysql_read_true_varchar( + &len, ptr, + (ulint) + (((Field_varstring*)field)->length_bytes)); + + } + + break; + default: + ; + } + /* + if (field->null_ptr && + field_in_record_is_null(table, field, (char*) row)) { + */ + + if (field->is_null_in_record(row)) { + my_md5_input(ctx, &null_byte, 1); + } else { + my_md5_input(ctx, &true_byte, 1); + my_md5_input(ctx, ptr, len); + } + } + + my_md5_result(ctx, digest); + + return(0); +} + +/** Append table-level exclusive key. +@param thd MySQL thread handle +@param table table +@retval false on success +@retval true on failure */ +ATTRIBUTE_COLD bool wsrep_append_table_key(MYSQL_THD thd, const dict_table_t &table) +{ + char db_buf[NAME_LEN + 1]; + char tbl_buf[NAME_LEN + 1]; + ulint db_buf_len, tbl_buf_len; + + if (!table.parse_name(db_buf, tbl_buf, &db_buf_len, &tbl_buf_len)) + { + WSREP_ERROR("Parse_name for table key append failed: %s", + wsrep_thd_query(thd)); + return true; + } + + /* Append table-level exclusive key */ + const int rcode = wsrep_thd_append_table_key(thd, db_buf, + tbl_buf, WSREP_SERVICE_KEY_EXCLUSIVE); + if (rcode) + { + WSREP_ERROR("Appending table key failed: %s, %d", + wsrep_thd_query(thd), rcode); + return true; + } + + return false; +} +#endif /* WITH_WSREP */ + +/** +Updates a row given as a parameter to a new value. Note that we are given +whole rows, not just the fields which are updated: this incurs some +overhead for CPU when we check which fields are actually updated. +TODO: currently InnoDB does not prevent the 'Halloween problem': +in a searched update a single row can get updated several times +if its index columns are updated! +@param[in] old_row Old row contents in MySQL format +@param[out] new_row Updated row contents in MySQL format +@return error number or 0 */ + +int +ha_innobase::update_row( + const uchar* old_row, + const uchar* new_row) +{ + int err; + + dberr_t error; + trx_t* trx = thd_to_trx(m_user_thd); + mariadb_set_stats set_stats_temporary(handler_stats); + + DBUG_ENTER("ha_innobase::update_row"); + + if (is_read_only()) { + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } else if (!trx_is_started(trx)) { + trx->will_lock = true; + } + + if (m_upd_buf == NULL) { + ut_ad(m_upd_buf_size == 0); + + /* Create a buffer for packing the fields of a record. Why + table->reclength did not work here? Obviously, because char + fields when packed actually became 1 byte longer, when we also + stored the string length as the first byte. */ + + m_upd_buf_size = table->s->reclength + table->s->max_key_length + + MAX_REF_PARTS * 3; + + m_upd_buf = reinterpret_cast( + my_malloc(PSI_INSTRUMENT_ME, + m_upd_buf_size, + MYF(MY_WME))); + + if (m_upd_buf == NULL) { + m_upd_buf_size = 0; + DBUG_RETURN(HA_ERR_OUT_OF_MEM); + } + } + + upd_t* uvect = row_get_prebuilt_update_vector(m_prebuilt); + ib_uint64_t autoinc; + + /* Build an update vector from the modified fields in the rows + (uses m_upd_buf of the handle) */ + + error = calc_row_difference( + uvect, old_row, new_row, table, m_upd_buf, m_upd_buf_size, + m_prebuilt, autoinc); + + if (error != DB_SUCCESS) { + goto func_exit; + } + + if (!uvect->n_fields) { + /* This is the same as success, but instructs + MySQL that the row is not really updated and it + should not increase the count of updated rows. + This is fix for http://bugs.mysql.com/29157 */ + DBUG_RETURN(HA_ERR_RECORD_IS_THE_SAME); + } else { + if (m_prebuilt->upd_node->is_delete) { + trx->fts_next_doc_id = 0; + } + + /* row_start was updated by vers_make_update() + in calc_row_difference() */ + error = row_update_for_mysql(m_prebuilt); + + if (error == DB_SUCCESS && m_prebuilt->versioned_write + /* Multiple UPDATE of same rows in single transaction create + historical rows only once. */ + && trx->id != table->vers_start_id()) { + /* UPDATE is not used by ALTER TABLE. Just precaution + as we don't need history generation for ALTER TABLE. */ + ut_ad(thd_sql_command(m_user_thd) != SQLCOM_ALTER_TABLE); + error = row_insert_for_mysql((byte*) old_row, + m_prebuilt, + ROW_INS_HISTORICAL); + } + } + + if (error == DB_SUCCESS && autoinc) { + /* A value for an AUTO_INCREMENT column + was specified in the UPDATE statement. */ + + /* We need the upper limit of the col type to check for + whether we update the table autoinc counter or not. */ + ulonglong col_max_value = + table->found_next_number_field->get_max_int_value(); + + /* This should filter out the negative + values set explicitly by the user. */ + if (autoinc <= col_max_value) { + ulonglong offset; + ulonglong increment; + + offset = m_prebuilt->autoinc_offset; + increment = m_prebuilt->autoinc_increment; + + autoinc = innobase_next_autoinc( + autoinc, 1, increment, offset, + col_max_value); + + error = innobase_set_max_autoinc(autoinc); + + if (m_prebuilt->table->persistent_autoinc) { + /* Update the PAGE_ROOT_AUTO_INC. Yes, we do + this even if dict_table_t::autoinc already was + greater than autoinc, because we cannot know + if any INSERT actually used (and wrote to + PAGE_ROOT_AUTO_INC) a value bigger than our + autoinc. */ + btr_write_autoinc(dict_table_get_first_index( + m_prebuilt->table), + autoinc); + } + } + } + +func_exit: + if (error == DB_FTS_INVALID_DOCID) { + err = HA_FTS_INVALID_DOCID; + my_error(HA_FTS_INVALID_DOCID, MYF(0)); + } else { + err = convert_error_code_to_mysql( + error, m_prebuilt->table->flags, m_user_thd); + } + +#ifdef WITH_WSREP + if (error == DB_SUCCESS && trx->is_wsrep() + && wsrep_thd_is_local(m_user_thd) + && !wsrep_thd_ignore_table(m_user_thd)) { + DBUG_PRINT("wsrep", ("update row key")); + + /* We use table-level exclusive key for SEQUENCES + and normal key append for others. */ + if (table->s->table_type == TABLE_TYPE_SEQUENCE) { + if (wsrep_append_table_key(m_user_thd, *m_prebuilt->table)) + DBUG_RETURN(HA_ERR_INTERNAL_ERROR); + } else if (wsrep_append_keys(m_user_thd, + wsrep_protocol_version >= 4 + ? WSREP_SERVICE_KEY_UPDATE + : WSREP_SERVICE_KEY_EXCLUSIVE, + old_row, new_row)) { + WSREP_DEBUG("WSREP: UPDATE_ROW_KEY FAILED"); + DBUG_PRINT("wsrep", ("row key failed")); + DBUG_RETURN(HA_ERR_INTERNAL_ERROR); + } + } +#endif /* WITH_WSREP */ + + DBUG_RETURN(err); +} + +/**********************************************************************//** +Deletes a row given as the parameter. +@return error number or 0 */ + +int +ha_innobase::delete_row( +/*====================*/ + const uchar* record) /*!< in: a row in MySQL format */ +{ + dberr_t error; + trx_t* trx = thd_to_trx(m_user_thd); + mariadb_set_stats set_stats_temporary(handler_stats); + + DBUG_ENTER("ha_innobase::delete_row"); + + if (is_read_only()) { + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } else if (!trx_is_started(trx)) { + trx->will_lock = true; + } + + if (!m_prebuilt->upd_node) { + row_get_prebuilt_update_vector(m_prebuilt); + } + + /* This is a delete */ + m_prebuilt->upd_node->is_delete = table->versioned_write(VERS_TRX_ID) + && table->vers_end_field()->is_max() + && trx->id != table->vers_start_id() + ? VERSIONED_DELETE + : PLAIN_DELETE; + trx->fts_next_doc_id = 0; + + error = row_update_for_mysql(m_prebuilt); + +#ifdef WITH_WSREP + if (error == DB_SUCCESS && trx->is_wsrep() + && wsrep_thd_is_local(m_user_thd) + && !wsrep_thd_ignore_table(m_user_thd)) { + if (wsrep_append_keys(m_user_thd, WSREP_SERVICE_KEY_EXCLUSIVE, + record, + NULL)) { + DBUG_PRINT("wsrep", ("delete fail")); + DBUG_RETURN(HA_ERR_INTERNAL_ERROR); + } + } +#endif /* WITH_WSREP */ + DBUG_RETURN(convert_error_code_to_mysql( + error, m_prebuilt->table->flags, m_user_thd)); +} + +/**********************************************************************//** +Removes a new lock set on a row, if it was not read optimistically. This can +be called after a row has been read in the processing of an UPDATE or a DELETE +query. */ + +void +ha_innobase::unlock_row(void) +/*=========================*/ +{ + DBUG_ENTER("ha_innobase::unlock_row"); + + if (m_prebuilt->select_lock_type == LOCK_NONE) { + DBUG_VOID_RETURN; + } + + ut_ad(trx_state_eq(m_prebuilt->trx, TRX_STATE_ACTIVE, true)); + + switch (m_prebuilt->row_read_type) { + case ROW_READ_WITH_LOCKS: + if (m_prebuilt->trx->isolation_level > TRX_ISO_READ_COMMITTED) + break; + /* fall through */ + case ROW_READ_TRY_SEMI_CONSISTENT: + row_unlock_for_mysql(m_prebuilt, FALSE); + break; + case ROW_READ_DID_SEMI_CONSISTENT: + m_prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT; + break; + } + + DBUG_VOID_RETURN; +} + +/* See handler.h and row0mysql.h for docs on this function. */ + +bool +ha_innobase::was_semi_consistent_read(void) +/*=======================================*/ +{ + return(m_prebuilt->row_read_type == ROW_READ_DID_SEMI_CONSISTENT); +} + +/* See handler.h and row0mysql.h for docs on this function. */ +void ha_innobase::try_semi_consistent_read(bool yes) +{ + ut_ad(m_prebuilt->trx == thd_to_trx(ha_thd())); + /* Row read type is set to semi consistent read if this was + requested by the SQL layer and the transaction isolation level is + READ UNCOMMITTED or READ COMMITTED. */ + m_prebuilt->row_read_type = yes + && m_prebuilt->trx->isolation_level <= TRX_ISO_READ_COMMITTED + ? ROW_READ_TRY_SEMI_CONSISTENT + : ROW_READ_WITH_LOCKS; +} + +/******************************************************************//** +Initializes a handle to use an index. +@return 0 or error number */ + +int +ha_innobase::index_init( +/*====================*/ + uint keynr, /*!< in: key (index) number */ + bool) +{ + DBUG_ENTER("index_init"); + + DBUG_RETURN(change_active_index(keynr)); +} + +/******************************************************************//** +Currently does nothing. +@return 0 */ + +int +ha_innobase::index_end(void) +/*========================*/ +{ + DBUG_ENTER("index_end"); + + active_index = MAX_KEY; + + in_range_check_pushed_down = FALSE; + + m_ds_mrr.dsmrr_close(); + + DBUG_RETURN(0); +} + +/*********************************************************************//** +Converts a search mode flag understood by MySQL to a flag understood +by InnoDB. */ +page_cur_mode_t +convert_search_mode_to_innobase( +/*============================*/ + ha_rkey_function find_flag) +{ + switch (find_flag) { + case HA_READ_KEY_EXACT: + /* this does not require the index to be UNIQUE */ + case HA_READ_KEY_OR_NEXT: + return(PAGE_CUR_GE); + case HA_READ_AFTER_KEY: + return(PAGE_CUR_G); + case HA_READ_BEFORE_KEY: + return(PAGE_CUR_L); + case HA_READ_KEY_OR_PREV: + case HA_READ_PREFIX_LAST: + case HA_READ_PREFIX_LAST_OR_PREV: + return(PAGE_CUR_LE); + case HA_READ_MBR_CONTAIN: + return(PAGE_CUR_CONTAIN); + case HA_READ_MBR_INTERSECT: + return(PAGE_CUR_INTERSECT); + case HA_READ_MBR_WITHIN: + return(PAGE_CUR_WITHIN); + case HA_READ_MBR_DISJOINT: + return(PAGE_CUR_DISJOINT); + case HA_READ_MBR_EQUAL: + return(PAGE_CUR_MBR_EQUAL); + case HA_READ_PREFIX: + return(PAGE_CUR_UNSUPP); + /* do not use "default:" in order to produce a gcc warning: + enumeration value '...' not handled in switch + (if -Wswitch or -Wall is used) */ + } + + my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "this functionality"); + + return(PAGE_CUR_UNSUPP); +} + +/* + BACKGROUND INFO: HOW A SELECT SQL QUERY IS EXECUTED + --------------------------------------------------- +The following does not cover all the details, but explains how we determine +the start of a new SQL statement, and what is associated with it. + +For each table in the database the MySQL interpreter may have several +table handle instances in use, also in a single SQL query. For each table +handle instance there is an InnoDB 'm_prebuilt' struct which contains most +of the InnoDB data associated with this table handle instance. + + A) if the user has not explicitly set any MySQL table level locks: + + 1) MySQL calls ::external_lock to set an 'intention' table level lock on +the table of the handle instance. There we set +m_prebuilt->sql_stat_start = TRUE. The flag sql_stat_start should be set +true if we are taking this table handle instance to use in a new SQL +statement issued by the user. We also increment trx->n_mysql_tables_in_use. + + 2) If m_prebuilt->sql_stat_start == TRUE we 'pre-compile' the MySQL search +instructions to m_prebuilt->template of the table handle instance in +::index_read. The template is used to save CPU time in large joins. + + 3) In row_search_mvcc(), if m_prebuilt->sql_stat_start is true, we +allocate a new consistent read view for the trx if it does not yet have one, +or in the case of a locking read, set an InnoDB 'intention' table level +lock on the table. + + 4) We do the SELECT. MySQL may repeatedly call ::index_read for the +same table handle instance, if it is a join. + + 5) When the SELECT ends, MySQL removes its intention table level locks +in ::external_lock. When trx->n_mysql_tables_in_use drops to zero, + (a) we execute a COMMIT there if the autocommit is on, + (b) we also release possible 'SQL statement level resources' InnoDB may +have for this SQL statement. The MySQL interpreter does NOT execute +autocommit for pure read transactions, though it should. That is why the +table handler in that case has to execute the COMMIT in ::external_lock. + + B) If the user has explicitly set MySQL table level locks, then MySQL +does NOT call ::external_lock at the start of the statement. To determine +when we are at the start of a new SQL statement we at the start of +::index_read also compare the query id to the latest query id where the +table handle instance was used. If it has changed, we know we are at the +start of a new SQL statement. Since the query id can theoretically +overwrap, we use this test only as a secondary way of determining the +start of a new SQL statement. */ + + +/**********************************************************************//** +Positions an index cursor to the index specified in the handle. Fetches the +row if any. +@return 0, HA_ERR_KEY_NOT_FOUND, or error number */ + +int +ha_innobase::index_read( +/*====================*/ + uchar* buf, /*!< in/out: buffer for the returned + row */ + const uchar* key_ptr, /*!< in: key value; if this is NULL + we position the cursor at the + start or end of index; this can + also contain an InnoDB row id, in + which case key_len is the InnoDB + row id length; the key value can + also be a prefix of a full key value, + and the last column can be a prefix + of a full column */ + uint key_len,/*!< in: key value length */ + enum ha_rkey_function find_flag)/*!< in: search flags from my_base.h */ +{ + DBUG_ENTER("index_read"); + mariadb_set_stats set_stats_temporary(handler_stats); + DEBUG_SYNC_C("ha_innobase_index_read_begin"); + + ut_a(m_prebuilt->trx == thd_to_trx(m_user_thd)); + ut_ad(key_len != 0 || find_flag != HA_READ_KEY_EXACT); + + dict_index_t* index = m_prebuilt->index; + + if (index == NULL || index->is_corrupted()) { + m_prebuilt->index_usable = FALSE; + DBUG_RETURN(HA_ERR_CRASHED); + } + + if (!m_prebuilt->index_usable) { + DBUG_RETURN(index->is_corrupted() + ? HA_ERR_INDEX_CORRUPT + : HA_ERR_TABLE_DEF_CHANGED); + } + + if (index->type & DICT_FTS) { + DBUG_RETURN(HA_ERR_KEY_NOT_FOUND); + } + + /* For R-Tree index, we will always place the page lock to + pages being searched */ + if (index->is_spatial() && !m_prebuilt->trx->will_lock) { + if (trx_is_started(m_prebuilt->trx)) { + DBUG_RETURN(HA_ERR_READ_ONLY_TRANSACTION); + } else { + m_prebuilt->trx->will_lock = true; + } + } + + /* Note that if the index for which the search template is built is not + necessarily m_prebuilt->index, but can also be the clustered index */ + + if (m_prebuilt->sql_stat_start) { + build_template(false); + } + + if (key_ptr != NULL) { + /* Convert the search key value to InnoDB format into + m_prebuilt->search_tuple */ + + row_sel_convert_mysql_key_to_innobase( + m_prebuilt->search_tuple, + m_prebuilt->srch_key_val1, + m_prebuilt->srch_key_val_len, + index, + (byte*) key_ptr, + (ulint) key_len); + + DBUG_ASSERT(m_prebuilt->search_tuple->n_fields > 0); + } else { + /* We position the cursor to the last or the first entry + in the index */ + + dtuple_set_n_fields(m_prebuilt->search_tuple, 0); + } + + page_cur_mode_t mode = convert_search_mode_to_innobase(find_flag); + + ulint match_mode = 0; + + if (find_flag == HA_READ_KEY_EXACT) { + + match_mode = ROW_SEL_EXACT; + + } else if (find_flag == HA_READ_PREFIX_LAST) { + + match_mode = ROW_SEL_EXACT_PREFIX; + } + + m_last_match_mode = (uint) match_mode; + + dberr_t ret = mode == PAGE_CUR_UNSUPP ? DB_UNSUPPORTED + : row_search_mvcc(buf, mode, m_prebuilt, match_mode, 0); + + DBUG_EXECUTE_IF("ib_select_query_failure", ret = DB_ERROR;); + + int error; + + switch (ret) { + case DB_SUCCESS: + error = 0; + table->status = 0; + break; + + case DB_RECORD_NOT_FOUND: + error = HA_ERR_KEY_NOT_FOUND; + table->status = STATUS_NOT_FOUND; + break; + + case DB_END_OF_INDEX: + error = HA_ERR_KEY_NOT_FOUND; + table->status = STATUS_NOT_FOUND; + break; + + case DB_TABLESPACE_DELETED: + ib_senderrf( + m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_DISCARDED, + table->s->table_name.str); + + table->status = STATUS_NOT_FOUND; + error = HA_ERR_TABLESPACE_MISSING; + break; + + case DB_TABLESPACE_NOT_FOUND: + + ib_senderrf( + m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_MISSING, + table->s->table_name.str); + + table->status = STATUS_NOT_FOUND; + error = HA_ERR_TABLESPACE_MISSING; + break; + + default: + error = convert_error_code_to_mysql( + ret, m_prebuilt->table->flags, m_user_thd); + + table->status = STATUS_NOT_FOUND; + break; + } + + DBUG_RETURN(error); +} + +/*******************************************************************//** +The following functions works like index_read, but it find the last +row with the current key value or prefix. +@return 0, HA_ERR_KEY_NOT_FOUND, or an error code */ + +int +ha_innobase::index_read_last( +/*=========================*/ + uchar* buf, /*!< out: fetched row */ + const uchar* key_ptr,/*!< in: key value, or a prefix of a full + key value */ + uint key_len)/*!< in: length of the key val or prefix + in bytes */ +{ + return(index_read(buf, key_ptr, key_len, HA_READ_PREFIX_LAST)); +} + +/********************************************************************//** +Get the index for a handle. Does not change active index. +@return NULL or index instance. */ + +dict_index_t* +ha_innobase::innobase_get_index( +/*============================*/ + uint keynr) /*!< in: use this index; MAX_KEY means always + clustered index, even if it was internally + generated by InnoDB */ +{ + KEY* key = NULL; + dict_table_t* ib_table = m_prebuilt->table; + dict_index_t* index; + + DBUG_ENTER("innobase_get_index"); + + if (keynr != MAX_KEY && table->s->keys > 0) { + key = &table->key_info[keynr]; + index = dict_table_get_index_on_name(ib_table, key->name.str); + } else { + index = dict_table_get_first_index(ib_table); + } + + if (index == NULL) { + sql_print_error( + "InnoDB could not find key no %u with name %s" + " from dict cache for table %s", + keynr, key ? key->name.str : "NULL", + ib_table->name.m_name); + } + + DBUG_RETURN(index); +} + +/********************************************************************//** +Changes the active index of a handle. +@return 0 or error code */ + +int +ha_innobase::change_active_index( +/*=============================*/ + uint keynr) /*!< in: use this index; MAX_KEY means always clustered + index, even if it was internally generated by + InnoDB */ +{ + DBUG_ENTER("change_active_index"); + + ut_ad(m_user_thd == ha_thd()); + ut_a(m_prebuilt->trx == thd_to_trx(m_user_thd)); + + active_index = keynr; + + m_prebuilt->index = innobase_get_index(keynr); + + if (m_prebuilt->index == NULL) { + sql_print_warning("InnoDB: change_active_index(%u) failed", + keynr); + m_prebuilt->index_usable = FALSE; + DBUG_RETURN(1); + } + + m_prebuilt->index_usable = row_merge_is_index_usable( + m_prebuilt->trx, m_prebuilt->index); + + if (!m_prebuilt->index_usable) { + if (m_prebuilt->index->is_corrupted()) { + char table_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name( + table_name, sizeof table_name, + m_prebuilt->index->table->name.m_name); + + if (m_prebuilt->index->is_primary()) { + ut_ad(m_prebuilt->index->table->corrupted); + push_warning_printf( + m_user_thd, Sql_condition::WARN_LEVEL_WARN, + ER_TABLE_CORRUPT, + "InnoDB: Table %s is corrupted.", + table_name); + DBUG_RETURN(ER_TABLE_CORRUPT); + } else { + push_warning_printf( + m_user_thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_INDEX_CORRUPT, + "InnoDB: Index %s for table %s is" + " marked as corrupted", + m_prebuilt->index->name(), + table_name); + DBUG_RETURN(HA_ERR_INDEX_CORRUPT); + } + } else { + push_warning_printf( + m_user_thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_TABLE_DEF_CHANGED, + "InnoDB: insufficient history for index %u", + keynr); + } + + /* The caller seems to ignore this. Thus, we must check + this again in row_search_mvcc(). */ + DBUG_RETURN(convert_error_code_to_mysql(DB_MISSING_HISTORY, + 0, NULL)); + } + + ut_a(m_prebuilt->search_tuple != 0); + + /* Initialization of search_tuple is not needed for FT index + since FT search returns rank only. In addition engine should + be able to retrieve FTS_DOC_ID column value if necessary. */ + if (m_prebuilt->index->type & DICT_FTS) { + for (uint i = 0; i < table->s->fields; i++) { + if (m_prebuilt->read_just_key + && bitmap_is_set(table->read_set, i) + && !strcmp(table->s->field[i]->field_name.str, + FTS_DOC_ID_COL_NAME)) { + m_prebuilt->fts_doc_id_in_read_set = true; + break; + } + } + } else { + ulint n_fields = dict_index_get_n_unique_in_tree( + m_prebuilt->index); + + dtuple_set_n_fields(m_prebuilt->search_tuple, n_fields); + + dict_index_copy_types( + m_prebuilt->search_tuple, m_prebuilt->index, + n_fields); + + /* If it's FTS query and FTS_DOC_ID exists FTS_DOC_ID field is + always added to read_set. */ + m_prebuilt->fts_doc_id_in_read_set = m_prebuilt->in_fts_query + && m_prebuilt->read_just_key + && m_prebuilt->index->contains_col_or_prefix( + m_prebuilt->table->fts->doc_col, false); + } + + /* MySQL changes the active index for a handle also during some + queries, for example SELECT MAX(a), SUM(a) first retrieves the MAX() + and then calculates the sum. Previously we played safe and used + the flag ROW_MYSQL_WHOLE_ROW below, but that caused unnecessary + copying. Starting from MySQL-4.1 we use a more efficient flag here. */ + + build_template(false); + + DBUG_RETURN(0); +} + +/* @return true if it's necessary to switch current statement log format from +STATEMENT to ROW if binary log format is MIXED and autoincrement values +are changed in the statement */ +bool ha_innobase::autoinc_lock_mode_stmt_unsafe() const +{ + return innobase_autoinc_lock_mode == AUTOINC_NO_LOCKING; +} + +/***********************************************************************//** +Reads the next or previous row from a cursor, which must have previously been +positioned using index_read. +@return 0, HA_ERR_END_OF_FILE, or error number */ + +int +ha_innobase::general_fetch( +/*=======================*/ + uchar* buf, /*!< in/out: buffer for next row in MySQL + format */ + uint direction, /*!< in: ROW_SEL_NEXT or ROW_SEL_PREV */ + uint match_mode) /*!< in: 0, ROW_SEL_EXACT, or + ROW_SEL_EXACT_PREFIX */ +{ + DBUG_ENTER("general_fetch"); + + mariadb_set_stats set_stats_temporary(handler_stats); + const trx_t* trx = m_prebuilt->trx; + + ut_ad(trx == thd_to_trx(m_user_thd)); + + if (m_prebuilt->table->is_readable()) { + } else if (m_prebuilt->table->corrupted) { + DBUG_RETURN(HA_ERR_CRASHED); + } else { + DBUG_RETURN(m_prebuilt->table->space + ? HA_ERR_DECRYPTION_FAILED + : HA_ERR_NO_SUCH_TABLE); + } + + int error; + + switch (dberr_t ret = row_search_mvcc(buf, PAGE_CUR_UNSUPP, m_prebuilt, + match_mode, direction)) { + case DB_SUCCESS: + error = 0; + table->status = 0; + break; + case DB_RECORD_NOT_FOUND: + error = HA_ERR_END_OF_FILE; + table->status = STATUS_NOT_FOUND; + break; + case DB_END_OF_INDEX: + error = HA_ERR_END_OF_FILE; + table->status = STATUS_NOT_FOUND; + break; + case DB_TABLESPACE_DELETED: + ib_senderrf( + trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_DISCARDED, + table->s->table_name.str); + + table->status = STATUS_NOT_FOUND; + error = HA_ERR_TABLESPACE_MISSING; + break; + case DB_TABLESPACE_NOT_FOUND: + + ib_senderrf( + trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_MISSING, + table->s->table_name.str); + + table->status = STATUS_NOT_FOUND; + error = HA_ERR_TABLESPACE_MISSING; + break; + default: + error = convert_error_code_to_mysql( + ret, m_prebuilt->table->flags, m_user_thd); + + table->status = STATUS_NOT_FOUND; + break; + } + + DBUG_RETURN(error); +} + +/***********************************************************************//** +Reads the next row from a cursor, which must have previously been +positioned using index_read. +@return 0, HA_ERR_END_OF_FILE, or error number */ + +int +ha_innobase::index_next( +/*====================*/ + uchar* buf) /*!< in/out: buffer for next row in MySQL + format */ +{ + return(general_fetch(buf, ROW_SEL_NEXT, 0)); +} + +/*******************************************************************//** +Reads the next row matching to the key value given as the parameter. +@return 0, HA_ERR_END_OF_FILE, or error number */ + +int +ha_innobase::index_next_same( +/*=========================*/ + uchar* buf, /*!< in/out: buffer for the row */ + const uchar*, uint) +{ + return(general_fetch(buf, ROW_SEL_NEXT, m_last_match_mode)); +} + +/***********************************************************************//** +Reads the previous row from a cursor, which must have previously been +positioned using index_read. +@return 0, HA_ERR_END_OF_FILE, or error number */ + +int +ha_innobase::index_prev( +/*====================*/ + uchar* buf) /*!< in/out: buffer for previous row in MySQL format */ +{ + return(general_fetch(buf, ROW_SEL_PREV, 0)); +} + +/********************************************************************//** +Positions a cursor on the first record in an index and reads the +corresponding row to buf. +@return 0, HA_ERR_END_OF_FILE, or error code */ + +int +ha_innobase::index_first( +/*=====================*/ + uchar* buf) /*!< in/out: buffer for the row */ +{ + DBUG_ENTER("index_first"); + + int error = index_read(buf, NULL, 0, HA_READ_AFTER_KEY); + + /* MySQL does not seem to allow this to return HA_ERR_KEY_NOT_FOUND */ + + if (error == HA_ERR_KEY_NOT_FOUND) { + error = HA_ERR_END_OF_FILE; + } + + DBUG_RETURN(error); +} + +/********************************************************************//** +Positions a cursor on the last record in an index and reads the +corresponding row to buf. +@return 0, HA_ERR_END_OF_FILE, or error code */ + +int +ha_innobase::index_last( +/*====================*/ + uchar* buf) /*!< in/out: buffer for the row */ +{ + DBUG_ENTER("index_last"); + + int error = index_read(buf, NULL, 0, HA_READ_BEFORE_KEY); + + /* MySQL does not seem to allow this to return HA_ERR_KEY_NOT_FOUND */ + + if (error == HA_ERR_KEY_NOT_FOUND) { + error = HA_ERR_END_OF_FILE; + } + + DBUG_RETURN(error); +} + +/****************************************************************//** +Initialize a table scan. +@return 0 or error number */ + +int +ha_innobase::rnd_init( +/*==================*/ + bool scan) /*!< in: true if table/index scan FALSE otherwise */ +{ + int err; + + /* Store the active index value so that we can restore the original + value after a scan */ + + if (m_prebuilt->clust_index_was_generated) { + err = change_active_index(MAX_KEY); + } else { + err = change_active_index(m_primary_key); + } + + /* Don't use semi-consistent read in random row reads (by position). + This means we must disable semi_consistent_read if scan is false */ + + if (!scan) { + try_semi_consistent_read(0); + } + + m_start_of_scan = true; + + return(err); +} + +/*****************************************************************//** +Ends a table scan. +@return 0 or error number */ + +int +ha_innobase::rnd_end(void) +/*======================*/ +{ + return(index_end()); +} + +/*****************************************************************//** +Reads the next row in a table scan (also used to read the FIRST row +in a table scan). +@return 0, HA_ERR_END_OF_FILE, or error number */ + +int +ha_innobase::rnd_next( +/*==================*/ + uchar* buf) /*!< in/out: returns the row in this buffer, + in MySQL format */ +{ + int error; + DBUG_ENTER("rnd_next"); + + if (m_start_of_scan) { + error = index_first(buf); + + if (error == HA_ERR_KEY_NOT_FOUND) { + error = HA_ERR_END_OF_FILE; + } + + m_start_of_scan = false; + } else { + error = general_fetch(buf, ROW_SEL_NEXT, 0); + } + + DBUG_RETURN(error); +} + +/**********************************************************************//** +Fetches a row from the table based on a row reference. +@return 0, HA_ERR_KEY_NOT_FOUND, or error code */ + +int +ha_innobase::rnd_pos( +/*=================*/ + uchar* buf, /*!< in/out: buffer for the row */ + uchar* pos) /*!< in: primary key value of the row in the + MySQL format, or the row id if the clustered + index was internally generated by InnoDB; the + length of data in pos has to be ref_length */ +{ + DBUG_ENTER("rnd_pos"); + DBUG_DUMP("key", pos, ref_length); + + ut_a(m_prebuilt->trx == thd_to_trx(ha_thd())); + + /* Note that we assume the length of the row reference is fixed + for the table, and it is == ref_length */ + + int error = index_read(buf, pos, (uint)ref_length, HA_READ_KEY_EXACT); + + if (error != 0) { + DBUG_PRINT("error", ("Got error: %d", error)); + } + + DBUG_RETURN(error); +} + +/**********************************************************************//** +Initialize FT index scan +@return 0 or error number */ + +int +ha_innobase::ft_init() +/*==================*/ +{ + DBUG_ENTER("ft_init"); + + trx_t* trx = check_trx_exists(ha_thd()); + + /* FTS queries are not treated as autocommit non-locking selects. + This is because the FTS implementation can acquire locks behind + the scenes. This has not been verified but it is safer to treat + them as regular read only transactions for now. */ + + if (!trx_is_started(trx)) { + trx->will_lock = true; + } + + DBUG_RETURN(rnd_init(false)); +} + +/**********************************************************************//** +Initialize FT index scan +@return FT_INFO structure if successful or NULL */ + +FT_INFO* +ha_innobase::ft_init_ext( +/*=====================*/ + uint flags, /* in: */ + uint keynr, /* in: */ + String* key) /* in: */ +{ + NEW_FT_INFO* fts_hdl = NULL; + dict_index_t* index; + fts_result_t* result; + char buf_tmp[8192]; + ulint buf_tmp_used; + uint num_errors; + ulint query_len = key->length(); + const CHARSET_INFO* char_set = key->charset(); + const char* query = key->ptr(); + + if (UNIV_UNLIKELY(fts_enable_diag_print)) { + { + ib::info out; + out << "keynr=" << keynr << ", '"; + out.write(key->ptr(), key->length()); + } + + if (flags & FT_BOOL) { + ib::info() << "BOOL search"; + } else { + ib::info() << "NL search"; + } + } + + /* Multi byte character sets like utf32 and utf16 are not + compatible with some string function used. So to convert them + to uft8 before we proceed. */ + if (char_set->mbminlen != 1) { + buf_tmp_used = innobase_convert_string( + buf_tmp, sizeof(buf_tmp) - 1, + &my_charset_utf8mb3_general_ci, + query, query_len, (CHARSET_INFO*) char_set, + &num_errors); + + buf_tmp[buf_tmp_used] = 0; + query = buf_tmp; + query_len = buf_tmp_used; + } + + trx_t* trx = m_prebuilt->trx; + + /* FTS queries are not treated as autocommit non-locking selects. + This is because the FTS implementation can acquire locks behind + the scenes. This has not been verified but it is safer to treat + them as regular read only transactions for now. */ + + if (!trx_is_started(trx)) { + trx->will_lock = true; + } + + dict_table_t* ft_table = m_prebuilt->table; + + /* Table does not have an FTS index */ + if (!ft_table->fts || ib_vector_is_empty(ft_table->fts->indexes)) { + my_error(ER_TABLE_HAS_NO_FT, MYF(0)); + return(NULL); + } + + /* If tablespace is discarded, we should return here */ + if (!ft_table->space) { + my_error(ER_TABLESPACE_MISSING, MYF(0), table->s->db.str, + table->s->table_name.str); + return(NULL); + } + + if (keynr == NO_SUCH_KEY) { + /* FIXME: Investigate the NO_SUCH_KEY usage */ + index = reinterpret_cast + (ib_vector_getp(ft_table->fts->indexes, 0)); + } else { + index = innobase_get_index(keynr); + } + + if (index == NULL || index->type != DICT_FTS) { + my_error(ER_TABLE_HAS_NO_FT, MYF(0)); + return(NULL); + } + + if (!(ft_table->fts->added_synced)) { + fts_init_index(ft_table, FALSE); + + ft_table->fts->added_synced = true; + } + + const byte* q = reinterpret_cast( + const_cast(query)); + + // FIXME: support ft_init_ext_with_hints(), pass LIMIT + dberr_t error = fts_query(trx, index, flags, q, query_len, &result); + + if (error != DB_SUCCESS) { + my_error(convert_error_code_to_mysql(error, 0, NULL), MYF(0)); + return(NULL); + } + + /* Allocate FTS handler, and instantiate it before return */ + fts_hdl = reinterpret_cast( + my_malloc(PSI_INSTRUMENT_ME, sizeof(NEW_FT_INFO), MYF(0))); + + fts_hdl->please = const_cast<_ft_vft*>(&ft_vft_result); + fts_hdl->could_you = const_cast<_ft_vft_ext*>(&ft_vft_ext_result); + fts_hdl->ft_prebuilt = m_prebuilt; + fts_hdl->ft_result = result; + + /* FIXME: Re-evaluate the condition when Bug 14469540 is resolved */ + m_prebuilt->in_fts_query = true; + + return(reinterpret_cast(fts_hdl)); +} + +/*****************************************************************//** +Set up search tuple for a query through FTS_DOC_ID_INDEX on +supplied Doc ID. This is used by MySQL to retrieve the documents +once the search result (Doc IDs) is available + +@return DB_SUCCESS or DB_INDEX_CORRUPT +*/ +static +dberr_t +innobase_fts_create_doc_id_key( +/*===========================*/ + dtuple_t* tuple, /* in/out: m_prebuilt->search_tuple */ + const dict_index_t* + index, /* in: index (FTS_DOC_ID_INDEX) */ + doc_id_t* doc_id) /* in/out: doc id to search, value + could be changed to storage format + used for search. */ +{ + doc_id_t temp_doc_id; + dfield_t* dfield = dtuple_get_nth_field(tuple, 0); + const ulint n_uniq = index->table->fts_n_uniq(); + + if (dict_index_get_n_unique(index) != n_uniq) + return DB_INDEX_CORRUPT; + + dtuple_set_n_fields(tuple, index->n_fields); + dict_index_copy_types(tuple, index, index->n_fields); + +#ifdef UNIV_DEBUG + /* The unique Doc ID field should be an eight-bytes integer */ + dict_field_t* field = dict_index_get_nth_field(index, 0); + ut_a(field->col->mtype == DATA_INT); + ut_ad(sizeof(*doc_id) == field->fixed_len); + ut_ad(!strcmp(index->name, FTS_DOC_ID_INDEX_NAME)); +#endif /* UNIV_DEBUG */ + + /* Convert to storage byte order */ + mach_write_to_8(reinterpret_cast(&temp_doc_id), *doc_id); + *doc_id = temp_doc_id; + dfield_set_data(dfield, doc_id, sizeof(*doc_id)); + + if (n_uniq == 2) { + ut_ad(index->table->versioned()); + dfield = dtuple_get_nth_field(tuple, 1); + if (index->table->versioned_by_id()) { + dfield_set_data(dfield, trx_id_max_bytes, + sizeof(trx_id_max_bytes)); + } else { + dfield_set_data(dfield, timestamp_max_bytes, + sizeof(timestamp_max_bytes)); + } + } + + dtuple_set_n_fields_cmp(tuple, n_uniq); + + for (ulint i = n_uniq; i < index->n_fields; i++) { + dfield = dtuple_get_nth_field(tuple, i); + dfield_set_null(dfield); + } + return DB_SUCCESS; +} + +/**********************************************************************//** +Fetch next result from the FT result set +@return error code */ + +int +ha_innobase::ft_read( +/*=================*/ + uchar* buf) /*!< in/out: buf contain result row */ +{ + row_prebuilt_t* ft_prebuilt; + mariadb_set_stats set_stats_temporary(handler_stats); + + ft_prebuilt = reinterpret_cast(ft_handler)->ft_prebuilt; + + ut_a(ft_prebuilt == m_prebuilt); + + fts_result_t* result; + + result = reinterpret_cast(ft_handler)->ft_result; + + if (result->current == NULL) { + /* This is the case where the FTS query did not + contain and matching documents. */ + if (result->rankings_by_id != NULL) { + /* Now that we have the complete result, we + need to sort the document ids on their rank + calculation. */ + + fts_query_sort_result_on_rank(result); + + result->current = const_cast( + rbt_first(result->rankings_by_rank)); + } else { + ut_a(result->current == NULL); + } + } else { + result->current = const_cast( + rbt_next(result->rankings_by_rank, result->current)); + } + +next_record: + + if (result->current != NULL) { + doc_id_t search_doc_id; + dtuple_t* tuple = m_prebuilt->search_tuple; + + /* If we only need information from result we can return + without fetching the table row */ + if (ft_prebuilt->read_just_key) { +#ifdef MYSQL_STORE_FTS_DOC_ID + if (m_prebuilt->fts_doc_id_in_read_set) { + fts_ranking_t* ranking; + ranking = rbt_value(fts_ranking_t, + result->current); + innobase_fts_store_docid( + table, ranking->doc_id); + } +#endif + table->status= 0; + return(0); + } + + dict_index_t* index; + + index = m_prebuilt->table->fts_doc_id_index; + + /* Must find the index */ + ut_a(index != NULL); + + /* Switch to the FTS doc id index */ + m_prebuilt->index = index; + + fts_ranking_t* ranking = rbt_value( + fts_ranking_t, result->current); + + search_doc_id = ranking->doc_id; + + /* We pass a pointer of search_doc_id because it will be + converted to storage byte order used in the search + tuple. */ + dberr_t ret = innobase_fts_create_doc_id_key( + tuple, index, &search_doc_id); + + if (ret == DB_SUCCESS) { + ret = row_search_mvcc( + buf, PAGE_CUR_GE, m_prebuilt, + ROW_SEL_EXACT, 0); + } + + int error; + + switch (ret) { + case DB_SUCCESS: + error = 0; + table->status = 0; + break; + case DB_RECORD_NOT_FOUND: + result->current = const_cast( + rbt_next(result->rankings_by_rank, + result->current)); + + if (!result->current) { + /* exhaust the result set, should return + HA_ERR_END_OF_FILE just like + ha_innobase::general_fetch() and/or + ha_innobase::index_first() etc. */ + error = HA_ERR_END_OF_FILE; + table->status = STATUS_NOT_FOUND; + } else { + goto next_record; + } + break; + case DB_END_OF_INDEX: + error = HA_ERR_END_OF_FILE; + table->status = STATUS_NOT_FOUND; + break; + case DB_TABLESPACE_DELETED: + + ib_senderrf( + m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_DISCARDED, + table->s->table_name.str); + + table->status = STATUS_NOT_FOUND; + error = HA_ERR_TABLESPACE_MISSING; + break; + case DB_TABLESPACE_NOT_FOUND: + + ib_senderrf( + m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_MISSING, + table->s->table_name.str); + + table->status = STATUS_NOT_FOUND; + error = HA_ERR_TABLESPACE_MISSING; + break; + default: + error = convert_error_code_to_mysql( + ret, 0, m_user_thd); + + table->status = STATUS_NOT_FOUND; + break; + } + + return(error); + } + + return(HA_ERR_END_OF_FILE); +} + +#ifdef WITH_WSREP +inline +const char* +wsrep_key_type_to_str(Wsrep_service_key_type type) +{ + switch (type) { + case WSREP_SERVICE_KEY_SHARED: + return "shared"; + case WSREP_SERVICE_KEY_REFERENCE: + return "reference"; + case WSREP_SERVICE_KEY_UPDATE: + return "update"; + case WSREP_SERVICE_KEY_EXCLUSIVE: + return "exclusive"; + }; + return "unknown"; +} + +extern dberr_t +wsrep_append_foreign_key( +/*===========================*/ + trx_t* trx, /*!< in: trx */ + dict_foreign_t* foreign, /*!< in: foreign key constraint */ + const rec_t* rec, /*!is_wsrep()); + + if (!wsrep_thd_is_local(trx->mysql_thd)) + return DB_SUCCESS; + + if (upd_node && wsrep_protocol_version < 4) { + key_type = WSREP_SERVICE_KEY_SHARED; + } + + THD* thd = trx->mysql_thd; + + if (!foreign || + (!foreign->referenced_table && !foreign->foreign_table)) { + WSREP_INFO("FK: %s missing in: %s", + (!foreign ? "constraint" : + (!foreign->referenced_table ? + "referenced table" : "foreign table")), + wsrep_thd_query(thd)); + return DB_ERROR; + } + + ulint rcode = DB_SUCCESS; + char cache_key[513] = {'\0'}; + size_t cache_key_len = 0; + + if ( !((referenced) ? + foreign->referenced_table : foreign->foreign_table)) { + WSREP_DEBUG("pulling %s table into cache", + (referenced) ? "referenced" : "foreign"); + dict_sys.lock(SRW_LOCK_CALL); + + if (referenced) { + foreign->referenced_table = + dict_sys.load_table( + {foreign->referenced_table_name_lookup, + strlen(foreign-> + referenced_table_name_lookup) + }); + if (foreign->referenced_table) { + foreign->referenced_index = + dict_foreign_find_index( + foreign->referenced_table, NULL, + foreign->referenced_col_names, + foreign->n_fields, + foreign->foreign_index, + TRUE, FALSE); + } + } else { + foreign->foreign_table = + dict_sys.load_table( + {foreign->foreign_table_name_lookup, + strlen(foreign-> + foreign_table_name_lookup)}); + + if (foreign->foreign_table) { + foreign->foreign_index = + dict_foreign_find_index( + foreign->foreign_table, NULL, + foreign->foreign_col_names, + foreign->n_fields, + foreign->referenced_index, + TRUE, FALSE); + } + } + dict_sys.unlock(); + } + + if ( !((referenced) ? + foreign->referenced_table : foreign->foreign_table)) { + WSREP_WARN("FK: %s missing in query: %s", + (!foreign->referenced_table) ? + "referenced table" : "foreign table", + wsrep_thd_query(thd)); + return DB_ERROR; + } + + byte key[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'}; + ulint len = WSREP_MAX_SUPPORTED_KEY_LENGTH; + + dict_index_t *idx_target = (referenced) ? + foreign->referenced_index : index; + dict_index_t *idx = (referenced) ? + UT_LIST_GET_FIRST(foreign->referenced_table->indexes) : + UT_LIST_GET_FIRST(foreign->foreign_table->indexes); + int i = 0; + + while (idx != NULL && idx != idx_target) { + if (innobase_strcasecmp (idx->name, innobase_index_reserve_name) != 0) { + i++; + } + idx = UT_LIST_GET_NEXT(indexes, idx); + } + + ut_a(idx); + key[0] = byte(i); + + rcode = wsrep_rec_get_foreign_key( + &key[1], &len, rec, index, idx, + wsrep_protocol_version > 1); + + if (rcode != DB_SUCCESS) { + WSREP_ERROR( + "FK key set failed: " ULINTPF + " (" ULINTPF "%s), index: %s %s, %s", + rcode, referenced, wsrep_key_type_to_str(key_type), + (index) ? index->name() : "void index", + (index && index->table) ? index->table->name.m_name : + "void table", + wsrep_thd_query(thd)); + return DB_ERROR; + } + + strncpy(cache_key, + (wsrep_protocol_version > 1) ? + ((referenced) ? + foreign->referenced_table->name.m_name : + foreign->foreign_table->name.m_name) : + foreign->foreign_table->name.m_name, sizeof(cache_key) - 1); + cache_key_len = strlen(cache_key); + +#ifdef WSREP_DEBUG_PRINT + ulint j; + fprintf(stderr, "FK parent key, table: %s %s len: %lu ", + cache_key, wsrep_key_type_to_str(key_type), len+1); + for (j=0; jreferenced_table->name.m_name, + foreign->foreign_table->name.m_name); + } + + wsrep_buf_t wkey_part[3]; + wsrep_key_t wkey = {wkey_part, 3}; + + if (!wsrep_prepare_key_for_innodb( + thd, + (const uchar*)cache_key, + cache_key_len + 1, + (const uchar*)key, len+1, + wkey_part, + (size_t*)&wkey.key_parts_num)) { + WSREP_WARN("key prepare failed for cascaded FK: %s", + wsrep_thd_query(thd)); + return DB_ERROR; + } + + rcode = wsrep_thd_append_key(thd, &wkey, 1, key_type); + + if (rcode) { + WSREP_ERROR("Appending cascaded fk row key failed: %s, " + ULINTPF, + wsrep_thd_query(thd), + rcode); + return DB_ERROR; + } + + if (pa_disable) { + wsrep_thd_set_PA_unsafe(trx->mysql_thd); + } + + return DB_SUCCESS; +} + +static int +wsrep_append_key( +/*=============*/ + THD *thd, + trx_t *trx, + TABLE_SHARE *table_share, + const char* key, + uint16_t key_len, + Wsrep_service_key_type key_type /*!< in: access type of this key + (shared, exclusive, semi...) */ +) +{ + ut_ad(!trx->is_bulk_insert()); + + DBUG_ENTER("wsrep_append_key"); + DBUG_PRINT("enter", + ("thd: %lu trx: %lld", thd_get_thread_id(thd), + (long long)trx->id)); +#ifdef WSREP_DEBUG_PRINT + fprintf(stderr, "%s conn %lu, trx " TRX_ID_FMT ", keylen %d, key %s.%s\n", + wsrep_key_type_to_str(key_type), + thd_get_thread_id(thd), trx->id, key_len, + table_share->table_name.str, key); + for (int i=0; itable_cache_key.str, + table_share->table_cache_key.length, + (const uchar*)key, key_len, + wkey_part, + (size_t*)&wkey.key_parts_num)) { + WSREP_WARN("key prepare failed for: %s", + (wsrep_thd_query(thd)) ? + wsrep_thd_query(thd) : "void"); + DBUG_RETURN(HA_ERR_INTERNAL_ERROR); + } + + int rcode = wsrep_thd_append_key(thd, &wkey, 1, key_type); + if (rcode) { + DBUG_PRINT("wsrep", ("row key failed: %d", rcode)); + WSREP_WARN("Appending row key failed: %s, %d", + (wsrep_thd_query(thd)) ? + wsrep_thd_query(thd) : "void", rcode); + DBUG_RETURN(HA_ERR_INTERNAL_ERROR); + } + + DBUG_RETURN(0); +} + +static bool +referenced_by_foreign_key2( +/*=======================*/ + dict_table_t* table, + dict_index_t* index) +{ + ut_ad(table != NULL); + ut_ad(index != NULL); + + const dict_foreign_set* fks = &table->referenced_set; + + for (dict_foreign_set::const_iterator it = fks->begin(); + it != fks->end(); + ++it) { + dict_foreign_t* foreign = *it; + + if (foreign->referenced_index != index) { + continue; + } + ut_ad(table == foreign->referenced_table); + return true; + } + return false; +} + +int +ha_innobase::wsrep_append_keys( +/*===========================*/ + THD *thd, + Wsrep_service_key_type key_type, /*!< in: access type of this row + operation: + (shared, exclusive, reference...) */ + const uchar* record0, /* in: row in MySQL format */ + const uchar* record1) /* in: row in MySQL format */ +{ + /* Sanity check: newly inserted records should always be passed with + EXCLUSIVE key type, all the rest are expected to carry a pre-image + */ + ut_a(record1 != NULL || key_type == WSREP_SERVICE_KEY_EXCLUSIVE); + + int rcode; + DBUG_ENTER("wsrep_append_keys"); + + bool key_appended = false; + trx_t *trx = thd_to_trx(thd); + +#ifdef WSREP_DEBUG_PRINT + fprintf(stderr, "%s conn %lu, trx " TRX_ID_FMT ", table %s\nSQL: %s\n", + wsrep_key_type_to_str(key_type), + thd_get_thread_id(thd), trx->id, + table_share->table_name.str, wsrep_thd_query(thd)); +#endif + + if (table_share && table_share->tmp_table != NO_TMP_TABLE) { + WSREP_DEBUG("skipping tmp table DML: THD: %lu tmp: %d SQL: %s", + thd_get_thread_id(thd), + table_share->tmp_table, + (wsrep_thd_query(thd)) ? + wsrep_thd_query(thd) : "void"); + DBUG_RETURN(0); + } + + if (wsrep_protocol_version == 0) { + char keyval[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'}; + char *key = &keyval[0]; + bool is_null; + + auto len = wsrep_store_key_val_for_row( + thd, table, 0, key, WSREP_MAX_SUPPORTED_KEY_LENGTH, + record0, &is_null); + + if (!is_null) { + rcode = wsrep_append_key( + thd, trx, table_share, keyval, + len, key_type); + + if (rcode) { + DBUG_RETURN(rcode); + } + } else { + WSREP_DEBUG("NULL key skipped (proto 0): %s", + wsrep_thd_query(thd)); + } + } else { + ut_a(table->s->keys <= 256); + uint i; + bool hasPK= false; + + for (i=0; is->keys; ++i) { + KEY* key_info = table->key_info + i; + if (key_info->flags & HA_NOSAME) { + hasPK = true; + break; + } + } + + for (i=0; is->keys; ++i) { + KEY* key_info = table->key_info + i; + + dict_index_t* idx = innobase_get_index(i); + dict_table_t* tab = (idx) ? idx->table : NULL; + + /* keyval[] shall contain an ordinal number at byte 0 + and the actual key data shall be written at byte 1. + Hence the total data length is the key length + 1 */ + char keyval0[WSREP_MAX_SUPPORTED_KEY_LENGTH+1]= {'\0'}; + char keyval1[WSREP_MAX_SUPPORTED_KEY_LENGTH+1]= {'\0'}; + keyval0[0] = (char)i; + keyval1[0] = (char)i; + char* key0 = &keyval0[1]; + char* key1 = &keyval1[1]; + + if (!tab) { + WSREP_WARN("MariaDB-InnoDB key mismatch %s %s", + table->s->table_name.str, + key_info->name.str); + } + /* !hasPK == table with no PK, + must append all non-unique keys */ + if (!hasPK || key_info->flags & HA_NOSAME || + ((tab && + referenced_by_foreign_key2(tab, idx)) || + (!tab && referenced_by_foreign_key()))) { + + bool is_null0; + auto len0 = wsrep_store_key_val_for_row( + thd, table, i, key0, + WSREP_MAX_SUPPORTED_KEY_LENGTH, + record0, &is_null0); + + if (record1) { + bool is_null1; + auto len1= wsrep_store_key_val_for_row( + thd, table, i, key1, + WSREP_MAX_SUPPORTED_KEY_LENGTH, + record1, &is_null1); + + if (is_null0 != is_null1 || + len0 != len1 || + memcmp(key0, key1, len0)) { + /* This key has chaged. If it + is unique, this is an exclusive + operation -> upgrade key type */ + if (key_info->flags & HA_NOSAME) { + key_type = WSREP_SERVICE_KEY_EXCLUSIVE; + } + + if (!is_null1) { + rcode = wsrep_append_key( + thd, trx, table_share, + keyval1, + /* for len1+1 see keyval1 + initialization comment */ + uint16_t(len1+1), + key_type); + if (rcode) + DBUG_RETURN(rcode); + } + } + } + + if (!is_null0) { + rcode = wsrep_append_key( + thd, trx, table_share, + /* for len0+1 see keyval0 + initialization comment */ + keyval0, uint16_t(len0+1), + key_type); + if (rcode) + DBUG_RETURN(rcode); + + if (key_info->flags & HA_NOSAME || + key_type == WSREP_SERVICE_KEY_SHARED|| + key_type == WSREP_SERVICE_KEY_REFERENCE) + key_appended = true; + } else { + WSREP_DEBUG("NULL key skipped: %s", + wsrep_thd_query(thd)); + } + } + } + } + + /* if no PK, calculate hash of full row, to be the key value */ + if (!key_appended && wsrep_certify_nonPK) { + uchar digest[16]; + + wsrep_calc_row_hash(digest, record0, table, m_prebuilt); + + if (int rcode = wsrep_append_key(thd, trx, table_share, + reinterpret_cast + (digest), 16, key_type)) { + DBUG_RETURN(rcode); + } + + if (record1) { + wsrep_calc_row_hash( + digest, record1, table, m_prebuilt); + if (int rcode = wsrep_append_key( + thd, trx, table_share, + reinterpret_cast(digest), 16, + key_type)) { + DBUG_RETURN(rcode); + } + } + DBUG_RETURN(0); + } + + DBUG_RETURN(0); +} +#endif /* WITH_WSREP */ + +/*********************************************************************//** +Stores a reference to the current row to 'ref' field of the handle. Note +that in the case where we have generated the clustered index for the +table, the function parameter is illogical: we MUST ASSUME that 'record' +is the current 'position' of the handle, because if row ref is actually +the row id internally generated in InnoDB, then 'record' does not contain +it. We just guess that the row id must be for the record where the handle +was positioned the last time. */ + +void +ha_innobase::position( +/*==================*/ + const uchar* record) /*!< in: row in MySQL format */ +{ + uint len; + + ut_a(m_prebuilt->trx == thd_to_trx(ha_thd())); + + if (m_prebuilt->clust_index_was_generated) { + /* No primary key was defined for the table and we + generated the clustered index from row id: the + row reference will be the row id, not any key value + that MySQL knows of */ + + len = DATA_ROW_ID_LEN; + + memcpy(ref, m_prebuilt->row_id, len); + } else { + + /* Copy primary key as the row reference */ + KEY* key_info = table->key_info + m_primary_key; + key_copy(ref, (uchar*)record, key_info, key_info->key_length); + len = key_info->key_length; + } + + ut_ad(len == ref_length); +} + +/*****************************************************************//** +Check whether there exist a column named as "FTS_DOC_ID", which is +reserved for InnoDB FTS Doc ID +@return true if there exist a "FTS_DOC_ID" column */ +static +bool +create_table_check_doc_id_col( +/*==========================*/ + trx_t* trx, /*!< in: InnoDB transaction handle */ + const TABLE* form, /*!< in: information on table + columns and indexes */ + ulint* doc_id_col) /*!< out: Doc ID column number if + there exist a FTS_DOC_ID column, + ULINT_UNDEFINED if column is of the + wrong type/name/size */ +{ + for (ulint i = 0; i < form->s->fields; i++) { + const Field* field = form->field[i]; + if (!field->stored_in_db()) { + continue; + } + + unsigned unsigned_type; + + auto col_type = get_innobase_type_from_mysql_type( + &unsigned_type, field); + + auto col_len = field->pack_length(); + + if (innobase_strcasecmp(field->field_name.str, + FTS_DOC_ID_COL_NAME) == 0) { + + /* Note the name is case sensitive due to + our internal query parser */ + if (col_type == DATA_INT + && !field->real_maybe_null() + && col_len == sizeof(doc_id_t) + && (strcmp(field->field_name.str, + FTS_DOC_ID_COL_NAME) == 0)) { + *doc_id_col = i; + } else { + push_warning_printf( + trx->mysql_thd, + Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: FTS_DOC_ID column must be" + " of BIGINT NOT NULL type, and named" + " in all capitalized characters"); + my_error(ER_WRONG_COLUMN_NAME, MYF(0), + field->field_name.str); + *doc_id_col = ULINT_UNDEFINED; + } + + return(true); + } + } + + return(false); +} + + +/** Finds all base columns needed to compute a given generated column. +This is returned as a bitmap, in field->table->tmp_set. +Works for both dict_v_col_t and dict_s_col_t columns. +@param[in] table InnoDB table +@param[in] field MySQL field +@param[in,out] col virtual or stored column */ +template +void +prepare_vcol_for_base_setup( +/*========================*/ + const dict_table_t* table, + const Field* field, + T* col) +{ + ut_ad(col->num_base == 0); + ut_ad(col->base_col == NULL); + + MY_BITMAP *old_read_set = field->table->read_set; + + field->table->read_set = &field->table->tmp_set; + + bitmap_clear_all(&field->table->tmp_set); + field->vcol_info->expr->walk( + &Item::register_field_in_read_map, 1, field->table); + col->num_base= bitmap_bits_set(&field->table->tmp_set) + & dict_index_t::MAX_N_FIELDS; + if (col->num_base != 0) { + col->base_col = static_cast(mem_heap_zalloc( + table->heap, col->num_base * sizeof( + * col->base_col))); + } + field->table->read_set= old_read_set; +} + + +/** Set up base columns for virtual column +@param[in] table InnoDB table +@param[in] field MySQL field +@param[in,out] v_col virtual column */ +void +innodb_base_col_setup( + dict_table_t* table, + const Field* field, + dict_v_col_t* v_col) +{ + uint16_t n = 0; + + prepare_vcol_for_base_setup(table, field, v_col); + + for (uint i= 0; i < field->table->s->fields; ++i) { + const Field* base_field = field->table->field[i]; + if (base_field->stored_in_db() + && bitmap_is_set(&field->table->tmp_set, i)) { + ulint z; + + for (z = 0; z < table->n_cols; z++) { + const char* name = dict_table_get_col_name(table, z); + if (!innobase_strcasecmp(name, + base_field->field_name.str)) { + break; + } + } + + ut_ad(z != table->n_cols); + + v_col->base_col[n] = dict_table_get_nth_col(table, z); + ut_ad(v_col->base_col[n]->ind == z); + n++; + } + } + v_col->num_base= n & dict_index_t::MAX_N_FIELDS; +} + +/** Set up base columns for stored column +@param[in] table InnoDB table +@param[in] field MySQL field +@param[in,out] s_col stored column */ +void +innodb_base_col_setup_for_stored( + const dict_table_t* table, + const Field* field, + dict_s_col_t* s_col) +{ + ulint n = 0; + + prepare_vcol_for_base_setup(table, field, s_col); + + for (uint i= 0; i < field->table->s->fields; ++i) { + const Field* base_field = field->table->field[i]; + + if (base_field->stored_in_db() + && bitmap_is_set(&field->table->tmp_set, i)) { + ulint z; + for (z = 0; z < table->n_cols; z++) { + const char* name = dict_table_get_col_name( + table, z); + if (!innobase_strcasecmp( + name, base_field->field_name.str)) { + break; + } + } + + ut_ad(z != table->n_cols); + + s_col->base_col[n] = dict_table_get_nth_col(table, z); + n++; + + if (n == s_col->num_base) { + break; + } + } + } + s_col->num_base= n; +} + +/** Create a table definition to an InnoDB database. +@return ER_* level error */ +inline MY_ATTRIBUTE((warn_unused_result)) +int +create_table_info_t::create_table_def() +{ + dict_table_t* table; + ulint nulls_allowed; + unsigned unsigned_type; + ulint binary_type; + ulint long_true_varchar; + ulint charset_no; + ulint doc_id_col = 0; + ibool has_doc_id_col = FALSE; + mem_heap_t* heap; + ha_table_option_struct *options= m_form->s->option_struct; + dberr_t err = DB_SUCCESS; + + DBUG_ENTER("create_table_def"); + DBUG_PRINT("enter", ("table_name: %s", m_table_name)); + + DBUG_ASSERT(m_trx->mysql_thd == m_thd); + + /* MySQL does the name length check. But we do additional check + on the name length here */ + const size_t table_name_len = strlen(m_table_name); + if (table_name_len > MAX_FULL_NAME_LEN) { + push_warning_printf( + m_thd, Sql_condition::WARN_LEVEL_WARN, + ER_TABLE_NAME, + "InnoDB: Table Name or Database Name is too long"); + + DBUG_RETURN(ER_TABLE_NAME); + } + + if (m_table_name[table_name_len - 1] == '/') { + push_warning_printf( + m_thd, Sql_condition::WARN_LEVEL_WARN, + ER_TABLE_NAME, + "InnoDB: Table name is empty"); + + DBUG_RETURN(ER_WRONG_TABLE_NAME); + } + + /* Find out the number of virtual columns. */ + ulint num_v = 0; + const bool omit_virtual = ha_innobase::omits_virtual_cols(*m_form->s); + const ulint n_cols = omit_virtual + ? m_form->s->stored_fields : m_form->s->fields; + + if (!omit_virtual) { + for (ulint i = 0; i < n_cols; i++) { + num_v += !m_form->field[i]->stored_in_db(); + } + } + + /* Check whether there already exists a FTS_DOC_ID column */ + if (create_table_check_doc_id_col(m_trx, m_form, &doc_id_col)){ + + /* Raise error if the Doc ID column is of wrong type or name */ + if (doc_id_col == ULINT_UNDEFINED) { + DBUG_RETURN(HA_ERR_GENERIC); + } else { + has_doc_id_col = TRUE; + } + } + + /* Adjust the number of columns for the FTS hidden field */ + const ulint actual_n_cols = n_cols + + (m_flags2 & DICT_TF2_FTS && !has_doc_id_col); + + table = dict_table_t::create({m_table_name,table_name_len}, nullptr, + actual_n_cols, num_v, m_flags, m_flags2); + + /* Set the hidden doc_id column. */ + if (m_flags2 & DICT_TF2_FTS) { + table->fts->doc_col = has_doc_id_col + ? doc_id_col : n_cols - num_v; + } + + if (DICT_TF_HAS_DATA_DIR(m_flags)) { + ut_a(strlen(m_remote_path)); + + table->data_dir_path = mem_heap_strdup( + table->heap, m_remote_path); + + } else { + table->data_dir_path = NULL; + } + + heap = mem_heap_create(1000); + auto _ = make_scope_exit([heap]() { mem_heap_free(heap); }); + + ut_d(bool have_vers_start = false); + ut_d(bool have_vers_end = false); + + for (ulint i = 0, j = 0; j < n_cols; i++) { + Field* field = m_form->field[i]; + ulint vers_row = 0; + + if (m_form->versioned()) { + if (i == m_form->s->vers.start_fieldno) { + vers_row = DATA_VERS_START; + ut_d(have_vers_start = true); + } else if (i == m_form->s->vers.end_fieldno) { + vers_row = DATA_VERS_END; + ut_d(have_vers_end = true); + } else if (!(field->flags + & VERS_UPDATE_UNVERSIONED_FLAG)) { + vers_row = DATA_VERSIONED; + } + } + + auto col_type = get_innobase_type_from_mysql_type( + &unsigned_type, field); + + if (!col_type) { + push_warning_printf( + m_thd, Sql_condition::WARN_LEVEL_WARN, + ER_CANT_CREATE_TABLE, + "Error creating table '%s' with" + " column '%s'. Please check its" + " column type and try to re-create" + " the table with an appropriate" + " column type.", + table->name.m_name, field->field_name.str); +err_col: + dict_mem_table_free(table); + DBUG_RETURN(HA_ERR_GENERIC); + } + + nulls_allowed = field->real_maybe_null() ? 0 : DATA_NOT_NULL; + binary_type = field->binary() ? DATA_BINARY_TYPE : 0; + + charset_no = 0; + + if (dtype_is_string_type(col_type)) { + + charset_no = (ulint) field->charset()->number; + + DBUG_EXECUTE_IF("simulate_max_char_col", + charset_no = MAX_CHAR_COLL_NUM + 1; + ); + + if (charset_no > MAX_CHAR_COLL_NUM) { + /* in data0type.h we assume that the + number fits in one byte in prtype */ + push_warning_printf( + m_thd, Sql_condition::WARN_LEVEL_WARN, + ER_CANT_CREATE_TABLE, + "In InnoDB, charset-collation codes" + " must be below 256." + " Unsupported code " ULINTPF ".", + charset_no); + dict_mem_table_free(table); + + DBUG_RETURN(ER_CANT_CREATE_TABLE); + } + } + + auto col_len = field->pack_length(); + + /* The MySQL pack length contains 1 or 2 bytes length field + for a true VARCHAR. Let us subtract that, so that the InnoDB + column length in the InnoDB data dictionary is the real + maximum byte length of the actual data. */ + + long_true_varchar = 0; + + if (field->type() == MYSQL_TYPE_VARCHAR) { + col_len -= ((Field_varstring*) field)->length_bytes; + + if (((Field_varstring*) field)->length_bytes == 2) { + long_true_varchar = DATA_LONG_TRUE_VARCHAR; + } + } + + /* First check whether the column to be added has a + system reserved name. */ + if (dict_col_name_is_reserved(field->field_name.str)){ + my_error(ER_WRONG_COLUMN_NAME, MYF(0), + field->field_name.str); + goto err_col; + } + + ulint is_virtual = !field->stored_in_db() ? DATA_VIRTUAL : 0; + + if (!is_virtual) { + dict_mem_table_add_col(table, heap, + field->field_name.str, col_type, + dtype_form_prtype( + (ulint) field->type() + | nulls_allowed | unsigned_type + | binary_type | long_true_varchar + | vers_row, + charset_no), + col_len); + } else if (!omit_virtual) { + dict_mem_table_add_v_col(table, heap, + field->field_name.str, col_type, + dtype_form_prtype( + (ulint) field->type() + | nulls_allowed | unsigned_type + | binary_type | long_true_varchar + | vers_row + | is_virtual, + charset_no), + col_len, i, 0); + } + + if (innobase_is_s_fld(field)) { + ut_ad(!is_virtual); + /* Added stored column in m_s_cols list. */ + dict_mem_table_add_s_col( + table, 0); + } + + if (is_virtual && omit_virtual) { + continue; + } + + j++; + } + + ut_ad(have_vers_start == have_vers_end); + ut_ad(table->versioned() == have_vers_start); + ut_ad(!table->versioned() || table->vers_start != table->vers_end); + + if (num_v) { + for (ulint i = 0, j = 0; i < n_cols; i++) { + dict_v_col_t* v_col; + + const Field* field = m_form->field[i]; + + if (field->stored_in_db()) { + continue; + } + + v_col = dict_table_get_nth_v_col(table, j); + + j++; + + innodb_base_col_setup(table, field, v_col); + } + } + + /** Fill base columns for the stored column present in the list. */ + if (table->s_cols && !table->s_cols->empty()) { + for (ulint i = 0; i < n_cols; i++) { + Field* field = m_form->field[i]; + + if (!innobase_is_s_fld(field)) { + continue; + } + + dict_s_col_list::iterator it; + for (it = table->s_cols->begin(); + it != table->s_cols->end(); ++it) { + dict_s_col_t s_col = *it; + + if (s_col.s_pos == i) { + innodb_base_col_setup_for_stored( + table, field, &s_col); + break; + } + } + } + } + + /* Add the FTS doc_id hidden column. */ + if (m_flags2 & DICT_TF2_FTS && !has_doc_id_col) { + fts_add_doc_id_column(table, heap); + } + + dict_table_add_system_columns(table, heap); + + if (table->is_temporary()) { + if ((options->encryption == 1 + && !innodb_encrypt_temporary_tables) + || (options->encryption == 2 + && innodb_encrypt_temporary_tables)) { + push_warning_printf(m_thd, + Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "Ignoring encryption parameter during " + "temporary table creation."); + } + + table->id = dict_sys.acquire_temporary_table_id(); + ut_ad(dict_tf_get_rec_format(table->flags) + != REC_FORMAT_COMPRESSED); + table->space_id = SRV_TMP_SPACE_ID; + table->space = fil_system.temp_space; + table->add_to_cache(); + } else { + ut_ad(dict_sys.sys_tables_exist()); + + err = row_create_table_for_mysql(table, m_trx); + } + + switch (err) { + case DB_SUCCESS: + ut_ad(table); + m_table = table; + DBUG_RETURN(0); + default: + break; + case DB_DUPLICATE_KEY: + char display_name[FN_REFLEN]; + char* buf_end = innobase_convert_identifier( + display_name, sizeof(display_name) - 1, + m_table_name, strlen(m_table_name), + m_thd); + + *buf_end = '\0'; + + my_error(ER_TABLE_EXISTS_ERROR, MYF(0), display_name); + } + + DBUG_RETURN(convert_error_code_to_mysql(err, m_flags, m_thd)); +} + +/*****************************************************************//** +Creates an index in an InnoDB database. */ +inline +int +create_index( +/*=========*/ + trx_t* trx, /*!< in: InnoDB transaction handle */ + const TABLE* form, /*!< in: information on table + columns and indexes */ + dict_table_t* table, /*!< in,out: table */ + uint key_num) /*!< in: index number */ +{ + dict_index_t* index; + int error; + const KEY* key; + ulint* field_lengths; + + DBUG_ENTER("create_index"); + + key = form->key_info + key_num; + + /* Assert that "GEN_CLUST_INDEX" cannot be used as non-primary index */ + ut_a(innobase_strcasecmp(key->name.str, innobase_index_reserve_name) != 0); + const ha_table_option_struct& o = *form->s->option_struct; + + if (key->flags & (HA_SPATIAL | HA_FULLTEXT)) { + /* Only one of these can be specified at a time. */ + ut_ad(~key->flags & (HA_SPATIAL | HA_FULLTEXT)); + ut_ad(!(key->flags & HA_NOSAME)); + index = dict_mem_index_create(table, key->name.str, + (key->flags & HA_SPATIAL) + ? DICT_SPATIAL : DICT_FTS, + key->user_defined_key_parts); + + for (ulint i = 0; i < key->user_defined_key_parts; i++) { + const Field* field = key->key_part[i].field; + + /* We do not support special (Fulltext or Spatial) + index on virtual columns */ + if (!field->stored_in_db()) { + ut_ad(0); + DBUG_RETURN(HA_ERR_UNSUPPORTED); + } + + dict_mem_index_add_field(index, field->field_name.str, + 0, + key->key_part->key_part_flag + & HA_REVERSE_SORT); + } + + DBUG_RETURN(convert_error_code_to_mysql( + row_create_index_for_mysql( + index, trx, NULL, + fil_encryption_t(o.encryption), + uint32_t(o.encryption_key_id)), + table->flags, NULL)); + } + + ulint ind_type = 0; + + if (key_num == form->s->primary_key) { + ind_type |= DICT_CLUSTERED; + } + + if (key->flags & HA_NOSAME) { + ind_type |= DICT_UNIQUE; + } + + field_lengths = (ulint*) my_malloc(PSI_INSTRUMENT_ME, + key->user_defined_key_parts * sizeof * + field_lengths, MYF(MY_FAE)); + + /* We pass 0 as the space id, and determine at a lower level the space + id where to store the table */ + + index = dict_mem_index_create(table, key->name.str, + ind_type, key->user_defined_key_parts); + + for (ulint i = 0; i < key->user_defined_key_parts; i++) { + KEY_PART_INFO* key_part = key->key_part + i; + ulint prefix_len; + unsigned is_unsigned; + + + /* (The flag HA_PART_KEY_SEG denotes in MySQL a + column prefix field in an index: we only store a + specified number of first bytes of the column to + the index field.) The flag does not seem to be + properly set by MySQL. Let us fall back on testing + the length of the key part versus the column. + We first reach to the table's column; if the index is on a + prefix, key_part->field is not the table's column (it's a + "fake" field forged in open_table_from_share() with length + equal to the length of the prefix); so we have to go to + form->fied. */ + Field* field= form->field[key_part->field->field_index]; + if (field == NULL) + ut_error; + + const char* field_name = key_part->field->field_name.str; + + auto col_type = get_innobase_type_from_mysql_type( + &is_unsigned, key_part->field); + + if (DATA_LARGE_MTYPE(col_type) + || (key_part->length < field->pack_length() + && field->type() != MYSQL_TYPE_VARCHAR) + || (field->type() == MYSQL_TYPE_VARCHAR + && key_part->length < field->pack_length() + - ((Field_varstring*) field)->length_bytes)) { + + switch (col_type) { + default: + prefix_len = key_part->length; + break; + case DATA_INT: + case DATA_FLOAT: + case DATA_DOUBLE: + case DATA_DECIMAL: + sql_print_error( + "MariaDB is trying to create a column" + " prefix index field, on an" + " inappropriate data type. Table" + " name %s, column name %s.", + form->s->table_name.str, + key_part->field->field_name.str); + + prefix_len = 0; + } + } else { + prefix_len = 0; + } + + ut_ad(prefix_len % field->charset()->mbmaxlen == 0); + + field_lengths[i] = key_part->length; + + if (!key_part->field->stored_in_db()) { + index->type |= DICT_VIRTUAL; + } + + dict_mem_index_add_field(index, field_name, prefix_len, + key_part->key_part_flag + & HA_REVERSE_SORT); + } + + ut_ad(key->flags & HA_FULLTEXT || !(index->type & DICT_FTS)); + + /* Even though we've defined max_supported_key_part_length, we + still do our own checking using field_lengths to be absolutely + sure we don't create too long indexes. */ + ulint flags = table->flags; + + error = convert_error_code_to_mysql( + row_create_index_for_mysql(index, trx, field_lengths, + fil_encryption_t(o.encryption), + uint32_t(o.encryption_key_id)), + flags, NULL); + + my_free(field_lengths); + + DBUG_RETURN(error); +} + +/** Return a display name for the row format +@param[in] row_format Row Format +@return row format name */ +static +const char* +get_row_format_name( + enum row_type row_format) +{ + switch (row_format) { + case ROW_TYPE_COMPACT: + return("COMPACT"); + case ROW_TYPE_COMPRESSED: + return("COMPRESSED"); + case ROW_TYPE_DYNAMIC: + return("DYNAMIC"); + case ROW_TYPE_REDUNDANT: + return("REDUNDANT"); + case ROW_TYPE_DEFAULT: + return("DEFAULT"); + case ROW_TYPE_FIXED: + return("FIXED"); + case ROW_TYPE_PAGE: + case ROW_TYPE_NOT_USED: + break; + } + return("NOT USED"); +} + +/** Validate DATA DIRECTORY option. +@return true if valid, false if not. */ +bool +create_table_info_t::create_option_data_directory_is_valid() +{ + bool is_valid = true; + + ut_ad(m_create_info->data_file_name + && m_create_info->data_file_name[0] != '\0'); + + /* Use DATA DIRECTORY only with file-per-table. */ + if (!m_allow_file_per_table) { + push_warning( + m_thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: DATA DIRECTORY requires" + " innodb_file_per_table."); + is_valid = false; + } + + /* Do not use DATA DIRECTORY with TEMPORARY TABLE. */ + if (m_create_info->tmp_table()) { + push_warning( + m_thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: DATA DIRECTORY cannot be used" + " for TEMPORARY tables."); + is_valid = false; + } + + /* We check for a DATA DIRECTORY mixed with TABLESPACE in + create_option_tablespace_is_valid(), no need to here. */ + + return(is_valid); +} + +/** Validate the create options. Check that the options KEY_BLOCK_SIZE, +ROW_FORMAT, DATA DIRECTORY, TEMPORARY are compatible with +each other and other settings. These CREATE OPTIONS are not validated +here unless innodb_strict_mode is on. With strict mode, this function +will report each problem it finds using a custom message with error +code ER_ILLEGAL_HA_CREATE_OPTION, not its built-in message. +@return NULL if valid, string name of bad option if not. */ +const char* +create_table_info_t::create_options_are_invalid() +{ + bool has_key_block_size = (m_create_info->key_block_size != 0); + + const char* ret = NULL; + enum row_type row_format = m_create_info->row_type; + const bool is_temp = m_create_info->tmp_table(); + + ut_ad(m_thd != NULL); + + /* If innodb_strict_mode is not set don't do any more validation. */ + if (!THDVAR(m_thd, strict_mode)) { + return(NULL); + } + + /* Check if a non-zero KEY_BLOCK_SIZE was specified. */ + if (has_key_block_size) { + if (is_temp || innodb_read_only_compressed) { + my_error(ER_UNSUPPORTED_COMPRESSED_TABLE, MYF(0)); + return("KEY_BLOCK_SIZE"); + } + + switch (m_create_info->key_block_size) { + ulint kbs_max; + case 1: + case 2: + case 4: + case 8: + case 16: + /* The maximum KEY_BLOCK_SIZE (KBS) is + UNIV_PAGE_SIZE_MAX. But if srv_page_size is + smaller than UNIV_PAGE_SIZE_MAX, the maximum + KBS is also smaller. */ + kbs_max = ut_min( + 1U << (UNIV_PAGE_SSIZE_MAX - 1), + 1U << (PAGE_ZIP_SSIZE_MAX - 1)); + if (m_create_info->key_block_size > kbs_max) { + push_warning_printf( + m_thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: KEY_BLOCK_SIZE=%ld" + " cannot be larger than %ld.", + m_create_info->key_block_size, + kbs_max); + ret = "KEY_BLOCK_SIZE"; + } + + /* Valid KEY_BLOCK_SIZE, check its dependencies. */ + if (!m_allow_file_per_table) { + push_warning( + m_thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: KEY_BLOCK_SIZE requires" + " innodb_file_per_table."); + ret = "KEY_BLOCK_SIZE"; + } + break; + default: + push_warning_printf( + m_thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: invalid KEY_BLOCK_SIZE = %u." + " Valid values are [1, 2, 4, 8, 16]", + (uint) m_create_info->key_block_size); + ret = "KEY_BLOCK_SIZE"; + break; + } + } + + /* Check for a valid InnoDB ROW_FORMAT specifier and + other incompatibilities. */ + switch (row_format) { + case ROW_TYPE_COMPRESSED: + if (is_temp || innodb_read_only_compressed) { + my_error(ER_UNSUPPORTED_COMPRESSED_TABLE, MYF(0)); + return("ROW_FORMAT"); + } + if (!m_allow_file_per_table) { + push_warning_printf( + m_thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: ROW_FORMAT=%s requires" + " innodb_file_per_table.", + get_row_format_name(row_format)); + ret = "ROW_FORMAT"; + } + break; + case ROW_TYPE_DYNAMIC: + case ROW_TYPE_COMPACT: + case ROW_TYPE_REDUNDANT: + if (has_key_block_size) { + push_warning_printf( + m_thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: cannot specify ROW_FORMAT = %s" + " with KEY_BLOCK_SIZE.", + get_row_format_name(row_format)); + ret = "KEY_BLOCK_SIZE"; + } + break; + case ROW_TYPE_DEFAULT: + break; + case ROW_TYPE_FIXED: + case ROW_TYPE_PAGE: + case ROW_TYPE_NOT_USED: + push_warning( + m_thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: invalid ROW_FORMAT specifier."); + ret = "ROW_TYPE"; + break; + } + + if (!m_create_info->data_file_name + || !m_create_info->data_file_name[0]) { + } else if (!my_use_symdir) { + my_error(WARN_OPTION_IGNORED, MYF(ME_WARNING), + "DATA DIRECTORY"); + } else if (!create_option_data_directory_is_valid()) { + ret = "DATA DIRECTORY"; + } + + /* Do not allow INDEX_DIRECTORY */ + if (m_create_info->index_file_name) { + push_warning_printf( + m_thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: INDEX DIRECTORY is not supported"); + ret = "INDEX DIRECTORY"; + } + + /* Don't support compressed table when page size > 16k. */ + if ((has_key_block_size || row_format == ROW_TYPE_COMPRESSED) + && srv_page_size > UNIV_PAGE_SIZE_DEF) { + push_warning(m_thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: Cannot create a COMPRESSED table" + " when innodb_page_size > 16k."); + + if (has_key_block_size) { + ret = "KEY_BLOCK_SIZE"; + } else { + ret = "ROW_TYPE"; + } + } + + return(ret); +} + +/*****************************************************************//** +Check engine specific table options not handled by SQL-parser. +@return NULL if valid, string if not */ +const char* +create_table_info_t::check_table_options() +{ + enum row_type row_format = m_create_info->row_type; + const ha_table_option_struct *options= m_form->s->option_struct; + + switch (options->encryption) { + case FIL_ENCRYPTION_OFF: + if (options->encryption_key_id != FIL_DEFAULT_ENCRYPTION_KEY) { + push_warning( + m_thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: ENCRYPTED=NO implies" + " ENCRYPTION_KEY_ID=1"); + compile_time_assert(FIL_DEFAULT_ENCRYPTION_KEY == 1); + } + if (srv_encrypt_tables != 2) { + break; + } + push_warning( + m_thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: ENCRYPTED=NO cannot be used with" + " innodb_encrypt_tables=FORCE"); + return "ENCRYPTED"; + case FIL_ENCRYPTION_DEFAULT: + if (!srv_encrypt_tables) { + break; + } + /* fall through */ + case FIL_ENCRYPTION_ON: + const uint32_t key_id = uint32_t(options->encryption_key_id); + if (!encryption_key_id_exists(key_id)) { + push_warning_printf( + m_thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: ENCRYPTION_KEY_ID %u not available", + key_id); + return "ENCRYPTION_KEY_ID"; + } + + /* We do not support encryption for spatial indexes, + except if innodb_checksum_algorithm=full_crc32. + Do not allow ENCRYPTED=YES if any SPATIAL INDEX exists. */ + if (options->encryption != FIL_ENCRYPTION_ON + || srv_checksum_algorithm + >= SRV_CHECKSUM_ALGORITHM_FULL_CRC32) { + break; + } + for (ulint i = 0; i < m_form->s->keys; i++) { + if (m_form->key_info[i].flags & HA_SPATIAL) { + push_warning(m_thd, + Sql_condition::WARN_LEVEL_WARN, + HA_ERR_UNSUPPORTED, + "InnoDB: ENCRYPTED=YES is not" + " supported for SPATIAL INDEX"); + return "ENCRYPTED"; + } + } + } + + if (!m_allow_file_per_table + && options->encryption != FIL_ENCRYPTION_DEFAULT) { + push_warning( + m_thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: ENCRYPTED requires innodb_file_per_table"); + return "ENCRYPTED"; + } + + /* Check page compression requirements */ + if (options->page_compressed) { + + if (row_format == ROW_TYPE_COMPRESSED) { + push_warning( + m_thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED table can't have" + " ROW_TYPE=COMPRESSED"); + return "PAGE_COMPRESSED"; + } + + switch (row_format) { + default: + break; + case ROW_TYPE_DEFAULT: + if (m_default_row_format + != DEFAULT_ROW_FORMAT_REDUNDANT) { + break; + } + /* fall through */ + case ROW_TYPE_REDUNDANT: + push_warning( + m_thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED table can't have" + " ROW_TYPE=REDUNDANT"); + return "PAGE_COMPRESSED"; + } + + if (!m_allow_file_per_table) { + push_warning( + m_thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED requires" + " innodb_file_per_table."); + return "PAGE_COMPRESSED"; + } + + if (m_create_info->key_block_size) { + push_warning( + m_thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED table can't have" + " key_block_size"); + return "PAGE_COMPRESSED"; + } + } + + /* Check page compression level requirements, some of them are + already checked above */ + if (options->page_compression_level != 0) { + if (options->page_compressed == false) { + push_warning( + m_thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSION_LEVEL requires" + " PAGE_COMPRESSED"); + return "PAGE_COMPRESSION_LEVEL"; + } + + if (options->page_compression_level < 1 || options->page_compression_level > 9) { + push_warning_printf( + m_thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: invalid PAGE_COMPRESSION_LEVEL = %lu." + " Valid values are [1, 2, 3, 4, 5, 6, 7, 8, 9]", + options->page_compression_level); + return "PAGE_COMPRESSION_LEVEL"; + } + } + + return NULL; +} + +/*****************************************************************//** +Update create_info. Used in SHOW CREATE TABLE et al. */ + +void +ha_innobase::update_create_info( +/*============================*/ + HA_CREATE_INFO* create_info) /*!< in/out: create info */ +{ + if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) { + info(HA_STATUS_AUTO); + create_info->auto_increment_value = stats.auto_increment_value; + } + + if (m_prebuilt->table->is_temporary()) { + return; + } + + dict_get_and_save_data_dir_path(m_prebuilt->table); + + if (m_prebuilt->table->data_dir_path) { + create_info->data_file_name = m_prebuilt->table->data_dir_path; + } +} + +/*****************************************************************//** +Initialize the table FTS stopword list +@return TRUE if success */ +ibool +innobase_fts_load_stopword( +/*=======================*/ + dict_table_t* table, /*!< in: Table has the FTS */ + trx_t* trx, /*!< in: transaction */ + THD* thd) /*!< in: current thread */ +{ + ut_ad(dict_sys.locked()); + + const char *stopword_table= THDVAR(thd, ft_user_stopword_table); + if (!stopword_table) + { + mysql_mutex_lock(&LOCK_global_system_variables); + if (innobase_server_stopword_table) + stopword_table= thd_strdup(thd, innobase_server_stopword_table); + mysql_mutex_unlock(&LOCK_global_system_variables); + } + + table->fts->dict_locked= true; + bool success= fts_load_stopword(table, trx, stopword_table, + THDVAR(thd, ft_enable_stopword), false); + table->fts->dict_locked= false; + return success; +} + +/** Parse the table name into normal name and remote path if needed. +@param[in] name Table name (db/table or full path). +@return 0 if successful, otherwise, error number */ +int +create_table_info_t::parse_table_name( + const char* +#ifdef _WIN32 + name +#endif + ) +{ + DBUG_ENTER("parse_table_name"); + +#ifdef _WIN32 + /* Names passed in from server are in two formats: + 1. /: for normal table creation + 2. full path: for temp table creation, or DATA DIRECTORY. + + When srv_file_per_table is on and mysqld_embedded is off, + check for full path pattern, i.e. + X:\dir\..., X is a driver letter, or + \\dir1\dir2\..., UNC path + returns error if it is in full path format, but not creating a temp. + table. Currently InnoDB does not support symbolic link on Windows. */ + + if (m_innodb_file_per_table + && !mysqld_embedded + && !m_create_info->tmp_table()) { + + if ((name[1] == ':') + || (name[0] == '\\' && name[1] == '\\')) { + sql_print_error("Cannot create table %s\n", name); + DBUG_RETURN(HA_ERR_GENERIC); + } + } +#endif + + m_remote_path[0] = '\0'; + + /* Make sure DATA DIRECTORY is compatible with other options + and set the remote path. In the case of either; + CREATE TEMPORARY TABLE ... DATA DIRECTORY={path} ... ; + CREATE TABLE ... DATA DIRECTORY={path} TABLESPACE={name}... ; + we ignore the DATA DIRECTORY. */ + if (m_create_info->data_file_name + && m_create_info->data_file_name[0] + && my_use_symdir) { + if (!create_option_data_directory_is_valid()) { + push_warning_printf( + m_thd, Sql_condition::WARN_LEVEL_WARN, + WARN_OPTION_IGNORED, + ER_DEFAULT(WARN_OPTION_IGNORED), + "DATA DIRECTORY"); + + m_flags &= ~DICT_TF_MASK_DATA_DIR; + } else { + strncpy(m_remote_path, + m_create_info->data_file_name, + FN_REFLEN - 1); + } + } + + if (m_create_info->index_file_name) { + my_error(WARN_OPTION_IGNORED, ME_WARNING, + "INDEX DIRECTORY"); + } + + DBUG_RETURN(0); +} + +/** @return whether innodb_strict_mode is active */ +bool ha_innobase::is_innodb_strict_mode(THD *thd) +{ + return THDVAR(thd, strict_mode); +} + +/** Determine InnoDB table flags. +If strict_mode=OFF, this will adjust the flags to what should be assumed. +@retval true on success +@retval false on error */ +bool create_table_info_t::innobase_table_flags() +{ + DBUG_ENTER("innobase_table_flags"); + + const char* fts_doc_id_index_bad = NULL; + ulint zip_ssize = 0; + enum row_type row_type; + rec_format_t innodb_row_format = + get_row_format(m_default_row_format); + const bool is_temp = m_create_info->tmp_table(); + bool zip_allowed = !is_temp; + + const ulint zip_ssize_max = + ut_min(static_cast(UNIV_PAGE_SSIZE_MAX), + static_cast(PAGE_ZIP_SSIZE_MAX)); + + ha_table_option_struct *options= m_form->s->option_struct; + + m_flags = 0; + m_flags2 = 0; + + /* Check if there are any FTS indexes defined on this table. */ + const uint fts_n_uniq= m_form->versioned() ? 2 : 1; + for (uint i = 0; i < m_form->s->keys; i++) { + const KEY* key = &m_form->key_info[i]; + + if (key->flags & HA_FULLTEXT) { + m_flags2 |= DICT_TF2_FTS; + + /* We don't support FTS indexes in temporary + tables. */ + if (is_temp) { + my_error(ER_INNODB_NO_FT_TEMP_TABLE, MYF(0)); + DBUG_RETURN(false); + } + + if (fts_doc_id_index_bad) { + goto index_bad; + } + } + + if (innobase_strcasecmp(key->name.str, FTS_DOC_ID_INDEX_NAME)) { + continue; + } + + /* Do a pre-check on FTS DOC ID index */ + if (!(key->flags & HA_NOSAME) + || key->user_defined_key_parts != fts_n_uniq + || (key->key_part[0].key_part_flag & HA_REVERSE_SORT) + || strcmp(key->name.str, FTS_DOC_ID_INDEX_NAME) + || strcmp(key->key_part[0].field->field_name.str, + FTS_DOC_ID_COL_NAME)) { + fts_doc_id_index_bad = key->name.str; + } + + if (fts_doc_id_index_bad && (m_flags2 & DICT_TF2_FTS)) { +index_bad: + my_error(ER_INNODB_FT_WRONG_DOCID_INDEX, MYF(0), + fts_doc_id_index_bad); + DBUG_RETURN(false); + } + } + + if (m_create_info->key_block_size > 0) { + /* The requested compressed page size (key_block_size) + is given in kilobytes. If it is a valid number, store + that value as the number of log2 shifts from 512 in + zip_ssize. Zero means it is not compressed. */ + ulint zssize; /* Zip Shift Size */ + ulint kbsize; /* Key Block Size */ + for (zssize = kbsize = 1; + zssize <= zip_ssize_max; + zssize++, kbsize <<= 1) { + if (kbsize == m_create_info->key_block_size) { + zip_ssize = zssize; + break; + } + } + + /* Make sure compressed row format is allowed. */ + if (is_temp) { + push_warning( + m_thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: KEY_BLOCK_SIZE is ignored" + " for TEMPORARY TABLE."); + zip_allowed = false; + } else if (!m_allow_file_per_table) { + push_warning( + m_thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: KEY_BLOCK_SIZE requires" + " innodb_file_per_table."); + zip_allowed = false; + } + + if (!zip_allowed + || zssize > zip_ssize_max) { + push_warning_printf( + m_thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: ignoring KEY_BLOCK_SIZE=%u.", + (uint) m_create_info->key_block_size); + } + } + + row_type = m_create_info->row_type; + + if (zip_ssize && zip_allowed) { + /* if ROW_FORMAT is set to default, + automatically change it to COMPRESSED. */ + if (row_type == ROW_TYPE_DEFAULT) { + row_type = ROW_TYPE_COMPRESSED; + } else if (row_type != ROW_TYPE_COMPRESSED) { + /* ROW_FORMAT other than COMPRESSED + ignores KEY_BLOCK_SIZE. It does not + make sense to reject conflicting + KEY_BLOCK_SIZE and ROW_FORMAT, because + such combinations can be obtained + with ALTER TABLE anyway. */ + push_warning_printf( + m_thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: ignoring KEY_BLOCK_SIZE=%u" + " unless ROW_FORMAT=COMPRESSED.", + (uint) m_create_info->key_block_size); + zip_allowed = false; + } + } else { + /* zip_ssize == 0 means no KEY_BLOCK_SIZE. */ + if (row_type == ROW_TYPE_COMPRESSED && zip_allowed) { + /* ROW_FORMAT=COMPRESSED without KEY_BLOCK_SIZE + implies half the maximum KEY_BLOCK_SIZE(*1k) or + srv_page_size, whichever is less. */ + zip_ssize = zip_ssize_max - 1; + } + } + + /* Validate the row format. Correct it if necessary */ + + switch (row_type) { + case ROW_TYPE_REDUNDANT: + innodb_row_format = REC_FORMAT_REDUNDANT; + break; + case ROW_TYPE_COMPACT: + innodb_row_format = REC_FORMAT_COMPACT; + break; + case ROW_TYPE_COMPRESSED: + if (is_temp) { + push_warning_printf( + m_thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: ROW_FORMAT=%s is ignored for" + " TEMPORARY TABLE.", + get_row_format_name(row_type)); + } else if (!m_allow_file_per_table) { + push_warning_printf( + m_thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: ROW_FORMAT=COMPRESSED requires" + " innodb_file_per_table."); + } else { + innodb_row_format = REC_FORMAT_COMPRESSED; + break; + } + zip_allowed = false; + /* Set ROW_FORMAT = COMPACT */ + /* fall through */ + case ROW_TYPE_NOT_USED: + case ROW_TYPE_FIXED: + case ROW_TYPE_PAGE: + push_warning( + m_thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: assuming ROW_FORMAT=DYNAMIC."); + /* fall through */ + case ROW_TYPE_DYNAMIC: + innodb_row_format = REC_FORMAT_DYNAMIC; + break; + case ROW_TYPE_DEFAULT: + ; + } + + /* Don't support compressed table when page size > 16k. */ + if (zip_allowed && zip_ssize && srv_page_size > UNIV_PAGE_SIZE_DEF) { + push_warning(m_thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: Cannot create a COMPRESSED table" + " when innodb_page_size > 16k." + " Assuming ROW_FORMAT=DYNAMIC."); + zip_allowed = false; + } + + ut_ad(!is_temp || !zip_allowed); + ut_ad(!is_temp || innodb_row_format != REC_FORMAT_COMPRESSED); + + /* Set the table flags */ + if (!zip_allowed) { + zip_ssize = 0; + } + + ulint level = 0; + + if (is_temp) { + m_flags2 |= DICT_TF2_TEMPORARY; + } else { + if (m_use_file_per_table) { + m_flags2 |= DICT_TF2_USE_FILE_PER_TABLE; + } + + level = ulint(options->page_compression_level); + if (!level) { + level = page_zip_level; + if (!level && options->page_compressed) { + push_warning_printf( + m_thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED requires" + " PAGE_COMPRESSION_LEVEL or" + " innodb_compression_level > 0"); + DBUG_RETURN(false); + } + } + } + + /* Set the table flags */ + dict_tf_set(&m_flags, innodb_row_format, zip_ssize, + m_use_data_dir, level && options->page_compressed, level); + + if (m_form->s->table_type == TABLE_TYPE_SEQUENCE) { + m_flags |= DICT_TF_MASK_NO_ROLLBACK; + } + + /* Set the flags2 when create table or alter tables */ + m_flags2 |= DICT_TF2_FTS_AUX_HEX_NAME; + DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name", + m_flags2 &= ~DICT_TF2_FTS_AUX_HEX_NAME;); + + DBUG_RETURN(true); +} + +/** Parse MERGE_THRESHOLD value from the string. +@param[in] thd connection +@param[in] str string which might include 'MERGE_THRESHOLD=' +@return value parsed. 0 means not found or invalid value. */ +static +unsigned +innobase_parse_merge_threshold( + THD* thd, + const char* str) +{ + static const char* label = "MERGE_THRESHOLD="; + static const size_t label_len = strlen(label); + const char* pos = str; + + pos = strstr(str, label); + + if (pos == NULL) { + return(0); + } + + pos += label_len; + + lint ret = atoi(pos); + + if (ret > 0 && ret <= 50) { + return(static_cast(ret)); + } + + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_ILLEGAL_HA_CREATE_OPTION, + "InnoDB: Invalid value for MERGE_THRESHOLD in the CREATE TABLE" + " statement. The value is ignored."); + + return(0); +} + +/** Parse hint for table and its indexes, and update the information +in dictionary. +@param[in] thd connection +@param[in,out] table target table +@param[in] table_share table definition */ +void +innobase_parse_hint_from_comment( + THD* thd, + dict_table_t* table, + const TABLE_SHARE* table_share) +{ + unsigned merge_threshold_table; + unsigned merge_threshold_index[MAX_KEY]; + bool is_found[MAX_KEY]; + + if (table_share->comment.str != NULL) { + merge_threshold_table + = innobase_parse_merge_threshold( + thd, table_share->comment.str); + } else { + merge_threshold_table = DICT_INDEX_MERGE_THRESHOLD_DEFAULT; + } + + if (merge_threshold_table == 0) { + merge_threshold_table = DICT_INDEX_MERGE_THRESHOLD_DEFAULT; + } + + for (uint i = 0; i < table_share->keys; i++) { + KEY* key_info = &table_share->key_info[i]; + + ut_ad(i < sizeof(merge_threshold_index) + / sizeof(merge_threshold_index[0])); + + if (key_info->flags & HA_USES_COMMENT + && key_info->comment.str != NULL) { + merge_threshold_index[i] + = innobase_parse_merge_threshold( + thd, key_info->comment.str); + } else { + merge_threshold_index[i] = merge_threshold_table; + } + + if (merge_threshold_index[i] == 0) { + merge_threshold_index[i] = merge_threshold_table; + } + } + + /* update SYS_INDEX table */ + if (!table->is_temporary()) { + for (uint i = 0; i < table_share->keys; i++) { + is_found[i] = false; + } + + for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); + index != NULL; + index = UT_LIST_GET_NEXT(indexes, index)) { + + if (dict_index_is_auto_gen_clust(index)) { + + /* GEN_CLUST_INDEX should use + merge_threshold_table */ + dict_index_set_merge_threshold( + index, merge_threshold_table); + continue; + } + + for (uint i = 0; i < table_share->keys; i++) { + if (is_found[i]) { + continue; + } + + KEY* key_info = &table_share->key_info[i]; + + if (innobase_strcasecmp( + index->name, key_info->name.str) == 0) { + + dict_index_set_merge_threshold( + index, + merge_threshold_index[i]); + is_found[i] = true; + break; + } + } + } + } + + for (uint i = 0; i < table_share->keys; i++) { + is_found[i] = false; + } + + /* update in memory */ + for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); + index != NULL; + index = UT_LIST_GET_NEXT(indexes, index)) { + + if (dict_index_is_auto_gen_clust(index)) { + + /* GEN_CLUST_INDEX should use merge_threshold_table */ + + /* x-lock index is needed to exclude concurrent + pessimistic tree operations */ + index->lock.x_lock(SRW_LOCK_CALL); + index->merge_threshold = merge_threshold_table + & ((1U << 6) - 1); + index->lock.x_unlock(); + + continue; + } + + for (uint i = 0; i < table_share->keys; i++) { + if (is_found[i]) { + continue; + } + + KEY* key_info = &table_share->key_info[i]; + + if (innobase_strcasecmp( + index->name, key_info->name.str) == 0) { + + /* x-lock index is needed to exclude concurrent + pessimistic tree operations */ + index->lock.x_lock(SRW_LOCK_CALL); + index->merge_threshold + = merge_threshold_index[i] + & ((1U << 6) - 1); + index->lock.x_unlock(); + is_found[i] = true; + + break; + } + } + } +} + +/** Set m_use_* flags. */ +void +create_table_info_t::set_tablespace_type( + bool table_being_altered_is_file_per_table) +{ + /** Allow file_per_table for this table either because: + 1) the setting innodb_file_per_table=on, + 2) the table being altered is currently file_per_table */ + m_allow_file_per_table = + m_innodb_file_per_table + || table_being_altered_is_file_per_table; + + /* Ignore the current innodb-file-per-table setting if we are + creating a temporary table. */ + m_use_file_per_table = m_allow_file_per_table + && !m_create_info->tmp_table(); + + /* DATA DIRECTORY must have m_use_file_per_table but cannot be + used with TEMPORARY tables. */ + m_use_data_dir = + m_use_file_per_table + && m_create_info->data_file_name + && m_create_info->data_file_name[0] + && my_use_symdir; +} + +/** Initialize the create_table_info_t object. +@return error number */ +int +create_table_info_t::initialize() +{ + DBUG_ENTER("create_table_info_t::initialize"); + + ut_ad(m_thd != NULL); + ut_ad(m_create_info != NULL); + + if (m_form->s->fields > REC_MAX_N_USER_FIELDS) { + DBUG_RETURN(HA_ERR_TOO_MANY_FIELDS); + } + + /* Check for name conflicts (with reserved name) for + any user indices to be created. */ + if (innobase_index_name_is_reserved(m_thd, m_form->key_info, + m_form->s->keys)) { + DBUG_RETURN(HA_ERR_WRONG_INDEX); + } + + /* Get the transaction associated with the current thd, or create one + if not yet created */ + + check_trx_exists(m_thd); + + DBUG_RETURN(0); +} + + +/** Check if a virtual column is part of a fulltext or spatial index. */ +bool +create_table_info_t::gcols_in_fulltext_or_spatial() +{ + for (ulint i = 0; i < m_form->s->keys; i++) { + const KEY* key = m_form->key_info + i; + if (!(key->flags & (HA_SPATIAL | HA_FULLTEXT))) { + continue; + } + for (ulint j = 0; j < key->user_defined_key_parts; j++) { + /* We do not support special (Fulltext or + Spatial) index on virtual columns */ + if (!key->key_part[j].field->stored_in_db()) { + my_error(ER_UNSUPPORTED_ACTION_ON_GENERATED_COLUMN, MYF(0)); + return true; + } + } + } + return false; +} + + +/** Prepare to create a new table to an InnoDB database. +@param[in] name Table name +@return error number */ +int create_table_info_t::prepare_create_table(const char* name, bool strict) +{ + DBUG_ENTER("prepare_create_table"); + + ut_ad(m_thd != NULL); + ut_ad(m_create_info != NULL); + + set_tablespace_type(false); + + normalize_table_name(m_table_name, name); + + /* Validate table options not handled by the SQL-parser */ + if (check_table_options()) { + DBUG_RETURN(HA_WRONG_CREATE_OPTION); + } + + /* Validate the create options if innodb_strict_mode is set. + Do not use the regular message for ER_ILLEGAL_HA_CREATE_OPTION + because InnoDB might actually support the option, but not under + the current conditions. The messages revealing the specific + problems are reported inside this function. */ + if (strict && create_options_are_invalid()) { + DBUG_RETURN(HA_WRONG_CREATE_OPTION); + } + + /* Create the table flags and flags2 */ + if (!innobase_table_flags()) { + DBUG_RETURN(HA_WRONG_CREATE_OPTION); + } + + if (high_level_read_only) { + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } + + if (gcols_in_fulltext_or_spatial()) { + DBUG_RETURN(HA_ERR_UNSUPPORTED); + } + + for (uint i = 0; i < m_form->s->keys; i++) { + const size_t max_field_len + = DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(m_flags); + const KEY& key = m_form->key_info[i]; + + if (key.algorithm == HA_KEY_ALG_FULLTEXT) { + continue; + } + + if (too_big_key_part_length(max_field_len, key)) { + DBUG_RETURN(convert_error_code_to_mysql( + DB_TOO_BIG_INDEX_COL, m_flags, NULL)); + } + } + + DBUG_RETURN(parse_table_name(name)); +} + +/** Push warning message to SQL-layer based on foreign key constraint index +match error. +@param[in] trx Current transaction +@param[in] operation Operation ("Create" or "Alter") +@param[in] create_name Table name as specified in SQL +@param[in] columns Foreign key column names array +@param[in] index_error Index error code +@param[in] err_col Column where error happened +@param[in] err_index Index where error happened +@param[in] table Table object */ +static void +foreign_push_index_error(trx_t* trx, const char* operation, + const char* create_name, const char* fk_text, + const char** columns, fkerr_t index_error, + ulint err_col, dict_index_t* err_index, + dict_table_t* table) +{ + switch (index_error) { + case FK_SUCCESS: + break; + case FK_INDEX_NOT_FOUND: + ib_foreign_warn(trx, DB_CANNOT_ADD_CONSTRAINT, create_name, + "%s table %s with foreign key %s constraint" + " failed. There is no index in the referenced" + " table where the referenced columns appear" + " as the first columns.", + operation, create_name, fk_text); + return; + case FK_IS_PREFIX_INDEX: + ib_foreign_warn( + trx, DB_CANNOT_ADD_CONSTRAINT, create_name, + "%s table %s with foreign key %s constraint" + " failed. There is only prefix index in the referenced" + " table where the referenced columns appear" + " as the first columns.", + operation, create_name, fk_text); + return; + case FK_COL_NOT_NULL: + ib_foreign_warn( + trx, DB_CANNOT_ADD_CONSTRAINT, create_name, + "%s table %s with foreign key %s constraint" + " failed. You have defined a SET NULL condition but " + "column '%s' on index is defined as NOT NULL.", + operation, create_name, fk_text, columns[err_col]); + return; + case FK_COLS_NOT_EQUAL: + dict_field_t* field; + const char* col_name; + field = dict_index_get_nth_field(err_index, err_col); + + col_name = field->col->is_virtual() + ? "(null)" + : dict_table_get_col_name( + table, dict_col_get_no(field->col)); + ib_foreign_warn( + trx, DB_CANNOT_ADD_CONSTRAINT, create_name, + "%s table %s with foreign key %s constraint" + " failed. Field type or character set for column '%s' " + "does not match referenced column '%s'.", + operation, create_name, fk_text, columns[err_col], + col_name); + return; + } + DBUG_ASSERT("unknown error" == 0); +} + +/** Find column or virtual column in table by its name. +@param[in] table Table where column is searched +@param[in] name Name to search for +@retval true if found +@retval false if not found */ +static bool +find_col(dict_table_t* table, const char** name) +{ + ulint i; + for (i = 0; i < dict_table_get_n_cols(table); i++) { + + const char* col_name = dict_table_get_col_name(table, i); + + if (0 == innobase_strcasecmp(col_name, *name)) { + /* Found */ + strcpy((char*)*name, col_name); + return true; + } + } + + for (i = 0; i < dict_table_get_n_v_cols(table); i++) { + + const char* col_name = dict_table_get_v_col_name(table, i); + + if (0 == innobase_strcasecmp(col_name, *name)) { + /* Found */ + strcpy((char*)*name, col_name); + return true; + } + } + return false; +} + +/** Foreign key printer for error messages. Prints FK name if it exists or +key part list in the form (col1, col2, col3, ...) */ +class key_text +{ + static const size_t MAX_TEXT = 48; + char buf[MAX_TEXT + 1]; + +public: + key_text(Key* key) + { + char* ptr = buf; + if (key->name.str) { + size_t len = std::min(key->name.length, MAX_TEXT - 2); + *(ptr++) = '`'; + memcpy(ptr, key->name.str, len); + ptr += len; + *(ptr++) = '`'; + *ptr = '\0'; + return; + } + *(ptr++) = '('; + List_iterator_fast it(key->columns); + while (Key_part_spec* k = it++) { + /* 3 is etc continuation ("..."); + 2 is comma separator (", ") in case of next exists; + 1 is terminating ')' */ + if (MAX_TEXT - (size_t)(ptr - buf) + >= (it.peek() ? 3 + 2 + 1 : 3 + 1) + + k->field_name.length) { + memcpy(ptr, k->field_name.str, + k->field_name.length); + ptr += k->field_name.length; + if (it.peek()) { + *(ptr++) = ','; + *(ptr++) = ' '; + } + } else { + ut_ad((size_t)(ptr - buf) <= MAX_TEXT - 4); + memcpy(ptr, "...", 3); + ptr += 3; + break; + } + } + *(ptr++) = ')'; + *ptr = '\0'; + } + const char* str() { return buf; } +}; + +/** Create InnoDB foreign keys from MySQL alter_info. Collect all +dict_foreign_t items into local_fk_set and then add into system table. +@return DB_SUCCESS or specific error code */ +dberr_t +create_table_info_t::create_foreign_keys() +{ + dict_foreign_set local_fk_set; + dict_foreign_set_free local_fk_set_free(local_fk_set); + dberr_t error; + ulint number = 1; + static const unsigned MAX_COLS_PER_FK = 500; + const char* column_names[MAX_COLS_PER_FK]; + const char* ref_column_names[MAX_COLS_PER_FK]; + char create_name[MAX_DATABASE_NAME_LEN + 1 + + MAX_TABLE_NAME_LEN + 1]; + dict_index_t* index = NULL; + fkerr_t index_error = FK_SUCCESS; + dict_index_t* err_index = NULL; + ulint err_col; + const bool tmp_table = m_flags2 & DICT_TF2_TEMPORARY; + const CHARSET_INFO* cs = thd_charset(m_thd); + const char* operation = "Create "; + const char* name = m_table_name; + + enum_sql_command sqlcom = enum_sql_command(thd_sql_command(m_thd)); + + if (sqlcom == SQLCOM_ALTER_TABLE) { + dict_table_t* table_to_alter; + mem_heap_t* heap = mem_heap_create(10000); + ulint highest_id_so_far; + char* n = dict_get_referenced_table( + name, LEX_STRING_WITH_LEN(m_form->s->db), + LEX_STRING_WITH_LEN(m_form->s->table_name), + &table_to_alter, heap, cs); + + /* Starting from 4.0.18 and 4.1.2, we generate foreign key id's + in the format databasename/tablename_ibfk_[number], where + [number] is local to the table; look for the highest [number] + for table_to_alter, so that we can assign to new constraints + higher numbers. */ + + /* If we are altering a temporary table, the table name after + ALTER TABLE does not correspond to the internal table name, and + table_to_alter is NULL. TODO: should we fix this somehow? */ + + if (table_to_alter) { + n = table_to_alter->name.m_name; + highest_id_so_far = dict_table_get_highest_foreign_id( + table_to_alter); + } else { + highest_id_so_far = 0; + } + + char* bufend = innobase_convert_name( + create_name, sizeof create_name, n, strlen(n), m_thd); + create_name[bufend - create_name] = '\0'; + number = highest_id_so_far + 1; + mem_heap_free(heap); + operation = "Alter "; + } else if (strstr(name, "#P#") || strstr(name, "#p#")) { + /* Partitioned table */ + create_name[0] = '\0'; + } else { + char* bufend = innobase_convert_name(create_name, + sizeof create_name, + name, + strlen(name), m_thd); + create_name[bufend - create_name] = '\0'; + } + + Alter_info* alter_info = m_create_info->alter_info; + ut_ad(alter_info); + List_iterator_fast key_it(alter_info->key_list); + + dict_table_t* table = dict_sys.find_table({name,strlen(name)}); + if (!table) { + ib_foreign_warn(m_trx, DB_CANNOT_ADD_CONSTRAINT, create_name, + "%s table %s foreign key constraint" + " failed. Table not found.", + operation, create_name); + + return (DB_CANNOT_ADD_CONSTRAINT); + } + + while (Key* key = key_it++) { + if (key->type != Key::FOREIGN_KEY || key->old) + continue; + + if (tmp_table) { + ib_foreign_warn(m_trx, DB_CANNOT_ADD_CONSTRAINT, + create_name, + "%s table `%s`.`%s` with foreign key " + "constraint failed. " + "Temporary tables can't have " + "foreign key constraints.", + operation, m_form->s->db.str, + m_form->s->table_name.str); + + return (DB_CANNOT_ADD_CONSTRAINT); + } else if (!*create_name) { + ut_ad("should be unreachable" == 0); + return DB_CANNOT_ADD_CONSTRAINT; + } + + Foreign_key* fk = static_cast(key); + Key_part_spec* col; + bool success; + + dict_foreign_t* foreign = dict_mem_foreign_create(); + if (!foreign) { + return (DB_OUT_OF_MEMORY); + } + + List_iterator_fast col_it(fk->columns); + unsigned i = 0, j = 0; + while ((col = col_it++)) { + column_names[i] = mem_heap_strdupl( + foreign->heap, col->field_name.str, + col->field_name.length); + success = find_col(table, column_names + i); + if (!success) { + key_text k(fk); + ib_foreign_warn( + m_trx, DB_CANNOT_ADD_CONSTRAINT, + create_name, + "%s table %s foreign key %s constraint" + " failed. Column %s was not found.", + operation, create_name, k.str(), + column_names[i]); + dict_foreign_free(foreign); + return (DB_CANNOT_ADD_CONSTRAINT); + } + ++i; + if (i >= MAX_COLS_PER_FK) { + key_text k(fk); + ib_foreign_warn( + m_trx, DB_CANNOT_ADD_CONSTRAINT, + create_name, + "%s table %s foreign key %s constraint" + " failed. Too many columns: %u (%u " + "allowed).", + operation, create_name, k.str(), i, + MAX_COLS_PER_FK); + dict_foreign_free(foreign); + return (DB_CANNOT_ADD_CONSTRAINT); + } + } + + index = dict_foreign_find_index( + table, NULL, column_names, i, NULL, TRUE, FALSE, + &index_error, &err_col, &err_index); + + if (!index) { + key_text k(fk); + foreign_push_index_error(m_trx, operation, create_name, + k.str(), column_names, + index_error, err_col, + err_index, table); + dict_foreign_free(foreign); + return (DB_CANNOT_ADD_CONSTRAINT); + } + + if (fk->constraint_name.str) { + ulint db_len; + + /* Catenate 'databasename/' to the constraint name + specified by the user: we conceive the constraint as + belonging to the same MySQL 'database' as the table + itself. We store the name to foreign->id. */ + + db_len = dict_get_db_name_len(table->name.m_name); + + foreign->id = static_cast(mem_heap_alloc( + foreign->heap, + db_len + fk->constraint_name.length + 2)); + + memcpy(foreign->id, table->name.m_name, db_len); + foreign->id[db_len] = '/'; + strcpy(foreign->id + db_len + 1, + fk->constraint_name.str); + } + + if (foreign->id == NULL) { + error = dict_create_add_foreign_id( + &number, table->name.m_name, foreign); + if (error != DB_SUCCESS) { + dict_foreign_free(foreign); + return (error); + } + } + + std::pair ret + = local_fk_set.insert(foreign); + + if (!ret.second) { + /* A duplicate foreign key name has been found */ + dict_foreign_free(foreign); + return (DB_CANNOT_ADD_CONSTRAINT); + } + + foreign->foreign_table = table; + foreign->foreign_table_name + = mem_heap_strdup(foreign->heap, table->name.m_name); + if (!foreign->foreign_table_name) { + return (DB_OUT_OF_MEMORY); + } + + dict_mem_foreign_table_name_lookup_set(foreign, TRUE); + + foreign->foreign_index = index; + foreign->n_fields = i & dict_index_t::MAX_N_FIELDS; + + foreign->foreign_col_names = static_cast( + mem_heap_alloc(foreign->heap, i * sizeof(void*))); + if (!foreign->foreign_col_names) { + return (DB_OUT_OF_MEMORY); + } + + memcpy(foreign->foreign_col_names, column_names, + i * sizeof(void*)); + + foreign->referenced_table_name = dict_get_referenced_table( + name, LEX_STRING_WITH_LEN(fk->ref_db), + LEX_STRING_WITH_LEN(fk->ref_table), + &foreign->referenced_table, foreign->heap, cs); + + if (!foreign->referenced_table_name) { + return (DB_OUT_OF_MEMORY); + } + + if (!foreign->referenced_table && m_trx->check_foreigns) { + char buf[MAX_TABLE_NAME_LEN + 1] = ""; + char* bufend; + + bufend = innobase_convert_name( + buf, MAX_TABLE_NAME_LEN, + foreign->referenced_table_name, + strlen(foreign->referenced_table_name), m_thd); + buf[bufend - buf] = '\0'; + key_text k(fk); + ib_foreign_warn(m_trx, DB_CANNOT_ADD_CONSTRAINT, + create_name, + "%s table %s with foreign key %s " + "constraint failed. Referenced table " + "%s not found in the data dictionary.", + operation, create_name, k.str(), buf); + return (DB_CANNOT_ADD_CONSTRAINT); + } + + /* Don't allow foreign keys on partitioned tables yet. */ + if (foreign->referenced_table + && dict_table_is_partition(foreign->referenced_table)) { + /* How could one make a referenced table to be a + * partition? */ + ut_ad(0); + my_error(ER_FEATURE_NOT_SUPPORTED_WITH_PARTITIONING, + MYF(0), "FOREIGN KEY"); + return (DB_CANNOT_ADD_CONSTRAINT); + } + + col_it.init(fk->ref_columns); + while ((col = col_it++)) { + ref_column_names[j] = mem_heap_strdupl( + foreign->heap, col->field_name.str, + col->field_name.length); + if (foreign->referenced_table) { + success = find_col(foreign->referenced_table, + ref_column_names + j); + if (!success) { + key_text k(fk); + ib_foreign_warn( + m_trx, + DB_CANNOT_ADD_CONSTRAINT, + create_name, + "%s table %s foreign key %s " + "constraint failed. " + "Column %s was not found.", + operation, create_name, + k.str(), ref_column_names[j]); + + return (DB_CANNOT_ADD_CONSTRAINT); + } + } + ++j; + } + /* See ER_WRONG_FK_DEF in mysql_prepare_create_table() */ + ut_ad(i == j); + + /* Try to find an index which contains the columns as the first + fields and in the right order, and the types are the same as in + foreign->foreign_index */ + + if (foreign->referenced_table) { + index = dict_foreign_find_index( + foreign->referenced_table, NULL, + ref_column_names, i, foreign->foreign_index, + TRUE, FALSE, &index_error, &err_col, + &err_index); + + if (!index) { + key_text k(fk); + foreign_push_index_error( + m_trx, operation, create_name, k.str(), + column_names, index_error, err_col, + err_index, foreign->referenced_table); + + return (DB_CANNOT_ADD_CONSTRAINT); + } + } else { + ut_a(m_trx->check_foreigns == FALSE); + index = NULL; + } + + foreign->referenced_index = index; + dict_mem_referenced_table_name_lookup_set(foreign, TRUE); + + foreign->referenced_col_names = static_cast( + mem_heap_alloc(foreign->heap, i * sizeof(void*))); + if (!foreign->referenced_col_names) { + return (DB_OUT_OF_MEMORY); + } + + memcpy(foreign->referenced_col_names, ref_column_names, + i * sizeof(void*)); + + if (fk->delete_opt == FK_OPTION_SET_NULL + || fk->update_opt == FK_OPTION_SET_NULL) { + for (j = 0; j < foreign->n_fields; j++) { + if ((dict_index_get_nth_col( + foreign->foreign_index, j) + ->prtype) + & DATA_NOT_NULL) { + const dict_col_t* col + = dict_index_get_nth_col( + foreign->foreign_index, + j); + const char* col_name + = dict_table_get_col_name( + foreign->foreign_index + ->table, + dict_col_get_no(col)); + + /* It is not sensible to define SET + NULL + if the column is not allowed to be + NULL! */ + key_text k(fk); + ib_foreign_warn( + m_trx, + DB_CANNOT_ADD_CONSTRAINT, + create_name, + "%s table %s with foreign key " + "%s constraint failed. You have" + " defined a SET NULL condition " + "but column '%s' is defined as " + "NOT NULL.", + operation, create_name, + k.str(), col_name); + + return (DB_CANNOT_ADD_CONSTRAINT); + } + } + } + + switch (fk->delete_opt) { + case FK_OPTION_UNDEF: + case FK_OPTION_RESTRICT: + break; + case FK_OPTION_CASCADE: + foreign->type |= DICT_FOREIGN_ON_DELETE_CASCADE; + break; + case FK_OPTION_SET_NULL: + foreign->type |= DICT_FOREIGN_ON_DELETE_SET_NULL; + break; + case FK_OPTION_NO_ACTION: + foreign->type |= DICT_FOREIGN_ON_DELETE_NO_ACTION; + break; + case FK_OPTION_SET_DEFAULT: + // TODO: MDEV-10393 Foreign keys SET DEFAULT action + break; + default: + ut_ad(0); + break; + } + + switch (fk->update_opt) { + case FK_OPTION_UNDEF: + case FK_OPTION_RESTRICT: + break; + case FK_OPTION_CASCADE: + foreign->type |= DICT_FOREIGN_ON_UPDATE_CASCADE; + break; + case FK_OPTION_SET_NULL: + foreign->type |= DICT_FOREIGN_ON_UPDATE_SET_NULL; + break; + case FK_OPTION_NO_ACTION: + foreign->type |= DICT_FOREIGN_ON_UPDATE_NO_ACTION; + break; + case FK_OPTION_SET_DEFAULT: + // TODO: MDEV-10393 Foreign keys SET DEFAULT action + break; + default: + ut_ad(0); + break; + } + } + + if (dict_foreigns_has_s_base_col(local_fk_set, table)) { + return (DB_NO_FK_ON_S_BASE_COL); + } + + /**********************************************************/ + /* The following call adds the foreign key constraints + to the data dictionary system tables on disk */ + m_trx->op_info = "adding foreign keys"; + + trx_start_if_not_started_xa(m_trx, true); + + m_trx->dict_operation = true; + + error = dict_create_add_foreigns_to_dictionary(local_fk_set, table, + m_trx); + + if (error == DB_SUCCESS) { + + table->foreign_set.insert(local_fk_set.begin(), + local_fk_set.end()); + std::for_each(local_fk_set.begin(), local_fk_set.end(), + dict_foreign_add_to_referenced_table()); + local_fk_set.clear(); + + dict_mem_table_fill_foreign_vcol_set(table); + } + return (error); +} + +/** Create the internal innodb table. +@param create_fk whether to add FOREIGN KEY constraints */ +int create_table_info_t::create_table(bool create_fk) +{ + int error; + int primary_key_no; + uint i; + + DBUG_ENTER("create_table"); + + /* Look for a primary key */ + primary_key_no = (m_form->s->primary_key != MAX_KEY ? + (int) m_form->s->primary_key : -1); + + /* Our function innobase_get_mysql_key_number_for_index assumes + the primary key is always number 0, if it exists */ + ut_a(primary_key_no == -1 || primary_key_no == 0); + + error = create_table_def(); + + if (error) { + DBUG_RETURN(error); + } + + /* Create the keys */ + + if (m_form->s->keys == 0 || primary_key_no == -1) { + /* Create an index which is used as the clustered index; + order the rows by their row id which is internally generated + by InnoDB */ + ulint flags = m_table->flags; + dict_index_t* index = dict_mem_index_create( + m_table, innobase_index_reserve_name, + DICT_CLUSTERED, 0); + const ha_table_option_struct& o = *m_form->s->option_struct; + error = convert_error_code_to_mysql( + row_create_index_for_mysql( + index, m_trx, NULL, + fil_encryption_t(o.encryption), + uint32_t(o.encryption_key_id)), + flags, m_thd); + if (error) { + DBUG_RETURN(error); + } + } + + if (primary_key_no != -1) { + /* In InnoDB the clustered index must always be created + first */ + if ((error = create_index(m_trx, m_form, m_table, + (uint) primary_key_no))) { + DBUG_RETURN(error); + } + } + + /* Create the ancillary tables that are common to all FTS indexes on + this table. */ + if (m_flags2 & DICT_TF2_FTS) { + fts_doc_id_index_enum ret; + + /* Check whether there already exists FTS_DOC_ID_INDEX */ + ret = innobase_fts_check_doc_id_index_in_def( + m_form->s->keys, m_form->key_info); + + switch (ret) { + case FTS_INCORRECT_DOC_ID_INDEX: + push_warning_printf(m_thd, + Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_NAME_FOR_INDEX, + " InnoDB: Index name %s is reserved" + " for the unique index on" + " FTS_DOC_ID column for FTS" + " Document ID indexing" + " on table %s. Please check" + " the index definition to" + " make sure it is of correct" + " type\n", + FTS_DOC_ID_INDEX_NAME, + m_table->name.m_name); + + if (m_table->fts) { + m_table->fts->~fts_t(); + m_table->fts = nullptr; + } + + my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0), + FTS_DOC_ID_INDEX_NAME); + DBUG_RETURN(-1); + case FTS_EXIST_DOC_ID_INDEX: + case FTS_NOT_EXIST_DOC_ID_INDEX: + break; + } + + dberr_t err = fts_create_common_tables( + m_trx, m_table, + (ret == FTS_EXIST_DOC_ID_INDEX)); + + error = convert_error_code_to_mysql(err, 0, NULL); + + if (error) { + DBUG_RETURN(error); + } + } + + for (i = 0; i < m_form->s->keys; i++) { + if (i != uint(primary_key_no) + && (error = create_index(m_trx, m_form, m_table, i))) { + DBUG_RETURN(error); + } + } + + /* Cache all the FTS indexes on this table in the FTS specific + structure. They are used for FTS indexed column update handling. */ + if (m_flags2 & DICT_TF2_FTS) { + fts_t* fts = m_table->fts; + + ut_a(fts != NULL); + + dict_table_get_all_fts_indexes(m_table, fts->indexes); + } + + dberr_t err = create_fk ? create_foreign_keys() : DB_SUCCESS; + + if (err == DB_SUCCESS) { + const dict_err_ignore_t ignore_err = m_trx->check_foreigns + ? DICT_ERR_IGNORE_NONE : DICT_ERR_IGNORE_FK_NOKEY; + + /* Check that also referencing constraints are ok */ + dict_names_t fk_tables; + err = dict_load_foreigns(m_table_name, nullptr, + m_trx->id, true, + ignore_err, fk_tables); + while (err == DB_SUCCESS && !fk_tables.empty()) { + dict_sys.load_table( + {fk_tables.front(), strlen(fk_tables.front())}, + ignore_err); + fk_tables.pop_front(); + } + } + + switch (err) { + case DB_PARENT_NO_INDEX: + push_warning_printf( + m_thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_CANNOT_ADD_FOREIGN, + "Create table '%s' with foreign key constraint" + " failed. There is no index in the referenced" + " table where the referenced columns appear" + " as the first columns.\n", m_table_name); + break; + + case DB_CHILD_NO_INDEX: + push_warning_printf( + m_thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_CANNOT_ADD_FOREIGN, + "Create table '%s' with foreign key constraint" + " failed. There is no index in the referencing" + " table where referencing columns appear" + " as the first columns.\n", m_table_name); + break; + case DB_NO_FK_ON_S_BASE_COL: + push_warning_printf( + m_thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_CANNOT_ADD_FOREIGN, + "Create table '%s' with foreign key constraint" + " failed. Cannot add foreign key constraint" + " placed on the base column of stored" + " column. \n", + m_table_name); + default: + break; + } + + if (err != DB_SUCCESS) { + DBUG_RETURN(convert_error_code_to_mysql( + err, m_flags, NULL)); + } + + /* In TRUNCATE TABLE, we will merely warn about the maximum + row size being too large. */ + if (!row_size_is_acceptable(*m_table, create_fk)) { + DBUG_RETURN(convert_error_code_to_mysql( + DB_TOO_BIG_RECORD, m_flags, NULL)); + } + + DBUG_RETURN(0); +} + +bool create_table_info_t::row_size_is_acceptable( + const dict_table_t &table, bool strict) const +{ + for (dict_index_t *index= dict_table_get_first_index(&table); index; + index= dict_table_get_next_index(index)) + if (!row_size_is_acceptable(*index, strict)) + return false; + return true; +} + +dict_index_t::record_size_info_t dict_index_t::record_size_info() const +{ + ut_ad(!(type & DICT_FTS)); + + /* maximum allowed size of a node pointer record */ + ulint page_ptr_max; + const bool comp= table->not_redundant(); + /* table->space == NULL after DISCARD TABLESPACE */ + const ulint zip_size= dict_tf_get_zip_size(table->flags); + record_size_info_t result; + + if (zip_size && zip_size < srv_page_size) + { + /* On a ROW_FORMAT=COMPRESSED page, two records must fit in the + uncompressed page modification log. On compressed pages + with size.physical() == univ_page_size.physical(), + this limit will never be reached. */ + ut_ad(comp); + /* The maximum allowed record size is the size of + an empty page, minus a byte for recoding the heap + number in the page modification log. The maximum + allowed node pointer size is half that. */ + result.max_leaf_size= page_zip_empty_size(n_fields, zip_size); + if (result.max_leaf_size) + { + result.max_leaf_size--; + } + page_ptr_max= result.max_leaf_size / 2; + /* On a compressed page, there is a two-byte entry in + the dense page directory for every record. But there + is no record header. */ + result.shortest_size= 2; + } + else + { + /* The maximum allowed record size is half a B-tree + page(16k for 64k page size). No additional sparse + page directory entry will be generated for the first + few user records. */ + result.max_leaf_size= (comp || srv_page_size < UNIV_PAGE_SIZE_MAX) + ? page_get_free_space_of_empty(comp) / 2 + : REDUNDANT_REC_MAX_DATA_SIZE; + + page_ptr_max= result.max_leaf_size; + /* Each record has a header. */ + result.shortest_size= comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES; + } + + if (comp) + { + /* Include the "null" flags in the + maximum possible record size. */ + result.shortest_size+= UT_BITS_IN_BYTES(n_nullable); + } + else + { + /* For each column, include a 2-byte offset and a + "null" flag. The 1-byte format is only used in short + records that do not contain externally stored columns. + Such records could never exceed the page limit, even + when using the 2-byte format. */ + result.shortest_size+= 2 * n_fields; + } + + const ulint max_local_len= table->get_overflow_field_local_len(); + + /* Compute the maximum possible record size. */ + for (unsigned i= 0; i < n_fields; i++) + { + const dict_field_t &f= fields[i]; + const dict_col_t &col= *f.col; + + /* In dtuple_convert_big_rec(), variable-length columns + that are longer than BTR_EXTERN_LOCAL_STORED_MAX_SIZE + may be chosen for external storage. + + Fixed-length columns, and all columns of secondary + index records are always stored inline. */ + + /* Determine the maximum length of the index field. + The field_ext_max_size should be computed as the worst + case in rec_get_converted_size_comp() for + REC_STATUS_ORDINARY records. */ + + size_t field_max_size= dict_col_get_fixed_size(&col, comp); + if (field_max_size && f.fixed_len != 0) + { + /* dict_index_add_col() should guarantee this */ + ut_ad(!f.prefix_len || f.fixed_len == f.prefix_len); + if (f.prefix_len) + field_max_size= f.prefix_len; + /* Fixed lengths are not encoded + in ROW_FORMAT=COMPACT. */ + goto add_field_size; + } + + field_max_size= dict_col_get_max_size(&col); + + if (f.prefix_len) + { + if (f.prefix_len < field_max_size) + { + field_max_size= f.prefix_len; + } + + /* those conditions were copied from dtuple_convert_big_rec()*/ + } + else if (field_max_size > max_local_len && + field_max_size > BTR_EXTERN_LOCAL_STORED_MAX_SIZE && + DATA_BIG_COL(&col) && dict_index_is_clust(this)) + { + + /* In the worst case, we have a locally stored + column of BTR_EXTERN_LOCAL_STORED_MAX_SIZE bytes. + The length can be stored in one byte. If the + column were stored externally, the lengths in + the clustered index page would be + BTR_EXTERN_FIELD_REF_SIZE and 2. */ + field_max_size= max_local_len; + } + + if (comp) + { + /* Add the extra size for ROW_FORMAT=COMPACT. + For ROW_FORMAT=REDUNDANT, these bytes were + added to result.shortest_size before this loop. */ + result.shortest_size+= field_max_size < 256 ? 1 : 2; + } + add_field_size: + result.shortest_size+= field_max_size; + + /* Check the size limit on leaf pages. */ + if (result.shortest_size >= result.max_leaf_size) + { + result.set_too_big(i); + } + + /* Check the size limit on non-leaf pages. Records + stored in non-leaf B-tree pages consist of the unique + columns of the record (the key columns of the B-tree) + and a node pointer field. When we have processed the + unique columns, result.shortest_size equals the size of the + node pointer record minus the node pointer column. */ + if (i + 1 == dict_index_get_n_unique_in_tree(this) && + result.shortest_size + REC_NODE_PTR_SIZE + (comp ? 0 : 2) >= + page_ptr_max) + { + result.set_too_big(i); + } + } + + return result; +} + +/** Issue a warning that the row is too big. */ +static void ib_warn_row_too_big(THD *thd, const dict_table_t *table) +{ + /* FIXME: this row size check should be improved */ + /* If prefix is true then a 768-byte prefix is stored + locally for BLOB fields. Refer to dict_table_get_format() */ + const bool prefix= !dict_table_has_atomic_blobs(table); + + const ulint free_space= + page_get_free_space_of_empty(table->flags & DICT_TF_COMPACT) / 2; + + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_TO_BIG_ROW, + "Row size too large (> " ULINTPF "). Changing some columns to TEXT" + " or BLOB %smay help. In current row format, BLOB prefix of" + " %d bytes is stored inline.", + free_space, + prefix ? "or using ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED " : "", + prefix ? DICT_MAX_FIXED_COL_LEN : 0); +} + +bool create_table_info_t::row_size_is_acceptable( + const dict_index_t &index, bool strict) const +{ + if ((index.type & DICT_FTS) || index.table->is_system_db) + { + /* Ignore system tables check because innodb_table_stats + maximum row size can not fit on 4k page. */ + return true; + } + + const bool innodb_strict_mode= THDVAR(m_thd, strict_mode); + dict_index_t::record_size_info_t info= index.record_size_info(); + + if (info.row_is_too_big()) + { + ut_ad(info.get_overrun_size() != 0); + + const size_t idx= info.get_first_overrun_field_index(); + const dict_field_t *field= dict_index_get_nth_field(&index, idx); + + ut_ad((!field->name) == field->col->is_dropped()); + if (innodb_strict_mode || global_system_variables.log_warnings > 2) + { + ib::error_or_warn eow(strict && innodb_strict_mode); + if (field->name) + eow << "Cannot add field " << field->name << " in table "; + else + eow << "Cannot add an instantly dropped column in table "; + eow << "`" << m_form->s->db.str << "`.`" << m_form->s->table_name.str + << "`" " because after adding it, the row size is " + << info.get_overrun_size() + << " which is greater than maximum allowed size (" + << info.max_leaf_size << " bytes) for a record on index leaf page."; + } + + if (strict && innodb_strict_mode) + return false; + + ib_warn_row_too_big(m_thd, index.table); + } + + return true; +} + +void create_table_info_t::create_table_update_dict(dict_table_t *table, + THD *thd, + const HA_CREATE_INFO &info, + const TABLE &t) +{ + ut_ad(dict_sys.locked()); + + DBUG_ASSERT(table->get_ref_count()); + if (table->fts) + { + if (!table->fts_doc_id_index) + table->fts_doc_id_index= + dict_table_get_index_on_name(table, FTS_DOC_ID_INDEX_NAME); + else + DBUG_ASSERT(table->fts_doc_id_index == + dict_table_get_index_on_name(table, FTS_DOC_ID_INDEX_NAME)); + } + + DBUG_ASSERT(!table->fts == !table->fts_doc_id_index); + + innobase_copy_frm_flags_from_create_info(table, &info); + + /* Load server stopword into FTS cache */ + if (table->flags2 & DICT_TF2_FTS && + innobase_fts_load_stopword(table, nullptr, thd)) + fts_optimize_add_table(table); + + if (const Field *ai = t.found_next_number_field) + { + ut_ad(ai->stored_in_db()); + ib_uint64_t autoinc= info.auto_increment_value; + if (autoinc == 0) + autoinc= 1; + + table->autoinc_mutex.wr_lock(); + dict_table_autoinc_initialize(table, autoinc); + + if (!table->is_temporary()) + { + const unsigned col_no= innodb_col_no(ai); + table->persistent_autoinc= static_cast + (dict_table_get_nth_col_pos(table, col_no, nullptr) + 1) & + dict_index_t::MAX_N_FIELDS; + /* Persist the "last used" value, which typically is AUTO_INCREMENT - 1. + In btr_create(), the value 0 was already written. */ + if (--autoinc) + btr_write_autoinc(dict_table_get_first_index(table), autoinc); + } + + table->autoinc_mutex.wr_unlock(); + } + + innobase_parse_hint_from_comment(thd, table, t.s); +} + +/** Allocate a new trx. */ +void +create_table_info_t::allocate_trx() +{ + m_trx = innobase_trx_allocate(m_thd); + m_trx->will_lock = true; +} + +/** Create a new table to an InnoDB database. +@param[in] name Table name, format: "db/table_name". +@param[in] form Table format; columns and index information. +@param[in] create_info Create info (including create statement string). +@param[in] file_per_table whether to create .ibd file +@param[in,out] trx dictionary transaction, or NULL to create new +@return error code +@retval 0 on success */ +int +ha_innobase::create(const char *name, TABLE *form, HA_CREATE_INFO *create_info, + bool file_per_table, trx_t *trx= nullptr) +{ + char norm_name[FN_REFLEN]; /* {database}/{tablename} */ + char remote_path[FN_REFLEN]; /* Absolute path of table */ + + DBUG_ENTER("ha_innobase::create"); + DBUG_ASSERT(form->s == table_share); + DBUG_ASSERT(table_share->table_type == TABLE_TYPE_SEQUENCE || + table_share->table_type == TABLE_TYPE_NORMAL); + + create_table_info_t info(ha_thd(), form, create_info, norm_name, + remote_path, file_per_table, trx); + + int error= info.initialize(); + if (!error) + error= info.prepare_create_table(name, !trx); + if (error) + DBUG_RETURN(error); + + const bool own_trx= !trx; + if (own_trx) + { + info.allocate_trx(); + trx= info.trx(); + DBUG_ASSERT(trx_state_eq(trx, TRX_STATE_NOT_STARTED)); + + if (!(info.flags2() & DICT_TF2_TEMPORARY)) + { + trx_start_for_ddl(trx); + if (dberr_t err= lock_sys_tables(trx)) + error= convert_error_code_to_mysql(err, 0, nullptr); + } + row_mysql_lock_data_dictionary(trx); + } + + if (!error) + error= info.create_table(own_trx); + + if (own_trx || (info.flags2() & DICT_TF2_TEMPORARY)) + { + if (error) + trx_rollback_for_mysql(trx); + else + { + std::vector deleted; + trx->commit(deleted); + ut_ad(deleted.empty()); + info.table()->acquire(); + info.create_table_update_dict(info.table(), info.thd(), + *create_info, *form); + } + + if (own_trx) + { + row_mysql_unlock_data_dictionary(trx); + + if (!error) + { + dict_stats_update(info.table(), DICT_STATS_EMPTY_TABLE); + if (!info.table()->is_temporary()) + log_write_up_to(trx->commit_lsn, true); + info.table()->release(); + } + trx->free(); + } + } + else if (!error && m_prebuilt) + m_prebuilt->table= info.table(); + + DBUG_RETURN(error); +} + +/** Create a new table to an InnoDB database. +@param[in] name Table name, format: "db/table_name". +@param[in] form Table format; columns and index information. +@param[in] create_info Create info (including create statement string). +@return 0 if success else error number. */ +int ha_innobase::create(const char *name, TABLE *form, + HA_CREATE_INFO *create_info) +{ + return create(name, form, create_info, srv_file_per_table); +} + +/*****************************************************************//** +Discards or imports an InnoDB tablespace. +@return 0 == success, -1 == error */ + +int +ha_innobase::discard_or_import_tablespace( +/*======================================*/ + my_bool discard) /*!< in: TRUE if discard, else import */ +{ + + DBUG_ENTER("ha_innobase::discard_or_import_tablespace"); + + ut_a(m_prebuilt->trx != NULL); + ut_a(m_prebuilt->trx->magic_n == TRX_MAGIC_N); + ut_a(m_prebuilt->trx == thd_to_trx(ha_thd())); + + if (is_read_only()) { + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } + + if (m_prebuilt->table->is_temporary()) { + ib_senderrf( + m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_CANNOT_DISCARD_TEMPORARY_TABLE); + + DBUG_RETURN(HA_ERR_TABLE_NEEDS_UPGRADE); + } + + if (m_prebuilt->table->space == fil_system.sys_space) { + ib_senderrf( + m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLE_IN_SYSTEM_TABLESPACE, + m_prebuilt->table->name.m_name); + + DBUG_RETURN(HA_ERR_TABLE_NEEDS_UPGRADE); + } + + trx_start_if_not_started(m_prebuilt->trx, true); + m_prebuilt->trx->dict_operation = true; + + /* Obtain an exclusive lock on the table. */ + dberr_t err = lock_table_for_trx(m_prebuilt->table, + m_prebuilt->trx, LOCK_X); + if (err == DB_SUCCESS) { + err = lock_sys_tables(m_prebuilt->trx); + } + + if (err != DB_SUCCESS) { + /* unable to lock the table: do nothing */ + m_prebuilt->trx->commit(); + } else if (discard) { + + /* Discarding an already discarded tablespace should be an + idempotent operation. Also, if the .ibd file is missing the + user may want to set the DISCARD flag in order to IMPORT + a new tablespace. */ + + if (!m_prebuilt->table->is_readable()) { + ib_senderrf( + m_prebuilt->trx->mysql_thd, + IB_LOG_LEVEL_WARN, ER_TABLESPACE_MISSING, + m_prebuilt->table->name.m_name); + } + + err = row_discard_tablespace_for_mysql( + m_prebuilt->table, m_prebuilt->trx); + } else if (m_prebuilt->table->is_readable()) { + /* Commit the transaction in order to + release the table lock. */ + trx_commit_for_mysql(m_prebuilt->trx); + + ib::error() << "Unable to import tablespace " + << m_prebuilt->table->name << " because it already" + " exists. Please DISCARD the tablespace" + " before IMPORT."; + ib_senderrf( + m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_EXISTS, m_prebuilt->table->name.m_name); + + DBUG_RETURN(HA_ERR_TABLE_EXIST); + } else { + err = row_import_for_mysql(m_prebuilt->table, m_prebuilt); + + if (err == DB_SUCCESS) { + + info(HA_STATUS_TIME + | HA_STATUS_CONST + | HA_STATUS_VARIABLE + | HA_STATUS_AUTO); + + fil_crypt_set_encrypt_tables(srv_encrypt_tables); + } + } + + ut_ad(m_prebuilt->trx->state == TRX_STATE_NOT_STARTED); + + if (discard || err != DB_SUCCESS) { + DBUG_RETURN(convert_error_code_to_mysql( + err, m_prebuilt->table->flags, NULL)); + } + + if (dict_stats_is_persistent_enabled(m_prebuilt->table)) { + dberr_t ret; + + /* Adjust the persistent statistics. */ + ret = dict_stats_update(m_prebuilt->table, + DICT_STATS_RECALC_PERSISTENT); + + if (ret != DB_SUCCESS) { + push_warning_printf( + ha_thd(), + Sql_condition::WARN_LEVEL_WARN, + ER_ALTER_INFO, + "Error updating stats for table '%s'" + " after table rebuild: %s", + m_prebuilt->table->name.m_name, + ut_strerr(ret)); + } + } + + DBUG_RETURN(0); +} + + +/** DROP TABLE (possibly as part of DROP DATABASE, CREATE/ALTER TABLE) +@param name table name +@return error number */ +int ha_innobase::delete_table(const char *name) +{ + DBUG_ENTER("ha_innobase::delete_table"); + if (high_level_read_only) + DBUG_RETURN(HA_ERR_TABLE_READONLY); + + THD *thd= ha_thd(); + + DBUG_EXECUTE_IF("test_normalize_table_name_low", + test_normalize_table_name_low();); + DBUG_EXECUTE_IF("test_ut_format_name", test_ut_format_name();); + + trx_t *parent_trx= check_trx_exists(thd); + dict_table_t *table; + + { + char norm_name[FN_REFLEN]; + normalize_table_name(norm_name, name); + span n{norm_name, strlen(norm_name)}; + + dict_sys.lock(SRW_LOCK_CALL); + table= dict_sys.load_table(n, DICT_ERR_IGNORE_DROP); +#ifdef WITH_PARTITION_STORAGE_ENGINE + if (!table && lower_case_table_names == 1 && is_partition(norm_name)) + { + IF_WIN(normalize_table_name_c_low(norm_name, name, false), + innobase_casedn_str(norm_name)); + table= dict_sys.load_table(n, DICT_ERR_IGNORE_DROP); + } +#endif + if (!table) + { + dict_sys.unlock(); + DBUG_RETURN(HA_ERR_NO_SUCH_TABLE); + } + } + + if (table->is_temporary()) + { + dict_sys.unlock(); + parent_trx->mod_tables.erase(table); /* CREATE...SELECT error handling */ + btr_drop_temporary_table(*table); + dict_sys.lock(SRW_LOCK_CALL); + dict_sys.remove(table); + dict_sys.unlock(); + DBUG_RETURN(0); + } + + table->acquire(); + dict_sys.unlock(); + + trx_t *trx= parent_trx; + dberr_t err= DB_SUCCESS; + if (!trx->lock.table_locks.empty() && + thd_ddl_options(trx->mysql_thd)->is_create_select()) + { + /* CREATE TABLE...PRIMARY KEY...SELECT ought to be dropping the + table because a duplicate key was detected or a timeout occurred. + + We shall hijack the existing transaction to drop the table and + commit the transaction. If this is a partitioned table, one + partition will use this hijacked transaction; others will use a + separate transaction, one per partition. */ + ut_ad(!trx->dict_operation_lock_mode); + ut_ad(trx->will_lock); + ut_ad(trx->state == TRX_STATE_ACTIVE); + trx->dict_operation= true; + } + else + { + trx= innobase_trx_allocate(thd); + trx_start_for_ddl(trx); + + if (table->name.is_temporary()) + /* There is no need to lock any FOREIGN KEY child tables. */; +#ifdef WITH_PARTITION_STORAGE_ENGINE + else if (table->name.part()) + /* FOREIGN KEY constraints cannot exist on partitioned tables. */; +#endif + else + { + dict_sys.freeze(SRW_LOCK_CALL); + for (const dict_foreign_t* f : table->referenced_set) + if (dict_table_t* child= f->foreign_table) + if ((err= lock_table_for_trx(child, trx, LOCK_X)) != DB_SUCCESS) + break; + dict_sys.unfreeze(); + } + } + + dict_table_t *table_stats= nullptr, *index_stats= nullptr; + MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr; + if (err == DB_SUCCESS) + err= lock_table_for_trx(table, trx, LOCK_X); + + const bool fts= err == DB_SUCCESS && + (table->flags2 & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS)); + const enum_sql_command sqlcom= enum_sql_command(thd_sql_command(thd)); + + if (fts) + { + fts_optimize_remove_table(table); + purge_sys.stop_FTS(*table); + err= fts_lock_tables(trx, *table); + } + +#ifdef WITH_PARTITION_STORAGE_ENGINE + const bool rollback_add_partition= + (sqlcom == SQLCOM_ALTER_TABLE && table->name.part()); + + if (rollback_add_partition) + { + if (!fts) + purge_sys.stop_FTS(); + /* This looks like the rollback of ALTER TABLE...ADD PARTITION + that was caused by MDL timeout. We could have written undo log + for inserting the data into the new partitions. */ + if (table->stat_persistent != DICT_STATS_PERSISTENT_OFF) + { + /* We do not really know if we are holding MDL_EXCLUSIVE. Even + though this code is handling the case that we are not holding + it, we might actually hold it. We want to avoid a deadlock + with dict_stats_process_entry_from_recalc_pool(). */ + dict_stats_recalc_pool_del(table->id, true); + /* If statistics calculation is still using this table, we will + catch it below while waiting for purge to stop using this table. */ + } + } +#endif + + DEBUG_SYNC(thd, "before_delete_table_stats"); + + if (err == DB_SUCCESS && dict_stats_is_persistent_enabled(table) && + !table->is_stats_table()) + { + table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false, + DICT_ERR_IGNORE_NONE); + if (table_stats) + { + dict_sys.freeze(SRW_LOCK_CALL); + table_stats= dict_acquire_mdl_shared(table_stats, + thd, &mdl_table); + dict_sys.unfreeze(); + } + + index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false, + DICT_ERR_IGNORE_NONE); + if (index_stats) + { + dict_sys.freeze(SRW_LOCK_CALL); + index_stats= dict_acquire_mdl_shared(index_stats, + thd, &mdl_index); + dict_sys.unfreeze(); + } + + const bool skip_wait{table->name.is_temporary()}; + + if (table_stats && index_stats && + !strcmp(table_stats->name.m_name, TABLE_STATS_NAME) && + !strcmp(index_stats->name.m_name, INDEX_STATS_NAME) && + !(err= lock_table_for_trx(table_stats, trx, LOCK_X, skip_wait))) + err= lock_table_for_trx(index_stats, trx, LOCK_X, skip_wait); + + if (err != DB_SUCCESS && skip_wait) + { + /* We may skip deleting statistics if we cannot lock the tables, + when the table carries a temporary name. */ + ut_ad(err == DB_LOCK_WAIT); + ut_ad(trx->error_state == DB_SUCCESS); + err= DB_SUCCESS; + dict_table_close(table_stats, false, thd, mdl_table); + dict_table_close(index_stats, false, thd, mdl_index); + table_stats= nullptr; + index_stats= nullptr; + } + } + + if (err == DB_SUCCESS) + { + if (!table->space) + { + const char *data_dir_path= DICT_TF_HAS_DATA_DIR(table->flags) + ? table->data_dir_path : nullptr; + char *path= fil_make_filepath(data_dir_path, table->name, CFG, + data_dir_path != nullptr); + os_file_delete_if_exists(innodb_data_file_key, path, nullptr); + ut_free(path); + path= fil_make_filepath(data_dir_path, table->name, IBD, + data_dir_path != nullptr); + os_file_delete_if_exists(innodb_data_file_key, path, nullptr); + ut_free(path); + if (data_dir_path) + { + path= fil_make_filepath(nullptr, table->name, ISL, false); + os_file_delete_if_exists(innodb_data_file_key, path, nullptr); + ut_free(path); + } + } + err= lock_sys_tables(trx); + } + + dict_sys.lock(SRW_LOCK_CALL); + + if (!table->release() && err == DB_SUCCESS) + { + /* Wait for purge threads to stop using the table. */ + for (uint n= 15;;) + { + dict_sys.unlock(); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + dict_sys.lock(SRW_LOCK_CALL); + + if (!--n) + { + err= DB_LOCK_WAIT_TIMEOUT; + break; + } + if (!table->get_ref_count()) + break; + } + } + + trx->dict_operation_lock_mode= true; + + if (err != DB_SUCCESS) + { +err_exit: + trx->dict_operation_lock_mode= false; + trx->rollback(); + switch (err) { + case DB_CANNOT_DROP_CONSTRAINT: + case DB_LOCK_WAIT_TIMEOUT: + break; + default: + ib::error() << "DROP TABLE " << table->name << ": " << err; + } + if (fts) + { + fts_optimize_add_table(table); + purge_sys.resume_FTS(); + } +#ifdef WITH_PARTITION_STORAGE_ENGINE + else if (rollback_add_partition) + purge_sys.resume_FTS(); +#endif + if (table_stats) + dict_table_close(table_stats, true, thd, mdl_table); + if (index_stats) + dict_table_close(index_stats, true, thd, mdl_index); + dict_sys.unlock(); + if (trx != parent_trx) + trx->free(); + DBUG_RETURN(convert_error_code_to_mysql(err, 0, NULL)); + } + + if (!table->no_rollback() && trx->check_foreigns) + { + const bool drop_db= sqlcom == SQLCOM_DROP_DB; + for (auto foreign : table->referenced_set) + { + /* We should allow dropping a referenced table if creating + that referenced table has failed for some reason. For example + if referenced table is created but it column types that are + referenced do not match. */ + if (foreign->foreign_table == table || + (drop_db && + dict_tables_have_same_db(table->name.m_name, + foreign->foreign_table_name_lookup))) + continue; + mysql_mutex_lock(&dict_foreign_err_mutex); + rewind(dict_foreign_err_file); + ut_print_timestamp(dict_foreign_err_file); + fputs(" Cannot drop table ", dict_foreign_err_file); + ut_print_name(dict_foreign_err_file, trx, table->name.m_name); + fputs("\nbecause it is referenced by ", dict_foreign_err_file); + ut_print_name(dict_foreign_err_file, trx, foreign->foreign_table_name); + putc('\n', dict_foreign_err_file); + mysql_mutex_unlock(&dict_foreign_err_mutex); + err= DB_CANNOT_DROP_CONSTRAINT; + goto err_exit; + } + } + + if (!table->no_rollback()) + err= trx->drop_table_foreign(table->name); + + if (err == DB_SUCCESS && table_stats && index_stats) + err= trx->drop_table_statistics(table->name); + if (err != DB_SUCCESS) + goto err_exit; + + err= trx->drop_table(*table); + if (err != DB_SUCCESS) + goto err_exit; + + std::vector deleted; + trx->commit(deleted); + if (table_stats) + dict_table_close(table_stats, true, thd, mdl_table); + if (index_stats) + dict_table_close(index_stats, true, thd, mdl_index); + row_mysql_unlock_data_dictionary(trx); + for (pfs_os_file_t d : deleted) + os_file_close(d); + log_write_up_to(trx->commit_lsn, true); + if (trx != parent_trx) + trx->free(); + if (!fts) +#ifdef WITH_PARTITION_STORAGE_ENGINE + if (!rollback_add_partition) +#endif + DBUG_RETURN(0); + purge_sys.resume_FTS(); + DBUG_RETURN(0); +} + +/** Rename an InnoDB table. +@param[in,out] trx InnoDB data dictionary transaction +@param[in] from old table name +@param[in] to new table name +@param[in] use_fk whether to enforce FOREIGN KEY +@return DB_SUCCESS or error code */ +static dberr_t innobase_rename_table(trx_t *trx, const char *from, + const char *to, bool use_fk) +{ + dberr_t error; + char norm_to[FN_REFLEN]; + char norm_from[FN_REFLEN]; + + DBUG_ENTER("innobase_rename_table"); + DBUG_ASSERT(trx->dict_operation); + + ut_ad(!srv_read_only_mode); + + normalize_table_name(norm_to, to); + normalize_table_name(norm_from, from); + + DEBUG_SYNC_C("innodb_rename_table_ready"); + + ut_ad(trx->will_lock); + + error = row_rename_table_for_mysql(norm_from, norm_to, trx, use_fk); + + if (error != DB_SUCCESS) { + if (error == DB_TABLE_NOT_FOUND + && lower_case_table_names == 1) { + char* is_part = is_partition(norm_from); + + if (is_part) { + char par_case_name[FN_REFLEN]; +#ifndef _WIN32 + /* Check for the table using lower + case name, including the partition + separator "P" */ + strcpy(par_case_name, norm_from); + innobase_casedn_str(par_case_name); +#else + /* On Windows platfrom, check + whether there exists table name in + system table whose name is + not being normalized to lower case */ + normalize_table_name_c_low( + par_case_name, from, false); +#endif /* _WIN32 */ + trx_start_if_not_started(trx, true); + error = row_rename_table_for_mysql( + par_case_name, norm_to, trx, false); + } + } + + if (error == DB_SUCCESS) { +#ifndef _WIN32 + sql_print_warning("Rename partition table %s" + " succeeds after converting to lower" + " case. The table may have" + " been moved from a case" + " in-sensitive file system.\n", + norm_from); +#else + sql_print_warning("Rename partition table %s" + " succeeds after skipping the step to" + " lower case the table name." + " The table may have been" + " moved from a case sensitive" + " file system.\n", + norm_from); +#endif /* _WIN32 */ + } + } + + DBUG_RETURN(error); +} + +/** TRUNCATE TABLE +@return error code +@retval 0 on success */ +int ha_innobase::truncate() +{ + mariadb_set_stats set_stats_temporary(handler_stats); + DBUG_ENTER("ha_innobase::truncate"); + + update_thd(); + + if (is_read_only()) + DBUG_RETURN(HA_ERR_TABLE_READONLY); + + HA_CREATE_INFO info; + dict_table_t *ib_table= m_prebuilt->table; + info.init(); + update_create_info_from_table(&info, table); + switch (dict_tf_get_rec_format(ib_table->flags)) { + case REC_FORMAT_REDUNDANT: + info.row_type= ROW_TYPE_REDUNDANT; + break; + case REC_FORMAT_COMPACT: + info.row_type= ROW_TYPE_COMPACT; + break; + case REC_FORMAT_COMPRESSED: + info.row_type= ROW_TYPE_COMPRESSED; + break; + case REC_FORMAT_DYNAMIC: + info.row_type= ROW_TYPE_DYNAMIC; + break; + } + + const auto stored_lock= m_prebuilt->stored_select_lock_type; + trx_t *trx= innobase_trx_allocate(m_user_thd); + trx_start_for_ddl(trx); + + if (ib_table->is_temporary()) + { + info.options|= HA_LEX_CREATE_TMP_TABLE; + btr_drop_temporary_table(*ib_table); + m_prebuilt->table= nullptr; + row_prebuilt_free(m_prebuilt); + m_prebuilt= nullptr; + my_free(m_upd_buf); + m_upd_buf= nullptr; + m_upd_buf_size= 0; + + row_mysql_lock_data_dictionary(trx); + ib_table->release(); + dict_sys.remove(ib_table, false, true); + int err= create(ib_table->name.m_name, table, &info, true, trx); + row_mysql_unlock_data_dictionary(trx); + + ut_ad(!err); + if (!err) + { + err= open(ib_table->name.m_name, 0, 0); + m_prebuilt->table->release(); + m_prebuilt->stored_select_lock_type= stored_lock; + } + + trx->free(); + +#ifdef BTR_CUR_HASH_ADAPT + if (UT_LIST_GET_LEN(ib_table->freed_indexes)) + { + ib_table->vc_templ= nullptr; + ib_table->id= 0; + } + else +#endif /* BTR_CUR_HASH_ADAPT */ + dict_mem_table_free(ib_table); + + DBUG_RETURN(err); + } + + mem_heap_t *heap= mem_heap_create(1000); + + if (!ib_table->space) + ib_senderrf(m_user_thd, IB_LOG_LEVEL_WARN, ER_TABLESPACE_DISCARDED, + table->s->table_name.str); + + dict_get_and_save_data_dir_path(ib_table); + info.data_file_name= ib_table->data_dir_path; + const char *temp_name= + dict_mem_create_temporary_tablename(heap, + ib_table->name.m_name, ib_table->id); + const char *name= mem_heap_strdup(heap, ib_table->name.m_name); + + dict_table_t *table_stats = nullptr, *index_stats = nullptr; + MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr; + + dberr_t error= DB_SUCCESS; + + dict_sys.freeze(SRW_LOCK_CALL); + for (const dict_foreign_t *f : ib_table->referenced_set) + if (dict_table_t *child= f->foreign_table) + if ((error= lock_table_for_trx(child, trx, LOCK_X)) != DB_SUCCESS) + break; + dict_sys.unfreeze(); + + if (error == DB_SUCCESS) + error= lock_table_for_trx(ib_table, trx, LOCK_X); + + const bool fts= error == DB_SUCCESS && + ib_table->flags2 & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS); + + if (fts) + { + fts_optimize_remove_table(ib_table); + purge_sys.stop_FTS(*ib_table); + error= fts_lock_tables(trx, *ib_table); + } + + /* Wait for purge threads to stop using the table. */ + for (uint n = 15; ib_table->get_ref_count() > 1; ) + { + if (!--n) + { + error= DB_LOCK_WAIT_TIMEOUT; + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } + + if (error == DB_SUCCESS && dict_stats_is_persistent_enabled(ib_table) && + !ib_table->is_stats_table()) + { + table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false, + DICT_ERR_IGNORE_NONE); + if (table_stats) + { + dict_sys.freeze(SRW_LOCK_CALL); + table_stats= dict_acquire_mdl_shared(table_stats, m_user_thd, + &mdl_table); + dict_sys.unfreeze(); + } + index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false, + DICT_ERR_IGNORE_NONE); + if (index_stats) + { + dict_sys.freeze(SRW_LOCK_CALL); + index_stats= dict_acquire_mdl_shared(index_stats, m_user_thd, + &mdl_index); + dict_sys.unfreeze(); + } + + if (table_stats && index_stats && + !strcmp(table_stats->name.m_name, TABLE_STATS_NAME) && + !strcmp(index_stats->name.m_name, INDEX_STATS_NAME) && + !(error= lock_table_for_trx(table_stats, trx, LOCK_X))) + error= lock_table_for_trx(index_stats, trx, LOCK_X); + } + + if (error == DB_SUCCESS) + error= lock_sys_tables(trx); + + std::vector deleted; + + row_mysql_lock_data_dictionary(trx); + + if (error == DB_SUCCESS) + { + error= innobase_rename_table(trx, ib_table->name.m_name, temp_name, false); + if (error == DB_SUCCESS) + error= trx->drop_table(*ib_table); + } + + int err = convert_error_code_to_mysql(error, ib_table->flags, m_user_thd); + const auto update_time = ib_table->update_time; + + if (err) + { + trx_rollback_for_mysql(trx); + if (fts) + fts_optimize_add_table(ib_table); + } + else + { + const auto def_trx_id= ib_table->def_trx_id; + ib_table->release(); + m_prebuilt->table= nullptr; + + err= create(name, table, &info, dict_table_is_file_per_table(ib_table), + trx); + if (!err) + { + m_prebuilt->table->acquire(); + create_table_info_t::create_table_update_dict(m_prebuilt->table, + m_user_thd, info, *table); + trx->commit(deleted); + } + else + { + trx_rollback_for_mysql(trx); + m_prebuilt->table= dict_table_open_on_name(name, true, + DICT_ERR_IGNORE_FK_NOKEY); + m_prebuilt->table->def_trx_id= def_trx_id; + } + dict_names_t fk_tables; + dict_load_foreigns(m_prebuilt->table->name.m_name, nullptr, 1, true, + DICT_ERR_IGNORE_FK_NOKEY, fk_tables); + for (const char *f : fk_tables) + dict_sys.load_table({f, strlen(f)}); + } + + if (fts) + purge_sys.resume_FTS(); + + row_mysql_unlock_data_dictionary(trx); + for (pfs_os_file_t d : deleted) os_file_close(d); + + if (!err) + { + dict_stats_update(m_prebuilt->table, DICT_STATS_EMPTY_TABLE); + log_write_up_to(trx->commit_lsn, true); + row_prebuilt_t *prebuilt= m_prebuilt; + uchar *upd_buf= m_upd_buf; + ulint upd_buf_size= m_upd_buf_size; + /* Mimic ha_innobase::close(). */ + m_prebuilt= nullptr; + m_upd_buf= nullptr; + m_upd_buf_size= 0; + + err= open(name, 0, 0); + if (!err) + { + m_prebuilt->stored_select_lock_type= stored_lock; + m_prebuilt->table->update_time= update_time; + row_prebuilt_free(prebuilt); + my_free(upd_buf); + } + else + { + /* Revert to the old table. */ + m_prebuilt= prebuilt; + m_upd_buf= upd_buf; + m_upd_buf_size= upd_buf_size; + } + } + + trx->free(); + + mem_heap_free(heap); + + if (table_stats) + dict_table_close(table_stats, false, m_user_thd, mdl_table); + if (index_stats) + dict_table_close(index_stats, false, m_user_thd, mdl_index); + + DBUG_RETURN(err); +} + +/*********************************************************************//** +Renames an InnoDB table. +@return 0 or error code */ + +int +ha_innobase::rename_table( +/*======================*/ + const char* from, /*!< in: old name of the table */ + const char* to) /*!< in: new name of the table */ +{ + THD* thd = ha_thd(); + + DBUG_ENTER("ha_innobase::rename_table"); + + if (high_level_read_only) { + ib_senderrf(thd, IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } + + trx_t* trx = innobase_trx_allocate(thd); + trx_start_for_ddl(trx); + + dict_table_t *table_stats = nullptr, *index_stats = nullptr; + MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr; + char norm_from[MAX_FULL_NAME_LEN]; + char norm_to[MAX_FULL_NAME_LEN]; + + normalize_table_name(norm_from, from); + normalize_table_name(norm_to, to); + + dberr_t error = DB_SUCCESS; + const bool from_temp = dict_table_t::is_temporary_name(norm_from); + + if (from_temp) { + /* There is no need to lock any FOREIGN KEY child tables. */ + } else if (dict_table_t *table = dict_table_open_on_name( + norm_from, false, DICT_ERR_IGNORE_FK_NOKEY)) { + dict_sys.freeze(SRW_LOCK_CALL); + for (const dict_foreign_t* f : table->referenced_set) { + if (dict_table_t* child = f->foreign_table) { + error = lock_table_for_trx(child, trx, LOCK_X); + if (error != DB_SUCCESS) { + break; + } + } + } + dict_sys.unfreeze(); + if (error == DB_SUCCESS) { + error = lock_table_for_trx(table, trx, LOCK_X); + } + table->release(); + } + + if (strcmp(norm_from, TABLE_STATS_NAME) + && strcmp(norm_from, INDEX_STATS_NAME) + && strcmp(norm_to, TABLE_STATS_NAME) + && strcmp(norm_to, INDEX_STATS_NAME)) { + table_stats = dict_table_open_on_name(TABLE_STATS_NAME, false, + DICT_ERR_IGNORE_NONE); + if (table_stats) { + dict_sys.freeze(SRW_LOCK_CALL); + table_stats = dict_acquire_mdl_shared( + table_stats, thd, &mdl_table); + dict_sys.unfreeze(); + } + index_stats = dict_table_open_on_name(INDEX_STATS_NAME, false, + DICT_ERR_IGNORE_NONE); + if (index_stats) { + dict_sys.freeze(SRW_LOCK_CALL); + index_stats = dict_acquire_mdl_shared( + index_stats, thd, &mdl_index); + dict_sys.unfreeze(); + } + + if (error == DB_SUCCESS && table_stats && index_stats + && !strcmp(table_stats->name.m_name, TABLE_STATS_NAME) + && !strcmp(index_stats->name.m_name, INDEX_STATS_NAME)) { + error = lock_table_for_trx(table_stats, trx, LOCK_X, + from_temp); + if (error == DB_SUCCESS) { + error = lock_table_for_trx(index_stats, trx, + LOCK_X, from_temp); + } + if (error != DB_SUCCESS && from_temp) { + ut_ad(error == DB_LOCK_WAIT); + ut_ad(trx->error_state == DB_SUCCESS); + error = DB_SUCCESS; + /* We may skip renaming statistics if + we cannot lock the tables, when the + table is being renamed from from a + temporary name. */ + dict_table_close(table_stats, false, thd, + mdl_table); + dict_table_close(index_stats, false, thd, + mdl_index); + table_stats = nullptr; + index_stats = nullptr; + } + } + } + + if (error == DB_SUCCESS) { + error = lock_table_for_trx(dict_sys.sys_tables, trx, LOCK_X); + if (error == DB_SUCCESS) { + error = lock_table_for_trx(dict_sys.sys_foreign, trx, + LOCK_X); + if (error == DB_SUCCESS) { + error = lock_table_for_trx( + dict_sys.sys_foreign_cols, + trx, LOCK_X); + } + } + } + + row_mysql_lock_data_dictionary(trx); + + if (error == DB_SUCCESS) { + error = innobase_rename_table(trx, from, to, true); + } + + DEBUG_SYNC(thd, "after_innobase_rename_table"); + + if (error == DB_SUCCESS && table_stats && index_stats) { + error = dict_stats_rename_table(norm_from, norm_to, trx); + if (error == DB_DUPLICATE_KEY) { + /* The duplicate may also occur in + mysql.innodb_index_stats. */ + my_error(ER_DUP_KEY, MYF(0), + "mysql.innodb_table_stats"); + error = DB_ERROR; + } + } + + if (error == DB_SUCCESS) { + trx->flush_log_later = true; + innobase_commit_low(trx); + } else { + trx->rollback(); + } + + if (table_stats) { + dict_table_close(table_stats, true, thd, mdl_table); + } + if (index_stats) { + dict_table_close(index_stats, true, thd, mdl_index); + } + row_mysql_unlock_data_dictionary(trx); + if (error == DB_SUCCESS) { + log_write_up_to(trx->commit_lsn, true); + } + trx->flush_log_later = false; + trx->free(); + + if (error == DB_DUPLICATE_KEY) { + /* We are not able to deal with handler::get_dup_key() + during DDL operations, because the duplicate key would + exist in metadata tables, not in the user table. */ + my_error(ER_TABLE_EXISTS_ERROR, MYF(0), to); + error = DB_ERROR; + } else if (error == DB_LOCK_WAIT_TIMEOUT) { + my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0), to); + error = DB_LOCK_WAIT; + } + + DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); +} + +/*********************************************************************//** +Estimates the number of index records in a range. +@return estimated number of rows */ + +ha_rows +ha_innobase::records_in_range( +/*==========================*/ + uint keynr, /*!< in: index number */ + const key_range *min_key, /*!< in: start key value of the + range, may also be 0 */ + const key_range *max_key, /*!< in: range end key val, may + also be 0 */ + page_range *pages) +{ + KEY* key; + dict_index_t* index; + dtuple_t* range_start; + dtuple_t* range_end; + ha_rows n_rows; + page_cur_mode_t mode1; + page_cur_mode_t mode2; + mem_heap_t* heap; + + DBUG_ENTER("records_in_range"); + + ut_a(m_prebuilt->trx == thd_to_trx(ha_thd())); + + m_prebuilt->trx->op_info = "estimating records in index range"; + + active_index = keynr; + + key = table->key_info + active_index; + + index = innobase_get_index(keynr); + + /* There exists possibility of not being able to find requested + index due to inconsistency between MySQL and InoDB dictionary info. + Necessary message should have been printed in innobase_get_index() */ + if (!m_prebuilt->table->space) { + n_rows = HA_POS_ERROR; + goto func_exit; + } + if (!index) { + n_rows = HA_POS_ERROR; + goto func_exit; + } + if (index->is_corrupted()) { + n_rows = HA_ERR_INDEX_CORRUPT; + goto func_exit; + } + if (!row_merge_is_index_usable(m_prebuilt->trx, index)) { + n_rows = HA_ERR_TABLE_DEF_CHANGED; + goto func_exit; + } + + heap = mem_heap_create(2 * (key->ext_key_parts * sizeof(dfield_t) + + sizeof(dtuple_t))); + + range_start = dtuple_create(heap, key->ext_key_parts); + dict_index_copy_types(range_start, index, key->ext_key_parts); + + range_end = dtuple_create(heap, key->ext_key_parts); + dict_index_copy_types(range_end, index, key->ext_key_parts); + + row_sel_convert_mysql_key_to_innobase( + range_start, + m_prebuilt->srch_key_val1, + m_prebuilt->srch_key_val_len, + index, + (byte*) (min_key ? min_key->key : (const uchar*) 0), + (ulint) (min_key ? min_key->length : 0)); + + DBUG_ASSERT(min_key + ? range_start->n_fields > 0 + : range_start->n_fields == 0); + + row_sel_convert_mysql_key_to_innobase( + range_end, + m_prebuilt->srch_key_val2, + m_prebuilt->srch_key_val_len, + index, + (byte*) (max_key ? max_key->key : (const uchar*) 0), + (ulint) (max_key ? max_key->length : 0)); + + DBUG_ASSERT(max_key + ? range_end->n_fields > 0 + : range_end->n_fields == 0); + + mode1 = convert_search_mode_to_innobase( + min_key ? min_key->flag : HA_READ_KEY_EXACT); + + mode2 = convert_search_mode_to_innobase( + max_key ? max_key->flag : HA_READ_KEY_EXACT); + + if (mode1 != PAGE_CUR_UNSUPP && mode2 != PAGE_CUR_UNSUPP) { + + if (dict_index_is_spatial(index)) { + /*Only min_key used in spatial index. */ + n_rows = rtr_estimate_n_rows_in_range( + index, range_start, mode1); + } else { + btr_pos_t tuple1(range_start, mode1, pages->first_page); + btr_pos_t tuple2(range_end, mode2, pages->last_page); + n_rows = btr_estimate_n_rows_in_range( + index, &tuple1, &tuple2); + pages->first_page= tuple1.page_id.raw(); + pages->last_page= tuple2.page_id.raw(); + } + } else { + + n_rows = HA_POS_ERROR; + } + + mem_heap_free(heap); + + DBUG_EXECUTE_IF( + "print_btr_estimate_n_rows_in_range_return_value", + push_warning_printf( + ha_thd(), Sql_condition::WARN_LEVEL_WARN, + ER_NO_DEFAULT, + "btr_estimate_n_rows_in_range(): %lld", + (longlong) n_rows); + ); + +func_exit: + + m_prebuilt->trx->op_info = (char*)""; + + /* The MySQL optimizer seems to believe an estimate of 0 rows is + always accurate and may return the result 'Empty set' based on that. + The accuracy is not guaranteed, and even if it were, for a locking + read we should anyway perform the search to set the next-key lock. + Add 1 to the value to make sure MySQL does not make the assumption! */ + + if (n_rows == 0) { + n_rows = 1; + } + + DBUG_RETURN((ha_rows) n_rows); +} + +/*********************************************************************//** +Gives an UPPER BOUND to the number of rows in a table. This is used in +filesort.cc. +@return upper bound of rows */ + +ha_rows +ha_innobase::estimate_rows_upper_bound() +/*====================================*/ +{ + const dict_index_t* index; + ulonglong estimate; + ulonglong local_data_file_length; + mariadb_set_stats set_stats_temporary(handler_stats); + DBUG_ENTER("estimate_rows_upper_bound"); + + /* We do not know if MySQL can call this function before calling + external_lock(). To be safe, update the thd of the current table + handle. */ + + update_thd(ha_thd()); + + m_prebuilt->trx->op_info = "calculating upper bound for table rows"; + + index = dict_table_get_first_index(m_prebuilt->table); + + ulint stat_n_leaf_pages = index->stat_n_leaf_pages; + + ut_a(stat_n_leaf_pages > 0); + + local_data_file_length = ulonglong(stat_n_leaf_pages) + << srv_page_size_shift; + + /* Calculate a minimum length for a clustered index record and from + that an upper bound for the number of rows. Since we only calculate + new statistics in row0mysql.cc when a table has grown by a threshold + factor, we must add a safety factor 2 in front of the formula below. */ + + estimate = 2 * local_data_file_length + / dict_index_calc_min_rec_len(index); + + m_prebuilt->trx->op_info = ""; + + /* Set num_rows less than MERGEBUFF to simulate the case where we do + not have enough space to merge the externally sorted file blocks. */ + DBUG_EXECUTE_IF("set_num_rows_lt_MERGEBUFF", + estimate = 2; + DBUG_SET("-d,set_num_rows_lt_MERGEBUFF"); + ); + + DBUG_RETURN((ha_rows) estimate); +} + +/*********************************************************************//** +How many seeks it will take to read through the table. This is to be +comparable to the number returned by records_in_range so that we can +decide if we should scan the table or use keys. +@return estimated time measured in disk seeks */ + +double +ha_innobase::scan_time() +/*====================*/ +{ + /* Since MySQL seems to favor table scans too much over index + searches, we pretend that a sequential read takes the same time + as a random disk read, that is, we do not divide the following + by 10, which would be physically realistic. */ + + /* The locking below is disabled for performance reasons. Without + it we could end up returning uninitialized value to the caller, + which in the worst case could make some query plan go bogus or + issue a Valgrind warning. */ + if (m_prebuilt == NULL) { + /* In case of derived table, Optimizer will try to fetch stat + for table even before table is create or open. In such + cases return default value of 1. + TODO: This will be further improved to return some approximate + estimate but that would also needs pre-population of stats + structure. As of now approach is in sync with MyISAM. */ + return(ulonglong2double(stats.data_file_length) / IO_SIZE + 2); + } + + ulint stat_clustered_index_size; + + ut_a(m_prebuilt->table->stat_initialized); + + stat_clustered_index_size = + m_prebuilt->table->stat_clustered_index_size; + + return((double) stat_clustered_index_size); +} + +/******************************************************************//** +Calculate the time it takes to read a set of ranges through an index +This enables us to optimise reads for clustered indexes. +@return estimated time measured in disk seeks */ + +double +ha_innobase::read_time( +/*===================*/ + uint index, /*!< in: key number */ + uint ranges, /*!< in: how many ranges */ + ha_rows rows) /*!< in: estimated number of rows in the ranges */ +{ + ha_rows total_rows; + + if (index != table->s->primary_key) { + /* Not clustered */ + return(handler::read_time(index, ranges, rows)); + } + + /* Assume that the read time is proportional to the scan time for all + rows + at most one seek per range. */ + + double time_for_scan = scan_time(); + + if ((total_rows = estimate_rows_upper_bound()) < rows) { + + return(time_for_scan); + } + + return(ranges + (double) rows / (double) total_rows * time_for_scan); +} + +/*********************************************************************//** +Calculates the key number used inside MySQL for an Innobase index. +@return the key number used inside MySQL */ +static +unsigned +innobase_get_mysql_key_number_for_index( +/*====================================*/ + const TABLE* table, /*!< in: table in MySQL data + dictionary */ + dict_table_t* ib_table,/*!< in: table in InnoDB data + dictionary */ + const dict_index_t* index) /*!< in: index */ +{ + const dict_index_t* ind; + unsigned int i; + + /* If index does not belong to the table object of share structure + (ib_table comes from the share structure) search the index->table + object instead */ + if (index->table != ib_table) { + i = 0; + ind = dict_table_get_first_index(index->table); + + while (index != ind) { + ind = dict_table_get_next_index(ind); + i++; + } + + if (dict_index_is_auto_gen_clust(index)) { + ut_a(i > 0); + i--; + } + + return(i); + } + + /* Directly find matching index with information from mysql TABLE + structure and InnoDB dict_index_t list */ + for (i = 0; i < table->s->keys; i++) { + ind = dict_table_get_index_on_name( + ib_table, table->key_info[i].name.str); + + if (index == ind) { + return(i); + } + } + + /* Loop through each index of the table and lock them */ + for (ind = dict_table_get_first_index(ib_table); + ind != NULL; + ind = dict_table_get_next_index(ind)) { + if (index == ind) { + /* Temp index is internal to InnoDB, that is + not present in the MySQL index list, so no + need to print such mismatch warning. */ + if (index->is_committed()) { + sql_print_warning( + "Found index %s in InnoDB index list" + " but not its MariaDB index number." + " It could be an InnoDB internal" + " index.", + index->name()); + } + return(~0U); + } + } + + ut_error; + + return(~0U); +} + +/*********************************************************************//** +Calculate Record Per Key value. Need to exclude the NULL value if +innodb_stats_method is set to "nulls_ignored" +@return estimated record per key value */ +rec_per_key_t +innodb_rec_per_key( +/*===============*/ + dict_index_t* index, /*!< in: dict_index_t structure */ + ulint i, /*!< in: the column we are + calculating rec per key */ + ha_rows records) /*!< in: estimated total records */ +{ + rec_per_key_t rec_per_key; + ib_uint64_t n_diff; + + ut_a(index->table->stat_initialized); + + ut_ad(i < dict_index_get_n_unique(index)); + ut_ad(!dict_index_is_spatial(index)); + + if (records == 0) { + /* "Records per key" is meaningless for empty tables. + Return 1.0 because that is most convenient to the Optimizer. */ + return(1.0); + } + + n_diff = index->stat_n_diff_key_vals[i]; + + if (n_diff == 0) { + + rec_per_key = static_cast(records); + } else if (srv_innodb_stats_method == SRV_STATS_NULLS_IGNORED) { + ib_uint64_t n_null; + ib_uint64_t n_non_null; + + n_non_null = index->stat_n_non_null_key_vals[i]; + + /* In theory, index->stat_n_non_null_key_vals[i] + should always be less than the number of records. + Since this is statistics value, the value could + have slight discrepancy. But we will make sure + the number of null values is not a negative number. */ + if (records < n_non_null) { + n_null = 0; + } else { + n_null = records - n_non_null; + } + + /* If the number of NULL values is the same as or + larger than that of the distinct values, we could + consider that the table consists mostly of NULL value. + Set rec_per_key to 1. */ + if (n_diff <= n_null) { + rec_per_key = 1.0; + } else { + /* Need to exclude rows with NULL values from + rec_per_key calculation */ + rec_per_key + = static_cast(records - n_null) + / static_cast(n_diff - n_null); + } + } else { + DEBUG_SYNC_C("after_checking_for_0"); + rec_per_key = static_cast(records) + / static_cast(n_diff); + } + + if (rec_per_key < 1.0) { + /* Values below 1.0 are meaningless and must be due to the + stats being imprecise. */ + rec_per_key = 1.0; + } + + return(rec_per_key); +} + +/** Calculate how many KiB of new data we will be able to insert to the +tablespace without running out of space. Start with a space object that has +been acquired by the caller who holds it for the calculation, +@param[in] space tablespace object from fil_space_acquire() +@return available space in KiB */ +static uintmax_t +fsp_get_available_space_in_free_extents(const fil_space_t& space) +{ + ulint size_in_header = space.size_in_header; + if (size_in_header < FSP_EXTENT_SIZE) { + return 0; /* TODO: count free frag pages and + return a value based on that */ + } + + /* Below we play safe when counting free extents above the free limit: + some of them will contain extent descriptor pages, and therefore + will not be free extents */ + ut_ad(size_in_header >= space.free_limit); + ulint n_free_up = + (size_in_header - space.free_limit) / FSP_EXTENT_SIZE; + + const ulint size = space.physical_size(); + if (n_free_up > 0) { + n_free_up--; + n_free_up -= n_free_up / (size / FSP_EXTENT_SIZE); + } + + /* We reserve 1 extent + 0.5 % of the space size to undo logs + and 1 extent + 0.5 % to cleaning operations; NOTE: this source + code is duplicated in the function above! */ + + ulint reserve = 2 + ((size_in_header / FSP_EXTENT_SIZE) * 2) / 200; + ulint n_free = space.free_len + n_free_up; + + if (reserve > n_free) { + return(0); + } + + return(static_cast(n_free - reserve) + * FSP_EXTENT_SIZE * (size / 1024)); +} + +/*********************************************************************//** +Returns statistics information of the table to the MySQL interpreter, +in various fields of the handle object. +@return HA_ERR_* error code or 0 */ + +int +ha_innobase::info_low( +/*==================*/ + uint flag, /*!< in: what information is requested */ + bool is_analyze) +{ + dict_table_t* ib_table; + ib_uint64_t n_rows; + char path[FN_REFLEN]; + os_file_stat_t stat_info; + + DBUG_ENTER("info"); + + DEBUG_SYNC_C("ha_innobase_info_low"); + + /* If we are forcing recovery at a high level, we will suppress + statistics calculation on tables, because that may crash the + server if an index is badly corrupted. */ + + /* We do not know if MySQL can call this function before calling + external_lock(). To be safe, update the thd of the current table + handle. */ + + update_thd(ha_thd()); + + m_prebuilt->trx->op_info = "returning various info to MariaDB"; + + ib_table = m_prebuilt->table; + DBUG_ASSERT(ib_table->get_ref_count() > 0); + + if (!ib_table->is_readable()) { + ib_table->stats_mutex_lock(); + ib_table->stat_initialized = true; + ib_table->stat_n_rows = 0; + ib_table->stat_clustered_index_size = 0; + ib_table->stat_sum_of_other_index_sizes = 0; + ib_table->stats_mutex_unlock(); + } + + if (flag & HA_STATUS_TIME) { + if (is_analyze || innobase_stats_on_metadata) { + + dict_stats_upd_option_t opt; + dberr_t ret; + + m_prebuilt->trx->op_info = "updating table statistics"; + + if (dict_stats_is_persistent_enabled(ib_table)) { + if (is_analyze) { + if (!srv_read_only_mode) { + dict_stats_recalc_pool_del( + ib_table->id, false); + } + opt = DICT_STATS_RECALC_PERSISTENT; + } else { + /* This is e.g. 'SHOW INDEXES', fetch + the persistent stats from disk. */ + opt = DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY; + } + } else { + opt = DICT_STATS_RECALC_TRANSIENT; + } + + ret = dict_stats_update(ib_table, opt); + + if (ret != DB_SUCCESS) { + m_prebuilt->trx->op_info = ""; + DBUG_RETURN(HA_ERR_GENERIC); + } + + m_prebuilt->trx->op_info = + "returning various info to MariaDB"; + } + + + stats.update_time = (ulong) ib_table->update_time; + } + + dict_stats_init(ib_table); + + if (flag & HA_STATUS_VARIABLE) { + + ulint stat_clustered_index_size; + ulint stat_sum_of_other_index_sizes; + + ib_table->stats_mutex_lock(); + + ut_a(ib_table->stat_initialized); + + n_rows = ib_table->stat_n_rows; + + stat_clustered_index_size + = ib_table->stat_clustered_index_size; + + stat_sum_of_other_index_sizes + = ib_table->stat_sum_of_other_index_sizes; + + ib_table->stats_mutex_unlock(); + + /* + The MySQL optimizer seems to assume in a left join that n_rows + is an accurate estimate if it is zero. Of course, it is not, + since we do not have any locks on the rows yet at this phase. + Since SHOW TABLE STATUS seems to call this function with the + HA_STATUS_TIME flag set, while the left join optimizer does not + set that flag, we add one to a zero value if the flag is not + set. That way SHOW TABLE STATUS will show the best estimate, + while the optimizer never sees the table empty. */ + + if (n_rows == 0 && !(flag & (HA_STATUS_TIME | HA_STATUS_OPEN))) { + n_rows++; + } + + /* Fix bug#40386: Not flushing query cache after truncate. + n_rows can not be 0 unless the table is empty, set to 1 + instead. The original problem of bug#29507 is actually + fixed in the server code. */ + if (thd_sql_command(m_user_thd) == SQLCOM_TRUNCATE) { + + n_rows = 1; + + /* We need to reset the m_prebuilt value too, otherwise + checks for values greater than the last value written + to the table will fail and the autoinc counter will + not be updated. This will force write_row() into + attempting an update of the table's AUTOINC counter. */ + + m_prebuilt->autoinc_last_value = 0; + } + + stats.records = (ha_rows) n_rows; + stats.deleted = 0; + if (fil_space_t* space = ib_table->space) { + const ulint size = space->physical_size(); + stats.data_file_length + = ulonglong(stat_clustered_index_size) + * size; + stats.index_file_length + = ulonglong(stat_sum_of_other_index_sizes) + * size; + space->s_lock(); + stats.delete_length = 1024 + * fsp_get_available_space_in_free_extents( + *space); + space->s_unlock(); + } + stats.check_time = 0; + stats.mrr_length_per_rec= (uint)ref_length + 8; // 8 = max(sizeof(void *)); + + if (stats.records == 0) { + stats.mean_rec_length = 0; + } else { + stats.mean_rec_length = (ulong) + (stats.data_file_length / stats.records); + } + } + + if (flag & HA_STATUS_CONST) { + /* Verify the number of index in InnoDB and MySQL + matches up. If m_prebuilt->clust_index_was_generated + holds, InnoDB defines GEN_CLUST_INDEX internally */ + ulint num_innodb_index = UT_LIST_GET_LEN(ib_table->indexes) + - m_prebuilt->clust_index_was_generated; + if (table->s->keys < num_innodb_index) { + /* If there are too many indexes defined + inside InnoDB, ignore those that are being + created, because MySQL will only consider + the fully built indexes here. */ + + for (const dict_index_t* index + = UT_LIST_GET_FIRST(ib_table->indexes); + index != NULL; + index = UT_LIST_GET_NEXT(indexes, index)) { + + /* First, online index creation is + completed inside InnoDB, and then + MySQL attempts to upgrade the + meta-data lock so that it can rebuild + the .frm file. If we get here in that + time frame, dict_index_is_online_ddl() + would not hold and the index would + still not be included in TABLE_SHARE. */ + if (!index->is_committed()) { + num_innodb_index--; + } + } + + if (table->s->keys < num_innodb_index + && innobase_fts_check_doc_id_index( + ib_table, NULL, NULL) + == FTS_EXIST_DOC_ID_INDEX) { + num_innodb_index--; + } + } + + if (table->s->keys != num_innodb_index) { + ib_table->dict_frm_mismatch = DICT_FRM_INCONSISTENT_KEYS; + ib_push_frm_error(m_user_thd, ib_table, table, num_innodb_index, true); + } + + snprintf(path, sizeof(path), "%s/%s%s", + mysql_data_home, table->s->normalized_path.str, + reg_ext); + + unpack_filename(path,path); + + /* Note that we do not know the access time of the table, + nor the CHECK TABLE time, nor the UPDATE or INSERT time. */ + + if (os_file_get_status( + path, &stat_info, false, + srv_read_only_mode) == DB_SUCCESS) { + stats.create_time = (ulong) stat_info.ctime; + } + + ib_table->stats_mutex_lock(); + auto _ = make_scope_exit([ib_table]() { + ib_table->stats_mutex_unlock(); }); + + ut_a(ib_table->stat_initialized); + + for (uint i = 0; i < table->s->keys; i++) { + ulong j; + + dict_index_t* index = innobase_get_index(i); + + if (index == NULL) { + ib_table->dict_frm_mismatch = DICT_FRM_INCONSISTENT_KEYS; + ib_push_frm_error(m_user_thd, ib_table, table, num_innodb_index, true); + break; + } + + KEY* key = &table->key_info[i]; + + for (j = 0; j < key->ext_key_parts; j++) { + + if ((key->flags & HA_FULLTEXT) + || (key->flags & HA_SPATIAL)) { + + /* The record per key does not apply to + FTS or Spatial indexes. */ + /* + key->rec_per_key[j] = 1; + key->set_records_per_key(j, 1.0); + */ + continue; + } + + if (j + 1 > index->n_uniq) { + sql_print_error( + "Index %s of %s has %u columns" + " unique inside InnoDB, but " + "server is asking statistics for" + " %lu columns. Have you mixed " + "up .frm files from different " + " installations? %s", + index->name(), + ib_table->name.m_name, + index->n_uniq, j + 1, + TROUBLESHOOTING_MSG); + break; + } + + /* innodb_rec_per_key() will use + index->stat_n_diff_key_vals[] and the value we + pass index->table->stat_n_rows. Both are + calculated by ANALYZE and by the background + stats gathering thread (which kicks in when too + much of the table has been changed). In + addition table->stat_n_rows is adjusted with + each DML (e.g. ++ on row insert). Those + adjustments are not MVCC'ed and not even + reversed on rollback. So, + index->stat_n_diff_key_vals[] and + index->table->stat_n_rows could have been + calculated at different time. This is + acceptable. */ + + ulong rec_per_key_int = static_cast( + innodb_rec_per_key(index, j, + stats.records)); + + /* Since MySQL seems to favor table scans + too much over index searches, we pretend + index selectivity is 2 times better than + our estimate: */ + + rec_per_key_int = rec_per_key_int / 2; + + if (rec_per_key_int == 0) { + rec_per_key_int = 1; + } + + key->rec_per_key[j] = rec_per_key_int; + } + } + } + + if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) { + + goto func_exit; + + } else if (flag & HA_STATUS_ERRKEY) { + const dict_index_t* err_index; + + ut_a(m_prebuilt->trx); + ut_a(m_prebuilt->trx->magic_n == TRX_MAGIC_N); + + err_index = trx_get_error_info(m_prebuilt->trx); + + if (err_index) { + errkey = innobase_get_mysql_key_number_for_index( + table, ib_table, err_index); + } else { + errkey = (unsigned int) ( + (m_prebuilt->trx->error_key_num + == ULINT_UNDEFINED) + ? ~0U + : m_prebuilt->trx->error_key_num); + } + } + + if ((flag & HA_STATUS_AUTO) && table->found_next_number_field) { + stats.auto_increment_value = innobase_peek_autoinc(); + } + +func_exit: + m_prebuilt->trx->op_info = (char*)""; + + DBUG_RETURN(0); +} + +/*********************************************************************//** +Returns statistics information of the table to the MySQL interpreter, +in various fields of the handle object. +@return HA_ERR_* error code or 0 */ + +int +ha_innobase::info( +/*==============*/ + uint flag) /*!< in: what information is requested */ +{ + return(info_low(flag, false /* not ANALYZE */)); +} + +/* +Updates index cardinalities of the table, based on random dives into +each index tree. This does NOT calculate exact statistics on the table. +@return HA_ADMIN_* error code or HA_ADMIN_OK */ + +int +ha_innobase::analyze(THD*, HA_CHECK_OPT*) +{ + /* Simply call info_low() with all the flags + and request recalculation of the statistics */ + int ret = info_low( + HA_STATUS_TIME | HA_STATUS_CONST | HA_STATUS_VARIABLE, + true /* this is ANALYZE */); + + if (ret != 0) { + return(HA_ADMIN_FAILED); + } + + return(HA_ADMIN_OK); +} + +/*****************************************************************//** +Defragment table. +@return error number */ +inline int ha_innobase::defragment_table() +{ + for (dict_index_t *index= dict_table_get_first_index(m_prebuilt->table); + index; index= dict_table_get_next_index(index)) + { + if (!index->is_btree()) + continue; + + if (btr_defragment_find_index(index)) + { + // We borrow this error code. When the same index is already in + // the defragmentation queue, issuing another defragmentation + // only introduces overhead. We return an error here to let the + // user know this is not necessary. Note that this will fail a + // query that's trying to defragment a full table if one of the + // indicies in that table is already in defragmentation. We + // choose this behavior so user is aware of this rather than + // silently defragment other indicies of that table. + return ER_SP_ALREADY_EXISTS; + } + + btr_pcur_t pcur; + + mtr_t mtr; + mtr.start(); + if (dberr_t err= pcur.open_leaf(true, index, BTR_SEARCH_LEAF, &mtr)) + { + mtr.commit(); + return convert_error_code_to_mysql(err, 0, m_user_thd); + } + else if (btr_pcur_get_block(&pcur)->page.id().page_no() == index->page) + { + mtr.commit(); + continue; + } + + btr_pcur_move_to_next(&pcur, &mtr); + btr_pcur_store_position(&pcur, &mtr); + mtr.commit(); + ut_ad(pcur.index() == index); + const bool interrupted= btr_defragment_add_index(&pcur, m_user_thd); + ut_free(pcur.old_rec_buf); + if (interrupted) + return ER_QUERY_INTERRUPTED; + } + + return 0; +} + +/**********************************************************************//** +This is mapped to "ALTER TABLE tablename ENGINE=InnoDB", which rebuilds +the table in MySQL. */ + +int +ha_innobase::optimize( +/*==================*/ + THD* thd, /*!< in: connection thread handle */ + HA_CHECK_OPT*) +{ + + /* FTS-FIXME: Since MySQL doesn't support engine-specific commands, + we have to hijack some existing command in order to be able to test + the new admin commands added in InnoDB's FTS support. For now, we + use MySQL's OPTIMIZE command, normally mapped to ALTER TABLE in + InnoDB (so it recreates the table anew), and map it to OPTIMIZE. + + This works OK otherwise, but MySQL locks the entire table during + calls to OPTIMIZE, which is undesirable. */ + bool try_alter = true; + + if (!m_prebuilt->table->is_temporary() + && m_prebuilt->table->is_readable() + && srv_defragment) { + int err = defragment_table(); + + if (err == 0) { + try_alter = false; + } else { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + uint(err), + "InnoDB: Cannot defragment table %s: returned error code %d\n", + m_prebuilt->table->name.m_name, err); + + if(err == ER_SP_ALREADY_EXISTS) { + try_alter = false; + } + } + } + + if (innodb_optimize_fulltext_only) { + if (m_prebuilt->table->fts && m_prebuilt->table->fts->cache + && m_prebuilt->table->space) { + fts_sync_table(m_prebuilt->table); + fts_optimize_table(m_prebuilt->table); + } + try_alter = false; + } + + return try_alter ? HA_ADMIN_TRY_ALTER : HA_ADMIN_OK; +} + +/*******************************************************************//** +Tries to check that an InnoDB table is not corrupted. If corruption is +noticed, prints to stderr information about it. In case of corruption +may also assert a failure and crash the server. +@return HA_ADMIN_CORRUPT or HA_ADMIN_OK */ + +int +ha_innobase::check( +/*===============*/ + THD* thd, /*!< in: user thread handle */ + HA_CHECK_OPT* check_opt) /*!< in: check options */ +{ + ulint n_rows; + ulint n_rows_in_table = ULINT_UNDEFINED; + bool is_ok = true; + dberr_t ret; + + DBUG_ENTER("ha_innobase::check"); + DBUG_ASSERT(thd == ha_thd()); + DBUG_ASSERT(thd == m_user_thd); + ut_a(m_prebuilt->trx->magic_n == TRX_MAGIC_N); + ut_a(m_prebuilt->trx == thd_to_trx(thd)); + ut_ad(m_prebuilt->trx->mysql_thd == thd); + + if (m_prebuilt->mysql_template == NULL) { + /* Build the template; we will use a dummy template + in index scans done in checking */ + + build_template(true); + } + + if (!m_prebuilt->table->space) { + ib_senderrf( + thd, + IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_DISCARDED, + table->s->table_name.str); + + DBUG_RETURN(HA_ADMIN_CORRUPT); + } else if (!m_prebuilt->table->is_readable()) { + ib_senderrf( + thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_MISSING, + table->s->table_name.str); + + DBUG_RETURN(HA_ADMIN_CORRUPT); + } + + m_prebuilt->trx->op_info = "checking table"; + + uint old_isolation_level = m_prebuilt->trx->isolation_level; + + /* We must run the index record counts at an isolation level + >= READ COMMITTED, because a dirty read can see a wrong number + of records in some index; to play safe, we normally use + REPEATABLE READ here */ + m_prebuilt->trx->isolation_level = high_level_read_only + && !m_prebuilt->table->is_temporary() + ? TRX_ISO_READ_UNCOMMITTED + : TRX_ISO_REPEATABLE_READ; + + trx_start_if_not_started(m_prebuilt->trx, false); + m_prebuilt->trx->read_view.open(m_prebuilt->trx); + + for (dict_index_t* index + = dict_table_get_first_index(m_prebuilt->table); + index; + index = dict_table_get_next_index(index)) { + /* If this is an index being created or dropped, skip */ + if (!index->is_committed()) { + continue; + } + if (index->type & DICT_FTS) { + /* We do not check any FULLTEXT INDEX. */ + continue; + } + + if ((check_opt->flags & T_QUICK) || index->is_corrupted()) { + } else if (trx_id_t bulk_trx_id = + m_prebuilt->table->bulk_trx_id) { + if (!m_prebuilt->trx->read_view.changes_visible( + bulk_trx_id)) { + is_ok = true; + goto func_exit; + } + + if (btr_validate_index(index, m_prebuilt->trx) + != DB_SUCCESS) { + is_ok = false; + push_warning_printf( + thd, + Sql_condition::WARN_LEVEL_WARN, + ER_NOT_KEYFILE, + "InnoDB: The B-tree of" + " index %s is corrupted.", + index->name()); + continue; + } + } + + /* Instead of invoking change_active_index(), set up + a dummy template for non-locking reads, disabling + access to the clustered index. */ + m_prebuilt->index = index; + + m_prebuilt->index_usable = row_merge_is_index_usable( + m_prebuilt->trx, m_prebuilt->index); + + DBUG_EXECUTE_IF( + "dict_set_index_corrupted", + if (!index->is_primary()) { + m_prebuilt->index_usable = FALSE; + dict_set_corrupted(index, + "dict_set_index_corrupted"); + }); + + if (UNIV_UNLIKELY(!m_prebuilt->index_usable)) { + if (index->is_corrupted()) { + push_warning_printf( + thd, + Sql_condition::WARN_LEVEL_WARN, + HA_ERR_INDEX_CORRUPT, + "InnoDB: Index %s is marked as" + " corrupted", + index->name()); + is_ok = false; + } else { + push_warning_printf( + thd, + Sql_condition::WARN_LEVEL_WARN, + HA_ERR_TABLE_DEF_CHANGED, + "InnoDB: Insufficient history for" + " index %s", + index->name()); + } + continue; + } + + m_prebuilt->sql_stat_start = TRUE; + m_prebuilt->template_type = ROW_MYSQL_DUMMY_TEMPLATE; + m_prebuilt->n_template = 0; + m_prebuilt->read_just_key = 0; + m_prebuilt->autoinc_error = DB_SUCCESS; + m_prebuilt->need_to_access_clustered = + !!(check_opt->flags & T_EXTEND); + + dtuple_set_n_fields(m_prebuilt->search_tuple, 0); + + m_prebuilt->select_lock_type = LOCK_NONE; + + /* Scan this index. */ + if (index->is_spatial()) { + ret = row_count_rtree_recs(m_prebuilt, &n_rows); + } else if (index->type & DICT_FTS) { + ret = DB_SUCCESS; + } else { + ret = row_check_index(m_prebuilt, &n_rows); + } + + DBUG_EXECUTE_IF( + "dict_set_index_corrupted", + if (!index->is_primary()) { + ret = DB_CORRUPTION; + }); + + if (ret == DB_INTERRUPTED || thd_killed(thd)) { + /* Do not report error since this could happen + during shutdown */ + break; + } + + if (ret == DB_SUCCESS + && m_prebuilt->autoinc_error != DB_MISSING_HISTORY) { + /* See if any non-fatal errors were reported. */ + ret = m_prebuilt->autoinc_error; + } + + if (ret != DB_SUCCESS) { + /* Assume some kind of corruption. */ + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_NOT_KEYFILE, + "InnoDB: The B-tree of" + " index %s is corrupted.", + index->name()); + is_ok = false; + dict_set_corrupted(index, "CHECK TABLE-check index"); + } + + + if (index == dict_table_get_first_index(m_prebuilt->table)) { + n_rows_in_table = n_rows; + } else if (!(index->type & DICT_FTS) + && (n_rows != n_rows_in_table)) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_NOT_KEYFILE, + "InnoDB: Index '%-.200s' contains " ULINTPF + " entries, should be " ULINTPF ".", + index->name(), n_rows, n_rows_in_table); + is_ok = false; + dict_set_corrupted(index, "CHECK TABLE; Wrong count"); + } + } + + /* Restore the original isolation level */ + m_prebuilt->trx->isolation_level = old_isolation_level; +#ifdef BTR_CUR_HASH_ADAPT +# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + /* We validate the whole adaptive hash index for all tables + at every CHECK TABLE only when QUICK flag is not present. */ + + if (!(check_opt->flags & T_QUICK) + && !btr_search_validate(m_prebuilt->trx->mysql_thd)) { + push_warning(thd, Sql_condition::WARN_LEVEL_WARN, + ER_NOT_KEYFILE, + "InnoDB: The adaptive hash index is corrupted."); + is_ok = false; + } +# endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */ +#endif /* BTR_CUR_HASH_ADAPT */ +func_exit: + m_prebuilt->trx->op_info = ""; + + DBUG_RETURN(is_ok ? HA_ADMIN_OK : HA_ADMIN_CORRUPT); +} + +/*******************************************************************//** +Gets the foreign key create info for a table stored in InnoDB. +@return own: character string in the form which can be inserted to the +CREATE TABLE statement, MUST be freed with +ha_innobase::free_foreign_key_create_info */ + +char* +ha_innobase::get_foreign_key_create_info(void) +/*==========================================*/ +{ + ut_a(m_prebuilt != NULL); + + /* We do not know if MySQL can call this function before calling + external_lock(). To be safe, update the thd of the current table + handle. */ + + update_thd(ha_thd()); + + m_prebuilt->trx->op_info = "getting info on foreign keys"; + + /* Output the data to a temporary string */ + std::string str = dict_print_info_on_foreign_keys( + TRUE, m_prebuilt->trx, + m_prebuilt->table); + + m_prebuilt->trx->op_info = ""; + + /* Allocate buffer for the string */ + char *fk_str = reinterpret_cast( + my_malloc(PSI_INSTRUMENT_ME, str.length() + 1, MYF(0))); + + if (fk_str) { + memcpy(fk_str, str.c_str(), str.length()); + fk_str[str.length()]='\0'; + } + + return(fk_str); +} + + +/***********************************************************************//** +Maps a InnoDB foreign key constraint to a equivalent MySQL foreign key info. +@return pointer to foreign key info */ +static +FOREIGN_KEY_INFO* +get_foreign_key_info( +/*=================*/ + THD* thd, /*!< in: user thread handle */ + dict_foreign_t* foreign)/*!< in: foreign key constraint */ +{ + FOREIGN_KEY_INFO f_key_info; + FOREIGN_KEY_INFO* pf_key_info; + uint i = 0; + size_t len; + char tmp_buff[NAME_LEN+1]; + char name_buff[NAME_LEN+1]; + const char* ptr; + LEX_CSTRING* referenced_key_name; + LEX_CSTRING* name = NULL; + + if (dict_table_t::is_temporary_name(foreign->foreign_table_name)) { + return NULL; + } + + ptr = dict_remove_db_name(foreign->id); + f_key_info.foreign_id = thd_make_lex_string( + thd, 0, ptr, strlen(ptr), 1); + + /* Name format: database name, '/', table name, '\0' */ + + /* Referenced (parent) database name */ + len = dict_get_db_name_len(foreign->referenced_table_name); + ut_a(len < sizeof(tmp_buff)); + memcpy(tmp_buff, foreign->referenced_table_name, len); + tmp_buff[len] = 0; + + len = filename_to_tablename(tmp_buff, name_buff, sizeof(name_buff)); + f_key_info.referenced_db = thd_make_lex_string( + thd, 0, name_buff, len, 1); + + /* Referenced (parent) table name */ + ptr = dict_remove_db_name(foreign->referenced_table_name); + len = filename_to_tablename(ptr, name_buff, sizeof(name_buff), 1); + f_key_info.referenced_table = thd_make_lex_string( + thd, 0, name_buff, len, 1); + + /* Dependent (child) database name */ + len = dict_get_db_name_len(foreign->foreign_table_name); + ut_a(len < sizeof(tmp_buff)); + memcpy(tmp_buff, foreign->foreign_table_name, len); + tmp_buff[len] = 0; + + len = filename_to_tablename(tmp_buff, name_buff, sizeof(name_buff)); + f_key_info.foreign_db = thd_make_lex_string( + thd, 0, name_buff, len, 1); + + /* Dependent (child) table name */ + ptr = dict_remove_db_name(foreign->foreign_table_name); + len = filename_to_tablename(ptr, name_buff, sizeof(name_buff), 1); + f_key_info.foreign_table = thd_make_lex_string( + thd, 0, name_buff, len, 1); + + do { + ptr = foreign->foreign_col_names[i]; + name = thd_make_lex_string(thd, name, ptr, + strlen(ptr), 1); + f_key_info.foreign_fields.push_back(name); + ptr = foreign->referenced_col_names[i]; + name = thd_make_lex_string(thd, name, ptr, + strlen(ptr), 1); + f_key_info.referenced_fields.push_back(name); + } while (++i < foreign->n_fields); + + if (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE) { + f_key_info.delete_method = FK_OPTION_CASCADE; + } else if (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL) { + f_key_info.delete_method = FK_OPTION_SET_NULL; + } else if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION) { + f_key_info.delete_method = FK_OPTION_NO_ACTION; + } else { + f_key_info.delete_method = FK_OPTION_RESTRICT; + } + + + if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE) { + f_key_info.update_method = FK_OPTION_CASCADE; + } else if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL) { + f_key_info.update_method = FK_OPTION_SET_NULL; + } else if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION) { + f_key_info.update_method = FK_OPTION_NO_ACTION; + } else { + f_key_info.update_method = FK_OPTION_RESTRICT; + } + + /* Load referenced table to update FK referenced key name. */ + if (foreign->referenced_table == NULL) { + + dict_table_t* ref_table = dict_table_open_on_name( + foreign->referenced_table_name_lookup, + true, DICT_ERR_IGNORE_NONE); + + if (ref_table == NULL) { + + if (!thd_test_options( + thd, OPTION_NO_FOREIGN_KEY_CHECKS)) { + ib::info() + << "Foreign Key referenced table " + << foreign->referenced_table_name + << " not found for foreign table " + << foreign->foreign_table_name; + } + } else { + dict_table_close(ref_table, true); + } + } + + if (foreign->referenced_index + && foreign->referenced_index->name != NULL) { + referenced_key_name = thd_make_lex_string( + thd, + f_key_info.referenced_key_name, + foreign->referenced_index->name, + strlen(foreign->referenced_index->name), + 1); + } else { + referenced_key_name = NULL; + } + + f_key_info.referenced_key_name = referenced_key_name; + + pf_key_info = (FOREIGN_KEY_INFO*) thd_memdup(thd, &f_key_info, + sizeof(FOREIGN_KEY_INFO)); + + return(pf_key_info); +} + +/*******************************************************************//** +Gets the list of foreign keys in this table. +@return always 0, that is, always succeeds */ + +int +ha_innobase::get_foreign_key_list( +/*==============================*/ + THD* thd, /*!< in: user thread handle */ + List* f_key_list) /*!< out: foreign key list */ +{ + update_thd(ha_thd()); + + m_prebuilt->trx->op_info = "getting list of foreign keys"; + + dict_sys.lock(SRW_LOCK_CALL); + + for (dict_foreign_set::iterator it + = m_prebuilt->table->foreign_set.begin(); + it != m_prebuilt->table->foreign_set.end(); + ++it) { + + FOREIGN_KEY_INFO* pf_key_info; + dict_foreign_t* foreign = *it; + + pf_key_info = get_foreign_key_info(thd, foreign); + + if (pf_key_info != NULL) { + f_key_list->push_back(pf_key_info); + } + } + + dict_sys.unlock(); + + m_prebuilt->trx->op_info = ""; + + return(0); +} + +/*******************************************************************//** +Gets the set of foreign keys where this table is the referenced table. +@return always 0, that is, always succeeds */ + +int +ha_innobase::get_parent_foreign_key_list( +/*=====================================*/ + THD* thd, /*!< in: user thread handle */ + List* f_key_list) /*!< out: foreign key list */ +{ + update_thd(ha_thd()); + + m_prebuilt->trx->op_info = "getting list of referencing foreign keys"; + + dict_sys.freeze(SRW_LOCK_CALL); + + for (dict_foreign_set::iterator it + = m_prebuilt->table->referenced_set.begin(); + it != m_prebuilt->table->referenced_set.end(); + ++it) { + + FOREIGN_KEY_INFO* pf_key_info; + dict_foreign_t* foreign = *it; + + pf_key_info = get_foreign_key_info(thd, foreign); + + if (pf_key_info != NULL) { + f_key_list->push_back(pf_key_info); + } + } + + dict_sys.unfreeze(); + + m_prebuilt->trx->op_info = ""; + + return(0); +} + +/** Table list item structure is used to store only the table +and name. It is used by get_cascade_foreign_key_table_list to store +the intermediate result for fetching the table set. */ +struct table_list_item { + /** InnoDB table object */ + const dict_table_t* table; + /** Table name */ + const char* name; +}; + +/** @return whether ALTER TABLE may change the storage engine of the table */ +bool ha_innobase::can_switch_engines() +{ + DBUG_ENTER("ha_innobase::can_switch_engines"); + update_thd(); + DBUG_RETURN(m_prebuilt->table->foreign_set.empty() && + m_prebuilt->table->referenced_set.empty()); +} + +/*******************************************************************//** +Checks if a table is referenced by a foreign key. The MySQL manual states that +a REPLACE is either equivalent to an INSERT, or DELETE(s) + INSERT. Only a +delete is then allowed internally to resolve a duplicate key conflict in +REPLACE, not an update. +@return > 0 if referenced by a FOREIGN KEY */ + +uint ha_innobase::referenced_by_foreign_key() +{ + dict_sys.freeze(SRW_LOCK_CALL); + const bool empty= m_prebuilt->table->referenced_set.empty(); + dict_sys.unfreeze(); + return !empty; +} + +/*******************************************************************//** +Tells something additional to the handler about how to do things. +@return 0 or error number */ + +int +ha_innobase::extra( +/*===============*/ + enum ha_extra_function operation) + /*!< in: HA_EXTRA_FLUSH or some other flag */ +{ + /* Warning: since it is not sure that MariaDB calls external_lock() + before calling this function, m_prebuilt->trx can be obsolete! */ + trx_t* trx = check_trx_exists(ha_thd()); + + switch (operation) { + case HA_EXTRA_FLUSH: + if (m_prebuilt->blob_heap) { + row_mysql_prebuilt_free_blob_heap(m_prebuilt); + } + break; + case HA_EXTRA_RESET_STATE: + reset_template(); + trx->duplicates = 0; + stmt_boundary: + trx->bulk_insert_apply(); + trx->end_bulk_insert(*m_prebuilt->table); + trx->bulk_insert = false; + break; + case HA_EXTRA_NO_KEYREAD: + m_prebuilt->read_just_key = 0; + break; + case HA_EXTRA_KEYREAD: + m_prebuilt->read_just_key = 1; + break; + case HA_EXTRA_KEYREAD_PRESERVE_FIELDS: + m_prebuilt->keep_other_fields_on_keyread = 1; + break; + case HA_EXTRA_INSERT_WITH_UPDATE: + trx->duplicates |= TRX_DUP_IGNORE; + goto stmt_boundary; + case HA_EXTRA_NO_IGNORE_DUP_KEY: + trx->duplicates &= ~TRX_DUP_IGNORE; + if (trx->is_bulk_insert()) { + /* Allow a subsequent INSERT into an empty table + if !unique_checks && !foreign_key_checks. */ + if (dberr_t err = trx->bulk_insert_apply()) { + return err; + } + break; + } + goto stmt_boundary; + case HA_EXTRA_WRITE_CAN_REPLACE: + trx->duplicates |= TRX_DUP_REPLACE; + goto stmt_boundary; + case HA_EXTRA_WRITE_CANNOT_REPLACE: + trx->duplicates &= ~TRX_DUP_REPLACE; + if (trx->is_bulk_insert()) { + /* Allow a subsequent INSERT into an empty table + if !unique_checks && !foreign_key_checks. */ + break; + } + goto stmt_boundary; + case HA_EXTRA_BEGIN_ALTER_COPY: + m_prebuilt->table->skip_alter_undo = 1; + if (m_prebuilt->table->is_temporary() + || !m_prebuilt->table->versioned_by_id()) { + break; + } + ut_ad(trx == m_prebuilt->trx); + trx_start_if_not_started(trx, true); + trx->mod_tables.emplace( + const_cast(m_prebuilt->table), 0) + .first->second.set_versioned(0); + break; + case HA_EXTRA_END_ALTER_COPY: + m_prebuilt->table->skip_alter_undo = 0; + if (!m_prebuilt->table->is_temporary()) { + log_buffer_flush_to_disk(); + } + break; + default:/* Do nothing */ + ; + } + + return(0); +} + +/** +MySQL calls this method at the end of each statement */ +int +ha_innobase::reset() +{ + if (m_prebuilt->blob_heap) { + row_mysql_prebuilt_free_blob_heap(m_prebuilt); + } + + reset_template(); + + m_ds_mrr.dsmrr_close(); + + /* TODO: This should really be reset in reset_template() but for now + it's safer to do it explicitly here. */ + + /* This is a statement level counter. */ + m_prebuilt->autoinc_last_value = 0; + + m_prebuilt->skip_locked = false; + return(0); +} + +/******************************************************************//** +MySQL calls this function at the start of each SQL statement inside LOCK +TABLES. Inside LOCK TABLES the ::external_lock method does not work to +mark SQL statement borders. Note also a special case: if a temporary table +is created inside LOCK TABLES, MySQL has not called external_lock() at all +on that table. +MySQL-5.0 also calls this before each statement in an execution of a stored +procedure. To make the execution more deterministic for binlogging, MySQL-5.0 +locks all tables involved in a stored procedure with full explicit table +locks (thd_in_lock_tables(thd) holds in store_lock()) before executing the +procedure. +@return 0 or error code */ + +int +ha_innobase::start_stmt( +/*====================*/ + THD* thd, /*!< in: handle to the user thread */ + thr_lock_type lock_type) +{ + trx_t* trx = m_prebuilt->trx; + + DBUG_ENTER("ha_innobase::start_stmt"); + + update_thd(thd); + + ut_ad(m_prebuilt->table != NULL); + + trx = m_prebuilt->trx; + + /* Reset the AUTOINC statement level counter for multi-row INSERTs. */ + trx->n_autoinc_rows = 0; + + const auto sql_command = thd_sql_command(thd); + + m_prebuilt->hint_need_to_fetch_extra_cols = 0; + reset_template(); + + switch (sql_command) { + case SQLCOM_INSERT: + case SQLCOM_INSERT_SELECT: + if (trx->is_bulk_insert()) { + /* Allow a subsequent INSERT into an empty table + if !unique_checks && !foreign_key_checks. */ + break; + } + /* fall through */ + default: + trx->end_bulk_insert(*m_prebuilt->table); + if (!trx->bulk_insert) { + break; + } + + /* Trigger could've initiated another stmt. + So apply all bulk operation and mark as + end bulk insert for all tables */ + trx->bulk_insert_apply(); + trx->end_bulk_insert(); + trx->bulk_insert = false; + trx->last_sql_stat_start.least_undo_no = trx->undo_no; + } + + m_prebuilt->sql_stat_start = TRUE; + + if (m_prebuilt->table->is_temporary() + && m_mysql_has_locked + && m_prebuilt->select_lock_type == LOCK_NONE) { + switch (sql_command) { + case SQLCOM_INSERT: + case SQLCOM_UPDATE: + case SQLCOM_DELETE: + case SQLCOM_REPLACE: + init_table_handle_for_HANDLER(); + m_prebuilt->select_lock_type = LOCK_X; + m_prebuilt->stored_select_lock_type = LOCK_X; + if (dberr_t error = row_lock_table(m_prebuilt)) { + DBUG_RETURN(convert_error_code_to_mysql( + error, 0, thd)); + } + break; + } + } + + if (!m_mysql_has_locked) { + /* This handle is for a temporary table created inside + this same LOCK TABLES; since MySQL does NOT call external_lock + in this case, we must use x-row locks inside InnoDB to be + prepared for an update of a row */ + + m_prebuilt->select_lock_type = LOCK_X; + + } else if (sql_command == SQLCOM_SELECT + && lock_type == TL_READ + && trx->isolation_level != TRX_ISO_SERIALIZABLE) { + + /* For other than temporary tables, we obtain + no lock for consistent read (plain SELECT). */ + + m_prebuilt->select_lock_type = LOCK_NONE; + } else { + /* Not a consistent read: restore the + select_lock_type value. The value of + stored_select_lock_type was decided in: + 1) ::store_lock(), + 2) ::external_lock(), + 3) ::init_table_handle_for_HANDLER(). */ + + ut_a(m_prebuilt->stored_select_lock_type != LOCK_NONE_UNSET); + + m_prebuilt->select_lock_type = + m_prebuilt->stored_select_lock_type; + } + + *trx->detailed_error = 0; + + innobase_register_trx(ht, thd, trx); + + if (!trx_is_started(trx)) { + trx->will_lock = true; + } + + DBUG_RETURN(0); +} + +/******************************************************************//** +Maps a MySQL trx isolation level code to the InnoDB isolation level code +@return InnoDB isolation level */ +static inline +uint +innobase_map_isolation_level( +/*=========================*/ + enum_tx_isolation iso) /*!< in: MySQL isolation level code */ +{ + if (UNIV_UNLIKELY(srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) + || UNIV_UNLIKELY(srv_read_only_mode)) { + return TRX_ISO_READ_UNCOMMITTED; + } + switch (iso) { + case ISO_REPEATABLE_READ: return(TRX_ISO_REPEATABLE_READ); + case ISO_READ_COMMITTED: return(TRX_ISO_READ_COMMITTED); + case ISO_SERIALIZABLE: return(TRX_ISO_SERIALIZABLE); + case ISO_READ_UNCOMMITTED: return(TRX_ISO_READ_UNCOMMITTED); + } + + ut_error; + + return(0); +} + +/******************************************************************//** +As MySQL will execute an external lock for every new table it uses when it +starts to process an SQL statement (an exception is when MySQL calls +start_stmt for the handle) we can use this function to store the pointer to +the THD in the handle. We will also use this function to communicate +to InnoDB that a new SQL statement has started and that we must store a +savepoint to our transaction handle, so that we are able to roll back +the SQL statement in case of an error. +@return 0 */ + +int +ha_innobase::external_lock( +/*=======================*/ + THD* thd, /*!< in: handle to the user thread */ + int lock_type) /*!< in: lock type */ +{ + DBUG_ENTER("ha_innobase::external_lock"); + DBUG_PRINT("enter",("lock_type: %d", lock_type)); + + update_thd(thd); + trx_t* trx = m_prebuilt->trx; + ut_ad(m_prebuilt->table); + + /* Statement based binlogging does not work in isolation level + READ UNCOMMITTED and READ COMMITTED since the necessary + locks cannot be taken. In this case, we print an + informative error message and return with an error. + Note: decide_logging_format would give the same error message, + except it cannot give the extra details. */ + + if (lock_type == F_WRLCK + && !(table_flags() & HA_BINLOG_STMT_CAPABLE) + && thd_binlog_format(thd) == BINLOG_FORMAT_STMT + && thd_binlog_filter_ok(thd) + && thd_sqlcom_can_generate_row_events(thd)) { + bool skip = false; +#ifdef WITH_WSREP + skip = trx->is_wsrep() && !wsrep_thd_is_local(thd); +#endif /* WITH_WSREP */ + /* used by test case */ + DBUG_EXECUTE_IF("no_innodb_binlog_errors", skip = true;); + + if (!skip) { + my_error(ER_BINLOG_STMT_MODE_AND_ROW_ENGINE, MYF(0), + " InnoDB is limited to row-logging when" + " transaction isolation level is" + " READ COMMITTED or READ UNCOMMITTED."); + + DBUG_RETURN(HA_ERR_LOGGING_IMPOSSIBLE); + } + } + + const auto sql_command = thd_sql_command(thd); + + /* Check for UPDATEs in read-only mode. */ + if (srv_read_only_mode) { + switch (sql_command) { + case SQLCOM_CREATE_TABLE: + if (lock_type != F_WRLCK) { + break; + } + /* fall through */ + case SQLCOM_UPDATE: + case SQLCOM_INSERT: + case SQLCOM_REPLACE: + case SQLCOM_DROP_TABLE: + case SQLCOM_ALTER_TABLE: + case SQLCOM_OPTIMIZE: + case SQLCOM_CREATE_INDEX: + case SQLCOM_DROP_INDEX: + case SQLCOM_CREATE_SEQUENCE: + case SQLCOM_DROP_SEQUENCE: + case SQLCOM_DELETE: + ib_senderrf(thd, IB_LOG_LEVEL_WARN, + ER_READ_ONLY_MODE); + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } + } + + m_prebuilt->sql_stat_start = TRUE; + m_prebuilt->hint_need_to_fetch_extra_cols = 0; + + reset_template(); + switch (sql_command) { + case SQLCOM_INSERT: + case SQLCOM_INSERT_SELECT: + if (trx->is_bulk_insert()) { + /* Allow a subsequent INSERT into an empty table + if !unique_checks && !foreign_key_checks. */ + break; + } + /* fall through */ + default: + trx->end_bulk_insert(*m_prebuilt->table); + if (!trx->bulk_insert) { + break; + } + trx->bulk_insert = false; + trx->last_sql_stat_start.least_undo_no = trx->undo_no; + } + + switch (m_prebuilt->table->quiesce) { + case QUIESCE_START: + /* Check for FLUSH TABLE t WITH READ LOCK; */ + if (!srv_read_only_mode + && sql_command == SQLCOM_FLUSH + && lock_type == F_RDLCK) { + + if (!m_prebuilt->table->space) { + ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_DISCARDED, + table->s->table_name.str); + + DBUG_RETURN(HA_ERR_TABLESPACE_MISSING); + } + + row_quiesce_table_start(m_prebuilt->table, trx); + + /* Use the transaction instance to track UNLOCK + TABLES. It can be done via START TRANSACTION; too + implicitly. */ + + ++trx->flush_tables; + } + break; + + case QUIESCE_COMPLETE: + /* Check for UNLOCK TABLES; implicit or explicit + or trx interruption. */ + if (trx->flush_tables > 0 + && (lock_type == F_UNLCK || trx_is_interrupted(trx))) { + + row_quiesce_table_complete(m_prebuilt->table, trx); + + ut_a(trx->flush_tables > 0); + --trx->flush_tables; + } + + break; + + case QUIESCE_NONE: + break; + } + + if (lock_type == F_WRLCK) { + + /* If this is a SELECT, then it is in UPDATE TABLE ... + or SELECT ... FOR UPDATE */ + m_prebuilt->select_lock_type = LOCK_X; + m_prebuilt->stored_select_lock_type = LOCK_X; + } + + if (lock_type != F_UNLCK) { + /* MySQL is setting a new table lock */ + + *trx->detailed_error = 0; + + innobase_register_trx(ht, thd, trx); + + if (trx->isolation_level == TRX_ISO_SERIALIZABLE + && m_prebuilt->select_lock_type == LOCK_NONE + && thd_test_options( + thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { + + /* To get serializable execution, we let InnoDB + conceptually add 'LOCK IN SHARE MODE' to all SELECTs + which otherwise would have been consistent reads. An + exception is consistent reads in the AUTOCOMMIT=1 mode: + we know that they are read-only transactions, and they + can be serialized also if performed as consistent + reads. */ + + m_prebuilt->select_lock_type = LOCK_S; + m_prebuilt->stored_select_lock_type = LOCK_S; + } + + /* Starting from 4.1.9, no InnoDB table lock is taken in LOCK + TABLES if AUTOCOMMIT=1. It does not make much sense to acquire + an InnoDB table lock if it is released immediately at the end + of LOCK TABLES, and InnoDB's table locks in that case cause + VERY easily deadlocks. + + We do not set InnoDB table locks if user has not explicitly + requested a table lock. Note that thd_in_lock_tables(thd) + can hold in some cases, e.g., at the start of a stored + procedure call (SQLCOM_CALL). */ + + if (m_prebuilt->select_lock_type != LOCK_NONE) { + + if (sql_command == SQLCOM_LOCK_TABLES + && THDVAR(thd, table_locks) + && thd_test_options(thd, OPTION_NOT_AUTOCOMMIT) + && thd_in_lock_tables(thd)) { + + dberr_t error = row_lock_table(m_prebuilt); + + if (error != DB_SUCCESS) { + + DBUG_RETURN( + convert_error_code_to_mysql( + error, 0, thd)); + } + } + + trx->mysql_n_tables_locked++; + } + + trx->n_mysql_tables_in_use++; + m_mysql_has_locked = true; + + if (!trx_is_started(trx) + && (m_prebuilt->select_lock_type != LOCK_NONE + || m_prebuilt->stored_select_lock_type != LOCK_NONE)) { + + trx->will_lock = true; + } + + DBUG_RETURN(0); + } else { + DEBUG_SYNC_C("ha_innobase_end_statement"); + } + + /* MySQL is releasing a table lock */ + + trx->n_mysql_tables_in_use--; + m_mysql_has_locked = false; + + /* If the MySQL lock count drops to zero we know that the current SQL + statement has ended */ + + if (trx->n_mysql_tables_in_use == 0) { + + trx->mysql_n_tables_locked = 0; + m_prebuilt->used_in_HANDLER = FALSE; + + if (!thd_test_options( + thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { + + if (trx_is_started(trx)) { + + innobase_commit(ht, thd, TRUE); + } + + } else if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) { + trx->read_view.close(); + } + } + + if (!trx_is_started(trx) + && lock_type != F_UNLCK + && (m_prebuilt->select_lock_type != LOCK_NONE + || m_prebuilt->stored_select_lock_type != LOCK_NONE)) { + + trx->will_lock = true; + } + + DBUG_RETURN(0); +} + +/************************************************************************//** +Here we export InnoDB status variables to MySQL. */ +static +void +innodb_export_status() +/*==================*/ +{ + if (srv_was_started) { + srv_export_innodb_status(); + } +} + +/************************************************************************//** +Implements the SHOW ENGINE INNODB STATUS command. Sends the output of the +InnoDB Monitor to the client. +@return 0 on success */ +static +int +innodb_show_status( +/*===============*/ + handlerton* hton, /*!< in: the innodb handlerton */ + THD* thd, /*!< in: the MySQL query thread of the caller */ + stat_print_fn* stat_print) +{ + static const char truncated_msg[] = "... truncated...\n"; + const long MAX_STATUS_SIZE = 1048576; + ulint trx_list_start = ULINT_UNDEFINED; + ulint trx_list_end = ULINT_UNDEFINED; + bool ret_val; + + DBUG_ENTER("innodb_show_status"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + /* We don't create the temp files or associated + mutexes in read-only-mode */ + + if (srv_read_only_mode) { + DBUG_RETURN(0); + } + + purge_sys.wake_if_not_active(); + + /* We let the InnoDB Monitor to output at most MAX_STATUS_SIZE + bytes of text. */ + + char* str; + size_t flen; + + mysql_mutex_lock(&srv_monitor_file_mutex); + rewind(srv_monitor_file); + + srv_printf_innodb_monitor(srv_monitor_file, FALSE, + &trx_list_start, &trx_list_end); + + os_file_set_eof(srv_monitor_file); + + flen = size_t(ftell(srv_monitor_file)); + if (ssize_t(flen) < 0) { + flen = 0; + } + + size_t usable_len; + + if (flen > MAX_STATUS_SIZE) { + usable_len = MAX_STATUS_SIZE; + truncated_status_writes++; + } else { + usable_len = flen; + } + + /* allocate buffer for the string, and + read the contents of the temporary file */ + + if (!(str = (char*) my_malloc(PSI_INSTRUMENT_ME, + usable_len + 1, MYF(0)))) { + mysql_mutex_unlock(&srv_monitor_file_mutex); + DBUG_RETURN(1); + } + + rewind(srv_monitor_file); + + if (flen < MAX_STATUS_SIZE) { + /* Display the entire output. */ + flen = fread(str, 1, flen, srv_monitor_file); + } else if (trx_list_end < flen + && trx_list_start < trx_list_end + && trx_list_start + flen - trx_list_end + < MAX_STATUS_SIZE - sizeof truncated_msg - 1) { + + /* Omit the beginning of the list of active transactions. */ + size_t len = fread(str, 1, trx_list_start, srv_monitor_file); + + memcpy(str + len, truncated_msg, sizeof truncated_msg - 1); + len += sizeof truncated_msg - 1; + usable_len = (MAX_STATUS_SIZE - 1) - len; + fseek(srv_monitor_file, long(flen - usable_len), SEEK_SET); + len += fread(str + len, 1, usable_len, srv_monitor_file); + flen = len; + } else { + /* Omit the end of the output. */ + flen = fread(str, 1, MAX_STATUS_SIZE - 1, srv_monitor_file); + } + + mysql_mutex_unlock(&srv_monitor_file_mutex); + + ret_val= stat_print( + thd, innobase_hton_name, + static_cast(strlen(innobase_hton_name)), + STRING_WITH_LEN(""), str, static_cast(flen)); + + my_free(str); + + DBUG_RETURN(ret_val); +} + +/************************************************************************//** +Return 0 on success and non-zero on failure. Note: the bool return type +seems to be abused here, should be an int. */ +static +bool +innobase_show_status( +/*=================*/ + handlerton* hton, /*!< in: the innodb handlerton */ + THD* thd, /*!< in: the MySQL query thread + of the caller */ + stat_print_fn* stat_print, + enum ha_stat_type stat_type) +{ + DBUG_ASSERT(hton == innodb_hton_ptr); + + switch (stat_type) { + case HA_ENGINE_STATUS: + /* Non-zero return value means there was an error. */ + return(innodb_show_status(hton, thd, stat_print) != 0); + + case HA_ENGINE_MUTEX: + case HA_ENGINE_LOGS: + /* Not handled */ + break; + } + + /* Success */ + return(false); +} + +/*********************************************************************//** +Returns number of THR_LOCK locks used for one instance of InnoDB table. +InnoDB no longer relies on THR_LOCK locks so 0 value is returned. +Instead of THR_LOCK locks InnoDB relies on combination of metadata locks +(e.g. for LOCK TABLES and DDL) and its own locking subsystem. +Note that even though this method returns 0, SQL-layer still calls +::store_lock(), ::start_stmt() and ::external_lock() methods for InnoDB +tables. */ + +uint +ha_innobase::lock_count(void) const +/*===============================*/ +{ + return 0; +} + +/*****************************************************************//** +Supposed to convert a MySQL table lock stored in the 'lock' field of the +handle to a proper type before storing pointer to the lock into an array +of pointers. +In practice, since InnoDB no longer relies on THR_LOCK locks and its +lock_count() method returns 0 it just informs storage engine about type +of THR_LOCK which SQL-layer would have acquired for this specific statement +on this specific table. +MySQL also calls this if it wants to reset some table locks to a not-locked +state during the processing of an SQL query. An example is that during a +SELECT the read lock is released early on the 'const' tables where we only +fetch one row. MySQL does not call this when it releases all locks at the +end of an SQL statement. +@return pointer to the current element in the 'to' array. */ + +THR_LOCK_DATA** +ha_innobase::store_lock( +/*====================*/ + THD* thd, /*!< in: user thread handle */ + THR_LOCK_DATA** to, /*!< in: pointer to the current + element in an array of pointers + to lock structs; + only used as return value */ + thr_lock_type lock_type) /*!< in: lock type to store in + 'lock'; this may also be + TL_IGNORE */ +{ + /* Note that trx in this function is NOT necessarily m_prebuilt->trx + because we call update_thd() later, in ::external_lock()! Failure to + understand this caused a serious memory corruption bug in 5.1.11. */ + + trx_t* trx = check_trx_exists(thd); + + /* NOTE: MySQL can call this function with lock 'type' TL_IGNORE! + Be careful to ignore TL_IGNORE if we are going to do something with + only 'real' locks! */ + + /* If no MySQL table is in use, we need to set the isolation level + of the transaction. */ + + if (lock_type != TL_IGNORE + && trx->n_mysql_tables_in_use == 0) { + trx->isolation_level = innobase_map_isolation_level( + (enum_tx_isolation) thd_tx_isolation(thd)); + + if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) { + + /* At low transaction isolation levels we let + each consistent read set its own snapshot */ + trx->read_view.close(); + } + } + + DBUG_ASSERT(EQ_CURRENT_THD(thd)); + const bool in_lock_tables = thd_in_lock_tables(thd); + const int sql_command = thd_sql_command(thd); + + if (srv_read_only_mode + && (sql_command == SQLCOM_UPDATE + || sql_command == SQLCOM_INSERT + || sql_command == SQLCOM_REPLACE + || sql_command == SQLCOM_DROP_TABLE + || sql_command == SQLCOM_ALTER_TABLE + || sql_command == SQLCOM_OPTIMIZE + || (sql_command == SQLCOM_CREATE_TABLE + && (lock_type >= TL_WRITE_CONCURRENT_INSERT + && lock_type <= TL_WRITE)) + || sql_command == SQLCOM_CREATE_INDEX + || sql_command == SQLCOM_DROP_INDEX + || sql_command == SQLCOM_CREATE_SEQUENCE + || sql_command == SQLCOM_DROP_SEQUENCE + || sql_command == SQLCOM_DELETE)) { + + ib_senderrf(trx->mysql_thd, + IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); + + } else if (sql_command == SQLCOM_FLUSH + && lock_type == TL_READ_NO_INSERT) { + + /* Check for FLUSH TABLES ... WITH READ LOCK */ + + /* Note: This call can fail, but there is no way to return + the error to the caller. We simply ignore it for now here + and push the error code to the caller where the error is + detected in the function. */ + + dberr_t err = row_quiesce_set_state( + m_prebuilt->table, QUIESCE_START, trx); + + ut_a(err == DB_SUCCESS || err == DB_UNSUPPORTED); + + if (trx->isolation_level == TRX_ISO_SERIALIZABLE) { + m_prebuilt->select_lock_type = LOCK_S; + m_prebuilt->stored_select_lock_type = LOCK_S; + } else { + m_prebuilt->select_lock_type = LOCK_NONE; + m_prebuilt->stored_select_lock_type = LOCK_NONE; + } + + /* Check for DROP TABLE */ + } else if (sql_command == SQLCOM_DROP_TABLE || + sql_command == SQLCOM_DROP_SEQUENCE) { + + /* MySQL calls this function in DROP TABLE though this table + handle may belong to another thd that is running a query. Let + us in that case skip any changes to the m_prebuilt struct. */ + + /* Check for LOCK TABLE t1,...,tn WITH SHARED LOCKS */ + } else if ((lock_type == TL_READ && in_lock_tables) + || (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables) + || lock_type == TL_READ_WITH_SHARED_LOCKS + || lock_type == TL_READ_SKIP_LOCKED + || lock_type == TL_READ_NO_INSERT + || (lock_type != TL_IGNORE + && sql_command != SQLCOM_SELECT)) { + + /* The OR cases above are in this order: + 1) MySQL is doing LOCK TABLES ... READ LOCAL, or we + are processing a stored procedure or function, or + 2) (we do not know when TL_READ_HIGH_PRIORITY is used), or + 3) this is a SELECT ... IN SHARE MODE, or + 4) this is a SELECT ... IN SHARE MODE SKIP LOCKED, or + 5) we are doing a complex SQL statement like + INSERT INTO ... SELECT ... and the logical logging (MySQL + binlog) requires the use of a locking read, or + MySQL is doing LOCK TABLES ... READ. + 6) we let InnoDB do locking reads for all SQL statements that + are not simple SELECTs; note that select_lock_type in this + case may get strengthened in ::external_lock() to LOCK_X. + Note that we MUST use a locking read in all data modifying + SQL statements, because otherwise the execution would not be + serializable, and also the results from the update could be + unexpected if an obsolete consistent read view would be + used. */ + + /* Use consistent read for checksum table */ + + if (sql_command == SQLCOM_CHECKSUM + || sql_command == SQLCOM_CREATE_SEQUENCE + || (sql_command == SQLCOM_ANALYZE && lock_type == TL_READ) + || (trx->isolation_level <= TRX_ISO_READ_COMMITTED + && (lock_type == TL_READ + || lock_type == TL_READ_NO_INSERT) + && (sql_command == SQLCOM_INSERT_SELECT + || sql_command == SQLCOM_REPLACE_SELECT + || sql_command == SQLCOM_UPDATE + || sql_command == SQLCOM_CREATE_SEQUENCE + || sql_command == SQLCOM_CREATE_TABLE))) { + + /* If the transaction isolation level is + READ UNCOMMITTED or READ COMMITTED and we are executing + INSERT INTO...SELECT or REPLACE INTO...SELECT + or UPDATE ... = (SELECT ...) or CREATE ... + SELECT... without FOR UPDATE or IN SHARE + MODE in select, then we use consistent read + for select. */ + + m_prebuilt->select_lock_type = LOCK_NONE; + m_prebuilt->stored_select_lock_type = LOCK_NONE; + } else { + m_prebuilt->select_lock_type = LOCK_S; + m_prebuilt->stored_select_lock_type = LOCK_S; + } + + } else if (lock_type != TL_IGNORE) { + + /* We set possible LOCK_X value in external_lock, not yet + here even if this would be SELECT ... FOR UPDATE */ + + m_prebuilt->select_lock_type = LOCK_NONE; + m_prebuilt->stored_select_lock_type = LOCK_NONE; + } + m_prebuilt->skip_locked= (lock_type == TL_WRITE_SKIP_LOCKED || + lock_type == TL_READ_SKIP_LOCKED); + + if (!trx_is_started(trx) + && (m_prebuilt->select_lock_type != LOCK_NONE + || m_prebuilt->stored_select_lock_type != LOCK_NONE)) { + + trx->will_lock = true; + } + + return(to); +} + +/*********************************************************************//** +Read the next autoinc value. Acquire the relevant locks before reading +the AUTOINC value. If SUCCESS then the table AUTOINC mutex will be locked +on return and all relevant locks acquired. +@return DB_SUCCESS or error code */ + +dberr_t +ha_innobase::innobase_get_autoinc( +/*==============================*/ + ulonglong* value) /*!< out: autoinc value */ +{ + *value = 0; + + m_prebuilt->autoinc_error = innobase_lock_autoinc(); + + if (m_prebuilt->autoinc_error == DB_SUCCESS) { + + /* Determine the first value of the interval */ + *value = dict_table_autoinc_read(m_prebuilt->table); + + /* It should have been initialized during open. */ + if (*value == 0) { + m_prebuilt->autoinc_error = DB_UNSUPPORTED; + m_prebuilt->table->autoinc_mutex.wr_unlock(); + } + } + + return(m_prebuilt->autoinc_error); +} + +/*******************************************************************//** +This function reads the global auto-inc counter. It doesn't use the +AUTOINC lock even if the lock mode is set to TRADITIONAL. +@return the autoinc value */ + +ulonglong +ha_innobase::innobase_peek_autoinc(void) +/*====================================*/ +{ + ulonglong auto_inc; + dict_table_t* innodb_table; + + ut_a(m_prebuilt != NULL); + ut_a(m_prebuilt->table != NULL); + + innodb_table = m_prebuilt->table; + + innodb_table->autoinc_mutex.wr_lock(); + + auto_inc = dict_table_autoinc_read(innodb_table); + + if (auto_inc == 0) { + ib::info() << "AUTOINC next value generation is disabled for" + " '" << innodb_table->name << "'"; + } + + innodb_table->autoinc_mutex.wr_unlock(); + + return(auto_inc); +} + +/*********************************************************************//** +Returns the value of the auto-inc counter in *first_value and ~0 on failure. */ + +void +ha_innobase::get_auto_increment( +/*============================*/ + ulonglong offset, /*!< in: table autoinc offset */ + ulonglong increment, /*!< in: table autoinc + increment */ + ulonglong nb_desired_values, /*!< in: number of values + reqd */ + ulonglong* first_value, /*!< out: the autoinc value */ + ulonglong* nb_reserved_values) /*!< out: count of reserved + values */ +{ + trx_t* trx; + dberr_t error; + ulonglong autoinc = 0; + mariadb_set_stats set_stats_temporary(handler_stats); + + /* Prepare m_prebuilt->trx in the table handle */ + update_thd(ha_thd()); + + error = innobase_get_autoinc(&autoinc); + + if (error != DB_SUCCESS) { + *first_value = (~(ulonglong) 0); + return; + } + + /* This is a hack, since nb_desired_values seems to be accurate only + for the first call to get_auto_increment() for multi-row INSERT and + meaningless for other statements e.g, LOAD etc. Subsequent calls to + this method for the same statement results in different values which + don't make sense. Therefore we store the value the first time we are + called and count down from that as rows are written (see write_row()). + */ + + trx = m_prebuilt->trx; + + /* Note: We can't rely on *first_value since some MySQL engines, + in particular the partition engine, don't initialize it to 0 when + invoking this method. So we are not sure if it's guaranteed to + be 0 or not. */ + + /* We need the upper limit of the col type to check for + whether we update the table autoinc counter or not. */ + ulonglong col_max_value = + table->next_number_field->get_max_int_value(); + + /** The following logic is needed to avoid duplicate key error + for autoincrement column. + + (1) InnoDB gives the current autoincrement value with respect + to increment and offset value. + + (2) Basically it does compute_next_insert_id() logic inside InnoDB + to avoid the current auto increment value changed by handler layer. + + (3) It is restricted only for insert operations. */ + + if (increment > 1 && increment <= ~autoinc && autoinc < col_max_value + && thd_sql_command(m_user_thd) != SQLCOM_ALTER_TABLE) { + + ulonglong prev_auto_inc = autoinc; + + autoinc = ((autoinc - 1) + increment - offset)/ increment; + + autoinc = autoinc * increment + offset; + + /* If autoinc exceeds the col_max_value then reset + to old autoinc value. Because in case of non-strict + sql mode, boundary value is not considered as error. */ + + if (autoinc >= col_max_value) { + autoinc = prev_auto_inc; + } + + ut_ad(autoinc > 0); + } + + /* Called for the first time ? */ + if (trx->n_autoinc_rows == 0) { + + trx->n_autoinc_rows = (ulint) nb_desired_values; + + /* It's possible for nb_desired_values to be 0: + e.g., INSERT INTO T1(C) SELECT C FROM T2; */ + if (nb_desired_values == 0) { + + trx->n_autoinc_rows = 1; + } + + set_if_bigger(*first_value, autoinc); + /* Not in the middle of a mult-row INSERT. */ + } else if (m_prebuilt->autoinc_last_value == 0) { + set_if_bigger(*first_value, autoinc); + } + + if (*first_value > col_max_value) { + /* Out of range number. Let handler::update_auto_increment() + take care of this */ + m_prebuilt->autoinc_last_value = 0; + m_prebuilt->table->autoinc_mutex.wr_unlock(); + *nb_reserved_values= 0; + return; + } + + *nb_reserved_values = trx->n_autoinc_rows; + + /* With old style AUTOINC locking we only update the table's + AUTOINC counter after attempting to insert the row. */ + if (innobase_autoinc_lock_mode != AUTOINC_OLD_STYLE_LOCKING) { + ulonglong current; + ulonglong next_value; + + current = *first_value; + + /* Compute the last value in the interval */ + next_value = innobase_next_autoinc( + current, *nb_reserved_values, increment, offset, + col_max_value); + + m_prebuilt->autoinc_last_value = next_value; + + if (m_prebuilt->autoinc_last_value < *first_value) { + *first_value = (~(ulonglong) 0); + } else { + /* Update the table autoinc variable */ + dict_table_autoinc_update_if_greater( + m_prebuilt->table, + m_prebuilt->autoinc_last_value); + } + } else { + /* This will force write_row() into attempting an update + of the table's AUTOINC counter. */ + m_prebuilt->autoinc_last_value = 0; + } + + /* The increment to be used to increase the AUTOINC value, we use + this in write_row() and update_row() to increase the autoinc counter + for columns that are filled by the user. We need the offset and + the increment. */ + m_prebuilt->autoinc_offset = offset; + m_prebuilt->autoinc_increment = increment; + + m_prebuilt->table->autoinc_mutex.wr_unlock(); +} + +/*******************************************************************//** +See comment in handler.cc */ + +bool +ha_innobase::get_error_message( +/*===========================*/ + int error, + String* buf) +{ + trx_t* trx = check_trx_exists(ha_thd()); + + if (error == HA_ERR_DECRYPTION_FAILED) { + const char *msg = "Table encrypted but decryption failed. This could be because correct encryption management plugin is not loaded, used encryption key is not available or encryption method does not match."; + buf->copy(msg, (uint)strlen(msg), system_charset_info); + } else { + buf->copy(trx->detailed_error, (uint) strlen(trx->detailed_error), + system_charset_info); + } + + return(FALSE); +} + +/** Retrieves the names of the table and the key for which there was a +duplicate entry in the case of HA_ERR_FOREIGN_DUPLICATE_KEY. + +If any of the names is not available, then this method will return +false and will not change any of child_table_name or child_key_name. + +@param[out] child_table_name Table name +@param[in] child_table_name_len Table name buffer size +@param[out] child_key_name Key name +@param[in] child_key_name_len Key name buffer size + +@retval true table and key names were available and were written into the +corresponding out parameters. +@retval false table and key names were not available, the out parameters +were not touched. */ +bool +ha_innobase::get_foreign_dup_key( +/*=============================*/ + char* child_table_name, + uint child_table_name_len, + char* child_key_name, + uint child_key_name_len) +{ + const dict_index_t* err_index; + + ut_a(m_prebuilt->trx != NULL); + ut_a(m_prebuilt->trx->magic_n == TRX_MAGIC_N); + + err_index = trx_get_error_info(m_prebuilt->trx); + + if (err_index == NULL) { + return(false); + } + /* else */ + + /* copy table name (and convert from filename-safe encoding to + system_charset_info) */ + char* p = strchr(err_index->table->name.m_name, '/'); + + /* strip ".../" prefix if any */ + if (p != NULL) { + p++; + } else { + p = err_index->table->name.m_name; + } + + size_t len; + + len = filename_to_tablename(p, child_table_name, child_table_name_len); + + child_table_name[len] = '\0'; + + /* copy index name */ + snprintf(child_key_name, child_key_name_len, "%s", + err_index->name()); + + return(true); +} + +/*******************************************************************//** +Compares two 'refs'. A 'ref' is the (internal) primary key value of the row. +If there is no explicitly declared non-null unique key or a primary key, then +InnoDB internally uses the row id as the primary key. +@return < 0 if ref1 < ref2, 0 if equal, else > 0 */ + +int +ha_innobase::cmp_ref( +/*=================*/ + const uchar* ref1, /*!< in: an (internal) primary key value in the + MySQL key value format */ + const uchar* ref2) /*!< in: an (internal) primary key value in the + MySQL key value format */ +{ + enum_field_types mysql_type; + Field* field; + KEY_PART_INFO* key_part; + KEY_PART_INFO* key_part_end; + uint len1; + uint len2; + int result; + + if (m_prebuilt->clust_index_was_generated) { + /* The 'ref' is an InnoDB row id */ + + return(memcmp(ref1, ref2, DATA_ROW_ID_LEN)); + } + + /* Do a type-aware comparison of primary key fields. PK fields + are always NOT NULL, so no checks for NULL are performed. */ + + key_part = table->key_info[table->s->primary_key].key_part; + + key_part_end = key_part + + table->key_info[table->s->primary_key].user_defined_key_parts; + + for (; key_part != key_part_end; ++key_part) { + field = key_part->field; + mysql_type = field->type(); + + if (mysql_type == MYSQL_TYPE_TINY_BLOB + || mysql_type == MYSQL_TYPE_MEDIUM_BLOB + || mysql_type == MYSQL_TYPE_BLOB + || mysql_type == MYSQL_TYPE_LONG_BLOB) { + + /* In the MySQL key value format, a column prefix of + a BLOB is preceded by a 2-byte length field */ + + len1 = innobase_read_from_2_little_endian(ref1); + len2 = innobase_read_from_2_little_endian(ref2); + + result = ((Field_blob*) field)->cmp( + ref1 + 2, len1, ref2 + 2, len2); + } else { + result = field->key_cmp(ref1, ref2); + } + + if (result) { + if (key_part->key_part_flag & HA_REVERSE_SORT) + result = -result; + return(result); + } + + ref1 += key_part->store_length; + ref2 += key_part->store_length; + } + + return(0); +} + +/*******************************************************************//** +Ask InnoDB if a query to a table can be cached. +@return TRUE if query caching of the table is permitted */ + +my_bool +ha_innobase::register_query_cache_table( +/*====================================*/ + THD* thd, /*!< in: user thread handle */ + const char* table_key, /*!< in: normalized path to the + table */ + uint key_length, /*!< in: length of the normalized + path to the table */ + qc_engine_callback* + call_back, /*!< out: pointer to function for + checking if query caching + is permitted */ + ulonglong *engine_data) /*!< in/out: data to call_back */ +{ + *engine_data = 0; + *call_back = innobase_query_caching_of_table_permitted; + + return(innobase_query_caching_of_table_permitted( + thd, table_key, + static_cast(key_length), + engine_data)); +} + +/******************************************************************//** +This function is used to find the storage length in bytes of the first n +characters for prefix indexes using a multibyte character set. The function +finds charset information and returns length of prefix_len characters in the +index field in bytes. +@return number of bytes occupied by the first n characters */ +ulint +innobase_get_at_most_n_mbchars( +/*===========================*/ + ulint charset_id, /*!< in: character set id */ + ulint prefix_len, /*!< in: prefix length in bytes of the index + (this has to be divided by mbmaxlen to get the + number of CHARACTERS n in the prefix) */ + ulint data_len, /*!< in: length of the string in bytes */ + const char* str) /*!< in: character string */ +{ + ulint char_length; /*!< character length in bytes */ + ulint n_chars; /*!< number of characters in prefix */ + CHARSET_INFO* charset; /*!< charset used in the field */ + + charset = get_charset((uint) charset_id, MYF(MY_WME)); + + ut_ad(charset); + ut_ad(charset->mbmaxlen); + + /* Calculate how many characters at most the prefix index contains */ + + n_chars = prefix_len / charset->mbmaxlen; + + /* If the charset is multi-byte, then we must find the length of the + first at most n chars in the string. If the string contains less + characters than n, then we return the length to the end of the last + character. */ + + if (charset->mbmaxlen > 1) { + /* charpos() returns the byte length of the first n_chars + characters, or a value bigger than the length of str, if + there were not enough full characters in str. + + Why does the code below work: + Suppose that we are looking for n UTF-8 characters. + + 1) If the string is long enough, then the prefix contains at + least n complete UTF-8 characters + maybe some extra + characters + an incomplete UTF-8 character. No problem in + this case. The function returns the pointer to the + end of the nth character. + + 2) If the string is not long enough, then the string contains + the complete value of a column, that is, only complete UTF-8 + characters, and we can store in the column prefix index the + whole string. */ + + char_length= charset->charpos(str, str + data_len, n_chars); + if (char_length > data_len) { + char_length = data_len; + } + } else if (data_len < prefix_len) { + + char_length = data_len; + + } else { + + char_length = prefix_len; + } + + return(char_length); +} + +/*******************************************************************//** +This function is used to prepare an X/Open XA distributed transaction. +@return 0 or error number */ +static +int +innobase_xa_prepare( +/*================*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + THD* thd, /*!< in: handle to the MySQL thread of + the user whose XA transaction should + be prepared */ + bool prepare_trx) /*!< in: true - prepare transaction + false - the current SQL statement + ended */ +{ + trx_t* trx = check_trx_exists(thd); + + DBUG_ASSERT(hton == innodb_hton_ptr); + + thd_get_xid(thd, &reinterpret_cast(trx->xid)); + + if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) { + + sql_print_error("Transaction not registered for MariaDB 2PC," + " but transaction is active"); + } + + if (prepare_trx + || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) { + + /* We were instructed to prepare the whole transaction, or + this is an SQL statement end and autocommit is on */ + + ut_ad(trx_is_registered_for_2pc(trx)); + + trx_prepare_for_mysql(trx); + } else { + /* We just mark the SQL statement ended and do not do a + transaction prepare */ + + /* If we had reserved the auto-inc lock for some + table in this SQL statement we release it now */ + + lock_unlock_table_autoinc(trx); + + /* Store the current undo_no of the transaction so that we + know where to roll back if we have to roll back the next + SQL statement */ + if (UNIV_UNLIKELY(end_of_statement(trx))) { + return 1; + } + } + + if (thd_sql_command(thd) != SQLCOM_XA_PREPARE + && (prepare_trx + || !thd_test_options( + thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) { + + /* For mysqlbackup to work the order of transactions in binlog + and InnoDB must be the same. Consider the situation + + thread1> prepare; write to binlog; ... + + thread2> prepare; write to binlog; commit + thread1> ... commit + + The server guarantees that writes to the binary log + and commits are in the same order, so we do not have + to handle this case. */ + } + + return(0); +} + +/*******************************************************************//** +This function is used to recover X/Open XA distributed transactions. +@return number of prepared transactions stored in xid_list */ +static +int +innobase_xa_recover( +/*================*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + XID* xid_list,/*!< in/out: prepared transactions */ + uint len) /*!< in: number of slots in xid_list */ +{ + DBUG_ASSERT(hton == innodb_hton_ptr); + + if (len == 0 || xid_list == NULL) { + + return(0); + } + + return(trx_recover_for_mysql(xid_list, len)); +} + +/*******************************************************************//** +This function is used to commit one X/Open XA distributed transaction +which is in the prepared state +@return 0 or error number */ +static +int +innobase_commit_by_xid( +/*===================*/ + handlerton* hton, + XID* xid) /*!< in: X/Open XA transaction identification */ +{ + DBUG_ASSERT(hton == innodb_hton_ptr); + + DBUG_EXECUTE_IF("innobase_xa_fail", + return XAER_RMFAIL;); + + if (high_level_read_only) { + return(XAER_RMFAIL); + } + + if (trx_t* trx = trx_get_trx_by_xid(xid)) { + /* use cases are: disconnected xa, slave xa, recovery */ + innobase_commit_low(trx); + ut_ad(trx->mysql_thd == NULL); + trx_deregister_from_2pc(trx); + ut_ad(!trx->will_lock); /* trx cache requirement */ + trx->free(); + + return(XA_OK); + } else { + return(XAER_NOTA); + } +} + +/** This function is used to rollback one X/Open XA distributed transaction +which is in the prepared state + +@param[in] hton InnoDB handlerton +@param[in] xid X/Open XA transaction identification + +@return 0 or error number */ +int innobase_rollback_by_xid(handlerton* hton, XID* xid) +{ + DBUG_ASSERT(hton == innodb_hton_ptr); + + DBUG_EXECUTE_IF("innobase_xa_fail", + return XAER_RMFAIL;); + + if (high_level_read_only) { + return(XAER_RMFAIL); + } + + if (trx_t* trx = trx_get_trx_by_xid(xid)) { +#ifdef WITH_WSREP + /* If a wsrep transaction is being rolled back during + the recovery, we must clear the xid in order to avoid + writing serialisation history for rolled back transaction. */ + if (wsrep_is_wsrep_xid(&trx->xid)) { + trx->xid.null(); + } +#endif /* WITH_WSREP */ + int ret = innobase_rollback_trx(trx); + ut_ad(!trx->will_lock); + trx->free(); + + return(ret); + } else { + return(XAER_NOTA); + } +} + +bool +ha_innobase::check_if_incompatible_data( +/*====================================*/ + HA_CREATE_INFO* info, + uint table_changes) +{ + ha_table_option_struct *param_old, *param_new; + + /* Cache engine specific options */ + param_new = info->option_struct; + param_old = table->s->option_struct; + + innobase_copy_frm_flags_from_create_info(m_prebuilt->table, info); + + if (table_changes != IS_EQUAL_YES) { + + return(COMPATIBLE_DATA_NO); + } + + /* Check that auto_increment value was not changed */ + if ((info->used_fields & HA_CREATE_USED_AUTO) + && info->auto_increment_value != 0) { + + return(COMPATIBLE_DATA_NO); + } + + /* Check that row format didn't change */ + if ((info->used_fields & HA_CREATE_USED_ROW_FORMAT) + && info->row_type != get_row_type()) { + + return(COMPATIBLE_DATA_NO); + } + + /* Specifying KEY_BLOCK_SIZE requests a rebuild of the table. */ + if (info->used_fields & HA_CREATE_USED_KEY_BLOCK_SIZE) { + return(COMPATIBLE_DATA_NO); + } + + /* Changes on engine specific table options requests a rebuild of the table. */ + if (param_new->page_compressed != param_old->page_compressed || + param_new->page_compression_level != param_old->page_compression_level) + { + return(COMPATIBLE_DATA_NO); + } + + return(COMPATIBLE_DATA_YES); +} + +/****************************************************************//** +Update the system variable innodb_io_capacity_max using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_io_capacity_max_update( +/*===========================*/ + THD* thd, /*!< in: thread handle */ + st_mysql_sys_var*, void*, + const void* save) /*!< in: immediate result + from check function */ +{ + ulong in_val = *static_cast(save); + + if (in_val < srv_io_capacity) { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "Setting innodb_io_capacity_max %lu" + " lower than innodb_io_capacity %lu.", + in_val, srv_io_capacity); + + srv_io_capacity = in_val; + + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "Setting innodb_io_capacity to %lu", + srv_io_capacity); + } + + srv_max_io_capacity = in_val; +} + +/****************************************************************//** +Update the system variable innodb_io_capacity using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_io_capacity_update( +/*======================*/ + THD* thd, /*!< in: thread handle */ + st_mysql_sys_var*, void*, + const void* save) /*!< in: immediate result + from check function */ +{ + ulong in_val = *static_cast(save); + + if (in_val > srv_max_io_capacity) { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "Setting innodb_io_capacity to %lu" + " higher than innodb_io_capacity_max %lu", + in_val, srv_max_io_capacity); + + srv_max_io_capacity = (in_val & ~(~0UL >> 1)) + ? in_val : in_val * 2; + + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "Setting innodb_max_io_capacity to %lu", + srv_max_io_capacity); + } + + srv_io_capacity = in_val; +} + +/****************************************************************//** +Update the system variable innodb_max_dirty_pages_pct using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_max_dirty_pages_pct_update( +/*==============================*/ + THD* thd, /*!< in: thread handle */ + st_mysql_sys_var*, void*, + const void* save) /*!< in: immediate result + from check function */ +{ + double in_val = *static_cast(save); + if (in_val < srv_max_dirty_pages_pct_lwm) { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "innodb_max_dirty_pages_pct cannot be" + " set lower than" + " innodb_max_dirty_pages_pct_lwm."); + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "Lowering" + " innodb_max_dirty_page_pct_lwm to %lf", + in_val); + + srv_max_dirty_pages_pct_lwm = in_val; + } + + srv_max_buf_pool_modified_pct = in_val; + + mysql_mutex_unlock(&LOCK_global_system_variables); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_pool.page_cleaner_wakeup(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + mysql_mutex_lock(&LOCK_global_system_variables); +} + +/****************************************************************//** +Update the system variable innodb_max_dirty_pages_pct_lwm using the +"saved" value. This function is registered as a callback with MySQL. */ +static +void +innodb_max_dirty_pages_pct_lwm_update( +/*==================================*/ + THD* thd, /*!< in: thread handle */ + st_mysql_sys_var*, void*, + const void* save) /*!< in: immediate result + from check function */ +{ + double in_val = *static_cast(save); + if (in_val > srv_max_buf_pool_modified_pct) { + in_val = srv_max_buf_pool_modified_pct; + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "innodb_max_dirty_pages_pct_lwm" + " cannot be set higher than" + " innodb_max_dirty_pages_pct."); + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "Setting innodb_max_dirty_page_pct_lwm" + " to %lf", + in_val); + } + + srv_max_dirty_pages_pct_lwm = in_val; + + mysql_mutex_unlock(&LOCK_global_system_variables); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_pool.page_cleaner_wakeup(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + mysql_mutex_lock(&LOCK_global_system_variables); +} + +/*************************************************************//** +Don't allow to set innodb_fast_shutdown=0 if purge threads are +already down. +@return 0 if innodb_fast_shutdown can be set */ +static +int +fast_shutdown_validate( +/*=============================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to system + variable */ + void* save, /*!< out: immediate result + for update function */ + struct st_mysql_value* value) /*!< in: incoming string */ +{ + if (check_sysvar_int(thd, var, save, value)) { + return(1); + } + + uint new_val = *reinterpret_cast(save); + + if (srv_fast_shutdown && !new_val + && !srv_read_only_mode && abort_loop) { + return(1); + } + + return(0); +} + +/*************************************************************//** +Check whether valid argument given to innobase_*_stopword_table. +This function is registered as a callback with MySQL. +@return 0 for valid stopword table */ +static +int +innodb_stopword_table_validate( +/*===========================*/ + THD* thd, /*!< in: thread handle */ + st_mysql_sys_var*, + void* save, /*!< out: immediate result + for update function */ + struct st_mysql_value* value) /*!< in: incoming string */ +{ + const char* stopword_table_name; + char buff[STRING_BUFFER_USUAL_SIZE]; + int len = sizeof(buff); + trx_t* trx; + + ut_a(save != NULL); + ut_a(value != NULL); + + stopword_table_name = value->val_str(value, buff, &len); + + trx = check_trx_exists(thd); + + row_mysql_lock_data_dictionary(trx); + + /* Validate the stopword table's (if supplied) existence and + of the right format */ + int ret = stopword_table_name && !fts_valid_stopword_table( + stopword_table_name, NULL); + + row_mysql_unlock_data_dictionary(trx); + + if (!ret) { + if (stopword_table_name == buff) { + ut_ad(static_cast(len) < sizeof buff); + stopword_table_name = thd_strmake(thd, + stopword_table_name, + len); + } + + *static_cast(save) = stopword_table_name; + } + + return(ret); +} + +extern void buf_resize_start(); + +/** Update the system variable innodb_buffer_pool_size using the "saved" +value. This function is registered as a callback with MySQL. +@param[in] save immediate result from check function */ +static +void +innodb_buffer_pool_size_update(THD*,st_mysql_sys_var*,void*, const void* save) +{ + snprintf(export_vars.innodb_buffer_pool_resize_status, + sizeof(export_vars.innodb_buffer_pool_resize_status), + "Buffer pool resize requested"); + + buf_resize_start(); +} + +/** The latest assigned innodb_ft_aux_table name */ +static char* innodb_ft_aux_table; + +/** Update innodb_ft_aux_table_id on SET GLOBAL innodb_ft_aux_table. +@param[in,out] thd connection +@param[out] save new value of innodb_ft_aux_table +@param[in] value user-specified value */ +static int innodb_ft_aux_table_validate(THD *thd, st_mysql_sys_var*, + void* save, st_mysql_value* value) +{ + char buf[STRING_BUFFER_USUAL_SIZE]; + int len = sizeof buf; + + if (const char* table_name = value->val_str(value, buf, &len)) { + if (dict_table_t* table = dict_table_open_on_name( + table_name, false, DICT_ERR_IGNORE_NONE)) { + const table_id_t id = dict_table_has_fts_index(table) + ? table->id : 0; + dict_table_close(table); + if (id) { + innodb_ft_aux_table_id = id; + if (table_name == buf) { + ut_ad(static_cast(len) + < sizeof buf); + table_name = thd_strmake(thd, + table_name, + len); + } + + + *static_cast(save) = table_name; + return 0; + } + } + + return 1; + } else { + *static_cast(save) = NULL; + innodb_ft_aux_table_id = 0; + return 0; + } +} + +#ifdef BTR_CUR_HASH_ADAPT +/****************************************************************//** +Update the system variable innodb_adaptive_hash_index using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_adaptive_hash_index_update(THD*, st_mysql_sys_var*, void*, + const void* save) +{ + mysql_mutex_unlock(&LOCK_global_system_variables); + if (*(my_bool*) save) { + btr_search_enable(); + } else { + btr_search_disable(); + } + mysql_mutex_lock(&LOCK_global_system_variables); +} +#endif /* BTR_CUR_HASH_ADAPT */ + +/****************************************************************//** +Update the system variable innodb_cmp_per_index using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_cmp_per_index_update(THD*, st_mysql_sys_var*, void*, const void* save) +{ + /* Reset the stats whenever we enable the table + INFORMATION_SCHEMA.innodb_cmp_per_index. */ + if (!srv_cmp_per_index_enabled && *(my_bool*) save) { + mysql_mutex_unlock(&LOCK_global_system_variables); + page_zip_reset_stat_per_index(); + mysql_mutex_lock(&LOCK_global_system_variables); + } + + srv_cmp_per_index_enabled = !!(*(my_bool*) save); +} + +/****************************************************************//** +Update the system variable innodb_old_blocks_pct using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_old_blocks_pct_update(THD*, st_mysql_sys_var*, void*, const void* save) +{ + mysql_mutex_unlock(&LOCK_global_system_variables); + uint ratio = buf_LRU_old_ratio_update(*static_cast(save), + true); + mysql_mutex_lock(&LOCK_global_system_variables); + innobase_old_blocks_pct = ratio; +} + +/****************************************************************//** +Update the system variable innodb_old_blocks_pct using the "saved" +value. This function is registered as a callback with MySQL. */ +static +void +innodb_change_buffer_max_size_update(THD*, st_mysql_sys_var*, void*, + const void* save) +{ + srv_change_buffer_max_size = *static_cast(save); + mysql_mutex_unlock(&LOCK_global_system_variables); + ibuf_max_size_update(srv_change_buffer_max_size); + mysql_mutex_lock(&LOCK_global_system_variables); +} + +#ifdef UNIV_DEBUG +static uint srv_fil_make_page_dirty_debug = 0; +static uint srv_saved_page_number_debug; + +/****************************************************************//** +Make the first page of given user tablespace dirty. */ +static +void +innodb_make_page_dirty(THD*, st_mysql_sys_var*, void*, const void* save) +{ + mtr_t mtr; + uint space_id = *static_cast(save); + mysql_mutex_unlock(&LOCK_global_system_variables); + fil_space_t* space = fil_space_t::get(space_id); + + if (space == NULL) { +func_exit_no_space: + mysql_mutex_lock(&LOCK_global_system_variables); + return; + } + + if (srv_saved_page_number_debug >= space->size) { +func_exit: + space->release(); + goto func_exit_no_space; + } + + mtr.start(); + mtr.set_named_space(space); + + buf_block_t* block = buf_page_get( + page_id_t(space_id, srv_saved_page_number_debug), + space->zip_size(), RW_X_LATCH, &mtr); + + if (block != NULL) { + ib::info() << "Dirtying page: " << block->page.id(); + mtr.write<1,mtr_t::FORCED>(*block, + block->page.frame + + FIL_PAGE_SPACE_ID, + block->page.frame + [FIL_PAGE_SPACE_ID]); + } + mtr.commit(); + log_write_up_to(mtr.commit_lsn(), true); + goto func_exit; +} +#endif // UNIV_DEBUG + +/****************************************************************//** +Update the monitor counter according to the "set_option", turn +on/off or reset specified monitor counter. */ +static +void +innodb_monitor_set_option( +/*======================*/ + const monitor_info_t* monitor_info,/*!< in: monitor info for the monitor + to set */ + mon_option_t set_option) /*!< in: Turn on/off reset the + counter */ +{ + monitor_id_t monitor_id = monitor_info->monitor_id; + + /* If module type is MONITOR_GROUP_MODULE, it cannot be + turned on/off individually. It should never use this + function to set options */ + ut_a(!(monitor_info->monitor_type & MONITOR_GROUP_MODULE)); + + switch (set_option) { + case MONITOR_TURN_ON: + MONITOR_ON(monitor_id); + MONITOR_INIT(monitor_id); + MONITOR_SET_START(monitor_id); + + /* If the monitor to be turned on uses + exisitng monitor counter (status variable), + make special processing to remember existing + counter value. */ + if (monitor_info->monitor_type & MONITOR_EXISTING) { + srv_mon_process_existing_counter( + monitor_id, MONITOR_TURN_ON); + } + break; + + case MONITOR_TURN_OFF: + if (monitor_info->monitor_type & MONITOR_EXISTING) { + srv_mon_process_existing_counter( + monitor_id, MONITOR_TURN_OFF); + } + + MONITOR_OFF(monitor_id); + MONITOR_SET_OFF(monitor_id); + break; + + case MONITOR_RESET_VALUE: + srv_mon_reset(monitor_id); + break; + + case MONITOR_RESET_ALL_VALUE: + srv_mon_reset_all(monitor_id); + break; + + default: + ut_error; + } +} + +/****************************************************************//** +Find matching InnoDB monitor counters and update their status +according to the "set_option", turn on/off or reset specified +monitor counter. */ +static +void +innodb_monitor_update_wildcard( +/*===========================*/ + const char* name, /*!< in: monitor name to match */ + mon_option_t set_option) /*!< in: the set option, whether + to turn on/off or reset the counter */ +{ + ut_a(name); + + for (ulint use = 0; use < NUM_MONITOR; use++) { + ulint type; + monitor_id_t monitor_id = static_cast(use); + monitor_info_t* monitor_info; + + if (!innobase_wildcasecmp( + srv_mon_get_name(monitor_id), name)) { + monitor_info = srv_mon_get_info(monitor_id); + + type = monitor_info->monitor_type; + + /* If the monitor counter is of MONITOR_MODULE + type, skip it. Except for those also marked with + MONITOR_GROUP_MODULE flag, which can be turned + on only as a module. */ + if (!(type & MONITOR_MODULE) + && !(type & MONITOR_GROUP_MODULE)) { + innodb_monitor_set_option(monitor_info, + set_option); + } + + /* Need to special handle counters marked with + MONITOR_GROUP_MODULE, turn on the whole module if + any one of it comes here. Currently, only + "module_buf_page" is marked with MONITOR_GROUP_MODULE */ + if (type & MONITOR_GROUP_MODULE) { + if ((monitor_id >= MONITOR_MODULE_BUF_PAGE) + && (monitor_id < MONITOR_MODULE_OS)) { + if (set_option == MONITOR_TURN_ON + && MONITOR_IS_ON( + MONITOR_MODULE_BUF_PAGE)) { + continue; + } + + srv_mon_set_module_control( + MONITOR_MODULE_BUF_PAGE, + set_option); + } else { + /* If new monitor is added with + MONITOR_GROUP_MODULE, it needs + to be added here. */ + ut_ad(0); + } + } + } + } +} + +/*************************************************************//** +Given a configuration variable name, find corresponding monitor counter +and return its monitor ID if found. +@return monitor ID if found, MONITOR_NO_MATCH if there is no match */ +static +ulint +innodb_monitor_id_by_name_get( +/*==========================*/ + const char* name) /*!< in: monitor counter namer */ +{ + ut_a(name); + + /* Search for wild character '%' in the name, if + found, we treat it as a wildcard match. We do not search for + single character wildcard '_' since our monitor names already contain + such character. To avoid confusion, we request user must include + at least one '%' character to activate the wildcard search. */ + if (strchr(name, '%')) { + return(MONITOR_WILDCARD_MATCH); + } + + /* Not wildcard match, check for an exact match */ + for (ulint i = 0; i < NUM_MONITOR; i++) { + if (!innobase_strcasecmp( + name, srv_mon_get_name(static_cast(i)))) { + return(i); + } + } + + return(MONITOR_NO_MATCH); +} +/*************************************************************//** +Validate that the passed in monitor name matches at least one +monitor counter name with wildcard compare. +@return TRUE if at least one monitor name matches */ +static +ibool +innodb_monitor_validate_wildcard_name( +/*==================================*/ + const char* name) /*!< in: monitor counter namer */ +{ + for (ulint i = 0; i < NUM_MONITOR; i++) { + if (!innobase_wildcasecmp( + srv_mon_get_name(static_cast(i)), name)) { + return(TRUE); + } + } + + return(FALSE); +} +/*************************************************************//** +Validate the passed in monitor name, find and save the +corresponding monitor name in the function parameter "save". +@return 0 if monitor name is valid */ +static int innodb_monitor_valid_byname(const char *name) +{ + ulint use; + monitor_info_t* monitor_info; + + if (!name) { + return(1); + } + + use = innodb_monitor_id_by_name_get(name); + + /* No monitor name matches, nor it is wildcard match */ + if (use == MONITOR_NO_MATCH) { + return(1); + } + + if (use < NUM_MONITOR) { + monitor_info = srv_mon_get_info((monitor_id_t) use); + + /* If the monitor counter is marked with + MONITOR_GROUP_MODULE flag, then this counter + cannot be turned on/off individually, instead + it shall be turned on/off as a group using + its module name */ + if ((monitor_info->monitor_type & MONITOR_GROUP_MODULE) + && (!(monitor_info->monitor_type & MONITOR_MODULE))) { + sql_print_warning( + "Monitor counter '%s' cannot" + " be turned on/off individually." + " Please use its module name" + " to turn on/off the counters" + " in the module as a group.\n", + name); + + return(1); + } + + } else { + ut_a(use == MONITOR_WILDCARD_MATCH); + + /* For wildcard match, if there is not a single monitor + counter name that matches, treat it as an invalid + value for the system configuration variables */ + if (!innodb_monitor_validate_wildcard_name(name)) { + return(1); + } + } + + return(0); +} +/*************************************************************//** +Validate passed-in "value" is a valid monitor counter name. +This function is registered as a callback with MySQL. +@return 0 for valid name */ +static +int +innodb_monitor_validate( +/*====================*/ + THD*, st_mysql_sys_var*, + void* save, /*!< out: immediate result + for update function */ + struct st_mysql_value* value) /*!< in: incoming string */ +{ + int ret= 0; + + if (const char *name= value->val_str(value, nullptr, &ret)) + { + ret= innodb_monitor_valid_byname(name); + if (!ret) + *static_cast(save)= name; + } + else + ret= 1; + + return ret; +} + +/****************************************************************//** +Update the system variable innodb_enable(disable/reset/reset_all)_monitor +according to the "set_option" and turn on/off or reset specified monitor +counter. */ +static +void +innodb_monitor_update( +/*==================*/ + THD* thd, /*!< in: thread handle */ + void* var_ptr, /*!< out: where the + formal string goes */ + const void* save, /*!< in: immediate result + from check function */ + mon_option_t set_option) /*!< in: the set option, + whether to turn on/off or + reset the counter */ +{ + monitor_info_t* monitor_info; + ulint monitor_id; + ulint err_monitor = 0; + const char* name; + + ut_a(save != NULL); + + name = *static_cast(save); + + if (!name) { + monitor_id = MONITOR_DEFAULT_START; + } else { + monitor_id = innodb_monitor_id_by_name_get(name); + + /* Double check we have a valid monitor ID */ + if (monitor_id == MONITOR_NO_MATCH) { + return; + } + } + + if (monitor_id == MONITOR_DEFAULT_START) { + /* If user set the variable to "default", we will + print a message and make this set operation a "noop". + The check is being made here is because "set default" + does not go through validation function */ + if (thd) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + ER_NO_DEFAULT, + "Default value is not defined for" + " this set option. Please specify" + " correct counter or module name."); + } else { + sql_print_error( + "Default value is not defined for" + " this set option. Please specify" + " correct counter or module name.\n"); + } + + if (var_ptr) { + *(const char**) var_ptr = NULL; + } + } else if (monitor_id == MONITOR_WILDCARD_MATCH) { + innodb_monitor_update_wildcard(name, set_option); + } else { + monitor_info = srv_mon_get_info( + static_cast(monitor_id)); + + ut_a(monitor_info); + + /* If monitor is already truned on, someone could already + collect monitor data, exit and ask user to turn off the + monitor before turn it on again. */ + if (set_option == MONITOR_TURN_ON + && MONITOR_IS_ON(monitor_id)) { + err_monitor = monitor_id; + goto exit; + } + + if (var_ptr) { + *(const char**) var_ptr = monitor_info->monitor_name; + } + + /* Depending on the monitor name is for a module or + a counter, process counters in the whole module or + individual counter. */ + if (monitor_info->monitor_type & MONITOR_MODULE) { + srv_mon_set_module_control( + static_cast(monitor_id), + set_option); + } else { + innodb_monitor_set_option(monitor_info, set_option); + } + } +exit: + /* Only if we are trying to turn on a monitor that already + been turned on, we will set err_monitor. Print related + information */ + if (err_monitor) { + sql_print_warning("InnoDB: Monitor %s is already enabled.", + srv_mon_get_name((monitor_id_t) err_monitor)); + } +} + +#ifdef UNIV_DEBUG +static char* srv_buffer_pool_evict; + +/****************************************************************//** +Evict all uncompressed pages of compressed tables from the buffer pool. +Keep the compressed pages in the buffer pool. +@return whether all uncompressed pages were evicted */ +static bool innodb_buffer_pool_evict_uncompressed() +{ + bool all_evicted = true; + + mysql_mutex_lock(&buf_pool.mutex); + + for (buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU); + block != NULL; ) { + buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU, block); + ut_ad(block->page.in_file()); + ut_ad(block->page.belongs_to_unzip_LRU()); + ut_ad(block->in_unzip_LRU_list); + ut_ad(block->page.in_LRU_list); + + if (!buf_LRU_free_page(&block->page, false)) { + all_evicted = false; + block = prev_block; + } else { + /* Because buf_LRU_free_page() may release + and reacquire buf_pool.mutex, prev_block + may be invalid. */ + block = UT_LIST_GET_LAST(buf_pool.unzip_LRU); + } + } + + mysql_mutex_unlock(&buf_pool.mutex); + return(all_evicted); +} + +/****************************************************************//** +Called on SET GLOBAL innodb_buffer_pool_evict=... +Handles some values specially, to evict pages from the buffer pool. +SET GLOBAL innodb_buffer_pool_evict='uncompressed' +evicts all uncompressed page frames of compressed tablespaces. */ +static +void +innodb_buffer_pool_evict_update(THD*, st_mysql_sys_var*, void*, + const void* save) +{ + if (const char* op = *static_cast(save)) { + if (!strcmp(op, "uncompressed")) { + mysql_mutex_unlock(&LOCK_global_system_variables); + for (uint tries = 0; tries < 10000; tries++) { + if (innodb_buffer_pool_evict_uncompressed()) { + mysql_mutex_lock( + &LOCK_global_system_variables); + return; + } + + std::this_thread::sleep_for( + std::chrono::milliseconds(10)); + } + + /* We failed to evict all uncompressed pages. */ + ut_ad(0); + } + } +} +#endif /* UNIV_DEBUG */ + +/****************************************************************//** +Update the system variable innodb_monitor_enable and enable +specified monitor counter. +This function is registered as a callback with MySQL. */ +static +void +innodb_enable_monitor_update( +/*=========================*/ + THD* thd, /*!< in: thread handle */ + st_mysql_sys_var*, + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + innodb_monitor_update(thd, var_ptr, save, MONITOR_TURN_ON); +} + +/****************************************************************//** +Update the system variable innodb_monitor_disable and turn +off specified monitor counter. */ +static +void +innodb_disable_monitor_update( +/*==========================*/ + THD* thd, /*!< in: thread handle */ + st_mysql_sys_var*, + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + innodb_monitor_update(thd, var_ptr, save, MONITOR_TURN_OFF); +} + +/****************************************************************//** +Update the system variable innodb_monitor_reset and reset +specified monitor counter(s). +This function is registered as a callback with MySQL. */ +static +void +innodb_reset_monitor_update( +/*========================*/ + THD* thd, /*!< in: thread handle */ + st_mysql_sys_var*, + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + innodb_monitor_update(thd, var_ptr, save, MONITOR_RESET_VALUE); +} + +/****************************************************************//** +Update the system variable innodb_monitor_reset_all and reset +all value related monitor counter. +This function is registered as a callback with MySQL. */ +static +void +innodb_reset_all_monitor_update( +/*============================*/ + THD* thd, /*!< in: thread handle */ + st_mysql_sys_var*, + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + innodb_monitor_update(thd, var_ptr, save, MONITOR_RESET_ALL_VALUE); +} + +static +void +innodb_defragment_frequency_update(THD*, st_mysql_sys_var*, void*, + const void* save) +{ + srv_defragment_frequency = (*static_cast(save)); + srv_defragment_interval = 1000000000ULL / srv_defragment_frequency; +} + +static inline char *my_strtok_r(char *str, const char *delim, char **saveptr) +{ +#if defined _WIN32 + return strtok_s(str, delim, saveptr); +#else + return strtok_r(str, delim, saveptr); +#endif +} + +/****************************************************************//** +Parse and enable InnoDB monitor counters during server startup. +User can list the monitor counters/groups to be enable by specifying +"loose-innodb_monitor_enable=monitor_name1;monitor_name2..." +in server configuration file or at the command line. The string +separate could be ";", "," or empty space. */ +static +void +innodb_enable_monitor_at_startup( +/*=============================*/ + char* str) /*!< in/out: monitor counter enable list */ +{ + static const char* sep = " ;,"; + char* last; + + ut_a(str); + + /* Walk through the string, and separate each monitor counter + and/or counter group name, and calling innodb_monitor_update() + if successfully updated. Please note that the "str" would be + changed by strtok_r() as it walks through it. */ + for (char* option = my_strtok_r(str, sep, &last); + option; + option = my_strtok_r(NULL, sep, &last)) { + if (!innodb_monitor_valid_byname(option)) { + innodb_monitor_update(NULL, NULL, &option, + MONITOR_TURN_ON); + } else { + sql_print_warning("Invalid monitor counter" + " name: '%s'", option); + } + } +} + +/****************************************************************//** +Callback function for accessing the InnoDB variables from MySQL: +SHOW VARIABLES. */ +static int show_innodb_vars(THD*, SHOW_VAR* var, void *, + struct system_status_var *status_var, + enum enum_var_type var_type) +{ + innodb_export_status(); + var->type = SHOW_ARRAY; + var->value = (char*) &innodb_status_variables; + //var->scope = SHOW_SCOPE_GLOBAL; + + return(0); +} + +/****************************************************************//** +This function checks each index name for a table against reserved +system default primary index name 'GEN_CLUST_INDEX'. If a name +matches, this function pushes an warning message to the client, +and returns true. +@return true if the index name matches the reserved name */ +bool +innobase_index_name_is_reserved( +/*============================*/ + THD* thd, /*!< in/out: MySQL connection */ + const KEY* key_info, /*!< in: Indexes to be created */ + ulint num_of_keys) /*!< in: Number of indexes to + be created. */ +{ + const KEY* key; + uint key_num; /* index number */ + + for (key_num = 0; key_num < num_of_keys; key_num++) { + key = &key_info[key_num]; + + if (innobase_strcasecmp(key->name.str, + innobase_index_reserve_name) == 0) { + /* Push warning to mysql */ + push_warning_printf(thd, + Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_NAME_FOR_INDEX, + "Cannot Create Index with name" + " '%s'. The name is reserved" + " for the system default primary" + " index.", + innobase_index_reserve_name); + + my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0), + innobase_index_reserve_name); + + return(true); + } + } + + return(false); +} + +/** Retrieve the FTS Relevance Ranking result for doc with doc_id +of m_prebuilt->fts_doc_id +@param[in,out] fts_hdl FTS handler +@return the relevance ranking value */ +static +float +innobase_fts_retrieve_ranking( + FT_INFO* fts_hdl) +{ + fts_result_t* result; + row_prebuilt_t* ft_prebuilt; + + result = reinterpret_cast(fts_hdl)->ft_result; + + ft_prebuilt = reinterpret_cast(fts_hdl)->ft_prebuilt; + + fts_ranking_t* ranking = rbt_value(fts_ranking_t, result->current); + ft_prebuilt->fts_doc_id= ranking->doc_id; + + return(ranking->rank); +} + +/** Free the memory for the FTS handler +@param[in,out] fts_hdl FTS handler */ +static +void +innobase_fts_close_ranking( + FT_INFO* fts_hdl) +{ + fts_result_t* result; + + result = reinterpret_cast(fts_hdl)->ft_result; + + fts_query_free_result(result); + + my_free((uchar*) fts_hdl); +} + +/** Find and Retrieve the FTS Relevance Ranking result for doc with doc_id +of m_prebuilt->fts_doc_id +@param[in,out] fts_hdl FTS handler +@return the relevance ranking value */ +static +float +innobase_fts_find_ranking(FT_INFO* fts_hdl, uchar*, uint) +{ + fts_result_t* result; + row_prebuilt_t* ft_prebuilt; + + ft_prebuilt = reinterpret_cast(fts_hdl)->ft_prebuilt; + result = reinterpret_cast(fts_hdl)->ft_result; + + /* Retrieve the ranking value for doc_id with value of + m_prebuilt->fts_doc_id */ + return(fts_retrieve_ranking(result, ft_prebuilt->fts_doc_id)); +} + +#ifdef UNIV_DEBUG +static my_bool innodb_log_checkpoint_now = TRUE; +static my_bool innodb_buf_flush_list_now = TRUE; +static uint innodb_merge_threshold_set_all_debug + = DICT_INDEX_MERGE_THRESHOLD_DEFAULT; + +/** Force an InnoDB log checkpoint. */ +static +void +checkpoint_now_set(THD*, st_mysql_sys_var*, void*, const void *save) +{ + if (!*static_cast(save)) + return; + const auto size= log_sys.is_encrypted() + ? SIZE_OF_FILE_CHECKPOINT + 8 : SIZE_OF_FILE_CHECKPOINT; + mysql_mutex_unlock(&LOCK_global_system_variables); + lsn_t lsn; + while (log_sys.last_checkpoint_lsn.load(std::memory_order_acquire) + size < + (lsn= log_sys.get_lsn(std::memory_order_acquire))) + log_make_checkpoint(); + + mysql_mutex_lock(&LOCK_global_system_variables); +} + +/****************************************************************//** +Force a dirty pages flush now. */ +static +void +buf_flush_list_now_set(THD*, st_mysql_sys_var*, void*, const void* save) +{ + if (*(my_bool*) save) { + mysql_mutex_unlock(&LOCK_global_system_variables); + buf_flush_sync(); + mysql_mutex_lock(&LOCK_global_system_variables); + } +} + +/** Override current MERGE_THRESHOLD setting for all indexes at dictionary +now. +@param[in] save immediate result from check function */ +static +void +innodb_merge_threshold_set_all_debug_update(THD*, st_mysql_sys_var*, void*, + const void* save) +{ + innodb_merge_threshold_set_all_debug + = (*static_cast(save)); + dict_set_merge_threshold_all_debug( + innodb_merge_threshold_set_all_debug); +} +#endif /* UNIV_DEBUG */ + +/** Find and Retrieve the FTS doc_id for the current result row +@param[in,out] fts_hdl FTS handler +@return the document ID */ +static +ulonglong +innobase_fts_retrieve_docid( + FT_INFO_EXT* fts_hdl) +{ + fts_result_t* result; + row_prebuilt_t* ft_prebuilt; + + ft_prebuilt = reinterpret_cast(fts_hdl)->ft_prebuilt; + result = reinterpret_cast(fts_hdl)->ft_result; + + if (ft_prebuilt->read_just_key) { + + fts_ranking_t* ranking = + rbt_value(fts_ranking_t, result->current); + + return(ranking->doc_id); + } + + return(ft_prebuilt->fts_doc_id); +} + +/* These variables are never read by InnoDB or changed. They are a kind of +dummies that are needed by the MySQL infrastructure to call +buffer_pool_dump_now(), buffer_pool_load_now() and buffer_pool_load_abort() +by the user by doing: + SET GLOBAL innodb_buffer_pool_dump_now=ON; + SET GLOBAL innodb_buffer_pool_load_now=ON; + SET GLOBAL innodb_buffer_pool_load_abort=ON; +Their values are read by MySQL and displayed to the user when the variables +are queried, e.g.: + SELECT @@innodb_buffer_pool_dump_now; + SELECT @@innodb_buffer_pool_load_now; + SELECT @@innodb_buffer_pool_load_abort; */ +static my_bool innodb_buffer_pool_dump_now = FALSE; +static my_bool innodb_buffer_pool_load_now = FALSE; +static my_bool innodb_buffer_pool_load_abort = FALSE; + +/****************************************************************//** +Trigger a dump of the buffer pool if innodb_buffer_pool_dump_now is set +to ON. This function is registered as a callback with MySQL. */ +static +void +buffer_pool_dump_now( +/*=================*/ + THD* thd /*!< in: thread handle */ + MY_ATTRIBUTE((unused)), + struct st_mysql_sys_var* var /*!< in: pointer to system + variable */ + MY_ATTRIBUTE((unused)), + void* var_ptr /*!< out: where the formal + string goes */ + MY_ATTRIBUTE((unused)), + const void* save) /*!< in: immediate result from + check function */ +{ + if (*(my_bool*) save && !srv_read_only_mode) { + mysql_mutex_unlock(&LOCK_global_system_variables); + buf_dump_start(); + mysql_mutex_lock(&LOCK_global_system_variables); + } +} + +/****************************************************************//** +Trigger a load of the buffer pool if innodb_buffer_pool_load_now is set +to ON. This function is registered as a callback with MySQL. */ +static +void +buffer_pool_load_now( +/*=================*/ + THD* thd /*!< in: thread handle */ + MY_ATTRIBUTE((unused)), + struct st_mysql_sys_var* var /*!< in: pointer to system + variable */ + MY_ATTRIBUTE((unused)), + void* var_ptr /*!< out: where the formal + string goes */ + MY_ATTRIBUTE((unused)), + const void* save) /*!< in: immediate result from + check function */ +{ + if (*(my_bool*) save && !srv_read_only_mode) { + mysql_mutex_unlock(&LOCK_global_system_variables); + buf_load_start(); + mysql_mutex_lock(&LOCK_global_system_variables); + } +} + +/****************************************************************//** +Abort a load of the buffer pool if innodb_buffer_pool_load_abort +is set to ON. This function is registered as a callback with MySQL. */ +static +void +buffer_pool_load_abort( +/*===================*/ + THD* thd /*!< in: thread handle */ + MY_ATTRIBUTE((unused)), + struct st_mysql_sys_var* var /*!< in: pointer to system + variable */ + MY_ATTRIBUTE((unused)), + void* var_ptr /*!< out: where the formal + string goes */ + MY_ATTRIBUTE((unused)), + const void* save) /*!< in: immediate result from + check function */ +{ + if (*(my_bool*) save && !srv_read_only_mode) { + mysql_mutex_unlock(&LOCK_global_system_variables); + buf_load_abort(); + mysql_mutex_lock(&LOCK_global_system_variables); + } +} + +#if defined __linux__ || defined _WIN32 +static void innodb_log_file_buffering_update(THD *thd, st_mysql_sys_var*, + void *, const void *save) +{ + mysql_mutex_unlock(&LOCK_global_system_variables); + log_sys.set_buffered(*static_cast(save)); + mysql_mutex_lock(&LOCK_global_system_variables); +} +#endif + +static void innodb_log_file_size_update(THD *thd, st_mysql_sys_var*, + void *var, const void *save) +{ + ut_ad(var == &srv_log_file_size); + mysql_mutex_unlock(&LOCK_global_system_variables); + + if (high_level_read_only) + ib_senderrf(thd, IB_LOG_LEVEL_ERROR, ER_READ_ONLY_MODE); + else if (!log_sys.is_pmem() && + *static_cast(save) < log_sys.buf_size) + my_printf_error(ER_WRONG_ARGUMENTS, + "innodb_log_file_size must be at least" + " innodb_log_buffer_size=%zu", MYF(0), log_sys.buf_size); + else + { + switch (log_sys.resize_start(*static_cast(save))) { + case log_t::RESIZE_NO_CHANGE: + break; + case log_t::RESIZE_IN_PROGRESS: + my_printf_error(ER_WRONG_USAGE, + "innodb_log_file_size change is already in progress", + MYF(0)); + break; + case log_t::RESIZE_FAILED: + ib_senderrf(thd, IB_LOG_LEVEL_ERROR, ER_CANT_CREATE_HANDLER_FILE); + break; + case log_t::RESIZE_STARTED: + for (timespec abstime;;) + { + if (thd_kill_level(thd)) + { + log_sys.resize_abort(); + break; + } + + set_timespec(abstime, 5); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + const bool in_progress(buf_pool.get_oldest_modification(LSN_MAX) < + log_sys.resize_in_progress()); + if (in_progress) + my_cond_timedwait(&buf_pool.do_flush_list, + &buf_pool.flush_list_mutex.m_mutex, &abstime); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + if (!log_sys.resize_in_progress()) + break; + } + } + } + mysql_mutex_lock(&LOCK_global_system_variables); +} + +/** Update innodb_status_output or innodb_status_output_locks, +which control InnoDB "status monitor" output to the error log. +@param[out] var current value +@param[in] save to-be-assigned value */ +static +void +innodb_status_output_update(THD*,st_mysql_sys_var*,void*var,const void*save) +{ + if (srv_monitor_timer) + { + *static_cast(var)= *static_cast(save); + mysql_mutex_unlock(&LOCK_global_system_variables); + /* Wakeup server monitor. */ + srv_monitor_timer_schedule_now(); + mysql_mutex_lock(&LOCK_global_system_variables); + } +} + +/** Update the system variable innodb_encryption_threads. +@param[in] save to-be-assigned value */ +static +void +innodb_encryption_threads_update(THD*,st_mysql_sys_var*,void*,const void*save) +{ + mysql_mutex_unlock(&LOCK_global_system_variables); + fil_crypt_set_thread_cnt(*static_cast(save)); + mysql_mutex_lock(&LOCK_global_system_variables); +} + +/** Update the system variable innodb_encryption_rotate_key_age. +@param[in] save to-be-assigned value */ +static +void +innodb_encryption_rotate_key_age_update(THD*, st_mysql_sys_var*, void*, + const void* save) +{ + mysql_mutex_unlock(&LOCK_global_system_variables); + fil_crypt_set_rotate_key_age(*static_cast(save)); + mysql_mutex_lock(&LOCK_global_system_variables); +} + +/** Update the system variable innodb_encryption_rotation_iops. +@param[in] save to-be-assigned value */ +static +void +innodb_encryption_rotation_iops_update(THD*, st_mysql_sys_var*, void*, + const void* save) +{ + mysql_mutex_unlock(&LOCK_global_system_variables); + fil_crypt_set_rotation_iops(*static_cast(save)); + mysql_mutex_lock(&LOCK_global_system_variables); +} + +/** Update the system variable innodb_encrypt_tables. +@param[in] save to-be-assigned value */ +static +void +innodb_encrypt_tables_update(THD*, st_mysql_sys_var*, void*, const void* save) +{ + mysql_mutex_unlock(&LOCK_global_system_variables); + fil_crypt_set_encrypt_tables(*static_cast(save)); + mysql_mutex_lock(&LOCK_global_system_variables); +} + +static SHOW_VAR innodb_status_variables_export[]= { + SHOW_FUNC_ENTRY("Innodb", &show_innodb_vars), + {NullS, NullS, SHOW_LONG} +}; + +static struct st_mysql_storage_engine innobase_storage_engine= +{ MYSQL_HANDLERTON_INTERFACE_VERSION }; + +#ifdef WITH_WSREP +/** Request a transaction to be killed that holds a conflicting lock. +@param bf_trx brute force applier transaction +@param thd_id thd_get_thread_id(victim_trx->mysql_htd) +@param trx_id victim_trx->id */ +void lock_wait_wsrep_kill(trx_t *bf_trx, ulong thd_id, trx_id_t trx_id) +{ + THD *bf_thd= bf_trx->mysql_thd; + + if (THD *vthd= find_thread_by_id(thd_id)) + { + bool aborting= false; + wsrep_thd_LOCK(vthd); + trx_t *vtrx= thd_to_trx(vthd); + if (vtrx) + { + /* Do not bother with lock elision using transactional memory here; + this is rather complex code */ + LockMutexGuard g{SRW_LOCK_CALL}; + mysql_mutex_lock(&lock_sys.wait_mutex); + vtrx->mutex_lock(); + /* victim transaction is either active or prepared, if it has already + proceeded to replication phase */ + if (vtrx->id == trx_id) + { + switch (vtrx->state) { + default: + break; + case TRX_STATE_PREPARED: + if (!wsrep_is_wsrep_xid(&vtrx->xid)) + break; + /* fall through */ + case TRX_STATE_ACTIVE: + WSREP_LOG_CONFLICT(bf_thd, vthd, TRUE); + WSREP_DEBUG("Aborter BF trx_id: " TRX_ID_FMT " thread: %ld " + "seqno: %lld client_state: %s " + "client_mode: %s transaction_mode: %s query: %s", + bf_trx->id, + thd_get_thread_id(bf_thd), + wsrep_thd_trx_seqno(bf_thd), + wsrep_thd_client_state_str(bf_thd), + wsrep_thd_client_mode_str(bf_thd), + wsrep_thd_transaction_state_str(bf_thd), + wsrep_thd_query(bf_thd)); + WSREP_DEBUG("Victim %s trx_id: " TRX_ID_FMT " thread: %ld " + "seqno: %lld client_state: %s " + "client_mode: %s transaction_mode: %s query: %s", + wsrep_thd_is_BF(vthd, false) ? "BF" : "normal", + vtrx->id, + thd_get_thread_id(vthd), + wsrep_thd_trx_seqno(vthd), + wsrep_thd_client_state_str(vthd), + wsrep_thd_client_mode_str(vthd), + wsrep_thd_transaction_state_str(vthd), + wsrep_thd_query(vthd)); + aborting= true; + } + } + mysql_mutex_unlock(&lock_sys.wait_mutex); + vtrx->mutex_unlock(); + } + + DEBUG_SYNC(bf_thd, "before_wsrep_thd_abort"); + if (aborting && wsrep_thd_bf_abort(bf_thd, vthd, true)) + { + /* Need to grab mutexes again to ensure that the trx is still in + right state. */ + lock_sys.wr_lock(SRW_LOCK_CALL); + mysql_mutex_lock(&lock_sys.wait_mutex); + vtrx->mutex_lock(); + + /* if victim is waiting for some other lock, we have to cancel + that waiting + */ + if (vtrx->id == trx_id) + { + switch (vtrx->state) { + default: + break; + case TRX_STATE_ACTIVE: + case TRX_STATE_PREPARED: + lock_sys.cancel_lock_wait_for_wsrep_bf_abort(vtrx); + } + } + lock_sys.wr_unlock(); + mysql_mutex_unlock(&lock_sys.wait_mutex); + vtrx->mutex_unlock(); + } + else + { + WSREP_DEBUG("wsrep_thd_bf_abort has failed, victim %lu will survive", + thd_get_thread_id(vthd)); + } + wsrep_thd_UNLOCK(vthd); + wsrep_thd_kill_UNLOCK(vthd); + } +} + +/** This function forces the victim transaction to abort. Aborting the + transaction does NOT end it, it still has to be rolled back. + + The caller must lock LOCK_thd_kill and LOCK_thd_data. + + @param bf_thd brute force THD asking for the abort + @param victim_thd victim THD to be aborted +*/ +static void wsrep_abort_transaction(handlerton *, THD *bf_thd, THD *victim_thd, + my_bool signal) +{ + DBUG_ENTER("wsrep_abort_transaction"); + ut_ad(bf_thd); + ut_ad(victim_thd); + + trx_t *victim_trx= thd_to_trx(victim_thd); + + WSREP_DEBUG("abort transaction: BF: %s victim: %s victim conf: %s", + wsrep_thd_query(bf_thd), wsrep_thd_query(victim_thd), + wsrep_thd_transaction_state_str(victim_thd)); + + if (!victim_trx) + { + WSREP_DEBUG("abort transaction: victim did not exist"); + DBUG_VOID_RETURN; + } + + lock_sys.wr_lock(SRW_LOCK_CALL); + mysql_mutex_lock(&lock_sys.wait_mutex); + victim_trx->mutex_lock(); + + switch (victim_trx->state) { + default: + break; + case TRX_STATE_ACTIVE: + case TRX_STATE_PREPARED: + /* Cancel lock wait if the victim is waiting for a lock in InnoDB. + The transaction which is blocked somewhere else (e.g. waiting + for next command or MDL) has been interrupted by THD::awake_no_mutex() + on server level before calling this function. */ + lock_sys.cancel_lock_wait_for_wsrep_bf_abort(victim_trx); + } + lock_sys.wr_unlock(); + mysql_mutex_unlock(&lock_sys.wait_mutex); + victim_trx->mutex_unlock(); + + DBUG_VOID_RETURN; +} + +static +int +innobase_wsrep_set_checkpoint( +/*==========================*/ + handlerton* hton, + const XID* xid) +{ + DBUG_ASSERT(hton == innodb_hton_ptr); + + if (wsrep_is_wsrep_xid(xid)) { + + trx_rseg_update_wsrep_checkpoint(xid); + log_buffer_flush_to_disk(srv_flush_log_at_trx_commit == 1); + return 0; + } else { + return 1; + } +} + +static +int +innobase_wsrep_get_checkpoint( +/*==========================*/ + handlerton* hton, + XID* xid) +{ + DBUG_ASSERT(hton == innodb_hton_ptr); + trx_rseg_read_wsrep_checkpoint(*xid); + return 0; +} +#endif /* WITH_WSREP */ + +/* plugin options */ + +static MYSQL_SYSVAR_ENUM(checksum_algorithm, srv_checksum_algorithm, + PLUGIN_VAR_RQCMDARG, + "The algorithm InnoDB uses for page checksumming. Possible values are" + " FULL_CRC32" + " for new files, always use CRC-32C; for old, see CRC32 below;" + " STRICT_FULL_CRC32" + " for new files, always use CRC-32C; for old, see STRICT_CRC32 below;" + " CRC32" + " write crc32, allow previously used algorithms to match when reading;" + " STRICT_CRC32" + " write crc32, do not allow other algorithms to match when reading;" + " New files created with full_crc32 are readable by MariaDB 10.4.3+", + NULL, NULL, SRV_CHECKSUM_ALGORITHM_FULL_CRC32, + &innodb_checksum_algorithm_typelib); + +static MYSQL_SYSVAR_STR(data_home_dir, innobase_data_home_dir, + PLUGIN_VAR_READONLY, + "The common part for InnoDB table spaces.", + NULL, NULL, NULL); + +static MYSQL_SYSVAR_BOOL(doublewrite, srv_use_doublewrite_buf, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Enable InnoDB doublewrite buffer (enabled by default)." + " Disable with --skip-innodb-doublewrite.", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_BOOL(use_atomic_writes, srv_use_atomic_writes, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Enable atomic writes, instead of using the doublewrite buffer, for files " + "on devices that supports atomic writes.", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_BOOL(stats_include_delete_marked, + srv_stats_include_delete_marked, + PLUGIN_VAR_OPCMDARG, + "Include delete marked records when calculating persistent statistics", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_ENUM(instant_alter_column_allowed, + innodb_instant_alter_column_allowed, + PLUGIN_VAR_RQCMDARG, + "File format constraint for ALTER TABLE", NULL, NULL, 2/*add_drop_reorder*/, + &innodb_instant_alter_column_allowed_typelib); + +static MYSQL_SYSVAR_ULONG(io_capacity, srv_io_capacity, + PLUGIN_VAR_RQCMDARG, + "Number of IOPs the server can do. Tunes the background IO rate", + NULL, innodb_io_capacity_update, 200, 100, ~0UL, 0); + +static MYSQL_SYSVAR_ULONG(io_capacity_max, srv_max_io_capacity, + PLUGIN_VAR_RQCMDARG, + "Limit to which innodb_io_capacity can be inflated.", + NULL, innodb_io_capacity_max_update, + SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT, 100, + SRV_MAX_IO_CAPACITY_LIMIT, 0); + +#ifdef UNIV_DEBUG +static MYSQL_SYSVAR_BOOL(log_checkpoint_now, innodb_log_checkpoint_now, + PLUGIN_VAR_OPCMDARG, + "Force checkpoint now", + NULL, checkpoint_now_set, FALSE); + +static MYSQL_SYSVAR_BOOL(buf_flush_list_now, innodb_buf_flush_list_now, + PLUGIN_VAR_OPCMDARG, + "Force dirty page flush now", + NULL, buf_flush_list_now_set, FALSE); + +static MYSQL_SYSVAR_UINT(merge_threshold_set_all_debug, + innodb_merge_threshold_set_all_debug, + PLUGIN_VAR_RQCMDARG, + "Override current MERGE_THRESHOLD setting for all indexes at dictionary" + " cache by the specified value dynamically, at the time.", + NULL, innodb_merge_threshold_set_all_debug_update, + DICT_INDEX_MERGE_THRESHOLD_DEFAULT, 1, 50, 0); +#endif /* UNIV_DEBUG */ + +static MYSQL_SYSVAR_ULONG(purge_batch_size, srv_purge_batch_size, + PLUGIN_VAR_OPCMDARG, + "Number of UNDO log pages to purge in one batch from the history list.", + NULL, NULL, + 1000, /* Default setting */ + 1, /* Minimum value */ + innodb_purge_batch_size_MAX, 0); + +extern void srv_update_purge_thread_count(uint n); + +static +void +innodb_purge_threads_update(THD*, struct st_mysql_sys_var*, void*, const void*save ) +{ + srv_update_purge_thread_count(*static_cast(save)); +} + +static MYSQL_SYSVAR_UINT(purge_threads, srv_n_purge_threads, + PLUGIN_VAR_OPCMDARG, + "Number of tasks for purging transaction history", + NULL, innodb_purge_threads_update, + 4, /* Default setting */ + 1, /* Minimum value */ + innodb_purge_threads_MAX, /* Maximum value */ + 0); + +static MYSQL_SYSVAR_UINT(fast_shutdown, srv_fast_shutdown, + PLUGIN_VAR_OPCMDARG, + "Speeds up the shutdown process of the InnoDB storage engine. Possible" + " values are 0, 1 (faster), 2 (crash-like), 3 (fastest clean).", + fast_shutdown_validate, NULL, 1, 0, 3, 0); + +static MYSQL_SYSVAR_BOOL(file_per_table, srv_file_per_table, + PLUGIN_VAR_NOCMDARG, + "Stores each InnoDB table to an .ibd file in the database dir.", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_STR(ft_server_stopword_table, innobase_server_stopword_table, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_MEMALLOC, + "The user supplied stopword table name.", + innodb_stopword_table_validate, + NULL, + NULL); + +static MYSQL_SYSVAR_UINT(flush_log_at_timeout, srv_flush_log_at_timeout, + PLUGIN_VAR_OPCMDARG, + "Write and flush logs every (n) second.", + NULL, NULL, 1, 0, 2700, 0); + +static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit, + PLUGIN_VAR_OPCMDARG, + "Controls the durability/speed trade-off for commits." + " Set to 0 (write and flush redo log to disk only once per second)," + " 1 (flush to disk at each commit)," + " 2 (write to log at commit but flush to disk only once per second)" + " or 3 (flush to disk at prepare and at commit, slower and usually redundant)." + " 1 and 3 guarantees that after a crash, committed transactions will" + " not be lost and will be consistent with the binlog and other transactional" + " engines. 2 can get inconsistent and lose transactions if there is a" + " power failure or kernel crash but not if mysqld crashes. 0 has no" + " guarantees in case of crash. 0 and 2 can be faster than 1 or 3.", + NULL, NULL, 1, 0, 3, 0); + +static MYSQL_SYSVAR_ENUM(flush_method, srv_file_flush_method, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "With which method to flush data.", + NULL, NULL, IF_WIN(SRV_ALL_O_DIRECT_FSYNC, SRV_O_DIRECT), + &innodb_flush_method_typelib); + +static MYSQL_SYSVAR_STR(log_group_home_dir, srv_log_group_home_dir, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Path to ib_logfile0", NULL, NULL, NULL); + +static MYSQL_SYSVAR_DOUBLE(max_dirty_pages_pct, srv_max_buf_pool_modified_pct, + PLUGIN_VAR_RQCMDARG, + "Percentage of dirty pages allowed in bufferpool.", + NULL, innodb_max_dirty_pages_pct_update, 90.0, 0, 99.999, 0); + +static MYSQL_SYSVAR_DOUBLE(max_dirty_pages_pct_lwm, + srv_max_dirty_pages_pct_lwm, + PLUGIN_VAR_RQCMDARG, + "Percentage of dirty pages at which flushing kicks in. " + "The value 0 (default) means 'refer to innodb_max_dirty_pages_pct'.", + NULL, innodb_max_dirty_pages_pct_lwm_update, 0, 0, 99.999, 0); + +static MYSQL_SYSVAR_DOUBLE(adaptive_flushing_lwm, + srv_adaptive_flushing_lwm, + PLUGIN_VAR_RQCMDARG, + "Percentage of log capacity below which no adaptive flushing happens.", + NULL, NULL, 10.0, 0.0, 70.0, 0); + +static MYSQL_SYSVAR_BOOL(adaptive_flushing, srv_adaptive_flushing, + PLUGIN_VAR_NOCMDARG, + "Attempt flushing dirty pages to avoid IO bursts at checkpoints.", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_BOOL(flush_sync, srv_flush_sync, + PLUGIN_VAR_NOCMDARG, + "Allow IO bursts at the checkpoints ignoring io_capacity setting.", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_ULONG(flushing_avg_loops, + srv_flushing_avg_loops, + PLUGIN_VAR_RQCMDARG, + "Number of iterations over which the background flushing is averaged.", + NULL, NULL, 30, 1, 1000, 0); + +static MYSQL_SYSVAR_ULONG(max_purge_lag, srv_max_purge_lag, + PLUGIN_VAR_RQCMDARG, + "Desired maximum length of the purge queue (0 = no limit)", + NULL, NULL, 0, 0, ~0UL, 0); + +static MYSQL_SYSVAR_ULONG(max_purge_lag_delay, srv_max_purge_lag_delay, + PLUGIN_VAR_RQCMDARG, + "Maximum delay of user threads in micro-seconds", + NULL, NULL, + 0L, /* Default seting */ + 0L, /* Minimum value */ + 10000000UL, 0); /* Maximum value */ + +static MYSQL_SYSVAR_UINT(max_purge_lag_wait, innodb_max_purge_lag_wait, + PLUGIN_VAR_RQCMDARG, + "Wait until History list length is below the specified limit", + NULL, innodb_max_purge_lag_wait_update, UINT_MAX, 0, UINT_MAX, 0); + +static MYSQL_SYSVAR_BOOL(rollback_on_timeout, innobase_rollback_on_timeout, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + "Roll back the complete transaction on lock wait timeout, for 4.x compatibility (disabled by default)", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(status_file, innobase_create_status_file, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_NOSYSVAR, + "Enable SHOW ENGINE INNODB STATUS output in the innodb_status. file", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(stats_on_metadata, innobase_stats_on_metadata, + PLUGIN_VAR_OPCMDARG, + "Enable statistics gathering for metadata commands such as" + " SHOW TABLE STATUS for tables that use transient statistics (off by default)", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_ULONGLONG(stats_transient_sample_pages, + srv_stats_transient_sample_pages, + PLUGIN_VAR_RQCMDARG, + "The number of leaf index pages to sample when calculating transient" + " statistics (if persistent statistics are not used, default 8)", + NULL, NULL, 8, 1, ~0ULL, 0); + +static MYSQL_SYSVAR_BOOL(stats_persistent, srv_stats_persistent, + PLUGIN_VAR_OPCMDARG, + "InnoDB persistent statistics enabled for all tables unless overridden" + " at table level", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_BOOL(stats_auto_recalc, srv_stats_auto_recalc, + PLUGIN_VAR_OPCMDARG, + "InnoDB automatic recalculation of persistent statistics enabled for all" + " tables unless overridden at table level (automatic recalculation is only" + " done when InnoDB decides that the table has changed too much and needs a" + " new statistics)", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_ULONGLONG(stats_persistent_sample_pages, + srv_stats_persistent_sample_pages, + PLUGIN_VAR_RQCMDARG, + "The number of leaf index pages to sample when calculating persistent" + " statistics (by ANALYZE, default 20)", + NULL, NULL, 20, 1, ~0ULL, 0); + +static MYSQL_SYSVAR_ULONGLONG(stats_modified_counter, srv_stats_modified_counter, + PLUGIN_VAR_RQCMDARG, + "The number of rows modified before we calculate new statistics (default 0 = current limits)", + NULL, NULL, 0, 0, ~0ULL, 0); + +static MYSQL_SYSVAR_BOOL(stats_traditional, srv_stats_sample_traditional, + PLUGIN_VAR_RQCMDARG, + "Enable traditional statistic calculation based on number of configured pages (default true)", + NULL, NULL, TRUE); + +#ifdef BTR_CUR_HASH_ADAPT +static MYSQL_SYSVAR_BOOL(adaptive_hash_index, btr_search_enabled, + PLUGIN_VAR_OPCMDARG, + "Enable InnoDB adaptive hash index (disabled by default).", + NULL, innodb_adaptive_hash_index_update, false); + +/** Number of distinct partitions of AHI. +Each partition is protected by its own latch and so we have parts number +of latches protecting complete search system. */ +static MYSQL_SYSVAR_ULONG(adaptive_hash_index_parts, btr_ahi_parts, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + "Number of InnoDB Adaptive Hash Index Partitions (default 8)", + NULL, NULL, 8, 1, 512, 0); +#endif /* BTR_CUR_HASH_ADAPT */ + +static MYSQL_SYSVAR_UINT(compression_level, page_zip_level, + PLUGIN_VAR_RQCMDARG, + "Compression level used for zlib compression. 0 is no compression" + ", 1 is fastest, 9 is best compression and default is 6.", + NULL, NULL, DEFAULT_COMPRESSION_LEVEL, 0, 9, 0); + +static MYSQL_SYSVAR_UINT(autoextend_increment, + sys_tablespace_auto_extend_increment, + PLUGIN_VAR_RQCMDARG, + "Data file autoextend increment in megabytes", + NULL, NULL, 64, 1, 1000, 0); + +static MYSQL_SYSVAR_SIZE_T(buffer_pool_chunk_size, srv_buf_pool_chunk_unit, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Size of a single memory chunk" + " for resizing buffer pool. Online buffer pool resizing happens at this" + " granularity. 0 means autosize this variable based on buffer pool size.", + NULL, NULL, + 0, 0, SIZE_T_MAX, 1024 * 1024); + +static MYSQL_SYSVAR_STR(buffer_pool_filename, srv_buf_dump_filename, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Filename to/from which to dump/load the InnoDB buffer pool", + NULL, NULL, SRV_BUF_DUMP_FILENAME_DEFAULT); + +static MYSQL_SYSVAR_BOOL(buffer_pool_dump_now, innodb_buffer_pool_dump_now, + PLUGIN_VAR_RQCMDARG, + "Trigger an immediate dump of the buffer pool into a file named @@innodb_buffer_pool_filename", + NULL, buffer_pool_dump_now, FALSE); + +static MYSQL_SYSVAR_BOOL(buffer_pool_dump_at_shutdown, srv_buffer_pool_dump_at_shutdown, + PLUGIN_VAR_RQCMDARG, + "Dump the buffer pool into a file named @@innodb_buffer_pool_filename", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_ULONG(buffer_pool_dump_pct, srv_buf_pool_dump_pct, + PLUGIN_VAR_RQCMDARG, + "Dump only the hottest N% of each buffer pool, defaults to 25", + NULL, NULL, 25, 1, 100, 0); + +#ifdef UNIV_DEBUG +/* Added to test the innodb_buffer_pool_load_incomplete status variable. */ +static MYSQL_SYSVAR_ULONG(buffer_pool_load_pages_abort, srv_buf_pool_load_pages_abort, + PLUGIN_VAR_RQCMDARG, + "Number of pages during a buffer pool load to process before signaling innodb_buffer_pool_load_abort=1", + NULL, NULL, LONG_MAX, 1, LONG_MAX, 0); + +static MYSQL_SYSVAR_STR(buffer_pool_evict, srv_buffer_pool_evict, + PLUGIN_VAR_RQCMDARG, + "Evict pages from the buffer pool", + NULL, innodb_buffer_pool_evict_update, ""); +#endif /* UNIV_DEBUG */ + +static MYSQL_SYSVAR_BOOL(buffer_pool_load_now, innodb_buffer_pool_load_now, + PLUGIN_VAR_RQCMDARG, + "Trigger an immediate load of the buffer pool from a file named @@innodb_buffer_pool_filename", + NULL, buffer_pool_load_now, FALSE); + +static MYSQL_SYSVAR_BOOL(buffer_pool_load_abort, innodb_buffer_pool_load_abort, + PLUGIN_VAR_RQCMDARG, + "Abort a currently running load of the buffer pool", + NULL, buffer_pool_load_abort, FALSE); + +/* there is no point in changing this during runtime, thus readonly */ +static MYSQL_SYSVAR_BOOL(buffer_pool_load_at_startup, srv_buffer_pool_load_at_startup, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Load the buffer pool from a file named @@innodb_buffer_pool_filename", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_BOOL(defragment, srv_defragment, + PLUGIN_VAR_RQCMDARG, + "Enable/disable InnoDB defragmentation (default FALSE). When set to FALSE, all existing " + "defragmentation will be paused. And new defragmentation command will fail." + "Paused defragmentation commands will resume when this variable is set to " + "true again.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_UINT(defragment_n_pages, srv_defragment_n_pages, + PLUGIN_VAR_RQCMDARG, + "Number of pages considered at once when merging multiple pages to " + "defragment", + NULL, NULL, 7, 2, 32, 0); + +static MYSQL_SYSVAR_UINT(defragment_stats_accuracy, + srv_defragment_stats_accuracy, + PLUGIN_VAR_RQCMDARG, + "How many defragment stats changes there are before the stats " + "are written to persistent storage. Set to 0 meaning disable " + "defragment stats tracking.", + NULL, NULL, 0, 0, ~0U, 0); + +static MYSQL_SYSVAR_UINT(defragment_fill_factor_n_recs, + srv_defragment_fill_factor_n_recs, + PLUGIN_VAR_RQCMDARG, + "How many records of space defragmentation should leave on the page. " + "This variable, together with innodb_defragment_fill_factor, is introduced " + "so defragmentation won't pack the page too full and cause page split on " + "the next insert on every page. The variable indicating more defragmentation" + " gain is the one effective.", + NULL, NULL, 20, 1, 100, 0); + +static MYSQL_SYSVAR_DOUBLE(defragment_fill_factor, srv_defragment_fill_factor, + PLUGIN_VAR_RQCMDARG, + "A number between [0.7, 1] that tells defragmentation how full it should " + "fill a page. Default is 0.9. Number below 0.7 won't make much sense." + "This variable, together with innodb_defragment_fill_factor_n_recs, is " + "introduced so defragmentation won't pack the page too full and cause " + "page split on the next insert on every page. The variable indicating more " + "defragmentation gain is the one effective.", + NULL, NULL, 0.9, 0.7, 1, 0); + +static MYSQL_SYSVAR_UINT(defragment_frequency, srv_defragment_frequency, + PLUGIN_VAR_RQCMDARG, + "Do not defragment a single index more than this number of time per second." + "This controls the number of time defragmentation thread can request X_LOCK " + "on an index. Defragmentation thread will check whether " + "1/defragment_frequency (s) has passed since it worked on this index last " + "time, and put the index back to the queue if not enough time has passed. " + "The actual frequency can only be lower than this given number.", + NULL, innodb_defragment_frequency_update, + SRV_DEFRAGMENT_FREQUENCY_DEFAULT, 1, 1000, 0); + + +static MYSQL_SYSVAR_ULONG(lru_scan_depth, srv_LRU_scan_depth, + PLUGIN_VAR_RQCMDARG, + "How deep to scan LRU to keep it clean", + NULL, NULL, 1536, 100, ~0UL, 0); + +static MYSQL_SYSVAR_SIZE_T(lru_flush_size, innodb_lru_flush_size, + PLUGIN_VAR_RQCMDARG, + "How many pages to flush on LRU eviction", + NULL, NULL, 32, 1, SIZE_T_MAX, 0); + +static MYSQL_SYSVAR_ULONG(flush_neighbors, srv_flush_neighbors, + PLUGIN_VAR_OPCMDARG, + "Set to 0 (don't flush neighbors from buffer pool)," + " 1 (flush contiguous neighbors from buffer pool)" + " or 2 (flush neighbors from buffer pool)," + " when flushing a block", + NULL, NULL, 1, 0, 2, 0); + +static MYSQL_SYSVAR_BOOL(deadlock_detect, innodb_deadlock_detect, + PLUGIN_VAR_NOCMDARG, + "Enable/disable InnoDB deadlock detector (default ON)." + " if set to OFF, deadlock detection is skipped," + " and we rely on innodb_lock_wait_timeout in case of deadlock.", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_ENUM(deadlock_report, innodb_deadlock_report, + PLUGIN_VAR_RQCMDARG, + "How to report deadlocks (if innodb_deadlock_detect=ON).", + NULL, NULL, Deadlock::REPORT_FULL, &innodb_deadlock_report_typelib); + +static MYSQL_SYSVAR_UINT(fill_factor, innobase_fill_factor, + PLUGIN_VAR_RQCMDARG, + "Percentage of B-tree page filled during bulk insert", + NULL, NULL, 100, 10, 100, 0); + +static MYSQL_SYSVAR_BOOL(ft_enable_diag_print, fts_enable_diag_print, + PLUGIN_VAR_OPCMDARG, + "Whether to enable additional FTS diagnostic printout ", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(disable_sort_file_cache, srv_disable_sort_file_cache, + PLUGIN_VAR_OPCMDARG, + "Whether to disable OS system file cache for sort I/O", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_STR(ft_aux_table, innodb_ft_aux_table, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC, + "FTS internal auxiliary table to be checked", + innodb_ft_aux_table_validate, NULL, NULL); + +#if UNIV_WORD_SIZE == 4 + +static MYSQL_SYSVAR_SIZE_T(ft_cache_size, + *reinterpret_cast(&fts_max_cache_size), + PLUGIN_VAR_RQCMDARG, + "InnoDB Fulltext search cache size in bytes", + NULL, innodb_ft_cache_size_update, 8000000, 1600000, 1U << 29, 0); + +static MYSQL_SYSVAR_SIZE_T(ft_total_cache_size, + *reinterpret_cast(&fts_max_total_cache_size), + PLUGIN_VAR_RQCMDARG, + "Total memory allocated for InnoDB Fulltext Search cache", + NULL, innodb_ft_total_cache_size_update, 640000000, 32000000, 1600000000, 0); + +#else + +static MYSQL_SYSVAR_SIZE_T(ft_cache_size, + *reinterpret_cast(&fts_max_cache_size), + PLUGIN_VAR_RQCMDARG, + "InnoDB Fulltext search cache size in bytes", + NULL, innodb_ft_cache_size_update, 8000000, 1600000, 1ULL << 40, 0); + +static MYSQL_SYSVAR_SIZE_T(ft_total_cache_size, + *reinterpret_cast(&fts_max_total_cache_size), + PLUGIN_VAR_RQCMDARG, + "Total memory allocated for InnoDB Fulltext Search cache", + NULL, innodb_ft_total_cache_size_update, 640000000, 32000000, 1ULL << 40, 0); + +#endif + +static MYSQL_SYSVAR_SIZE_T(ft_result_cache_limit, fts_result_cache_limit, + PLUGIN_VAR_RQCMDARG, + "InnoDB Fulltext search query result cache limit in bytes", + NULL, NULL, 2000000000L, 1000000L, SIZE_T_MAX, 0); + +static MYSQL_SYSVAR_ULONG(ft_min_token_size, fts_min_token_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "InnoDB Fulltext search minimum token size in characters", + NULL, NULL, 3, 0, 16, 0); + +static MYSQL_SYSVAR_ULONG(ft_max_token_size, fts_max_token_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "InnoDB Fulltext search maximum token size in characters", + NULL, NULL, FTS_MAX_WORD_LEN_IN_CHAR, 10, FTS_MAX_WORD_LEN_IN_CHAR, 0); + +static MYSQL_SYSVAR_ULONG(ft_num_word_optimize, fts_num_word_optimize, + PLUGIN_VAR_OPCMDARG, + "InnoDB Fulltext search number of words to optimize for each optimize table call ", + NULL, NULL, 2000, 1000, 10000, 0); + +static MYSQL_SYSVAR_ULONG(ft_sort_pll_degree, fts_sort_pll_degree, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "InnoDB Fulltext search parallel sort degree, will round up to nearest power of 2 number", + NULL, NULL, 2, 1, 16, 0); + +static MYSQL_SYSVAR_ULONG(sort_buffer_size, srv_sort_buf_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Memory buffer size for index creation", + NULL, NULL, 1048576, 65536, 64<<20, 0); + +static MYSQL_SYSVAR_ULONGLONG(online_alter_log_max_size, srv_online_max_size, + PLUGIN_VAR_RQCMDARG, + "Maximum modification log file size for online index creation", + NULL, NULL, 128<<20, 65536, ~0ULL, 0); + +static MYSQL_SYSVAR_BOOL(optimize_fulltext_only, innodb_optimize_fulltext_only, + PLUGIN_VAR_NOCMDARG, + "Only optimize the Fulltext index of the table", + NULL, NULL, FALSE); + +extern int os_aio_resize(ulint n_reader_threads, ulint n_writer_threads); +static void innodb_update_io_thread_count(THD *thd,ulint n_read, ulint n_write) +{ + int res = os_aio_resize(n_read, n_write); + if (res) + { +#ifndef __linux__ + ut_ad(0); +#else + ut_a(srv_use_native_aio); + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_UNKNOWN_ERROR, + "Could not reserve max. number of concurrent ios." + "Increase the /proc/sys/fs/aio-max-nr to fix."); +#endif + } +} + +static void innodb_read_io_threads_update(THD* thd, struct st_mysql_sys_var*, void*, const void* save) +{ + srv_n_read_io_threads = *static_cast(save); + innodb_update_io_thread_count(thd, srv_n_read_io_threads, srv_n_write_io_threads); +} +static void innodb_write_io_threads_update(THD* thd, struct st_mysql_sys_var*, void*, const void* save) +{ + srv_n_write_io_threads = *static_cast(save); + innodb_update_io_thread_count(thd, srv_n_read_io_threads, srv_n_write_io_threads); +} + +static MYSQL_SYSVAR_UINT(read_io_threads, srv_n_read_io_threads, + PLUGIN_VAR_RQCMDARG, + "Number of background read I/O threads in InnoDB.", + NULL, innodb_read_io_threads_update , 4, 1, 64, 0); + +static MYSQL_SYSVAR_UINT(write_io_threads, srv_n_write_io_threads, + PLUGIN_VAR_RQCMDARG, + "Number of background write I/O threads in InnoDB.", + NULL, innodb_write_io_threads_update, 4, 2, 64, 0); + +static MYSQL_SYSVAR_ULONG(force_recovery, srv_force_recovery, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Helps to save your data in case the disk image of the database becomes corrupt. Value 5 can return bogus data, and 6 can permanently corrupt data.", + NULL, NULL, 0, 0, 6, 0); + +static MYSQL_SYSVAR_ULONG(page_size, srv_page_size, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + "Page size to use for all InnoDB tablespaces.", + NULL, NULL, UNIV_PAGE_SIZE_DEF, + UNIV_PAGE_SIZE_MIN, UNIV_PAGE_SIZE_MAX, 0); + +static MYSQL_SYSVAR_SIZE_T(log_buffer_size, log_sys.buf_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Redo log buffer size in bytes.", + NULL, NULL, 16U << 20, 2U << 20, SIZE_T_MAX, 4096); + +#if defined __linux__ || defined _WIN32 +static MYSQL_SYSVAR_BOOL(log_file_buffering, log_sys.log_buffered, + PLUGIN_VAR_OPCMDARG, + "Whether the file system cache for ib_logfile0 is enabled", + nullptr, innodb_log_file_buffering_update, FALSE); +#endif + +static MYSQL_SYSVAR_ULONGLONG(log_file_size, srv_log_file_size, + PLUGIN_VAR_RQCMDARG, + "Redo log size in bytes.", + nullptr, innodb_log_file_size_update, + 96 << 20, 4 << 20, std::numeric_limits::max(), 4096); + +static MYSQL_SYSVAR_UINT(old_blocks_pct, innobase_old_blocks_pct, + PLUGIN_VAR_RQCMDARG, + "Percentage of the buffer pool to reserve for 'old' blocks.", + NULL, innodb_old_blocks_pct_update, 100 * 3 / 8, 5, 95, 0); + +static MYSQL_SYSVAR_UINT(old_blocks_time, buf_LRU_old_threshold_ms, + PLUGIN_VAR_RQCMDARG, + "Move blocks to the 'new' end of the buffer pool if the first access" + " was at least this many milliseconds ago." + " The timeout is disabled if 0.", + NULL, NULL, 1000, 0, UINT_MAX32, 0); + +static MYSQL_SYSVAR_ULONG(open_files, innobase_open_files, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "How many files at the maximum InnoDB keeps open at the same time.", + NULL, NULL, 0, 0, LONG_MAX, 0); + +static MYSQL_SYSVAR_ULONG(sync_spin_loops, srv_n_spin_wait_rounds, + PLUGIN_VAR_RQCMDARG, + "Count of spin-loop rounds in InnoDB mutexes (30 by default)", + NULL, NULL, 30L, 0L, ~0UL, 0); + +static MYSQL_SYSVAR_UINT(spin_wait_delay, srv_spin_wait_delay, + PLUGIN_VAR_OPCMDARG, + "Maximum delay between polling for a spin lock (4 by default)", + NULL, NULL, 4, 0, 6000, 0); + +static my_bool innodb_prefix_index_cluster_optimization; + +static MYSQL_SYSVAR_BOOL(prefix_index_cluster_optimization, + innodb_prefix_index_cluster_optimization, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_DEPRECATED, + "Deprecated parameter with no effect", + nullptr, nullptr, TRUE); + +static MYSQL_SYSVAR_STR(data_file_path, innobase_data_file_path, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Path to individual files and their sizes.", + NULL, NULL, "ibdata1:12M:autoextend"); + +static MYSQL_SYSVAR_STR(temp_data_file_path, innobase_temp_data_file_path, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Path to files and their sizes making temp-tablespace.", + NULL, NULL, "ibtmp1:12M:autoextend"); + +static MYSQL_SYSVAR_STR(undo_directory, srv_undo_dir, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Directory where undo tablespace files live, this path can be absolute.", + NULL, NULL, NULL); + +static MYSQL_SYSVAR_UINT(undo_tablespaces, srv_undo_tablespaces, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Number of undo tablespaces to use.", + NULL, NULL, + 0L, /* Default seting */ + 0L, /* Minimum value */ + TRX_SYS_MAX_UNDO_SPACES, 0); /* Maximum value */ + +static MYSQL_SYSVAR_ULONGLONG(max_undo_log_size, srv_max_undo_log_size, + PLUGIN_VAR_OPCMDARG, + "Desired maximum UNDO tablespace size in bytes", + NULL, NULL, + 10 << 20, 10 << 20, + 1ULL << (32 + UNIV_PAGE_SIZE_SHIFT_MAX), 0); + +static MYSQL_SYSVAR_ULONG(purge_rseg_truncate_frequency, + srv_purge_rseg_truncate_frequency, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_DEPRECATED, + "Deprecated parameter with no effect", + NULL, NULL, 128, 1, 128, 0); + +static void innodb_undo_log_truncate_update(THD *thd, struct st_mysql_sys_var*, + void*, const void *save) +{ + if ((srv_undo_log_truncate= *static_cast(save))) + purge_sys.wake_if_not_active(); +} + +static MYSQL_SYSVAR_BOOL(undo_log_truncate, srv_undo_log_truncate, + PLUGIN_VAR_OPCMDARG, + "Enable or Disable Truncate of UNDO tablespace.", + NULL, innodb_undo_log_truncate_update, FALSE); + +static MYSQL_SYSVAR_LONG(autoinc_lock_mode, innobase_autoinc_lock_mode, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "The AUTOINC lock modes supported by InnoDB:" + " 0 => Old style AUTOINC locking (for backward compatibility);" + " 1 => New style AUTOINC locking;" + " 2 => No AUTOINC locking (unsafe for SBR)", + NULL, NULL, + AUTOINC_NEW_STYLE_LOCKING, /* Default setting */ + AUTOINC_OLD_STYLE_LOCKING, /* Minimum value */ + AUTOINC_NO_LOCKING, 0); /* Maximum value */ + +#ifdef HAVE_URING +# include +static utsname uname_for_io_uring; +#else +static +#endif +bool innodb_use_native_aio_default() +{ +#ifdef HAVE_URING + utsname &u= uname_for_io_uring; + if (!uname(&u) && u.release[0] == '5' && u.release[1] == '.' && + u.release[2] == '1' && u.release[3] >= '1' && u.release[3] <= '5' && + u.release[4] == '.') + { + if (u.release[3] == '5') { + const char *s= strstr(u.version, "5.15."); + if (s || (s= strstr(u.release, "5.15."))) + if ((s[5] >= '3' || s[6] >= '0')) + return true; /* 5.15.3 and later should be fine */ + } + io_uring_may_be_unsafe= u.release; + return false; /* working around io_uring hangs (MDEV-26674) */ + } +#endif + return true; +} + +static MYSQL_SYSVAR_BOOL(use_native_aio, srv_use_native_aio, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Use native AIO if supported on this platform.", + NULL, NULL, innodb_use_native_aio_default()); + +#ifdef HAVE_LIBNUMA +static MYSQL_SYSVAR_BOOL(numa_interleave, srv_numa_interleave, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Use NUMA interleave memory policy to allocate InnoDB buffer pool.", + NULL, NULL, FALSE); +#endif /* HAVE_LIBNUMA */ + +static void innodb_change_buffering_update(THD *thd, struct st_mysql_sys_var*, + void*, const void *save) +{ + ulong i= *static_cast(save); + if (i != IBUF_USE_NONE && !ibuf.index) + push_warning(thd, Sql_condition::WARN_LEVEL_WARN, ER_NOT_KEYFILE, + "InnoDB: The change buffer is corrupted."); + else + innodb_change_buffering= i; +} + +static MYSQL_SYSVAR_ENUM(change_buffering, innodb_change_buffering, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_DEPRECATED, + "Buffer changes to secondary indexes.", + nullptr, innodb_change_buffering_update, + IBUF_USE_NONE, &innodb_change_buffering_typelib); + +static MYSQL_SYSVAR_UINT(change_buffer_max_size, + srv_change_buffer_max_size, + PLUGIN_VAR_RQCMDARG, + "Maximum on-disk size of change buffer in terms of percentage" + " of the buffer pool.", + NULL, innodb_change_buffer_max_size_update, + CHANGE_BUFFER_DEFAULT_SIZE, 0, 50, 0); + +static MYSQL_SYSVAR_ENUM(stats_method, srv_innodb_stats_method, + PLUGIN_VAR_RQCMDARG, + "Specifies how InnoDB index statistics collection code should" + " treat NULLs. Possible values are NULLS_EQUAL (default)," + " NULLS_UNEQUAL and NULLS_IGNORED", + NULL, NULL, SRV_STATS_NULLS_EQUAL, &innodb_stats_method_typelib); + +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG +static MYSQL_SYSVAR_BOOL(change_buffer_dump, ibuf_dump, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Dump the change buffer at startup.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_UINT(change_buffering_debug, ibuf_debug, + PLUGIN_VAR_RQCMDARG, + "Debug flags for InnoDB change buffering (0=none, 1=try to buffer)", + NULL, NULL, 0, 0, 1, 0); +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + +static MYSQL_SYSVAR_ULONG(buf_dump_status_frequency, srv_buf_dump_status_frequency, + PLUGIN_VAR_RQCMDARG, + "A number between [0, 100] that tells how oftern buffer pool dump status " + "in percentages should be printed. E.g. 10 means that buffer pool dump " + "status is printed when every 10% of number of buffer pool pages are " + "dumped. Default is 0 (only start and end status is printed).", + NULL, NULL, 0, 0, 100, 0); + +static MYSQL_SYSVAR_BOOL(random_read_ahead, srv_random_read_ahead, + PLUGIN_VAR_NOCMDARG, + "Whether to use read ahead for random access within an extent.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_ULONG(read_ahead_threshold, srv_read_ahead_threshold, + PLUGIN_VAR_RQCMDARG, + "Number of pages that must be accessed sequentially for InnoDB to" + " trigger a readahead.", + NULL, NULL, 56, 0, 64, 0); + +static MYSQL_SYSVAR_STR(monitor_enable, innobase_enable_monitor_counter, + PLUGIN_VAR_RQCMDARG, + "Turn on a monitor counter", + innodb_monitor_validate, + innodb_enable_monitor_update, NULL); + +static MYSQL_SYSVAR_STR(monitor_disable, innobase_disable_monitor_counter, + PLUGIN_VAR_RQCMDARG, + "Turn off a monitor counter", + innodb_monitor_validate, + innodb_disable_monitor_update, NULL); + +static MYSQL_SYSVAR_STR(monitor_reset, innobase_reset_monitor_counter, + PLUGIN_VAR_RQCMDARG, + "Reset a monitor counter", + innodb_monitor_validate, + innodb_reset_monitor_update, NULL); + +static MYSQL_SYSVAR_STR(monitor_reset_all, innobase_reset_all_monitor_counter, + PLUGIN_VAR_RQCMDARG, + "Reset all values for a monitor counter", + innodb_monitor_validate, + innodb_reset_all_monitor_update, NULL); + +static MYSQL_SYSVAR_BOOL(status_output, srv_print_innodb_monitor, + PLUGIN_VAR_OPCMDARG, "Enable InnoDB monitor output to the error log.", + NULL, innodb_status_output_update, FALSE); + +static MYSQL_SYSVAR_BOOL(status_output_locks, srv_print_innodb_lock_monitor, + PLUGIN_VAR_OPCMDARG, "Enable InnoDB lock monitor output to the error log." + " Requires innodb_status_output=ON.", + NULL, innodb_status_output_update, FALSE); + +static MYSQL_SYSVAR_BOOL(print_all_deadlocks, srv_print_all_deadlocks, + PLUGIN_VAR_OPCMDARG, + "Print all deadlocks to MariaDB error log (off by default)", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_ULONG(compression_failure_threshold_pct, + zip_failure_threshold_pct, PLUGIN_VAR_OPCMDARG, + "If the compression failure rate of a table is greater than this number" + " more padding is added to the pages to reduce the failures. A value of" + " zero implies no padding", + NULL, NULL, 5, 0, 100, 0); + +static MYSQL_SYSVAR_ULONG(compression_pad_pct_max, + zip_pad_max, PLUGIN_VAR_OPCMDARG, + "Percentage of empty space on a data page that can be reserved" + " to make the page compressible.", + NULL, NULL, 50, 0, 75, 0); + +static MYSQL_SYSVAR_BOOL(read_only, srv_read_only_mode, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + "Start InnoDB in read only mode (off by default)", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(read_only_compressed, innodb_read_only_compressed, + PLUGIN_VAR_OPCMDARG, + "Make ROW_FORMAT=COMPRESSED tables read-only", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(cmp_per_index_enabled, srv_cmp_per_index_enabled, + PLUGIN_VAR_OPCMDARG, + "Enable INFORMATION_SCHEMA.innodb_cmp_per_index," + " may have negative impact on performance (off by default)", + NULL, innodb_cmp_per_index_update, FALSE); + +static MYSQL_SYSVAR_ENUM(default_row_format, innodb_default_row_format, + PLUGIN_VAR_RQCMDARG, + "The default ROW FORMAT for all innodb tables created without explicit" + " ROW_FORMAT. Possible values are REDUNDANT, COMPACT, and DYNAMIC." + " The ROW_FORMAT value COMPRESSED is not allowed", + NULL, NULL, DEFAULT_ROW_FORMAT_DYNAMIC, + &innodb_default_row_format_typelib); + +#ifdef UNIV_DEBUG +static MYSQL_SYSVAR_UINT(trx_rseg_n_slots_debug, trx_rseg_n_slots_debug, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_NOCMDOPT, + "Debug flags for InnoDB to limit TRX_RSEG_N_SLOTS for trx_rsegf_undo_find_free()", + NULL, NULL, 0, 0, 1024, 0); + +static MYSQL_SYSVAR_UINT(limit_optimistic_insert_debug, + btr_cur_limit_optimistic_insert_debug, PLUGIN_VAR_RQCMDARG, + "Artificially limit the number of records per B-tree page (0=unlimited).", + NULL, NULL, 0, 0, UINT_MAX32, 0); + +static MYSQL_SYSVAR_BOOL(trx_purge_view_update_only_debug, + srv_purge_view_update_only_debug, PLUGIN_VAR_NOCMDOPT, + "Pause actual purging any delete-marked records, but merely update the purge view." + " It is to create artificially the situation the purge view have been updated" + " but the each purges were not done yet.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(evict_tables_on_commit_debug, + innodb_evict_tables_on_commit_debug, PLUGIN_VAR_OPCMDARG, + "On transaction commit, try to evict tables from the data dictionary cache.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_UINT(data_file_size_debug, + srv_sys_space_size_debug, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "InnoDB system tablespace size to be set in recovery.", + NULL, NULL, 0, 0, 256U << 20, 0); + +static MYSQL_SYSVAR_UINT(fil_make_page_dirty_debug, + srv_fil_make_page_dirty_debug, PLUGIN_VAR_OPCMDARG, + "Make the first page of the given tablespace dirty.", + NULL, innodb_make_page_dirty, 0, 0, UINT_MAX32, 0); + +static MYSQL_SYSVAR_UINT(saved_page_number_debug, + srv_saved_page_number_debug, PLUGIN_VAR_OPCMDARG, + "An InnoDB page number.", + NULL, NULL, 0, 0, UINT_MAX32, 0); +#endif /* UNIV_DEBUG */ + +static MYSQL_SYSVAR_BOOL(force_primary_key, + srv_force_primary_key, + PLUGIN_VAR_OPCMDARG, + "Do not allow creating a table without primary key (off by default)", + NULL, NULL, FALSE); + +const char *page_compression_algorithms[]= { "none", "zlib", "lz4", "lzo", "lzma", "bzip2", "snappy", 0 }; +static TYPELIB page_compression_algorithms_typelib= +{ + array_elements(page_compression_algorithms) - 1, 0, + page_compression_algorithms, 0 +}; +static MYSQL_SYSVAR_ENUM(compression_algorithm, innodb_compression_algorithm, + PLUGIN_VAR_OPCMDARG, + "Compression algorithm used on page compression. One of: none, zlib, lz4, lzo, lzma, bzip2, or snappy", + innodb_compression_algorithm_validate, NULL, + /* We use here the largest number of supported compression method to + enable all those methods that are available. Availability of compression + method is verified on innodb_compression_algorithm_validate function. */ + PAGE_ZLIB_ALGORITHM, + &page_compression_algorithms_typelib); + +static MYSQL_SYSVAR_ULONG(fatal_semaphore_wait_threshold, srv_fatal_semaphore_wait_threshold, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Maximum number of seconds that semaphore times out in InnoDB.", + NULL, NULL, + DEFAULT_SRV_FATAL_SEMAPHORE_TIMEOUT, /* Default setting */ + 1, /* Minimum setting */ + UINT_MAX32, /* Maximum setting */ + 0); + +static const char* srv_encrypt_tables_names[] = { "OFF", "ON", "FORCE", 0 }; +static TYPELIB srv_encrypt_tables_typelib = { + array_elements(srv_encrypt_tables_names)-1, 0, srv_encrypt_tables_names, + NULL +}; +static MYSQL_SYSVAR_ENUM(encrypt_tables, srv_encrypt_tables, + PLUGIN_VAR_OPCMDARG, + "Enable encryption for tables. " + "Don't forget to enable --innodb-encrypt-log too", + innodb_encrypt_tables_validate, + innodb_encrypt_tables_update, + 0, + &srv_encrypt_tables_typelib); + +static MYSQL_SYSVAR_UINT(encryption_threads, srv_n_fil_crypt_threads, + PLUGIN_VAR_RQCMDARG, + "Number of threads performing background key rotation ", + NULL, + innodb_encryption_threads_update, + 0, 0, 255, 0); + +static MYSQL_SYSVAR_UINT(encryption_rotate_key_age, + srv_fil_crypt_rotate_key_age, + PLUGIN_VAR_RQCMDARG, + "Key rotation - re-encrypt in background " + "all pages that were encrypted with a key that " + "many (or more) versions behind. Value 0 indicates " + "that key rotation is disabled.", + NULL, + innodb_encryption_rotate_key_age_update, + 1, 0, UINT_MAX32, 0); + +static MYSQL_SYSVAR_UINT(encryption_rotation_iops, srv_n_fil_crypt_iops, + PLUGIN_VAR_RQCMDARG, + "Use this many iops for background key rotation", + NULL, + innodb_encryption_rotation_iops_update, + 100, 0, UINT_MAX32, 0); + +static MYSQL_SYSVAR_BOOL(encrypt_log, srv_encrypt_log, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + "Enable redo log encryption", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(immediate_scrub_data_uncompressed, + srv_immediate_scrub_data_uncompressed, + 0, + "Enable scrubbing of data", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(encrypt_temporary_tables, innodb_encrypt_temporary_tables, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, + "Enrypt the temporary table data.", + NULL, NULL, false); + +static struct st_mysql_sys_var* innobase_system_variables[]= { + MYSQL_SYSVAR(autoextend_increment), + MYSQL_SYSVAR(buffer_pool_size), + MYSQL_SYSVAR(buffer_pool_chunk_size), + MYSQL_SYSVAR(buffer_pool_filename), + MYSQL_SYSVAR(buffer_pool_dump_now), + MYSQL_SYSVAR(buffer_pool_dump_at_shutdown), + MYSQL_SYSVAR(buffer_pool_dump_pct), +#ifdef UNIV_DEBUG + MYSQL_SYSVAR(buffer_pool_evict), +#endif /* UNIV_DEBUG */ + MYSQL_SYSVAR(buffer_pool_load_now), + MYSQL_SYSVAR(buffer_pool_load_abort), +#ifdef UNIV_DEBUG + MYSQL_SYSVAR(buffer_pool_load_pages_abort), +#endif /* UNIV_DEBUG */ + MYSQL_SYSVAR(buffer_pool_load_at_startup), + MYSQL_SYSVAR(defragment), + MYSQL_SYSVAR(defragment_n_pages), + MYSQL_SYSVAR(defragment_stats_accuracy), + MYSQL_SYSVAR(defragment_fill_factor), + MYSQL_SYSVAR(defragment_fill_factor_n_recs), + MYSQL_SYSVAR(defragment_frequency), + MYSQL_SYSVAR(lru_scan_depth), + MYSQL_SYSVAR(lru_flush_size), + MYSQL_SYSVAR(flush_neighbors), + MYSQL_SYSVAR(checksum_algorithm), + MYSQL_SYSVAR(compression_level), + MYSQL_SYSVAR(data_file_path), + MYSQL_SYSVAR(temp_data_file_path), + MYSQL_SYSVAR(data_home_dir), + MYSQL_SYSVAR(doublewrite), + MYSQL_SYSVAR(stats_include_delete_marked), + MYSQL_SYSVAR(use_atomic_writes), + MYSQL_SYSVAR(fast_shutdown), + MYSQL_SYSVAR(read_io_threads), + MYSQL_SYSVAR(write_io_threads), + MYSQL_SYSVAR(file_per_table), + MYSQL_SYSVAR(flush_log_at_timeout), + MYSQL_SYSVAR(flush_log_at_trx_commit), + MYSQL_SYSVAR(flush_method), + MYSQL_SYSVAR(force_recovery), + MYSQL_SYSVAR(fill_factor), + MYSQL_SYSVAR(ft_cache_size), + MYSQL_SYSVAR(ft_total_cache_size), + MYSQL_SYSVAR(ft_result_cache_limit), + MYSQL_SYSVAR(ft_enable_stopword), + MYSQL_SYSVAR(ft_max_token_size), + MYSQL_SYSVAR(ft_min_token_size), + MYSQL_SYSVAR(ft_num_word_optimize), + MYSQL_SYSVAR(ft_sort_pll_degree), + MYSQL_SYSVAR(lock_wait_timeout), + MYSQL_SYSVAR(deadlock_detect), + MYSQL_SYSVAR(deadlock_report), + MYSQL_SYSVAR(page_size), + MYSQL_SYSVAR(log_buffer_size), +#if defined __linux__ || defined _WIN32 + MYSQL_SYSVAR(log_file_buffering), +#endif + MYSQL_SYSVAR(log_file_size), + MYSQL_SYSVAR(log_group_home_dir), + MYSQL_SYSVAR(max_dirty_pages_pct), + MYSQL_SYSVAR(max_dirty_pages_pct_lwm), + MYSQL_SYSVAR(adaptive_flushing_lwm), + MYSQL_SYSVAR(adaptive_flushing), + MYSQL_SYSVAR(flush_sync), + MYSQL_SYSVAR(flushing_avg_loops), + MYSQL_SYSVAR(max_purge_lag), + MYSQL_SYSVAR(max_purge_lag_delay), + MYSQL_SYSVAR(max_purge_lag_wait), + MYSQL_SYSVAR(old_blocks_pct), + MYSQL_SYSVAR(old_blocks_time), + MYSQL_SYSVAR(open_files), + MYSQL_SYSVAR(optimize_fulltext_only), + MYSQL_SYSVAR(rollback_on_timeout), + MYSQL_SYSVAR(ft_aux_table), + MYSQL_SYSVAR(ft_enable_diag_print), + MYSQL_SYSVAR(ft_server_stopword_table), + MYSQL_SYSVAR(ft_user_stopword_table), + MYSQL_SYSVAR(disable_sort_file_cache), + MYSQL_SYSVAR(stats_on_metadata), + MYSQL_SYSVAR(stats_transient_sample_pages), + MYSQL_SYSVAR(stats_persistent), + MYSQL_SYSVAR(stats_persistent_sample_pages), + MYSQL_SYSVAR(stats_auto_recalc), + MYSQL_SYSVAR(stats_modified_counter), + MYSQL_SYSVAR(stats_traditional), +#ifdef BTR_CUR_HASH_ADAPT + MYSQL_SYSVAR(adaptive_hash_index), + MYSQL_SYSVAR(adaptive_hash_index_parts), +#endif /* BTR_CUR_HASH_ADAPT */ + MYSQL_SYSVAR(stats_method), + MYSQL_SYSVAR(status_file), + MYSQL_SYSVAR(strict_mode), + MYSQL_SYSVAR(sort_buffer_size), + MYSQL_SYSVAR(online_alter_log_max_size), + MYSQL_SYSVAR(sync_spin_loops), + MYSQL_SYSVAR(spin_wait_delay), + MYSQL_SYSVAR(table_locks), + MYSQL_SYSVAR(prefix_index_cluster_optimization), + MYSQL_SYSVAR(tmpdir), + MYSQL_SYSVAR(autoinc_lock_mode), + MYSQL_SYSVAR(use_native_aio), +#ifdef HAVE_LIBNUMA + MYSQL_SYSVAR(numa_interleave), +#endif /* HAVE_LIBNUMA */ + MYSQL_SYSVAR(change_buffering), + MYSQL_SYSVAR(change_buffer_max_size), +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG + MYSQL_SYSVAR(change_buffer_dump), + MYSQL_SYSVAR(change_buffering_debug), +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + MYSQL_SYSVAR(random_read_ahead), + MYSQL_SYSVAR(read_ahead_threshold), + MYSQL_SYSVAR(read_only), + MYSQL_SYSVAR(read_only_compressed), + MYSQL_SYSVAR(instant_alter_column_allowed), + MYSQL_SYSVAR(io_capacity), + MYSQL_SYSVAR(io_capacity_max), + MYSQL_SYSVAR(monitor_enable), + MYSQL_SYSVAR(monitor_disable), + MYSQL_SYSVAR(monitor_reset), + MYSQL_SYSVAR(monitor_reset_all), + MYSQL_SYSVAR(purge_threads), + MYSQL_SYSVAR(purge_batch_size), +#ifdef UNIV_DEBUG + MYSQL_SYSVAR(log_checkpoint_now), + MYSQL_SYSVAR(buf_flush_list_now), + MYSQL_SYSVAR(merge_threshold_set_all_debug), +#endif /* UNIV_DEBUG */ + MYSQL_SYSVAR(status_output), + MYSQL_SYSVAR(status_output_locks), + MYSQL_SYSVAR(print_all_deadlocks), + MYSQL_SYSVAR(cmp_per_index_enabled), + MYSQL_SYSVAR(max_undo_log_size), + MYSQL_SYSVAR(purge_rseg_truncate_frequency), + MYSQL_SYSVAR(undo_log_truncate), + MYSQL_SYSVAR(undo_directory), + MYSQL_SYSVAR(undo_tablespaces), + MYSQL_SYSVAR(compression_failure_threshold_pct), + MYSQL_SYSVAR(compression_pad_pct_max), + MYSQL_SYSVAR(default_row_format), +#ifdef UNIV_DEBUG + MYSQL_SYSVAR(trx_rseg_n_slots_debug), + MYSQL_SYSVAR(limit_optimistic_insert_debug), + MYSQL_SYSVAR(trx_purge_view_update_only_debug), + MYSQL_SYSVAR(evict_tables_on_commit_debug), + MYSQL_SYSVAR(data_file_size_debug), + MYSQL_SYSVAR(fil_make_page_dirty_debug), + MYSQL_SYSVAR(saved_page_number_debug), +#endif /* UNIV_DEBUG */ + MYSQL_SYSVAR(force_primary_key), + MYSQL_SYSVAR(fatal_semaphore_wait_threshold), + /* Table page compression feature */ + MYSQL_SYSVAR(compression_default), + MYSQL_SYSVAR(compression_algorithm), + /* Encryption feature */ + MYSQL_SYSVAR(encrypt_tables), + MYSQL_SYSVAR(encryption_threads), + MYSQL_SYSVAR(encryption_rotate_key_age), + MYSQL_SYSVAR(encryption_rotation_iops), + MYSQL_SYSVAR(encrypt_log), + MYSQL_SYSVAR(default_encryption_key_id), + MYSQL_SYSVAR(immediate_scrub_data_uncompressed), + MYSQL_SYSVAR(buf_dump_status_frequency), + MYSQL_SYSVAR(background_thread), + MYSQL_SYSVAR(encrypt_temporary_tables), + + NULL +}; + +maria_declare_plugin(innobase) +{ + MYSQL_STORAGE_ENGINE_PLUGIN, + &innobase_storage_engine, + innobase_hton_name, + plugin_author, + "Supports transactions, row-level locking, foreign keys and encryption for tables", + PLUGIN_LICENSE_GPL, + innodb_init, /* Plugin Init */ + NULL, /* Plugin Deinit */ + MYSQL_VERSION_MAJOR << 8 | MYSQL_VERSION_MINOR, + innodb_status_variables_export,/* status variables */ + innobase_system_variables, /* system variables */ + PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE /* maturity */ +}, +i_s_innodb_trx, +i_s_innodb_locks, +i_s_innodb_lock_waits, +i_s_innodb_cmp, +i_s_innodb_cmp_reset, +i_s_innodb_cmpmem, +i_s_innodb_cmpmem_reset, +i_s_innodb_cmp_per_index, +i_s_innodb_cmp_per_index_reset, +i_s_innodb_buffer_page, +i_s_innodb_buffer_page_lru, +i_s_innodb_buffer_stats, +i_s_innodb_metrics, +i_s_innodb_ft_default_stopword, +i_s_innodb_ft_deleted, +i_s_innodb_ft_being_deleted, +i_s_innodb_ft_config, +i_s_innodb_ft_index_cache, +i_s_innodb_ft_index_table, +i_s_innodb_sys_tables, +i_s_innodb_sys_tablestats, +i_s_innodb_sys_indexes, +i_s_innodb_sys_columns, +i_s_innodb_sys_fields, +i_s_innodb_sys_foreign, +i_s_innodb_sys_foreign_cols, +i_s_innodb_sys_tablespaces, +i_s_innodb_sys_virtual, +i_s_innodb_tablespaces_encryption +maria_declare_plugin_end; + +/** @brief Adjust some InnoDB startup parameters based on file contents +or innodb_page_size. */ +static +void +innodb_params_adjust() +{ + MYSQL_SYSVAR_NAME(max_undo_log_size).max_val + = 1ULL << (32U + srv_page_size_shift); + MYSQL_SYSVAR_NAME(max_undo_log_size).min_val + = MYSQL_SYSVAR_NAME(max_undo_log_size).def_val + = ulonglong(SRV_UNDO_TABLESPACE_SIZE_IN_PAGES) + << srv_page_size_shift; + MYSQL_SYSVAR_NAME(max_undo_log_size).max_val + = 1ULL << (32U + srv_page_size_shift); +} + +/**************************************************************************** + * DS-MRR implementation + ***************************************************************************/ + +/** +Multi Range Read interface, DS-MRR calls */ +int +ha_innobase::multi_range_read_init( + RANGE_SEQ_IF* seq, + void* seq_init_param, + uint n_ranges, + uint mode, + HANDLER_BUFFER* buf) +{ + return(m_ds_mrr.dsmrr_init(this, seq, seq_init_param, + n_ranges, mode, buf)); +} + +int +ha_innobase::multi_range_read_next( + range_id_t* range_info) +{ + return(m_ds_mrr.dsmrr_next(range_info)); +} + +ha_rows +ha_innobase::multi_range_read_info_const( + uint keyno, + RANGE_SEQ_IF* seq, + void* seq_init_param, + uint n_ranges, + uint* bufsz, + uint* flags, + Cost_estimate* cost) +{ + /* See comments in ha_myisam::multi_range_read_info_const */ + m_ds_mrr.init(this, table); + + if (m_prebuilt->select_lock_type != LOCK_NONE) { + *flags |= HA_MRR_USE_DEFAULT_IMPL; + } + + ha_rows res= m_ds_mrr.dsmrr_info_const(keyno, seq, seq_init_param, n_ranges, + bufsz, flags, cost); + return res; +} + +ha_rows +ha_innobase::multi_range_read_info( + uint keyno, + uint n_ranges, + uint keys, + uint key_parts, + uint* bufsz, + uint* flags, + Cost_estimate* cost) +{ + m_ds_mrr.init(this, table); + ha_rows res= m_ds_mrr.dsmrr_info(keyno, n_ranges, keys, key_parts, bufsz, + flags, cost); + return res; +} + +int +ha_innobase::multi_range_read_explain_info( + uint mrr_mode, + char *str, + size_t size) +{ + return m_ds_mrr.dsmrr_explain_info(mrr_mode, str, size); +} + +/** Find or open a table handle for the virtual column template +@param[in] thd thread handle +@param[in,out] table InnoDB table whose virtual column template + is to be updated +@return table handle +@retval NULL if the table is dropped, unaccessible or corrupted +for purge thread */ +static TABLE* innodb_find_table_for_vc(THD* thd, dict_table_t* table) +{ + TABLE *mysql_table; + const bool bg_thread = THDVAR(thd, background_thread); + + if (bg_thread) { + if ((mysql_table = get_purge_table(thd))) { + return mysql_table; + } + } else { + if (table->vc_templ->mysql_table_query_id + == thd_get_query_id(thd)) { + return table->vc_templ->mysql_table; + } + } + + char db_buf[NAME_LEN + 1]; + char tbl_buf[NAME_LEN + 1]; + ulint db_buf_len, tbl_buf_len; + + if (!table->parse_name(db_buf, tbl_buf, &db_buf_len, &tbl_buf_len)) { + return NULL; + } + + if (bg_thread) { + return open_purge_table(thd, db_buf, db_buf_len, + tbl_buf, tbl_buf_len); + } + + mysql_table = find_fk_open_table(thd, db_buf, db_buf_len, + tbl_buf, tbl_buf_len); + table->vc_templ->mysql_table = mysql_table; + table->vc_templ->mysql_table_query_id = thd_get_query_id(thd); + return mysql_table; +} + +/** Change dbname and table name in table->vc_templ. +@param[in,out] table the table whose virtual column template +dbname and tbname to be renamed. */ +void +innobase_rename_vc_templ( + dict_table_t* table) +{ + char dbname[MAX_DATABASE_NAME_LEN + 1]; + char tbname[MAX_DATABASE_NAME_LEN + 1]; + char* name = table->name.m_name; + ulint dbnamelen = dict_get_db_name_len(name); + ulint tbnamelen = strlen(name) - dbnamelen - 1; + char t_dbname[MAX_DATABASE_NAME_LEN + 1]; + char t_tbname[MAX_TABLE_NAME_LEN + 1]; + + strncpy(dbname, name, dbnamelen); + dbname[dbnamelen] = 0; + strncpy(tbname, name + dbnamelen + 1, tbnamelen); + tbname[tbnamelen] =0; + + /* For partition table, remove the partition name and use the + "main" table name to build the template */ + char* is_part = is_partition(tbname); + + if (is_part != NULL) { + *is_part = '\0'; + tbnamelen = ulint(is_part - tbname); + } + + dbnamelen = filename_to_tablename(dbname, t_dbname, + MAX_DATABASE_NAME_LEN + 1); + tbnamelen = filename_to_tablename(tbname, t_tbname, + MAX_TABLE_NAME_LEN + 1); + + table->vc_templ->db_name = t_dbname; + table->vc_templ->tb_name = t_tbname; +} + + +/** + Allocate a heap and record for calculating virtual fields + Used mainly for virtual fields in indexes + +@param[in] thd MariaDB THD +@param[in] index Index in use +@param[out] heap Heap that holds temporary row +@param[in,out] table MariaDB table +@param[out] record Pointer to allocated MariaDB record +@param[out] storage Internal storage for blobs etc + +@retval true on success +@retval false on malloc failure or failed to open the maria table + for purge thread. +*/ + +bool innobase_allocate_row_for_vcol(THD *thd, const dict_index_t *index, + mem_heap_t **heap, TABLE **table, + VCOL_STORAGE *storage) +{ + TABLE *maria_table; + String *blob_value_storage; + if (!*table) + *table = innodb_find_table_for_vc(thd, index->table); + + /* For purge thread, there is a possiblity that table could have + dropped, corrupted or unaccessible. */ + if (!*table) + return false; + maria_table = *table; + if (!*heap && !(*heap = mem_heap_create(srv_page_size))) + return false; + + uchar *record = static_cast(mem_heap_alloc(*heap, + maria_table->s->reclength)); + + size_t len = maria_table->s->virtual_not_stored_blob_fields * sizeof(String); + blob_value_storage = static_cast(mem_heap_alloc(*heap, len)); + + if (!record || !blob_value_storage) + return false; + + storage->maria_table = maria_table; + storage->innobase_record = record; + storage->maria_record = maria_table->field[0]->record_ptr(); + storage->blob_value_storage = blob_value_storage; + + maria_table->move_fields(maria_table->field, record, storage->maria_record); + maria_table->remember_blob_values(blob_value_storage); + + return true; +} + + +/** Free memory allocated by innobase_allocate_row_for_vcol() */ + +void innobase_free_row_for_vcol(VCOL_STORAGE *storage) +{ + TABLE *maria_table= storage->maria_table; + maria_table->move_fields(maria_table->field, storage->maria_record, + storage->innobase_record); + maria_table->restore_blob_values(storage->blob_value_storage); +} + + +void innobase_report_computed_value_failed(dtuple_t *row) +{ + ib::error() << "Compute virtual column values failed for " + << rec_printer(row).str(); +} + + +/** Get the computed value by supplying the base column values. +@param[in,out] row the data row +@param[in] col virtual column +@param[in] index index +@param[in,out] local_heap heap memory for processing large data etc. +@param[in,out] heap memory heap that copies the actual index row +@param[in] ifield index field +@param[in] thd MySQL thread handle +@param[in,out] mysql_table mysql table object +@param[in,out] mysql_rec MariaDB record buffer +@param[in] old_table during ALTER TABLE, this is the old table + or NULL. +@param[in] update update vector for the row, if any +@param[in] foreign foreign key information +@return the field filled with computed value, or NULL if just want +to store the value in passed in "my_rec" */ +dfield_t* +innobase_get_computed_value( + dtuple_t* row, + const dict_v_col_t* col, + const dict_index_t* index, + mem_heap_t** local_heap, + mem_heap_t* heap, + const dict_field_t* ifield, + THD* thd, + TABLE* mysql_table, + byte* mysql_rec, + const dict_table_t* old_table, + const upd_t* update, + bool ignore_warnings) +{ + byte rec_buf2[REC_VERSION_56_MAX_INDEX_COL_LEN]; + byte* buf; + dfield_t* field; + ulint len; + + const ulint zip_size = old_table + ? old_table->space->zip_size() + : dict_tf_get_zip_size(index->table->flags); + + ulint ret = 0; + + dict_index_t *clust_index= dict_table_get_first_index(index->table); + + ut_ad(index->table->vc_templ); + ut_ad(thd != NULL); + ut_ad(mysql_table); + + DBUG_ENTER("innobase_get_computed_value"); + const mysql_row_templ_t* + vctempl = index->table->vc_templ->vtempl[ + index->table->vc_templ->n_col + col->v_pos]; + + if (!heap || index->table->vc_templ->rec_len + >= REC_VERSION_56_MAX_INDEX_COL_LEN) { + if (*local_heap == NULL) { + *local_heap = mem_heap_create(srv_page_size); + } + + buf = static_cast(mem_heap_alloc( + *local_heap, index->table->vc_templ->rec_len)); + } else { + buf = rec_buf2; + } + + for (ulint i = 0; i < unsigned{col->num_base}; i++) { + dict_col_t* base_col = col->base_col[i]; + const dfield_t* row_field = NULL; + ulint col_no = base_col->ind; + const mysql_row_templ_t* templ + = index->table->vc_templ->vtempl[col_no]; + const byte* data; + + if (update) { + ulint clust_no = dict_col_get_clust_pos(base_col, + clust_index); + ut_ad(clust_no != ULINT_UNDEFINED); + if (const upd_field_t *uf = upd_get_field_by_field_no( + update, uint16_t(clust_no), false)) { + row_field = &uf->new_val; + } + } + + if (!row_field) { + row_field = dtuple_get_nth_field(row, col_no); + } + + data = static_cast(row_field->data); + len = row_field->len; + + if (row_field->ext) { + if (*local_heap == NULL) { + *local_heap = mem_heap_create(srv_page_size); + } + + data = btr_copy_externally_stored_field( + &len, data, zip_size, + dfield_get_len(row_field), *local_heap); + } + + if (len == UNIV_SQL_NULL) { +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */ +#endif + mysql_rec[templ->mysql_null_byte_offset] + |= (byte) templ->mysql_null_bit_mask; +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 +# pragma GCC diagnostic pop +#endif + memcpy(mysql_rec + templ->mysql_col_offset, + static_cast( + index->table->vc_templ->default_rec + + templ->mysql_col_offset), + templ->mysql_col_len); + } else { + + row_sel_field_store_in_mysql_format( + mysql_rec + templ->mysql_col_offset, + templ, index, templ->clust_rec_field_no, + (const byte*)data, len); + + if (templ->mysql_null_bit_mask) { + /* It is a nullable column with a + non-NULL value */ + mysql_rec[templ->mysql_null_byte_offset] + &= static_cast( + ~templ->mysql_null_bit_mask); + } + } + } + + field = dtuple_get_nth_v_field(row, col->v_pos); + + MY_BITMAP *old_write_set = dbug_tmp_use_all_columns(mysql_table, &mysql_table->write_set); + MY_BITMAP *old_read_set = dbug_tmp_use_all_columns(mysql_table, &mysql_table->read_set); + ret = mysql_table->update_virtual_field( + mysql_table->field[col->m_col.ind], + ignore_warnings); + dbug_tmp_restore_column_map(&mysql_table->read_set, old_read_set); + dbug_tmp_restore_column_map(&mysql_table->write_set, old_write_set); + + if (ret != 0) { + DBUG_RETURN(NULL); + } + + if (vctempl->mysql_null_bit_mask + && (mysql_rec[vctempl->mysql_null_byte_offset] + & vctempl->mysql_null_bit_mask)) { + dfield_set_null(field); + field->type.prtype |= DATA_VIRTUAL; + DBUG_RETURN(field); + } + + row_mysql_store_col_in_innobase_format( + field, buf, + TRUE, mysql_rec + vctempl->mysql_col_offset, + vctempl->mysql_col_len, dict_table_is_comp(index->table)); + field->type.prtype |= DATA_VIRTUAL; + + ulint max_prefix = col->m_col.max_prefix; + + if (max_prefix && ifield + && (ifield->prefix_len == 0 + || ifield->prefix_len > col->m_col.max_prefix)) { + max_prefix = ifield->prefix_len; + } + + /* If this is a prefix index, we only need a portion of the field */ + if (max_prefix) { + len = dtype_get_at_most_n_mbchars( + col->m_col.prtype, + col->m_col.mbminlen, col->m_col.mbmaxlen, + max_prefix, + field->len, + static_cast(dfield_get_data(field))); + dfield_set_len(field, len); + } + + if (heap) { + dfield_dup(field, heap); + } + + DBUG_RETURN(field); +} + + +/** Attempt to push down an index condition. +@param[in] keyno MySQL key number +@param[in] idx_cond Index condition to be checked +@return Part of idx_cond which the handler will not evaluate */ + +class Item* +ha_innobase::idx_cond_push( + uint keyno, + class Item* idx_cond) +{ + DBUG_ENTER("ha_innobase::idx_cond_push"); + DBUG_ASSERT(keyno != MAX_KEY); + DBUG_ASSERT(idx_cond != NULL); + + /* We can only evaluate the condition if all columns are stored.*/ + dict_index_t* idx = innobase_get_index(keyno); + if (idx && dict_index_has_virtual(idx)) { + DBUG_RETURN(idx_cond); + } + + pushed_idx_cond = idx_cond; + pushed_idx_cond_keyno = keyno; + in_range_check_pushed_down = TRUE; + /* We will evaluate the condition entirely */ + DBUG_RETURN(NULL); +} + + +/** Push a primary key filter. +@param[in] pk_filter filter against which primary keys + are to be checked +@retval false if pushed (always) */ +bool ha_innobase::rowid_filter_push(Rowid_filter* pk_filter) +{ + DBUG_ENTER("ha_innobase::rowid_filter_push"); + DBUG_ASSERT(pk_filter != NULL); + pushed_rowid_filter= pk_filter; + DBUG_RETURN(false); +} + +static bool is_part_of_a_key_prefix(const Field_longstr *field) +{ + const TABLE_SHARE *s= field->table->s; + + for (uint i= 0; i < s->keys; i++) + { + const KEY &key= s->key_info[i]; + for (uint j= 0; j < key.user_defined_key_parts; j++) + { + const KEY_PART_INFO &info= key.key_part[j]; + // When field is a part of some key, a key part and field will have the + // same length. And their length will be different when only some prefix + // of a field is used as a key part. That's what we're looking for here. + if (info.field->field_index == field->field_index && + info.length != field->field_length) + { + DBUG_ASSERT(info.length < field->field_length); + return true; + } + } + } + + return false; +} + +static bool +is_part_of_a_primary_key(const Field* field) +{ + const TABLE_SHARE* s = field->table->s; + + return s->primary_key != MAX_KEY + && field->part_of_key.is_set(s->primary_key); +} + +bool ha_innobase::can_convert_string(const Field_string *field, + const Column_definition &new_type) const +{ + DBUG_ASSERT(!field->compression_method()); + if (new_type.type_handler() != field->type_handler()) + return false; + + if (new_type.char_length != field->char_length()) + return false; + + const Charset field_cs(field->charset()); + + if (new_type.length != field->max_display_length() && + (!m_prebuilt->table->not_redundant() || + field_cs.mbminlen() == field_cs.mbmaxlen())) + return false; + + if (new_type.charset != field->charset()) + { + if (!field_cs.encoding_allows_reinterpret_as(new_type.charset)) + return false; + + if (!field_cs.eq_collation_specific_names(new_type.charset)) + return !is_part_of_a_primary_key(field); + + // Fully indexed case works instantly like + // Compare_keys::EqualButKeyPartLength. But prefix case isn't implemented. + if (is_part_of_a_key_prefix(field)) + return false; + + return true; + } + + return true; +} + +static bool +supports_enlarging(const dict_table_t* table, const Field_varstring* field, + const Column_definition& new_type) +{ + return field->field_length <= 127 || new_type.length <= 255 + || field->field_length > 255 || !table->not_redundant(); +} + +bool ha_innobase::can_convert_varstring( + const Field_varstring *field, const Column_definition &new_type) const +{ + if (new_type.length < field->field_length) + return false; + + if (new_type.char_length < field->char_length()) + return false; + + if (!new_type.compression_method() != !field->compression_method()) + return false; + + if (new_type.type_handler() != field->type_handler()) + return false; + + if (new_type.charset != field->charset()) + { + if (!supports_enlarging(m_prebuilt->table, field, new_type)) + return false; + + Charset field_cs(field->charset()); + if (!field_cs.encoding_allows_reinterpret_as(new_type.charset)) + return false; + + if (!field_cs.eq_collation_specific_names(new_type.charset)) + return !is_part_of_a_primary_key(field); + + // Fully indexed case works instantly like + // Compare_keys::EqualButKeyPartLength. But prefix case isn't implemented. + if (is_part_of_a_key_prefix(field)) + return false; + + return true; + } + + if (new_type.length != field->field_length) + { + if (!supports_enlarging(m_prebuilt->table, field, new_type)) + return false; + + return true; + } + + return true; +} + +static bool is_part_of_a_key(const Field_blob *field) +{ + const TABLE_SHARE *s= field->table->s; + + for (uint i= 0; i < s->keys; i++) + { + const KEY &key= s->key_info[i]; + for (uint j= 0; j < key.user_defined_key_parts; j++) + { + const KEY_PART_INFO &info= key.key_part[j]; + if (info.field->field_index == field->field_index) + return true; + } + } + + return false; +} + +bool ha_innobase::can_convert_blob(const Field_blob *field, + const Column_definition &new_type) const +{ + if (new_type.type_handler() != field->type_handler()) + return false; + + if (!new_type.compression_method() != !field->compression_method()) + return false; + + if (new_type.pack_length != field->pack_length()) + return false; + + if (new_type.charset != field->charset()) + { + Charset field_cs(field->charset()); + if (!field_cs.encoding_allows_reinterpret_as(new_type.charset)) + return false; + + if (!field_cs.eq_collation_specific_names(new_type.charset)) + return !is_part_of_a_key(field); + + // Fully indexed case works instantly like + // Compare_keys::EqualButKeyPartLength. But prefix case isn't implemented. + if (is_part_of_a_key_prefix(field)) + return false; + + return true; + } + + return true; +} + + +bool ha_innobase::can_convert_nocopy(const Field &field, + const Column_definition &new_type) const +{ + if (const Field_string *tf= dynamic_cast(&field)) + return can_convert_string(tf, new_type); + + if (const Field_varstring *tf= dynamic_cast(&field)) + return can_convert_varstring(tf, new_type); + + if (dynamic_cast(&field)) + return false; + + if (const Field_blob *tf= dynamic_cast(&field)) + return can_convert_blob(tf, new_type); + + return false; +} + + +Compare_keys ha_innobase::compare_key_parts( + const Field &old_field, const Column_definition &new_field, + const KEY_PART_INFO &old_part, const KEY_PART_INFO &new_part) const +{ + const bool is_equal= old_field.is_equal(new_field); + const CHARSET_INFO *old_cs= old_field.charset(); + const CHARSET_INFO *new_cs= new_field.charset; + + if (!is_equal) + { + if (!old_field.table->file->can_convert_nocopy(old_field, new_field)) + return Compare_keys::NotEqual; + + if (!Charset(old_cs).eq_collation_specific_names(new_cs)) + return Compare_keys::NotEqual; + } + + if (old_part.length / old_cs->mbmaxlen != new_part.length / new_cs->mbmaxlen) + { + if (old_part.length != old_field.field_length) + return Compare_keys::NotEqual; + + if (old_part.length >= new_part.length) + return Compare_keys::NotEqual; + + return Compare_keys::EqualButKeyPartLength; + } + + return Compare_keys::Equal; +} + +/******************************************************************//** +Use this when the args are passed to the format string from +errmsg-utf8.txt directly as is. + +Push a warning message to the client, it is a wrapper around: + +void push_warning_printf( + THD *thd, Sql_condition::enum_condition_level level, + uint code, const char *format, ...); +*/ +void +ib_senderrf( +/*========*/ + THD* thd, /*!< in/out: session */ + ib_log_level_t level, /*!< in: warning level */ + ib_uint32_t code, /*!< MySQL error code */ + ...) /*!< Args */ +{ + va_list args; + const char* format = my_get_err_msg(code); + + /* If the caller wants to push a message to the client then + the caller must pass a valid session handle. */ + + ut_a(thd != 0); + + /* The error code must exist in the errmsg-utf8.txt file. */ + ut_a(format != 0); + + va_start(args, code); + + myf l; + + switch (level) { + case IB_LOG_LEVEL_INFO: + l = ME_NOTE; + break; + case IB_LOG_LEVEL_WARN: + l = ME_WARNING; + break; + default: + l = 0; + break; + } + + my_printv_error(code, format, MYF(l), args); + + va_end(args); + + if (level == IB_LOG_LEVEL_FATAL) { + ut_error; + } +} + +/******************************************************************//** +Use this when the args are first converted to a formatted string and then +passed to the format string from errmsg-utf8.txt. The error message format +must be: "Some string ... %s". + +Push a warning message to the client, it is a wrapper around: + +void push_warning_printf( + THD *thd, Sql_condition::enum_condition_level level, + uint code, const char *format, ...); +*/ +void +ib_errf( +/*====*/ + THD* thd, /*!< in/out: session */ + ib_log_level_t level, /*!< in: warning level */ + ib_uint32_t code, /*!< MySQL error code */ + const char* format, /*!< printf format */ + ...) /*!< Args */ +{ + char* str = NULL; + va_list args; + + /* If the caller wants to push a message to the client then + the caller must pass a valid session handle. */ + + ut_a(thd != 0); + ut_a(format != 0); + + va_start(args, format); + +#ifdef _WIN32 + int size = _vscprintf(format, args) + 1; + if (size > 0) { + str = static_cast(malloc(size)); + } + if (str == NULL) { + va_end(args); + return; /* Watch for Out-Of-Memory */ + } + str[size - 1] = 0x0; + vsnprintf(str, size, format, args); +#elif HAVE_VASPRINTF + if (vasprintf(&str, format, args) == -1) { + /* In case of failure use a fixed length string */ + str = static_cast(malloc(BUFSIZ)); + vsnprintf(str, BUFSIZ, format, args); + } +#else + /* Use a fixed length string. */ + str = static_cast(malloc(BUFSIZ)); + if (str == NULL) { + va_end(args); + return; /* Watch for Out-Of-Memory */ + } + vsnprintf(str, BUFSIZ, format, args); +#endif /* _WIN32 */ + + ib_senderrf(thd, level, code, str); + + va_end(args); + free(str); +} + +/* Keep the first 16 characters as-is, since the url is sometimes used +as an offset from this.*/ +const char* TROUBLESHOOTING_MSG = + "Please refer to https://mariadb.com/kb/en/innodb-troubleshooting/" + " for how to resolve the issue."; + +const char* TROUBLESHOOT_DATADICT_MSG = + "Please refer to https://mariadb.com/kb/en/innodb-data-dictionary-troubleshooting/" + " for how to resolve the issue."; + +const char* BUG_REPORT_MSG = + "Submit a detailed bug report to https://jira.mariadb.org/"; + +const char* FORCE_RECOVERY_MSG = + "Please refer to " + "https://mariadb.com/kb/en/library/innodb-recovery-modes/" + " for information about forcing recovery."; + +const char* OPERATING_SYSTEM_ERROR_MSG = + "Some operating system error numbers are described at" + " https://mariadb.com/kb/en/library/operating-system-error-codes/"; + +const char* FOREIGN_KEY_CONSTRAINTS_MSG = + "Please refer to https://mariadb.com/kb/en/library/foreign-keys/" + " for correct foreign key definition."; + +const char* SET_TRANSACTION_MSG = + "Please refer to https://mariadb.com/kb/en/library/set-transaction/"; + +const char* INNODB_PARAMETERS_MSG = + "Please refer to https://mariadb.com/kb/en/library/innodb-system-variables/"; + +/********************************************************************** +Converts an identifier from my_charset_filename to UTF-8 charset. +@return result string length, as returned by strconvert() */ +uint +innobase_convert_to_filename_charset( +/*=================================*/ + char* to, /* out: converted identifier */ + const char* from, /* in: identifier to convert */ + ulint len) /* in: length of 'to', in bytes */ +{ + uint errors; + CHARSET_INFO* cs_to = &my_charset_filename; + CHARSET_INFO* cs_from = system_charset_info; + + return(static_cast(strconvert( + cs_from, from, uint(strlen(from)), + cs_to, to, static_cast(len), &errors))); +} + +/********************************************************************** +Converts an identifier from my_charset_filename to UTF-8 charset. +@return result string length, as returned by strconvert() */ +uint +innobase_convert_to_system_charset( +/*===============================*/ + char* to, /* out: converted identifier */ + const char* from, /* in: identifier to convert */ + ulint len, /* in: length of 'to', in bytes */ + uint* errors) /* out: error return */ +{ + CHARSET_INFO* cs1 = &my_charset_filename; + CHARSET_INFO* cs2 = system_charset_info; + + return(static_cast(strconvert( + cs1, from, static_cast(strlen(from)), + cs2, to, static_cast(len), errors))); +} + +/** Validate the requested buffer pool size. Also, reserve the necessary +memory needed for buffer pool resize. +@param[in] thd thread handle +@param[out] save immediate result for update function +@param[in] value incoming string +@return 0 on success, 1 on failure. +*/ +static +int +innodb_buffer_pool_size_validate( + THD* thd, + st_mysql_sys_var*, + void* save, + struct st_mysql_value* value) +{ + longlong intbuf; + + value->val_int(value, &intbuf); + + if (static_cast(intbuf) < MYSQL_SYSVAR_NAME(buffer_pool_size).min_val) { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "innodb_buffer_pool_size must be at least" + " %lld for innodb_page_size=%lu", + MYSQL_SYSVAR_NAME(buffer_pool_size).min_val, + srv_page_size); + return(1); + } + + if (!srv_was_started) { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "Cannot update innodb_buffer_pool_size," + " because InnoDB is not started."); + return(1); + } + + mysql_mutex_lock(&buf_pool.mutex); + + if (srv_buf_pool_old_size != srv_buf_pool_size) { + mysql_mutex_unlock(&buf_pool.mutex); + my_printf_error(ER_WRONG_ARGUMENTS, + "Another buffer pool resize is already in progress.", MYF(0)); + return(1); + } + + ulint requested_buf_pool_size = buf_pool_size_align(ulint(intbuf)); + + *static_cast(save) = requested_buf_pool_size; + + if (srv_buf_pool_size == ulint(intbuf)) { + mysql_mutex_unlock(&buf_pool.mutex); + /* nothing to do */ + return(0); + } + + if (srv_buf_pool_size == requested_buf_pool_size) { + mysql_mutex_unlock(&buf_pool.mutex); + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_WRONG_ARGUMENTS, + "innodb_buffer_pool_size must be at least" + " innodb_buffer_pool_chunk_size=%zu", + srv_buf_pool_chunk_unit); + /* nothing to do */ + return(0); + } + + srv_buf_pool_size = requested_buf_pool_size; + mysql_mutex_unlock(&buf_pool.mutex); + + if (intbuf != static_cast(requested_buf_pool_size)) { + char buf[64]; + int len = 64; + value->val_str(value, buf, &len); + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_TRUNCATED_WRONG_VALUE, + "Truncated incorrect %-.32s value: '%-.128s'", + mysql_sysvar_buffer_pool_size.name, + value->val_str(value, buf, &len)); + } + + return(0); +} + +/*************************************************************//** +Check for a valid value of innobase_compression_algorithm. +@return 0 for valid innodb_compression_algorithm. */ +static +int +innodb_compression_algorithm_validate( +/*==================================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to system + variable */ + void* save, /*!< out: immediate result + for update function */ + struct st_mysql_value* value) /*!< in: incoming string */ +{ + DBUG_ENTER("innobase_compression_algorithm_validate"); + + if (check_sysvar_enum(thd, var, save, value)) { + DBUG_RETURN(1); + } + + if (compression_algorithm_is_not_loaded(*(ulong*)save, ME_WARNING)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} + +static +int +innodb_encrypt_tables_validate( +/*=================================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to system + variable */ + void* save, /*!< out: immediate result + for update function */ + struct st_mysql_value* value) /*!< in: incoming string */ +{ + if (check_sysvar_enum(thd, var, save, value)) { + return 1; + } + + ulong encrypt_tables = *(ulong*)save; + + if (encrypt_tables + && !encryption_key_id_exists(FIL_DEFAULT_ENCRYPTION_KEY)) { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_UNSUPPORTED, + "InnoDB: cannot enable encryption, " + "encryption plugin is not available"); + return 1; + } + + return 0; +} + +static void innodb_remember_check_sysvar_funcs() +{ + /* remember build-in sysvar check functions */ + ut_ad((MYSQL_SYSVAR_NAME(checksum_algorithm).flags & 0x1FF) == PLUGIN_VAR_ENUM); + check_sysvar_enum = MYSQL_SYSVAR_NAME(checksum_algorithm).check; + + ut_ad((MYSQL_SYSVAR_NAME(flush_log_at_timeout).flags & 15) == PLUGIN_VAR_INT); + check_sysvar_int = MYSQL_SYSVAR_NAME(flush_log_at_timeout).check; +} + +static const size_t MAX_BUF_SIZE = 4 * 1024; + +/********************************************************************//** +Helper function to push warnings from InnoDB internals to SQL-layer. */ +void +ib_push_warning( + trx_t* trx, /*!< in: trx */ + dberr_t error, /*!< in: error code to push as warning */ + const char *format,/*!< in: warning message */ + ...) +{ + if (trx && trx->mysql_thd) { + THD *thd = (THD *)trx->mysql_thd; + va_list args; + char *buf; + + va_start(args, format); + buf = (char *)my_malloc(PSI_INSTRUMENT_ME, MAX_BUF_SIZE, MYF(MY_WME)); + buf[MAX_BUF_SIZE - 1] = 0; + vsnprintf(buf, MAX_BUF_SIZE - 1, format, args); + + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + uint(convert_error_code_to_mysql(error, 0, thd)), buf); + my_free(buf); + va_end(args); + } +} + +/********************************************************************//** +Helper function to push warnings from InnoDB internals to SQL-layer. */ +void +ib_push_warning( + void* ithd, /*!< in: thd */ + dberr_t error, /*!< in: error code to push as warning */ + const char *format,/*!< in: warning message */ + ...) +{ + va_list args; + THD *thd = (THD *)ithd; + char *buf; + + if (ithd == NULL) { + thd = current_thd; + } + + if (thd) { + va_start(args, format); + buf = (char *)my_malloc(PSI_INSTRUMENT_ME, MAX_BUF_SIZE, MYF(MY_WME)); + buf[MAX_BUF_SIZE - 1] = 0; + vsnprintf(buf, MAX_BUF_SIZE - 1, format, args); + + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + uint(convert_error_code_to_mysql(error, 0, thd)), buf); + my_free(buf); + va_end(args); + } +} + +/** Helper function to push warnings from InnoDB internals to SQL-layer. +@param[in] trx +@param[in] error Error code to push as warning +@param[in] table_name Table name +@param[in] format Warning message +@param[in] ... Message arguments */ +void +ib_foreign_warn(trx_t* trx, /*!< in: trx */ + dberr_t error, /*!< in: error code to push as warning */ + const char* table_name, + const char* format, /*!< in: warning message */ + ...) +{ + va_list args; + char* buf; + static FILE* ef = dict_foreign_err_file; + static const size_t MAX_BUF_SIZE = 4 * 1024; + buf = (char*)my_malloc(PSI_INSTRUMENT_ME, MAX_BUF_SIZE, MYF(MY_WME)); + if (!buf) { + return; + } + + va_start(args, format); + vsprintf(buf, format, args); + va_end(args); + + mysql_mutex_lock(&dict_foreign_err_mutex); + rewind(ef); + ut_print_timestamp(ef); + fprintf(ef, " Error in foreign key constraint of table %s:\n", + table_name); + fputs(buf, ef); + mysql_mutex_unlock(&dict_foreign_err_mutex); + + if (trx && trx->mysql_thd) { + THD* thd = (THD*)trx->mysql_thd; + + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + uint(convert_error_code_to_mysql(error, 0, thd)), buf); + } + + my_free(buf); +} + +/********************************************************************//** +Helper function to push frm mismatch error to error log and +if needed to sql-layer. */ +void +ib_push_frm_error( + THD* thd, /*!< in: MySQL thd */ + dict_table_t* ib_table, /*!< in: InnoDB table */ + TABLE* table, /*!< in: MySQL table */ + ulint n_keys, /*!< in: InnoDB #keys */ + bool push_warning) /*!< in: print warning ? */ +{ + switch (ib_table->dict_frm_mismatch) { + case DICT_FRM_NO_PK: + sql_print_error("Table %s has a primary key in " + "InnoDB data dictionary, but not " + "in MariaDB!" + " Have you mixed up " + ".frm files from different " + "installations? See " + "https://mariadb.com/kb/en/innodb-troubleshooting/\n", + ib_table->name.m_name); + + if (push_warning) { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_NO_SUCH_INDEX, + "InnoDB: Table %s has a " + "primary key in InnoDB data " + "dictionary, but not in " + "MariaDB!", ib_table->name.m_name); + } + break; + case DICT_NO_PK_FRM_HAS: + sql_print_error( + "Table %s has no primary key in InnoDB data " + "dictionary, but has one in MariaDB! If you " + "created the table with a MariaDB version < " + "3.23.54 and did not define a primary key, " + "but defined a unique key with all non-NULL " + "columns, then MariaDB internally treats that " + "key as the primary key. You can fix this " + "error by dump + DROP + CREATE + reimport " + "of the table.", ib_table->name.m_name); + + if (push_warning) { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_NO_SUCH_INDEX, + "InnoDB: Table %s has no " + "primary key in InnoDB data " + "dictionary, but has one in " + "MariaDB!", + ib_table->name.m_name); + } + break; + + case DICT_FRM_INCONSISTENT_KEYS: + sql_print_error("InnoDB: Table %s contains " ULINTPF " " + "indexes inside InnoDB, which " + "is different from the number of " + "indexes %u defined in the .frm file. See " + "https://mariadb.com/kb/en/innodb-troubleshooting/\n", + ib_table->name.m_name, n_keys, + table->s->keys); + + if (push_warning) { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_NO_SUCH_INDEX, + "InnoDB: Table %s contains " ULINTPF " " + "indexes inside InnoDB, which " + "is different from the number of " + "indexes %u defined in the MariaDB ", + ib_table->name.m_name, n_keys, + table->s->keys); + } + break; + + case DICT_FRM_CONSISTENT: + default: + sql_print_error("InnoDB: Table %s is consistent " + "on InnoDB data dictionary and MariaDB " + " FRM file.", + ib_table->name.m_name); + ut_error; + break; + } +} + +/** Writes 8 bytes to nth tuple field +@param[in] tuple where to write +@param[in] nth index in tuple +@param[in] data what to write +@param[in] buf field data buffer */ +static void set_tuple_col_8(dtuple_t *tuple, int col, uint64_t data, byte *buf) +{ + dfield_t *dfield= dtuple_get_nth_field(tuple, col); + ut_ad(dfield->type.len == 8); + if (dfield->len == UNIV_SQL_NULL) + { + dfield_set_data(dfield, buf, 8); + } + ut_ad(dfield->len == dfield->type.len && dfield->data); + mach_write_to_8(dfield->data, data); +} + +void ins_node_t::vers_update_end(row_prebuilt_t *prebuilt, bool history_row) +{ + ut_ad(prebuilt->ins_node == this); + trx_t *trx= prebuilt->trx; +#ifndef DBUG_OFF + ut_ad(table->vers_start != table->vers_end); + const mysql_row_templ_t *t= prebuilt->get_template_by_col(table->vers_end); + ut_ad(t); + ut_ad(t->mysql_col_len == 8); +#endif + + if (history_row) + { + set_tuple_col_8(row, table->vers_end, trx->id, vers_end_buf); + } + else /* ROW_INS_VERSIONED */ + { + set_tuple_col_8(row, table->vers_end, TRX_ID_MAX, vers_end_buf); +#ifndef DBUG_OFF + t= prebuilt->get_template_by_col(table->vers_start); + ut_ad(t); + ut_ad(t->mysql_col_len == 8); +#endif + set_tuple_col_8(row, table->vers_start, trx->id, vers_start_buf); + } + dict_index_t *clust_index= dict_table_get_first_index(table); + THD *thd= trx->mysql_thd; + TABLE *mysql_table= prebuilt->m_mysql_table; + mem_heap_t *local_heap= NULL; + for (ulint col_no= 0; col_no < dict_table_get_n_v_cols(table); col_no++) + { + const dict_v_col_t *v_col= dict_table_get_nth_v_col(table, col_no); + for (ulint i= 0; i < unsigned(v_col->num_base); i++) + if (v_col->base_col[i]->ind == table->vers_end) + innobase_get_computed_value(row, v_col, clust_index, &local_heap, + table->heap, NULL, thd, mysql_table, + mysql_table->record[0], NULL, NULL); + } + if (UNIV_LIKELY_NULL(local_heap)) + mem_heap_free(local_heap); +} + +/** Calculate aligned buffer pool size based on srv_buf_pool_chunk_unit, +if needed. +@param[in] size size in bytes +@return aligned size */ +ulint +buf_pool_size_align( + ulint size) +{ + const size_t m = srv_buf_pool_chunk_unit; + size = ut_max(size, (size_t) MYSQL_SYSVAR_NAME(buffer_pool_size).min_val); + + if (size % m == 0) { + return(size); + } else { + return (size / m + 1) * m; + } +} diff --git a/storage/innobase/handler/ha_innodb.h b/storage/innobase/handler/ha_innodb.h new file mode 100644 index 00000000..1f42bf18 --- /dev/null +++ b/storage/innobase/handler/ha_innodb.h @@ -0,0 +1,937 @@ +/***************************************************************************** + +Copyright (c) 2000, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ +#ifdef WITH_WSREP +#include "wsrep_api.h" +#include +#endif /* WITH_WSREP */ + +#include "table.h" + +/* The InnoDB handler: the interface between MySQL and InnoDB. */ + +/** "GEN_CLUST_INDEX" is the name reserved for InnoDB default +system clustered index when there is no primary key. */ +extern const char innobase_index_reserve_name[]; + +/** Prebuilt structures in an InnoDB table handle used within MySQL */ +struct row_prebuilt_t; + +/** InnoDB transaction */ +struct trx_t; + +/** Engine specific table options are defined using this struct */ +struct ha_table_option_struct +{ + bool page_compressed; /*!< Table is using page compression + if this option is true. */ + ulonglong page_compression_level; /*!< Table page compression level + 0-9. */ + uint atomic_writes; /*!< Use atomic writes for this + table if this options is ON or + in DEFAULT if + innodb_use_atomic_writes. + Atomic writes are not used if + value OFF.*/ + uint encryption; /*!< DEFAULT, ON, OFF */ + ulonglong encryption_key_id; /*!< encryption key id */ +}; + +/** The class defining a handle to an Innodb table */ +class ha_innobase final : public handler +{ +public: + ha_innobase(handlerton* hton, TABLE_SHARE* table_arg); + ~ha_innobase() override; + + /** @return the transaction that last modified the table definition + @see dict_table_t::def_trx_id */ + ulonglong table_version() const override; + + /** Get the row type from the storage engine. If this method returns + ROW_TYPE_NOT_USED, the information in HA_CREATE_INFO should be used. */ + enum row_type get_row_type() const override; + + const char* table_type() const override; + + const char* index_type(uint key_number) override; + + Table_flags table_flags() const override; + + ulong index_flags(uint idx, uint part, bool all_parts) const override; + + uint max_supported_keys() const override; + + uint max_supported_key_length() const override; + + uint max_supported_key_part_length() const override; + + const key_map* keys_to_use_for_scanning() override; + + void column_bitmaps_signal() override; + + /** Opens dictionary table object using table name. For partition, we need to + try alternative lower/upper case names to support moving data files across + platforms. + @param[in] table_name name of the table/partition + @param[in] norm_name normalized name of the table/partition + @param[in] is_partition if this is a partition of a table + @param[in] ignore_err error to ignore for loading dictionary object + @return dictionary table object or NULL if not found */ + static dict_table_t* open_dict_table( + const char* table_name, + const char* norm_name, + bool is_partition, + dict_err_ignore_t ignore_err); + + int open(const char *name, int mode, uint test_if_locked) override; + + handler* clone(const char *name, MEM_ROOT *mem_root) override; + + int close(void) override; + + double scan_time() override; + + double read_time(uint index, uint ranges, ha_rows rows) override; + + int write_row(const uchar * buf) override; + + int update_row(const uchar * old_data, const uchar * new_data) override; + + int delete_row(const uchar * buf) override; + + bool was_semi_consistent_read() override; + + void try_semi_consistent_read(bool yes) override; + + void unlock_row() override; + + int index_init(uint index, bool sorted) override; + + int index_end() override; + + int index_read( + uchar* buf, + const uchar* key, + uint key_len, + ha_rkey_function find_flag) override; + + int index_read_last(uchar * buf, const uchar * key, + uint key_len) override; + + int index_next(uchar * buf) override; + + int index_next_same(uchar * buf, const uchar * key, + uint keylen) override; + + int index_prev(uchar * buf) override; + + int index_first(uchar * buf) override; + + int index_last(uchar * buf) override; + + /* Copy a cached MySQL row. If requested, also avoids + overwriting non-read columns. */ + void copy_cached_row(uchar *to_rec, const uchar *from_rec, + uint rec_length); + int rnd_init(bool scan) override; + + int rnd_end() override; + + int rnd_next(uchar *buf) override; + + int rnd_pos(uchar * buf, uchar *pos) override; + + int ft_init() override; + void ft_end() override { rnd_end(); } + FT_INFO *ft_init_ext(uint flags, uint inx, String* key) override; + int ft_read(uchar* buf) override; + + void position(const uchar *record) override; + + int info(uint) override; + + int analyze(THD* thd,HA_CHECK_OPT* check_opt) override; + + int optimize(THD* thd,HA_CHECK_OPT* check_opt) override; + + int discard_or_import_tablespace(my_bool discard) override; + + int extra(ha_extra_function operation) override; + + int reset() override; + + int external_lock(THD *thd, int lock_type) override; + + int start_stmt(THD *thd, thr_lock_type lock_type) override; + + ha_rows records_in_range( + uint inx, + const key_range* min_key, + const key_range* max_key, + page_range* pages) override; + + ha_rows estimate_rows_upper_bound() override; + + void update_create_info(HA_CREATE_INFO* create_info) override; + + int create( + const char* name, + TABLE* form, + HA_CREATE_INFO* create_info, + bool file_per_table, + trx_t* trx); + + int create( + const char* name, + TABLE* form, + HA_CREATE_INFO* create_info) override; + + int truncate() override; + + int delete_table(const char *name) override; + + int rename_table(const char* from, const char* to) override; + inline int defragment_table(); + int check(THD* thd, HA_CHECK_OPT* check_opt) override; + + inline void reload_statistics(); + + char* get_foreign_key_create_info() override; + + int get_foreign_key_list(THD *thd, + List *f_key_list) override; + + int get_parent_foreign_key_list( + THD* thd, + List* f_key_list) override; + + bool can_switch_engines() override; + + uint referenced_by_foreign_key() override; + + void free_foreign_key_create_info(char* str) override { my_free(str); } + + uint lock_count(void) const override; + + THR_LOCK_DATA** store_lock( + THD* thd, + THR_LOCK_DATA** to, + thr_lock_type lock_type) override; + + void init_table_handle_for_HANDLER() override; + + void get_auto_increment( + ulonglong offset, + ulonglong increment, + ulonglong nb_desired_values, + ulonglong* first_value, + ulonglong* nb_reserved_values) override; + + bool get_error_message(int error, String *buf) override; + + bool get_foreign_dup_key(char*, uint, char*, uint) override; + + uint8 table_cache_type() override; + + /** + Ask handler about permission to cache table during query registration + */ + my_bool register_query_cache_table( + THD* thd, + const char* table_key, + uint key_length, + qc_engine_callback* call_back, + ulonglong* engine_data) override; + + int cmp_ref(const uchar* ref1, const uchar* ref2) override; + + /** On-line ALTER TABLE interface @see handler0alter.cc @{ */ + + /** Check if InnoDB supports a particular alter table in-place + @param altered_table TABLE object for new version of table. + @param ha_alter_info Structure describing changes to be done + by ALTER TABLE and holding data used during in-place alter. + + @retval HA_ALTER_INPLACE_NOT_SUPPORTED Not supported + @retval HA_ALTER_INPLACE_INSTANT + MDL_EXCLUSIVE is needed for executing prepare_inplace_alter_table() + and commit_inplace_alter_table(). inplace_alter_table() + will not be called. + @retval HA_ALTER_INPLACE_COPY_NO_LOCK + MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded + to LOCK=NONE for rebuilding the table in inplace_alter_table() + @retval HA_ALTER_INPLACE_COPY_LOCK + MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded + to LOCK=SHARED for rebuilding the table in inplace_alter_table() + @retval HA_ALTER_INPLACE_NOCOPY_NO_LOCK + MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded + to LOCK=NONE for inplace_alter_table() which will not rebuild the table + @retval HA_ALTER_INPLACE_NOCOPY_LOCK + MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded + to LOCK=SHARED for inplace_alter_table() which will not rebuild + the table. */ + + enum_alter_inplace_result check_if_supported_inplace_alter( + TABLE* altered_table, + Alter_inplace_info* ha_alter_info) override; + + /** Allows InnoDB to update internal structures with concurrent + writes blocked (provided that check_if_supported_inplace_alter() + did not return HA_ALTER_INPLACE_NO_LOCK). + This will be invoked before inplace_alter_table(). + + @param altered_table TABLE object for new version of table. + @param ha_alter_info Structure describing changes to be done + by ALTER TABLE and holding data used during in-place alter. + + @retval true Failure + @retval false Success + */ + bool prepare_inplace_alter_table( + TABLE* altered_table, + Alter_inplace_info* ha_alter_info) override; + + /** Alter the table structure in-place with operations + specified using HA_ALTER_FLAGS and Alter_inplace_information. + The level of concurrency allowed during this operation depends + on the return value from check_if_supported_inplace_alter(). + + @param altered_table TABLE object for new version of table. + @param ha_alter_info Structure describing changes to be done + by ALTER TABLE and holding data used during in-place alter. + + @retval true Failure + @retval false Success + */ + bool inplace_alter_table( + TABLE* altered_table, + Alter_inplace_info* ha_alter_info) override; + + /** Commit or rollback the changes made during + prepare_inplace_alter_table() and inplace_alter_table() inside + the storage engine. Note that the allowed level of concurrency + during this operation will be the same as for + inplace_alter_table() and thus might be higher than during + prepare_inplace_alter_table(). (E.g concurrent writes were + blocked during prepare, but might not be during commit). + @param altered_table TABLE object for new version of table. + @param ha_alter_info Structure describing changes to be done + by ALTER TABLE and holding data used during in-place alter. + @param commit true => Commit, false => Rollback. + @retval true Failure + @retval false Success + */ + bool commit_inplace_alter_table( + TABLE* altered_table, + Alter_inplace_info* ha_alter_info, + bool commit) override; + /** @} */ + + bool check_if_incompatible_data( + HA_CREATE_INFO* info, + uint table_changes) override; + + /** @name Multi Range Read interface @{ */ + + /** Initialize multi range read @see DsMrr_impl::dsmrr_init + @param seq + @param seq_init_param + @param n_ranges + @param mode + @param buf */ + int multi_range_read_init( + RANGE_SEQ_IF* seq, + void* seq_init_param, + uint n_ranges, + uint mode, + HANDLER_BUFFER* buf) override; + + /** Process next multi range read @see DsMrr_impl::dsmrr_next + @param range_info */ + int multi_range_read_next(range_id_t *range_info) override; + + /** Initialize multi range read and get information. + @see ha_myisam::multi_range_read_info_const + @see DsMrr_impl::dsmrr_info_const + @param keyno + @param seq + @param seq_init_param + @param n_ranges + @param bufsz + @param flags + @param cost */ + ha_rows multi_range_read_info_const( + uint keyno, + RANGE_SEQ_IF* seq, + void* seq_init_param, + uint n_ranges, + uint* bufsz, + uint* flags, + Cost_estimate* cost) override; + + /** Initialize multi range read and get information. + @see DsMrr_impl::dsmrr_info + @param keyno + @param seq + @param seq_init_param + @param n_ranges + @param bufsz + @param flags + @param cost */ + ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys, + uint key_parts, uint* bufsz, uint* flags, + Cost_estimate* cost) override; + + int multi_range_read_explain_info(uint mrr_mode, + char *str, size_t size) override; + + /** Attempt to push down an index condition. + @param[in] keyno MySQL key number + @param[in] idx_cond Index condition to be checked + @return idx_cond if pushed; NULL if not pushed */ + Item* idx_cond_push(uint keyno, Item* idx_cond) override; + /* @} */ + + /** Check if InnoDB is not storing virtual column metadata for a table. + @param s table definition (based on .frm file) + @return whether InnoDB will omit virtual column metadata */ + static bool omits_virtual_cols(const TABLE_SHARE& s) + { + return s.frm_versionkey_info{} */ + uint m_primary_key; + + /** this is set to 1 when we are starting a table scan but have + not yet fetched any row, else false */ + bool m_start_of_scan; + + /*!< match mode of the latest search: ROW_SEL_EXACT, + ROW_SEL_EXACT_PREFIX, or undefined */ + uint m_last_match_mode; + + /** If mysql has locked with external_lock() */ + bool m_mysql_has_locked; +}; + + +/* Some accessor functions which the InnoDB plugin needs, but which +can not be added to mysql/plugin.h as part of the public interface; +the definitions are bracketed with #ifdef INNODB_COMPATIBILITY_HOOKS */ + +#ifndef INNODB_COMPATIBILITY_HOOKS +#error InnoDB needs MySQL to be built with #define INNODB_COMPATIBILITY_HOOKS +#endif + +extern "C" { + +/** Check if a user thread is running a non-transactional update +@param thd user thread +@retval 0 the user thread is not running a non-transactional update +@retval 1 the user thread is running a non-transactional update */ +int thd_non_transactional_update(const MYSQL_THD thd); + +/** Get the user thread's binary logging format +@param thd user thread +@return Value to be used as index into the binlog_format_names array */ +int thd_binlog_format(const MYSQL_THD thd); + +/** Check if binary logging is filtered for thread's current db. +@param thd Thread handle +@retval 1 the query is not filtered, 0 otherwise. */ +bool thd_binlog_filter_ok(const MYSQL_THD thd); + +/** Check if the query may generate row changes which may end up in the binary. +@param thd Thread handle +@retval 1 the query may generate row changes, 0 otherwise. +*/ +bool thd_sqlcom_can_generate_row_events(const MYSQL_THD thd); + +/** Is strict sql_mode set. +@param thd Thread object +@return True if sql_mode has strict mode (all or trans), false otherwise. */ +bool thd_is_strict_mode(const MYSQL_THD thd); + +} /* extern "C" */ + +/** Get the file name and position of the MySQL binlog corresponding to the + * current commit. + */ +extern void mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char **out_file); + +struct trx_t; + +extern const struct _ft_vft ft_vft_result; + +/** Structure Returned by ha_innobase::ft_init_ext() */ +typedef struct new_ft_info +{ + struct _ft_vft *please; + struct _ft_vft_ext *could_you; + row_prebuilt_t* ft_prebuilt; + fts_result_t* ft_result; +} NEW_FT_INFO; + +/** +Allocates an InnoDB transaction for a MySQL handler object. +@return InnoDB transaction handle */ +trx_t* +innobase_trx_allocate( + MYSQL_THD thd); /*!< in: user thread handle */ + +/*********************************************************************//** +This function checks each index name for a table against reserved +system default primary index name 'GEN_CLUST_INDEX'. If a name +matches, this function pushes an warning message to the client, +and returns true. +@return true if the index name matches the reserved name */ +bool +innobase_index_name_is_reserved( + THD* thd, /*!< in/out: MySQL connection */ + const KEY* key_info, /*!< in: Indexes to be created */ + ulint num_of_keys) /*!< in: Number of indexes to + be created. */ + MY_ATTRIBUTE((nonnull(1), warn_unused_result)); + +/** Parse hint for table and its indexes, and update the information +in dictionary. +@param[in] thd Connection thread +@param[in,out] table Target table +@param[in] table_share Table definition */ +void +innobase_parse_hint_from_comment( + THD* thd, + dict_table_t* table, + const TABLE_SHARE* table_share); + +/** Class for handling create table information. */ +class create_table_info_t +{ +public: + /** Constructor. + Used in two ways: + - all but file_per_table is used, when creating the table. + - all but name/path is used, when validating options and using flags. */ + create_table_info_t( + THD* thd, + const TABLE* form, + HA_CREATE_INFO* create_info, + char* table_name, + char* remote_path, + bool file_per_table, + trx_t* trx = NULL); + + /** Initialize the object. */ + int initialize(); + + /** Set m_tablespace_type. */ + void set_tablespace_type(bool table_being_altered_is_file_per_table); + + /** Create InnoDB foreign keys from MySQL alter_info. */ + dberr_t create_foreign_keys(); + + /** Create the internal innodb table. + @param create_fk whether to add FOREIGN KEY constraints */ + int create_table(bool create_fk = true); + + static void create_table_update_dict(dict_table_t* table, THD* thd, + const HA_CREATE_INFO& info, + const TABLE& t); + + /** Validates the create options. Checks that the options + KEY_BLOCK_SIZE, ROW_FORMAT, DATA DIRECTORY, TEMPORARY & TABLESPACE + are compatible with each other and other settings. + These CREATE OPTIONS are not validated here unless innodb_strict_mode + is on. With strict mode, this function will report each problem it + finds using a custom message with error code + ER_ILLEGAL_HA_CREATE_OPTION, not its built-in message. + @return NULL if valid, string name of bad option if not. */ + const char* create_options_are_invalid(); + + bool gcols_in_fulltext_or_spatial(); + + /** Validates engine specific table options not handled by + SQL-parser. + @return NULL if valid, string name of bad option if not. */ + const char* check_table_options(); + + /** Validate DATA DIRECTORY option. */ + bool create_option_data_directory_is_valid(); + + /** Validate TABLESPACE option. */ + bool create_option_tablespace_is_valid(); + + /** Prepare to create a table. */ + int prepare_create_table(const char* name, bool strict = true); + + void allocate_trx(); + + /** Checks that every index have sane size. Depends on strict mode */ + bool row_size_is_acceptable(const dict_table_t& table, + bool strict) const; + /** Checks that given index have sane size. Depends on strict mode */ + bool row_size_is_acceptable(const dict_index_t& index, + bool strict) const; + + /** Determines InnoDB table flags. + If strict_mode=OFF, this will adjust the flags to what should be assumed. + @retval true if successful, false if error */ + bool innobase_table_flags(); + + /** Set flags and append '/' to remote path if necessary. */ + void set_remote_path_flags(); + + /** Get table flags. */ + ulint flags() const + { return(m_flags); } + + /** Update table flags. */ + void flags_set(ulint flags) { m_flags |= flags; } + + /** Get table flags2. */ + ulint flags2() const + { return(m_flags2); } + + /** Get trx. */ + trx_t* trx() const + { return(m_trx); } + + /** @return table name */ + const char* table_name() const { return(m_table_name); } + + /** @return the created table */ + dict_table_t *table() const { return m_table; } + + THD* thd() const { return(m_thd); } + +private: + /** Parses the table name into normal name and either temp path or + remote path if needed.*/ + int + parse_table_name( + const char* name); + + /** Create the internal innodb table definition. */ + int create_table_def(); + + /** Connection thread handle. */ + THD* m_thd; + + /** InnoDB transaction handle. */ + trx_t* m_trx; + + /** Information on table columns and indexes. */ + const TABLE* m_form; + + /** Value of innodb_default_row_format */ + const ulong m_default_row_format; + + /** Create options. */ + HA_CREATE_INFO* m_create_info; + + /** Table name */ + char* m_table_name; + /** Table */ + dict_table_t* m_table; + + /** Remote path (DATA DIRECTORY) or zero length-string */ + char* m_remote_path; + + /** Local copy of srv_file_per_table. */ + bool m_innodb_file_per_table; + + /** Allow file_per_table for this table either because: + 1) the setting innodb_file_per_table=on, + 2) it was explicitly requested by tablespace=innodb_file_per_table. + 3) the table being altered is currently file_per_table */ + bool m_allow_file_per_table; + + /** After all considerations, this shows whether we will actually + create a table and tablespace using file-per-table. */ + bool m_use_file_per_table; + + /** Using DATA DIRECTORY */ + bool m_use_data_dir; + + /** Table flags */ + ulint m_flags; + + /** Table flags2 */ + ulint m_flags2; +}; + +/** +Initialize the table FTS stopword list +@return TRUE if success */ +ibool +innobase_fts_load_stopword( +/*=======================*/ + dict_table_t* table, /*!< in: Table has the FTS */ + trx_t* trx, /*!< in: transaction */ + THD* thd) /*!< in: current thread */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Some defines for innobase_fts_check_doc_id_index() return value */ +enum fts_doc_id_index_enum { + FTS_INCORRECT_DOC_ID_INDEX, + FTS_EXIST_DOC_ID_INDEX, + FTS_NOT_EXIST_DOC_ID_INDEX +}; + +/** +Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME +on the Doc ID column. +@return the status of the FTS_DOC_ID index */ +fts_doc_id_index_enum +innobase_fts_check_doc_id_index( + const dict_table_t* table, /*!< in: table definition */ + const TABLE* altered_table, /*!< in: MySQL table + that is being altered */ + ulint* fts_doc_col_no) /*!< out: The column number for + Doc ID */ + MY_ATTRIBUTE((warn_unused_result)); + +/** +Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME +on the Doc ID column in MySQL create index definition. +@return FTS_EXIST_DOC_ID_INDEX if there exists the FTS_DOC_ID index, +FTS_INCORRECT_DOC_ID_INDEX if the FTS_DOC_ID index is of wrong format */ +fts_doc_id_index_enum +innobase_fts_check_doc_id_index_in_def( + ulint n_key, /*!< in: Number of keys */ + const KEY* key_info) /*!< in: Key definitions */ + MY_ATTRIBUTE((warn_unused_result)); + +/** +Copy table flags from MySQL's TABLE_SHARE into an InnoDB table object. +Those flags are stored in .frm file and end up in the MySQL table object, +but are frequently used inside InnoDB so we keep their copies into the +InnoDB table object. */ +void +innobase_copy_frm_flags_from_table_share( + dict_table_t* innodb_table, /*!< in/out: InnoDB table */ + const TABLE_SHARE* table_share); /*!< in: table share */ + +/** Set up base columns for virtual column +@param[in] table the InnoDB table +@param[in] field MySQL field +@param[in,out] v_col virtual column to be set up */ +void +innodb_base_col_setup( + dict_table_t* table, + const Field* field, + dict_v_col_t* v_col); + +/** Set up base columns for stored column +@param[in] table InnoDB table +@param[in] field MySQL field +@param[in,out] s_col stored column */ +void +innodb_base_col_setup_for_stored( + const dict_table_t* table, + const Field* field, + dict_s_col_t* s_col); + +/** whether this is a stored generated column */ +#define innobase_is_s_fld(field) ((field)->vcol_info && (field)->stored_in_db()) + +/** Converts a search mode flag understood by MySQL to a flag understood +by InnoDB. +@param[in] find_flag MySQL search mode flag. +@return InnoDB search mode flag. */ +page_cur_mode_t +convert_search_mode_to_innobase( + enum ha_rkey_function find_flag); + +/** Commits a transaction in an InnoDB database. +@param[in] trx Transaction handle. */ +void +innobase_commit_low( + trx_t* trx); + +extern my_bool innobase_stats_on_metadata; + +/** Calculate Record Per Key value. +Need to exclude the NULL value if innodb_stats_method is set to "nulls_ignored" +@param[in] index InnoDB index. +@param[in] i The column we are calculating rec per key. +@param[in] records Estimated total records. +@return estimated record per key value */ +/* JAN: TODO: MySQL 5.7 */ +typedef float rec_per_key_t; +rec_per_key_t +innodb_rec_per_key( + dict_index_t* index, + ulint i, + ha_rows records); + +/** Build template for the virtual columns and their base columns +@param[in] table MySQL TABLE +@param[in] ib_table InnoDB dict_table_t +@param[in,out] s_templ InnoDB template structure +@param[in] add_v new virtual columns added along with + add index call +@param[in] locked true if innobase_share_mutex is held */ +void +innobase_build_v_templ( + const TABLE* table, + const dict_table_t* ib_table, + dict_vcol_templ_t* s_templ, + const dict_add_v_col_t* add_v, + bool locked); + +/** callback used by MySQL server layer to initialized +the table virtual columns' template +@param[in] table MySQL TABLE +@param[in,out] ib_table InnoDB dict_table_t */ +void +innobase_build_v_templ_callback( + const TABLE* table, + void* ib_table); + +/** Callback function definition, used by MySQL server layer to initialized +the table virtual columns' template */ +typedef void (*my_gcolumn_templatecallback_t)(const TABLE*, void*); + +/** Convert MySQL column number to dict_table_t::cols[] offset. +@param[in] field non-virtual column +@return column number relative to dict_table_t::cols[] */ +unsigned +innodb_col_no(const Field* field) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/********************************************************************//** +Helper function to push frm mismatch error to error log and +if needed to sql-layer. */ +void +ib_push_frm_error( + THD* thd, /*!< in: MySQL thd */ + dict_table_t* ib_table, /*!< in: InnoDB table */ + TABLE* table, /*!< in: MySQL table */ + ulint n_keys, /*!< in: InnoDB #keys */ + bool push_warning); /*!< in: print warning ? */ + +/** Check each index part length whether they not exceed the max limit +@param[in] max_field_len maximum allowed key part length +@param[in] key MariaDB key definition +@return true if index column length exceeds limit */ +MY_ATTRIBUTE((warn_unused_result)) +bool too_big_key_part_length(size_t max_field_len, const KEY& key); + +/** This function is used to rollback one X/Open XA distributed transaction +which is in the prepared state + +@param[in] hton InnoDB handlerton +@param[in] xid X/Open XA transaction identification + +@return 0 or error number */ +int innobase_rollback_by_xid(handlerton* hton, XID* xid); diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc new file mode 100644 index 00000000..40370ac5 --- /dev/null +++ b/storage/innobase/handler/handler0alter.cc @@ -0,0 +1,11843 @@ +/***************************************************************************** + +Copyright (c) 2005, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file handler/handler0alter.cc +Smart ALTER TABLE +*******************************************************/ + +/* Include necessary SQL headers */ +#include "univ.i" +#include +#include +#include +#include +#include +#include + +/* Include necessary InnoDB headers */ +#include "btr0sea.h" +#include "dict0crea.h" +#include "dict0dict.h" +#include "dict0load.h" +#include "dict0stats.h" +#include "dict0stats_bg.h" +#include "log0log.h" +#include "rem0types.h" +#include "row0log.h" +#include "row0merge.h" +#include "row0ins.h" +#include "row0row.h" +#include "row0upd.h" +#include "trx0trx.h" +#include "trx0purge.h" +#include "handler0alter.h" +#include "srv0mon.h" +#include "srv0srv.h" +#include "fts0priv.h" +#include "fts0plugin.h" +#include "pars0pars.h" +#include "row0sel.h" +#include "ha_innodb.h" +#include "ut0stage.h" +#include +#include + +/** File format constraint for ALTER TABLE */ +extern ulong innodb_instant_alter_column_allowed; + +static const char *MSG_UNSUPPORTED_ALTER_ONLINE_ON_VIRTUAL_COLUMN= + "INPLACE ADD or DROP of virtual columns cannot be " + "combined with other ALTER TABLE actions"; + +/** Operations for creating secondary indexes (no rebuild needed) */ +static const alter_table_operations INNOBASE_ONLINE_CREATE + = ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX + | ALTER_ADD_UNIQUE_INDEX; + +/** Operations that require filling in default values for columns */ +static const alter_table_operations INNOBASE_DEFAULTS + = ALTER_COLUMN_NOT_NULLABLE + | ALTER_ADD_STORED_BASE_COLUMN; + + +/** Operations that require knowledge about row_start, row_end values */ +static const alter_table_operations INNOBASE_ALTER_VERSIONED_REBUILD + = ALTER_ADD_SYSTEM_VERSIONING + | ALTER_DROP_SYSTEM_VERSIONING; + +/** Operations for rebuilding a table in place */ +static const alter_table_operations INNOBASE_ALTER_REBUILD + = ALTER_ADD_PK_INDEX + | ALTER_DROP_PK_INDEX + | ALTER_OPTIONS + /* ALTER_OPTIONS needs to check alter_options_need_rebuild() */ + | ALTER_COLUMN_NULLABLE + | INNOBASE_DEFAULTS + | ALTER_STORED_COLUMN_ORDER + | ALTER_DROP_STORED_COLUMN + | ALTER_RECREATE_TABLE + /* + | ALTER_STORED_COLUMN_TYPE + */ + | INNOBASE_ALTER_VERSIONED_REBUILD + ; + +/** Operations that require changes to data */ +static const alter_table_operations INNOBASE_ALTER_DATA + = INNOBASE_ONLINE_CREATE | INNOBASE_ALTER_REBUILD; + +/** Operations for altering a table that InnoDB does not care about */ +static const alter_table_operations INNOBASE_INPLACE_IGNORE + = ALTER_COLUMN_DEFAULT + | ALTER_PARTITIONED + | ALTER_COLUMN_COLUMN_FORMAT + | ALTER_COLUMN_STORAGE_TYPE + | ALTER_CONVERT_TO + | ALTER_VIRTUAL_GCOL_EXPR + | ALTER_DROP_CHECK_CONSTRAINT + | ALTER_RENAME + | ALTER_INDEX_ORDER + | ALTER_COLUMN_INDEX_LENGTH + | ALTER_CHANGE_INDEX_COMMENT + | ALTER_INDEX_IGNORABILITY; + +/** Operations on foreign key definitions (changing the schema only) */ +static const alter_table_operations INNOBASE_FOREIGN_OPERATIONS + = ALTER_DROP_FOREIGN_KEY + | ALTER_ADD_FOREIGN_KEY; + +/** Operations that InnoDB cares about and can perform without creating data */ +static const alter_table_operations INNOBASE_ALTER_NOCREATE + = ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX + | ALTER_DROP_UNIQUE_INDEX; + +/** Operations that InnoDB cares about and can perform without validation */ +static const alter_table_operations INNOBASE_ALTER_NOVALIDATE + = INNOBASE_ALTER_NOCREATE + | ALTER_VIRTUAL_COLUMN_ORDER + | ALTER_COLUMN_NAME + | INNOBASE_FOREIGN_OPERATIONS + | ALTER_COLUMN_UNVERSIONED + | ALTER_DROP_VIRTUAL_COLUMN; + +/** Operations that InnoDB cares about and can perform without rebuild */ +static const alter_table_operations INNOBASE_ALTER_NOREBUILD + = INNOBASE_ONLINE_CREATE + | INNOBASE_ALTER_NOCREATE; + +/** Operations that can be performed instantly, without inplace_alter_table() */ +static const alter_table_operations INNOBASE_ALTER_INSTANT + = ALTER_VIRTUAL_COLUMN_ORDER + | ALTER_COLUMN_NAME + | ALTER_ADD_VIRTUAL_COLUMN + | INNOBASE_FOREIGN_OPERATIONS + | ALTER_COLUMN_TYPE_CHANGE_BY_ENGINE + | ALTER_COLUMN_UNVERSIONED + | ALTER_RENAME_INDEX + | ALTER_DROP_VIRTUAL_COLUMN; + +/** Initialize instant->field_map. +@param[in] table table definition to copy from */ +inline void dict_table_t::init_instant(const dict_table_t& table) +{ + const dict_index_t& oindex __attribute__((unused))= *table.indexes.start; + dict_index_t& index = *indexes.start; + const unsigned u = index.first_user_field(); + DBUG_ASSERT(u == oindex.first_user_field()); + DBUG_ASSERT(index.n_fields >= oindex.n_fields); + + field_map_element_t* field_map_it = static_cast( + mem_heap_zalloc(heap, (index.n_fields - u) + * sizeof *field_map_it)); + instant->field_map = field_map_it; + + ut_d(unsigned n_drop = 0); + ut_d(unsigned n_nullable = 0); + for (unsigned i = u; i < index.n_fields; i++) { + auto& f = index.fields[i]; + ut_d(n_nullable += f.col->is_nullable()); + + if (!f.col->is_dropped()) { + (*field_map_it++).set_ind(f.col->ind); + continue; + } + + auto fixed_len = dict_col_get_fixed_size( + f.col, not_redundant()); + field_map_it->set_dropped(); + if (!f.col->is_nullable()) { + field_map_it->set_not_null(); + } + field_map_it->set_ind(fixed_len + ? uint16_t(fixed_len + 1) + : DATA_BIG_COL(f.col)); + field_map_it++; + ut_ad(f.col >= table.instant->dropped); + ut_ad(f.col < table.instant->dropped + + table.instant->n_dropped); + ut_d(n_drop++); + size_t d = f.col - table.instant->dropped; + ut_ad(f.col == &table.instant->dropped[d]); + ut_ad(d <= instant->n_dropped); + f.col = &instant->dropped[d]; + } + ut_ad(n_drop == n_dropped()); + ut_ad(field_map_it == &instant->field_map[index.n_fields - u]); + ut_ad(index.n_nullable == n_nullable); +} + +/** Set is_instant() before instant_column(). +@param[in] old previous table definition +@param[in] col_map map from old.cols[] and old.v_cols[] to this +@param[out] first_alter_pos 0, or 1 + first changed column position */ +inline void dict_table_t::prepare_instant(const dict_table_t& old, + const ulint* col_map, + unsigned& first_alter_pos) +{ + DBUG_ASSERT(!is_instant()); + DBUG_ASSERT(n_dropped() == 0); + DBUG_ASSERT(old.n_cols == old.n_def); + DBUG_ASSERT(n_cols == n_def); + DBUG_ASSERT(old.supports_instant()); + DBUG_ASSERT(not_redundant() == old.not_redundant()); + DBUG_ASSERT(DICT_TF_HAS_ATOMIC_BLOBS(flags) + == DICT_TF_HAS_ATOMIC_BLOBS(old.flags)); + DBUG_ASSERT(!persistent_autoinc + || persistent_autoinc == old.persistent_autoinc); + /* supports_instant() does not necessarily hold here, + in case ROW_FORMAT=COMPRESSED according to the + MariaDB data dictionary, and ALTER_OPTIONS was not set. + If that is the case, the instant ALTER TABLE would keep + the InnoDB table in its current format. */ + + dict_index_t& oindex = *old.indexes.start; + dict_index_t& index = *indexes.start; + first_alter_pos = 0; + + for (unsigned i = 0; i + DATA_N_SYS_COLS < old.n_cols; i++) { + if (col_map[i] != i) { + first_alter_pos = 1 + i; + goto add_metadata; + } + } + + if (!old.instant) { + /* Columns were not dropped or reordered. + Therefore columns must have been added at the end, + or modified instantly in place. */ + DBUG_ASSERT(index.n_fields >= oindex.n_fields); + DBUG_ASSERT(index.n_fields > oindex.n_fields + || !not_redundant()); +#ifdef UNIV_DEBUG + if (index.n_fields == oindex.n_fields) { + ut_ad(!not_redundant()); + for (unsigned i = index.n_fields; i--; ) { + ut_ad(index.fields[i].col->same_format( + *oindex.fields[i].col)); + } + } +#endif +set_core_fields: + index.n_core_fields = oindex.n_core_fields; + index.n_core_null_bytes = oindex.n_core_null_bytes; + } else { +add_metadata: + const unsigned n_old_drop = old.n_dropped(); + unsigned n_drop = n_old_drop; + for (unsigned i = old.n_cols; i--; ) { + if (col_map[i] == ULINT_UNDEFINED) { + DBUG_ASSERT(i + DATA_N_SYS_COLS + < uint(old.n_cols)); + n_drop++; + } + } + + instant = new (mem_heap_alloc(heap, sizeof(dict_instant_t))) + dict_instant_t(); + instant->n_dropped = n_drop; + if (n_drop) { + instant->dropped + = static_cast( + mem_heap_alloc(heap, n_drop + * sizeof(dict_col_t))); + if (n_old_drop) { + memcpy(instant->dropped, old.instant->dropped, + n_old_drop * sizeof(dict_col_t)); + } + } else { + instant->dropped = NULL; + } + + for (unsigned i = 0, d = n_old_drop; i < old.n_cols; i++) { + if (col_map[i] == ULINT_UNDEFINED) { + (new (&instant->dropped[d++]) + dict_col_t(old.cols[i]))->set_dropped(); + } + } +#ifndef DBUG_OFF + for (unsigned i = 0; i < n_drop; i++) { + DBUG_ASSERT(instant->dropped[i].is_dropped()); + } +#endif + const unsigned n_fields = index.n_fields + n_dropped(); + + DBUG_ASSERT(n_fields >= oindex.n_fields); + dict_field_t* fields = static_cast( + mem_heap_zalloc(heap, n_fields * sizeof *fields)); + unsigned i = 0, j = 0, n_nullable = 0; + ut_d(uint core_null = 0); + for (; i < oindex.n_fields; i++) { + DBUG_ASSERT(j <= i); + dict_field_t&f = fields[i] = oindex.fields[i]; + if (f.col->is_dropped()) { + /* The column has been instantly + dropped earlier. */ + DBUG_ASSERT(f.col >= old.instant->dropped); + { + size_t d = f.col + - old.instant->dropped; + DBUG_ASSERT(d < n_old_drop); + DBUG_ASSERT(&old.instant->dropped[d] + == f.col); + DBUG_ASSERT(!f.name); + f.col = instant->dropped + d; + } + if (f.col->is_nullable()) { +found_nullable: + n_nullable++; + ut_d(core_null + += i < oindex.n_core_fields); + } + continue; + } + + const ulint col_ind = col_map[f.col->ind]; + if (col_ind != ULINT_UNDEFINED) { + if (index.fields[j].col->ind != col_ind) { + /* The fields for instantly + added columns must be placed + last in the clustered index. + Keep pre-existing fields in + the same position. */ + uint k; + for (k = j + 1; k < index.n_fields; + k++) { + if (index.fields[k].col->ind + == col_ind) { + goto found_j; + } + } + DBUG_ASSERT("no such col" == 0); +found_j: + std::swap(index.fields[j], + index.fields[k]); + } + DBUG_ASSERT(index.fields[j].col->ind + == col_ind); + fields[i] = index.fields[j++]; + DBUG_ASSERT(!fields[i].col->is_dropped()); + DBUG_ASSERT(fields[i].name + == fields[i].col->name(*this)); + if (fields[i].col->is_nullable()) { + goto found_nullable; + } + continue; + } + + /* This column is being dropped. */ + unsigned d = n_old_drop; + for (unsigned c = 0; c < f.col->ind; c++) { + d += col_map[c] == ULINT_UNDEFINED; + } + DBUG_ASSERT(d < n_drop); + f.col = &instant->dropped[d]; + f.name = NULL; + if (f.col->is_nullable()) { + goto found_nullable; + } + } + + /* In case of discarded tablespace, InnoDB can't + read the root page. So assign the null bytes based + on nullabled fields */ + if (!oindex.table->space) { + oindex.n_core_null_bytes = static_cast( + UT_BITS_IN_BYTES(unsigned(oindex.n_nullable))); + } + + /* The n_core_null_bytes only matters for + ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC tables. */ + ut_ad(UT_BITS_IN_BYTES(core_null) == oindex.n_core_null_bytes + || !not_redundant()); + DBUG_ASSERT(i >= oindex.n_core_fields); + DBUG_ASSERT(j <= i); + DBUG_ASSERT(n_fields - (i - j) == index.n_fields); + std::sort(index.fields + j, index.fields + index.n_fields, + [](const dict_field_t& a, const dict_field_t& b) + { return a.col->ind < b.col->ind; }); + for (; i < n_fields; i++) { + fields[i] = index.fields[j++]; + n_nullable += fields[i].col->is_nullable(); + DBUG_ASSERT(!fields[i].col->is_dropped()); + DBUG_ASSERT(fields[i].name + == fields[i].col->name(*this)); + } + DBUG_ASSERT(j == index.n_fields); + index.n_fields = index.n_def = n_fields + & dict_index_t::MAX_N_FIELDS; + index.fields = fields; + DBUG_ASSERT(n_nullable >= index.n_nullable); + DBUG_ASSERT(n_nullable >= oindex.n_nullable); + index.n_nullable = n_nullable & dict_index_t::MAX_N_FIELDS; + goto set_core_fields; + } + + DBUG_ASSERT(n_cols + n_dropped() >= old.n_cols + old.n_dropped()); + DBUG_ASSERT(n_dropped() >= old.n_dropped()); + DBUG_ASSERT(index.n_core_fields == oindex.n_core_fields); + DBUG_ASSERT(index.n_core_null_bytes == oindex.n_core_null_bytes); +} + +/** Adjust index metadata for instant ADD/DROP/reorder COLUMN. +@param[in] clustered index definition after instant ALTER TABLE */ +inline void dict_index_t::instant_add_field(const dict_index_t& instant) +{ + DBUG_ASSERT(is_primary()); + DBUG_ASSERT(instant.is_primary()); + DBUG_ASSERT(!has_virtual()); + DBUG_ASSERT(!instant.has_virtual()); + DBUG_ASSERT(instant.n_core_fields <= instant.n_fields); + DBUG_ASSERT(n_def == n_fields); + DBUG_ASSERT(instant.n_def == instant.n_fields); + DBUG_ASSERT(type == instant.type); + DBUG_ASSERT(trx_id_offset == instant.trx_id_offset); + DBUG_ASSERT(n_user_defined_cols == instant.n_user_defined_cols); + DBUG_ASSERT(n_uniq == instant.n_uniq); + DBUG_ASSERT(instant.n_fields >= n_fields); + DBUG_ASSERT(instant.n_nullable >= n_nullable); + DBUG_ASSERT(instant.n_core_fields == n_core_fields); + DBUG_ASSERT(instant.n_core_null_bytes == n_core_null_bytes); + + /* instant will have all fields (including ones for columns + that have been or are being instantly dropped) in the same position + as this index. Fields for any added columns are appended at the end. */ +#ifndef DBUG_OFF + for (unsigned i = 0; i < n_fields; i++) { + DBUG_ASSERT(fields[i].same(instant.fields[i])); + DBUG_ASSERT(instant.fields[i].col->same_format(*fields[i] + .col)); + /* Instant conversion from NULL to NOT NULL is not allowed. */ + DBUG_ASSERT(!fields[i].col->is_nullable() + || instant.fields[i].col->is_nullable()); + DBUG_ASSERT(fields[i].col->is_nullable() + == instant.fields[i].col->is_nullable() + || !table->not_redundant()); + } +#endif + n_fields = instant.n_fields; + n_def = instant.n_def; + n_nullable = instant.n_nullable; + fields = static_cast( + mem_heap_dup(heap, instant.fields, n_fields * sizeof *fields)); + + ut_d(unsigned n_null = 0); + ut_d(unsigned n_dropped = 0); + + for (unsigned i = 0; i < n_fields; i++) { + const dict_col_t* icol = instant.fields[i].col; + dict_field_t& f = fields[i]; + ut_d(n_null += icol->is_nullable()); + DBUG_ASSERT(!icol->is_virtual()); + if (icol->is_dropped()) { + ut_d(n_dropped++); + f.col->set_dropped(); + f.name = NULL; + } else { + f.col = &table->cols[icol - instant.table->cols]; + f.name = f.col->name(*table); + } + } + + ut_ad(n_null == n_nullable); + ut_ad(n_dropped == instant.table->n_dropped()); +} + +/** Adjust table metadata for instant ADD/DROP/reorder COLUMN. +@param[in] table altered table (with dropped columns) +@param[in] col_map mapping from cols[] and v_cols[] to table +@return whether the metadata record must be updated */ +inline bool dict_table_t::instant_column(const dict_table_t& table, + const ulint* col_map) +{ + DBUG_ASSERT(!table.cached); + DBUG_ASSERT(table.n_def == table.n_cols); + DBUG_ASSERT(table.n_t_def == table.n_t_cols); + DBUG_ASSERT(n_def == n_cols); + DBUG_ASSERT(n_t_def == n_t_cols); + DBUG_ASSERT(n_v_def == n_v_cols); + DBUG_ASSERT(table.n_v_def == table.n_v_cols); + DBUG_ASSERT(table.n_cols + table.n_dropped() >= n_cols + n_dropped()); + DBUG_ASSERT(!table.persistent_autoinc + || persistent_autoinc == table.persistent_autoinc); + ut_ad(dict_sys.locked()); + + { + const char* end = table.col_names; + for (unsigned i = table.n_cols; i--; ) end += strlen(end) + 1; + + col_names = static_cast( + mem_heap_dup(heap, table.col_names, + ulint(end - table.col_names))); + } + const dict_col_t* const old_cols = cols; + cols = static_cast(mem_heap_dup(heap, table.cols, + table.n_cols + * sizeof *cols)); + + /* Preserve the default values of previously instantly added + columns, or copy the new default values to this->heap. */ + for (uint16_t i = 0; i < table.n_cols; i++) { + dict_col_t& c = cols[i]; + + if (const dict_col_t* o = find(old_cols, col_map, n_cols, i)) { + c.def_val = o->def_val; + DBUG_ASSERT(!((c.prtype ^ o->prtype) + & ~(DATA_NOT_NULL | DATA_VERSIONED + | CHAR_COLL_MASK << 16 + | DATA_LONG_TRUE_VARCHAR))); + DBUG_ASSERT(c.same_type(*o)); + DBUG_ASSERT(c.len >= o->len); + + if (o->vers_sys_start()) { + ut_ad(o->ind == vers_start); + vers_start = i & dict_index_t::MAX_N_FIELDS; + } else if (o->vers_sys_end()) { + ut_ad(o->ind == vers_end); + vers_end = i & dict_index_t::MAX_N_FIELDS; + } + continue; + } + + DBUG_ASSERT(c.is_added()); + if (c.def_val.len <= UNIV_PAGE_SIZE_MAX + && (!c.def_val.len + || !memcmp(c.def_val.data, field_ref_zero, + c.def_val.len))) { + c.def_val.data = field_ref_zero; + } else if (const void*& d = c.def_val.data) { + d = mem_heap_dup(heap, d, c.def_val.len); + } else { + DBUG_ASSERT(c.def_val.len == UNIV_SQL_NULL); + } + } + + n_t_def = (n_t_def + (table.n_cols - n_cols)) + & dict_index_t::MAX_N_FIELDS; + n_t_cols = (n_t_cols + (table.n_cols - n_cols)) + & dict_index_t::MAX_N_FIELDS; + n_def = table.n_cols; + + const dict_v_col_t* const old_v_cols = v_cols; + + if (const char* end = table.v_col_names) { + for (unsigned i = table.n_v_cols; i--; ) { + end += strlen(end) + 1; + } + + v_col_names = static_cast( + mem_heap_dup(heap, table.v_col_names, + ulint(end - table.v_col_names))); + v_cols = static_cast( + mem_heap_alloc(heap, table.n_v_cols * sizeof(*v_cols))); + for (ulint i = table.n_v_cols; i--; ) { + new (&v_cols[i]) dict_v_col_t(table.v_cols[i]); + v_cols[i].v_indexes.clear(); + } + } else { + ut_ad(table.n_v_cols == 0); + v_col_names = NULL; + v_cols = NULL; + } + + n_t_def = (n_t_def + (table.n_v_cols - n_v_cols)) + & dict_index_t::MAX_N_FIELDS; + n_t_cols = (n_t_cols + (table.n_v_cols - n_v_cols)) + & dict_index_t::MAX_N_FIELDS; + n_v_def = table.n_v_cols; + + for (unsigned i = 0; i < n_v_def; i++) { + dict_v_col_t& v = v_cols[i]; + DBUG_ASSERT(v.v_indexes.empty()); + v.base_col = static_cast( + mem_heap_dup(heap, v.base_col, + v.num_base * sizeof *v.base_col)); + + for (ulint n = v.num_base; n--; ) { + dict_col_t*& base = v.base_col[n]; + if (base->is_virtual()) { + } else if (base >= table.cols + && base < table.cols + table.n_cols) { + /* The base column was instantly added. */ + size_t c = base - table.cols; + DBUG_ASSERT(base == &table.cols[c]); + base = &cols[c]; + } else { + DBUG_ASSERT(base >= old_cols); + size_t c = base - old_cols; + DBUG_ASSERT(c + DATA_N_SYS_COLS < n_cols); + DBUG_ASSERT(base == &old_cols[c]); + DBUG_ASSERT(col_map[c] + DATA_N_SYS_COLS + < n_cols); + base = &cols[col_map[c]]; + } + } + } + + dict_index_t* index = dict_table_get_first_index(this); + bool metadata_changed; + { + const dict_index_t& i = *dict_table_get_first_index(&table); + metadata_changed = i.n_fields > index->n_fields; + ut_ad(i.n_fields >= index->n_fields); + index->instant_add_field(i); + } + + if (instant || table.instant) { + const auto old_instant = instant; + /* FIXME: add instant->heap, and transfer ownership here */ + if (!instant) { + instant = new (mem_heap_zalloc(heap, sizeof *instant)) + dict_instant_t(); + goto dup_dropped; + } else if (n_dropped() < table.n_dropped()) { +dup_dropped: + instant->dropped = static_cast( + mem_heap_dup(heap, table.instant->dropped, + table.instant->n_dropped + * sizeof *instant->dropped)); + instant->n_dropped = table.instant->n_dropped; + } else if (table.instant->n_dropped) { + memcpy(instant->dropped, table.instant->dropped, + table.instant->n_dropped + * sizeof *instant->dropped); + } + + const field_map_element_t* field_map = old_instant + ? old_instant->field_map : NULL; + + init_instant(table); + + if (!metadata_changed) { + metadata_changed = !field_map + || memcmp(field_map, + instant->field_map, + (index->n_fields + - index->first_user_field()) + * sizeof *field_map); + } + } + + while ((index = dict_table_get_next_index(index)) != NULL) { + if (index->to_be_dropped) { + continue; + } + for (unsigned i = 0; i < index->n_fields; i++) { + dict_field_t& f = index->fields[i]; + if (f.col >= table.cols + && f.col < table.cols + table.n_cols) { + /* This is an instantly added column + in a newly added index. */ + DBUG_ASSERT(!f.col->is_virtual()); + size_t c = f.col - table.cols; + DBUG_ASSERT(f.col == &table.cols[c]); + f.col = &cols[c]; + } else if (f.col >= &table.v_cols->m_col + && f.col < &table.v_cols[n_v_cols].m_col) { + /* This is an instantly added virtual column + in a newly added index. */ + DBUG_ASSERT(f.col->is_virtual()); + size_t c = reinterpret_cast( + f.col) - table.v_cols; + DBUG_ASSERT(f.col == &table.v_cols[c].m_col); + f.col = &v_cols[c].m_col; + } else if (f.col < old_cols + || f.col >= old_cols + n_cols) { + DBUG_ASSERT(f.col->is_virtual()); + f.col = &v_cols[col_map[ + reinterpret_cast( + f.col) + - old_v_cols + n_cols]].m_col; + } else { + f.col = &cols[col_map[f.col - old_cols]]; + DBUG_ASSERT(!f.col->is_virtual()); + } + f.name = f.col->name(*this); + if (f.col->is_virtual()) { + dict_v_col_t* v_col = reinterpret_cast + (f.col); + v_col->v_indexes.push_front( + dict_v_idx_t(index, i)); + } + } + } + + n_cols = table.n_cols; + n_v_cols = table.n_v_cols; + return metadata_changed; +} + +/** Find the old column number for the given new column position. +@param[in] col_map column map from old column to new column +@param[in] pos new column position +@param[in] n number of columns present in the column map +@return old column position for the given new column position. */ +static ulint find_old_col_no(const ulint* col_map, ulint pos, ulint n) +{ + do { + ut_ad(n); + } while (col_map[--n] != pos); + return n; +} + +/** Roll back instant_column(). +@param[in] old_n_cols original n_cols +@param[in] old_cols original cols +@param[in] old_col_names original col_names +@param[in] old_instant original instant structure +@param[in] old_fields original fields +@param[in] old_n_fields original number of fields +@param[in] old_n_core_fields original number of core fields +@param[in] old_n_v_cols original n_v_cols +@param[in] old_v_cols original v_cols +@param[in] old_v_col_names original v_col_names +@param[in] col_map column map */ +inline void dict_table_t::rollback_instant( + unsigned old_n_cols, + dict_col_t* old_cols, + const char* old_col_names, + dict_instant_t* old_instant, + dict_field_t* old_fields, + unsigned old_n_fields, + unsigned old_n_core_fields, + unsigned old_n_v_cols, + dict_v_col_t* old_v_cols, + const char* old_v_col_names, + const ulint* col_map) +{ + ut_ad(dict_sys.locked()); + + if (cols == old_cols) { + /* Alter fails before instant operation happens. + So there is no need to do rollback instant operation */ + return; + } + + dict_index_t* index = indexes.start; + /* index->is_instant() does not necessarily hold here, because + the table may have been emptied */ + DBUG_ASSERT(old_n_cols >= DATA_N_SYS_COLS); + DBUG_ASSERT(n_cols == n_def); + DBUG_ASSERT(index->n_def == index->n_fields); + DBUG_ASSERT(index->n_core_fields <= index->n_fields); + DBUG_ASSERT(old_n_core_fields <= old_n_fields); + DBUG_ASSERT(instant || !old_instant); + + instant = old_instant; + + index->n_nullable = 0; + + for (unsigned i = old_n_fields; i--; ) { + if (old_fields[i].col->is_nullable()) { + index->n_nullable++; + } + } + + for (unsigned i = n_v_cols; i--; ) { + v_cols[i].~dict_v_col_t(); + } + + index->n_core_fields = ((index->n_fields == index->n_core_fields) + ? old_n_fields + : old_n_core_fields) + & dict_index_t::MAX_N_FIELDS; + index->n_def = index->n_fields = old_n_fields + & dict_index_t::MAX_N_FIELDS; + index->n_core_null_bytes = static_cast( + UT_BITS_IN_BYTES(index->get_n_nullable(index->n_core_fields))); + + const dict_col_t* const new_cols = cols; + const dict_col_t* const new_cols_end __attribute__((unused)) = cols + n_cols; + const dict_v_col_t* const new_v_cols = v_cols; + const dict_v_col_t* const new_v_cols_end __attribute__((unused))= v_cols + n_v_cols; + + cols = old_cols; + col_names = old_col_names; + v_cols = old_v_cols; + v_col_names = old_v_col_names; + n_def = n_cols = old_n_cols & dict_index_t::MAX_N_FIELDS; + n_v_def = n_v_cols = old_n_v_cols & dict_index_t::MAX_N_FIELDS; + n_t_def = n_t_cols = (n_cols + n_v_cols) & dict_index_t::MAX_N_FIELDS; + + if (versioned()) { + for (unsigned i = 0; i < n_cols; ++i) { + if (cols[i].vers_sys_start()) { + vers_start = i & dict_index_t::MAX_N_FIELDS; + } else if (cols[i].vers_sys_end()) { + vers_end = i & dict_index_t::MAX_N_FIELDS; + } + } + } + + index->fields = old_fields; + + while ((index = dict_table_get_next_index(index)) != NULL) { + if (index->to_be_dropped) { + /* instant_column() did not adjust these indexes. */ + continue; + } + + for (unsigned i = 0; i < index->n_fields; i++) { + dict_field_t& f = index->fields[i]; + if (f.col->is_virtual()) { + DBUG_ASSERT(f.col >= &new_v_cols->m_col); + DBUG_ASSERT(f.col < &new_v_cols_end->m_col); + size_t n = size_t( + reinterpret_cast(f.col) + - new_v_cols); + DBUG_ASSERT(n <= n_v_cols); + + ulint old_col_no = find_old_col_no( + col_map + n_cols, n, n_v_cols); + DBUG_ASSERT(old_col_no <= n_v_cols); + f.col = &v_cols[old_col_no].m_col; + DBUG_ASSERT(f.col->is_virtual()); + } else { + DBUG_ASSERT(f.col >= new_cols); + DBUG_ASSERT(f.col < new_cols_end); + size_t n = size_t(f.col - new_cols); + DBUG_ASSERT(n <= n_cols); + + ulint old_col_no = find_old_col_no(col_map, + n, n_cols); + DBUG_ASSERT(old_col_no < n_cols); + f.col = &cols[old_col_no]; + DBUG_ASSERT(!f.col->is_virtual()); + } + f.name = f.col->name(*this); + } + } +} + +/* Report an InnoDB error to the client by invoking my_error(). */ +static ATTRIBUTE_COLD __attribute__((nonnull)) +void +my_error_innodb( +/*============*/ + dberr_t error, /*!< in: InnoDB error code */ + const char* table, /*!< in: table name */ + ulint flags) /*!< in: table flags */ +{ + switch (error) { + case DB_MISSING_HISTORY: + my_error(ER_TABLE_DEF_CHANGED, MYF(0)); + break; + case DB_RECORD_NOT_FOUND: + my_error(ER_KEY_NOT_FOUND, MYF(0), table); + break; + case DB_DEADLOCK: + my_error(ER_LOCK_DEADLOCK, MYF(0)); + break; + case DB_LOCK_WAIT_TIMEOUT: + my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0)); + break; + case DB_INTERRUPTED: + my_error(ER_QUERY_INTERRUPTED, MYF(0)); + break; + case DB_OUT_OF_MEMORY: + my_error(ER_OUT_OF_RESOURCES, MYF(0)); + break; + case DB_OUT_OF_FILE_SPACE: + my_error(ER_RECORD_FILE_FULL, MYF(0), table); + break; + case DB_TEMP_FILE_WRITE_FAIL: + my_error(ER_TEMP_FILE_WRITE_FAILURE, MYF(0)); + break; + case DB_TOO_BIG_INDEX_COL: + my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0), + (ulong) DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags)); + break; + case DB_TOO_MANY_CONCURRENT_TRXS: + my_error(ER_TOO_MANY_CONCURRENT_TRXS, MYF(0)); + break; + case DB_LOCK_TABLE_FULL: + my_error(ER_LOCK_TABLE_FULL, MYF(0)); + break; + case DB_UNDO_RECORD_TOO_BIG: + my_error(ER_UNDO_RECORD_TOO_BIG, MYF(0)); + break; + case DB_CORRUPTION: + my_error(ER_NOT_KEYFILE, MYF(0), table); + break; + case DB_TOO_BIG_RECORD: { + /* Note that in page0zip.ic page_zip_rec_needs_ext() rec_size + is limited to COMPRESSED_REC_MAX_DATA_SIZE (16K) or + REDUNDANT_REC_MAX_DATA_SIZE (16K-1). */ + bool comp = !!(flags & DICT_TF_COMPACT); + ulint free_space = page_get_free_space_of_empty(comp) / 2; + + if (free_space >= ulint(comp ? COMPRESSED_REC_MAX_DATA_SIZE : + REDUNDANT_REC_MAX_DATA_SIZE)) { + free_space = (comp ? COMPRESSED_REC_MAX_DATA_SIZE : + REDUNDANT_REC_MAX_DATA_SIZE) - 1; + } + + my_error(ER_TOO_BIG_ROWSIZE, MYF(0), free_space); + break; + } + case DB_INVALID_NULL: + /* TODO: report the row, as we do for DB_DUPLICATE_KEY */ + my_error(ER_INVALID_USE_OF_NULL, MYF(0)); + break; + case DB_CANT_CREATE_GEOMETRY_OBJECT: + my_error(ER_CANT_CREATE_GEOMETRY_OBJECT, MYF(0)); + break; + case DB_TABLESPACE_EXISTS: + my_error(ER_TABLESPACE_EXISTS, MYF(0), table); + break; + +#ifdef UNIV_DEBUG + case DB_SUCCESS: + case DB_DUPLICATE_KEY: + case DB_ONLINE_LOG_TOO_BIG: + /* These codes should not be passed here. */ + ut_error; +#endif /* UNIV_DEBUG */ + default: + my_error(ER_GET_ERRNO, MYF(0), error, "InnoDB"); + break; + } +} + +/** Get the name of an erroneous key. +@param[in] error_key_num InnoDB number of the erroneus key +@param[in] ha_alter_info changes that were being performed +@param[in] table InnoDB table +@return the name of the erroneous key */ +static +const char* +get_error_key_name( + ulint error_key_num, + const Alter_inplace_info* ha_alter_info, + const dict_table_t* table) +{ + if (error_key_num == ULINT_UNDEFINED) { + return(FTS_DOC_ID_INDEX_NAME); + } else if (ha_alter_info->key_count == 0) { + return(dict_table_get_first_index(table)->name); + } else { + return(ha_alter_info->key_info_buffer[error_key_num].name.str); + } +} + +/** Convert field type and length to InnoDB format */ +static void get_type(const Field &f, uint &prtype, uint8_t &mtype, + uint16_t &len) +{ + mtype= get_innobase_type_from_mysql_type(&prtype, &f); + len= static_cast(f.pack_length()); + prtype|= f.type(); + if (f.type() == MYSQL_TYPE_VARCHAR) + { + auto l= static_cast(f).length_bytes; + len= static_cast(len - l); + if (l == 2) + prtype|= DATA_LONG_TRUE_VARCHAR; + } + if (!f.real_maybe_null()) + prtype |= DATA_NOT_NULL; + if (f.binary()) + prtype |= DATA_BINARY_TYPE; + if (f.table->versioned()) + { + if (&f == f.table->field[f.table->s->vers.start_fieldno]) + prtype|= DATA_VERS_START; + else if (&f == f.table->field[f.table->s->vers.end_fieldno]) + prtype|= DATA_VERS_END; + else if (!(f.flags & VERS_UPDATE_UNVERSIONED_FLAG)) + prtype|= DATA_VERSIONED; + } + + if (!f.stored_in_db()) + prtype|= DATA_VIRTUAL; + + if (dtype_is_string_type(mtype)) + prtype|= f.charset()->number << 16; +} + +struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx +{ + /** Dummy query graph */ + que_thr_t*const thr; + /** The prebuilt struct of the creating instance */ + row_prebuilt_t*& prebuilt; + /** InnoDB indexes being created */ + dict_index_t** add_index; + /** MySQL key numbers for the InnoDB indexes that are being created */ + const ulint* add_key_numbers; + /** number of InnoDB indexes being created */ + ulint num_to_add_index; + /** InnoDB indexes being dropped */ + dict_index_t** drop_index; + /** number of InnoDB indexes being dropped */ + const ulint num_to_drop_index; + /** InnoDB foreign key constraints being dropped */ + dict_foreign_t** drop_fk; + /** number of InnoDB foreign key constraints being dropped */ + const ulint num_to_drop_fk; + /** InnoDB foreign key constraints being added */ + dict_foreign_t** add_fk; + /** number of InnoDB foreign key constraints being dropped */ + const ulint num_to_add_fk; + /** whether to create the indexes online */ + const bool online; + /** memory heap */ + mem_heap_t* const heap; + /** dictionary transaction */ + trx_t* trx; + /** original table (if rebuilt, differs from indexed_table) */ + dict_table_t* old_table; + /** table where the indexes are being created or dropped */ + dict_table_t* new_table; + /** table definition for instant ADD/DROP/reorder COLUMN */ + dict_table_t* instant_table; + /** mapping of old column numbers to new ones, or NULL */ + const ulint* col_map; + /** new column names, or NULL if nothing was renamed */ + const char** col_names; + /** added AUTO_INCREMENT column position, or ULINT_UNDEFINED */ + const ulint add_autoinc; + /** default values of ADD and CHANGE COLUMN, or NULL */ + const dtuple_t* defaults; + /** autoinc sequence to use */ + ib_sequence_t sequence; + /** temporary table name to use for old table when renaming tables */ + const char* tmp_name; + /** whether the order of the clustered index is unchanged */ + bool skip_pk_sort; + /** number of virtual columns to be added */ + unsigned num_to_add_vcol; + /** virtual columns to be added */ + dict_v_col_t* add_vcol; + const char** add_vcol_name; + /** number of virtual columns to be dropped */ + unsigned num_to_drop_vcol; + /** virtual columns to be dropped */ + dict_v_col_t* drop_vcol; + const char** drop_vcol_name; + /** ALTER TABLE stage progress recorder */ + ut_stage_alter_t* m_stage; + /** original number of user columns in the table */ + const unsigned old_n_cols; + /** original columns of the table */ + dict_col_t* const old_cols; + /** original column names of the table */ + const char* const old_col_names; + /** original instantly dropped or reordered columns */ + dict_instant_t* const old_instant; + /** original index fields */ + dict_field_t* const old_fields; + /** size of old_fields */ + const unsigned old_n_fields; + /** original old_table->n_core_fields */ + const unsigned old_n_core_fields; + /** original number of virtual columns in the table */ + const unsigned old_n_v_cols; + /** original virtual columns of the table */ + dict_v_col_t* const old_v_cols; + /** original virtual column names of the table */ + const char* const old_v_col_names; + /** 0, or 1 + first column whose position changes in instant ALTER */ + unsigned first_alter_pos; + /** Allow non-null conversion. + (1) Alter ignore should allow the conversion + irrespective of sql mode. + (2) Don't allow the conversion in strict mode + (3) Allow the conversion only in non-strict mode. */ + const bool allow_not_null; + + /** The page_compression_level attribute, or 0 */ + const uint page_compression_level; + + /** Indexed columns whose charset-collation is changing + in a way that does not require the table to be rebuilt */ + col_collations change_col_collate; + + ha_innobase_inplace_ctx(row_prebuilt_t*& prebuilt_arg, + dict_index_t** drop_arg, + ulint num_to_drop_arg, + dict_foreign_t** drop_fk_arg, + ulint num_to_drop_fk_arg, + dict_foreign_t** add_fk_arg, + ulint num_to_add_fk_arg, + bool online_arg, + mem_heap_t* heap_arg, + dict_table_t* new_table_arg, + const char** col_names_arg, + ulint add_autoinc_arg, + ulonglong autoinc_col_min_value_arg, + ulonglong autoinc_col_max_value_arg, + bool allow_not_null_flag, + bool page_compressed, + ulonglong page_compression_level_arg) : + inplace_alter_handler_ctx(), + thr (pars_complete_graph_for_exec(nullptr, prebuilt_arg->trx, + heap_arg, prebuilt_arg)), + prebuilt (prebuilt_arg), + add_index (0), add_key_numbers (0), num_to_add_index (0), + drop_index (drop_arg), num_to_drop_index (num_to_drop_arg), + drop_fk (drop_fk_arg), num_to_drop_fk (num_to_drop_fk_arg), + add_fk (add_fk_arg), num_to_add_fk (num_to_add_fk_arg), + online (online_arg), heap (heap_arg), + trx (innobase_trx_allocate(prebuilt_arg->trx->mysql_thd)), + old_table (prebuilt_arg->table), + new_table (new_table_arg), instant_table (0), + col_map (0), col_names (col_names_arg), + add_autoinc (add_autoinc_arg), + defaults (0), + sequence(prebuilt->trx->mysql_thd, + autoinc_col_min_value_arg, autoinc_col_max_value_arg), + tmp_name (0), + skip_pk_sort(false), + num_to_add_vcol(0), + add_vcol(0), + add_vcol_name(0), + num_to_drop_vcol(0), + drop_vcol(0), + drop_vcol_name(0), + m_stage(NULL), + old_n_cols(prebuilt_arg->table->n_cols), + old_cols(prebuilt_arg->table->cols), + old_col_names(prebuilt_arg->table->col_names), + old_instant(prebuilt_arg->table->instant), + old_fields(prebuilt_arg->table->indexes.start->fields), + old_n_fields(prebuilt_arg->table->indexes.start->n_fields), + old_n_core_fields(prebuilt_arg->table->indexes.start + ->n_core_fields), + old_n_v_cols(prebuilt_arg->table->n_v_cols), + old_v_cols(prebuilt_arg->table->v_cols), + old_v_col_names(prebuilt_arg->table->v_col_names), + first_alter_pos(0), + allow_not_null(allow_not_null_flag), + page_compression_level(page_compressed + ? (page_compression_level_arg + ? uint(page_compression_level_arg) + : page_zip_level) + : 0) + { + ut_ad(old_n_cols >= DATA_N_SYS_COLS); + ut_ad(page_compression_level <= 9); +#ifdef UNIV_DEBUG + for (ulint i = 0; i < num_to_add_index; i++) { + ut_ad(!add_index[i]->to_be_dropped); + } + for (ulint i = 0; i < num_to_drop_index; i++) { + ut_ad(drop_index[i]->to_be_dropped); + } +#endif /* UNIV_DEBUG */ + + trx_start_for_ddl(trx); + } + + ~ha_innobase_inplace_ctx() + { + UT_DELETE(m_stage); + if (instant_table) { + ut_ad(!instant_table->id); + while (dict_index_t* index + = UT_LIST_GET_LAST(instant_table->indexes)) { + UT_LIST_REMOVE(instant_table->indexes, index); + index->lock.free(); + dict_mem_index_free(index); + } + for (unsigned i = old_n_v_cols; i--; ) { + old_v_cols[i].~dict_v_col_t(); + } + if (instant_table->fts) { + instant_table->fts->~fts_t(); + instant_table->fts = nullptr; + } + dict_mem_table_free(instant_table); + } + mem_heap_free(heap); + } + + /** Determine if the table will be rebuilt. + @return whether the table will be rebuilt */ + bool need_rebuild () const { return(old_table != new_table); } + + /** Convert table-rebuilding ALTER to instant ALTER. */ + void prepare_instant() + { + DBUG_ASSERT(need_rebuild()); + DBUG_ASSERT(!is_instant()); + DBUG_ASSERT(old_table->n_cols == old_n_cols); + + instant_table = new_table; + new_table = old_table; + export_vars.innodb_instant_alter_column++; + + instant_table->prepare_instant(*old_table, col_map, + first_alter_pos); + } + + /** Adjust table metadata for instant ADD/DROP/reorder COLUMN. + @return whether the metadata record must be updated */ + bool instant_column() + { + DBUG_ASSERT(is_instant()); + DBUG_ASSERT(old_n_fields + == old_table->indexes.start->n_fields); + return old_table->instant_column(*instant_table, col_map); + } + + /** Revert prepare_instant() if the transaction is rolled back. */ + void rollback_instant() + { + if (!is_instant()) return; + old_table->rollback_instant(old_n_cols, + old_cols, old_col_names, + old_instant, + old_fields, old_n_fields, + old_n_core_fields, + old_n_v_cols, old_v_cols, + old_v_col_names, + col_map); + } + + /** @return whether this is instant ALTER TABLE */ + bool is_instant() const + { + DBUG_ASSERT(!instant_table || !instant_table->can_be_evicted); + return instant_table; + } + + /** Create an index table where indexes are ordered as follows: + + IF a new primary key is defined for the table THEN + + 1) New primary key + 2) The remaining keys in key_info + + ELSE + + 1) All new indexes in the order they arrive from MySQL + + ENDIF + + @return key definitions */ + MY_ATTRIBUTE((nonnull, warn_unused_result, malloc)) + inline index_def_t* + create_key_defs( + const Alter_inplace_info* ha_alter_info, + /*!< in: alter operation */ + const TABLE* altered_table, + /*!< in: MySQL table that is being altered */ + ulint& n_fts_add, + /*!< out: number of FTS indexes to be created */ + ulint& fts_doc_id_col, + /*!< in: The column number for Doc ID */ + bool& add_fts_doc_id, + /*!< in: whether we need to add new DOC ID + column for FTS index */ + bool& add_fts_doc_idx, + /*!< in: whether we need to add new DOC ID + index for FTS index */ + const TABLE* table); + /*!< in: MySQL table that is being altered */ + + /** Share context between partitions. + @param[in] ctx context from another partition of the table */ + void set_shared_data(const inplace_alter_handler_ctx& ctx) + { + if (add_autoinc != ULINT_UNDEFINED) { + const ha_innobase_inplace_ctx& ha_ctx = + static_cast + (ctx); + /* When adding an AUTO_INCREMENT column to a + partitioned InnoDB table, we must share the + sequence for all partitions. */ + ut_ad(ha_ctx.add_autoinc == add_autoinc); + ut_ad(ha_ctx.sequence.last()); + sequence = ha_ctx.sequence; + } + } + + /** @return whether the given column is being added */ + bool is_new_vcol(const dict_v_col_t &v_col) const + { + for (ulint i= 0; i < num_to_add_vcol; i++) + if (&add_vcol[i] == &v_col) + return true; + return false; + } + + /** During rollback, make newly added indexes point to + newly added virtual columns. */ + void clean_new_vcol_index() + { + ut_ad(old_table == new_table); + const dict_index_t *index= dict_table_get_first_index(old_table); + while ((index= dict_table_get_next_index(index)) != NULL) + { + if (!index->has_virtual() || index->is_committed()) + continue; + ulint n_drop_new_vcol= index->get_new_n_vcol(); + for (ulint i= 0; n_drop_new_vcol && i < index->n_fields; i++) + { + dict_col_t *col= index->fields[i].col; + /* Skip the non-virtual and old virtual columns */ + if (!col->is_virtual()) + continue; + dict_v_col_t *vcol= reinterpret_cast(col); + if (!is_new_vcol(*vcol)) + continue; + + index->fields[i].col= &index->new_vcol_info-> + add_drop_v_col(index->heap, vcol, --n_drop_new_vcol)->m_col; + } + } + } + + /** @return whether a FULLTEXT INDEX is being added */ + bool adding_fulltext_index() const + { + for (ulint a= 0; a < num_to_add_index; a++) + if (add_index[a]->type & DICT_FTS) + return true; + return false; + } + + /** Handle the apply log failure for online DDL operation. + @param ha_alter_info handler alter inplace info + @param altered_table MySQL table that is being altered + @param error error code + @retval false if error value is DB_SUCCESS or + TRUE in case of error */ + bool log_failure(Alter_inplace_info *ha_alter_info, + TABLE *altered_table, dberr_t error) + { + ulint err_key= thr_get_trx(thr)->error_key_num; + switch (error) { + KEY *dup_key; + case DB_SUCCESS: + return false; + case DB_DUPLICATE_KEY: + if (err_key == ULINT_UNDEFINED) + /* This should be the hidden index on FTS_DOC_ID */ + dup_key= nullptr; + else + { + DBUG_ASSERT(err_key < ha_alter_info->key_count); + dup_key= &ha_alter_info->key_info_buffer[err_key]; + } + print_keydup_error(altered_table, dup_key, MYF(0)); + break; + case DB_ONLINE_LOG_TOO_BIG: + my_error(ER_INNODB_ONLINE_LOG_TOO_BIG, MYF(0), + get_error_key_name(err_key, ha_alter_info, new_table)); + break; + case DB_INDEX_CORRUPT: + my_error(ER_INDEX_CORRUPT, MYF(0), + get_error_key_name(err_key, ha_alter_info, new_table)); + break; + default: + my_error_innodb(error, old_table->name.m_name, old_table->flags); + } + return true; + } + + /** Check whether the column has any change in collation type. + If it is then store the column information in heap + @param index index being added (or rebuilt) + @param altered_table altered table definition */ + void change_col_collation(dict_index_t *index, const TABLE &altered_table) + { + ut_ad(!need_rebuild()); + ut_ad(!index->is_primary()); + ut_ad(!index->is_committed()); + + unsigned n_cols= 0; + for (unsigned i= 0; i < index->n_fields; i++) + { + const char *field_name= index->fields[i].name(); + if (!field_name || !dtype_is_string_type(index->fields[i].col->mtype) || + index->fields[i].col->is_virtual()) + continue; + for (uint j= 0; j < altered_table.s->fields; j++) + { + const Field *altered_field= altered_table.field[j]; + + if (my_strcasecmp(system_charset_info, field_name, + altered_field->field_name.str)) + continue; + + unsigned prtype; + uint8_t mtype; + uint16_t len; + get_type(*altered_field, prtype, mtype, len); + + if (prtype == index->fields[i].col->prtype) + continue; + auto it= change_col_collate.find(index->fields[i].col->ind); + if (it != change_col_collate.end()) + { + n_cols++; + index->fields[i].col= it->second; + continue; + } + + const CHARSET_INFO *cs= altered_field->charset(); + + dict_col_t *col= + static_cast(mem_heap_alloc(heap, sizeof *col)); + *col= *index->fields[i].col; + col->prtype= prtype; + col->mtype= mtype; + col->mbminlen= cs->mbminlen & 7; + col->mbmaxlen= cs->mbmaxlen & 7; + col->len= len; + index->fields[i].col= col; + n_cols++; + change_col_collate[col->ind]= col; + } + } + + index->init_change_cols(n_cols); + } + + void cleanup_col_collation() + { + ut_ad(old_table == new_table); + if (change_col_collate.empty()) + return; + const dict_index_t *index= dict_table_get_first_index(old_table); + while ((index= dict_table_get_next_index(index)) != nullptr) + { + if (index->is_committed()) + continue; + auto collate_end= change_col_collate.end(); + for (unsigned i= 0, j= 0; i < index->n_fields; i++) + { + const dict_col_t *col= index->fields[i].col; + auto it= change_col_collate.find(col->ind); + if (it != collate_end) + { + ut_ad(it->second == col); + index->fields[i].col= + index->change_col_info->add(index->heap, *col, j++); + } + } + } + } +}; + +/********************************************************************//** +Get the upper limit of the MySQL integral and floating-point type. +@return maximum allowed value for the field */ +ulonglong innobase_get_int_col_max_value(const Field *field); + +/** Determine if fulltext indexes exist in a given table. +@param table MySQL table +@return number of fulltext indexes */ +static uint innobase_fulltext_exist(const TABLE* table) +{ + uint count = 0; + + for (uint i = 0; i < table->s->keys; i++) { + if (table->key_info[i].flags & HA_FULLTEXT) { + count++; + } + } + + return count; +} + +/** Determine whether indexed virtual columns exist in a table. +@param[in] table table definition +@return whether indexes exist on virtual columns */ +static bool innobase_indexed_virtual_exist(const TABLE* table) +{ + const KEY* const end = &table->key_info[table->s->keys]; + + for (const KEY* key = table->key_info; key < end; key++) { + const KEY_PART_INFO* const key_part_end = key->key_part + + key->user_defined_key_parts; + for (const KEY_PART_INFO* key_part = key->key_part; + key_part < key_part_end; key_part++) { + if (!key_part->field->stored_in_db()) + return true; + } + } + + return false; +} + +/** Determine if spatial indexes exist in a given table. +@param table MySQL table +@return whether spatial indexes exist on the table */ +static +bool +innobase_spatial_exist( +/*===================*/ + const TABLE* table) +{ + for (uint i = 0; i < table->s->keys; i++) { + if (table->key_info[i].flags & HA_SPATIAL) { + return(true); + } + } + + return(false); +} + +/** Determine if ALTER_OPTIONS requires rebuilding the table. +@param[in] ha_alter_info the ALTER TABLE operation +@param[in] table metadata before ALTER TABLE +@return whether it is mandatory to rebuild the table */ +static bool alter_options_need_rebuild( + const Alter_inplace_info* ha_alter_info, + const TABLE* table) +{ + DBUG_ASSERT(ha_alter_info->handler_flags & ALTER_OPTIONS); + + if (ha_alter_info->create_info->used_fields + & (HA_CREATE_USED_ROW_FORMAT + | HA_CREATE_USED_KEY_BLOCK_SIZE)) { + /* Specifying ROW_FORMAT or KEY_BLOCK_SIZE requires + rebuilding the table. (These attributes in the .frm + file may disagree with the InnoDB data dictionary, and + the interpretation of thse attributes depends on + InnoDB parameters. That is why we for now always + require a rebuild when these attributes are specified.) */ + return true; + } + + const ha_table_option_struct& alt_opt= + *ha_alter_info->create_info->option_struct; + const ha_table_option_struct& opt= *table->s->option_struct; + + /* Allow an instant change to enable page_compressed, + and any change of page_compression_level. */ + if ((!alt_opt.page_compressed && opt.page_compressed) + || alt_opt.encryption != opt.encryption + || alt_opt.encryption_key_id != opt.encryption_key_id) { + return(true); + } + + return false; +} + +/** Determine if ALTER TABLE needs to rebuild the table +(or perform instant operation). +@param[in] ha_alter_info the ALTER TABLE operation +@param[in] table metadata before ALTER TABLE +@return whether it is necessary to rebuild the table or to alter columns */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +bool +innobase_need_rebuild( + const Alter_inplace_info* ha_alter_info, + const TABLE* table) +{ + if ((ha_alter_info->handler_flags & ~(INNOBASE_INPLACE_IGNORE + | INNOBASE_ALTER_NOREBUILD + | INNOBASE_ALTER_INSTANT)) + == ALTER_OPTIONS) { + return alter_options_need_rebuild(ha_alter_info, table); + } + + return !!(ha_alter_info->handler_flags & INNOBASE_ALTER_REBUILD); +} + +/** Check if virtual column in old and new table are in order, excluding +those dropped column. This is needed because when we drop a virtual column, +ALTER_VIRTUAL_COLUMN_ORDER is also turned on, so we can't decide if this +is a real ORDER change or just DROP COLUMN +@param[in] table old TABLE +@param[in] altered_table new TABLE +@param[in] ha_alter_info Structure describing changes to be done +by ALTER TABLE and holding data used during in-place alter. +@return true is all columns in order, false otherwise. */ +static +bool +check_v_col_in_order( + const TABLE* table, + const TABLE* altered_table, + Alter_inplace_info* ha_alter_info) +{ + ulint j = 0; + + /* We don't support any adding new virtual column before + existed virtual column. */ + if (ha_alter_info->handler_flags + & ALTER_ADD_VIRTUAL_COLUMN) { + bool has_new = false; + + for (const Create_field& new_field : + ha_alter_info->alter_info->create_list) { + if (new_field.stored_in_db()) { + continue; + } + + /* Found a new added virtual column. */ + if (!new_field.field) { + has_new = true; + continue; + } + + /* If there's any old virtual column + after the new added virtual column, + order must be changed. */ + if (has_new) { + return(false); + } + } + } + + /* directly return true if ALTER_VIRTUAL_COLUMN_ORDER is not on */ + if (!(ha_alter_info->handler_flags + & ALTER_VIRTUAL_COLUMN_ORDER)) { + return(true); + } + + for (ulint i = 0; i < table->s->fields; i++) { + Field* field = table->field[i]; + + if (field->stored_in_db()) { + continue; + } + + if (field->flags & FIELD_IS_DROPPED) { + continue; + } + + /* Now check if the next virtual column in altered table + matches this column */ + while (j < altered_table->s->fields) { + Field* new_field = altered_table->s->field[j]; + + if (new_field->stored_in_db()) { + j++; + continue; + } + + if (my_strcasecmp(system_charset_info, + field->field_name.str, + new_field->field_name.str) != 0) { + /* different column */ + return(false); + } else { + j++; + break; + } + } + + if (j > altered_table->s->fields) { + /* there should not be less column in new table + without them being in drop list */ + ut_ad(0); + return(false); + } + } + + return(true); +} + +/** Determine if an instant operation is possible for altering columns. +@param[in] ib_table InnoDB table definition +@param[in] ha_alter_info the ALTER TABLE operation +@param[in] table table definition before ALTER TABLE +@param[in] altered_table table definition after ALTER TABLE +@param[in] strict whether to ensure that user records fit */ +static +bool +instant_alter_column_possible( + const dict_table_t& ib_table, + const Alter_inplace_info* ha_alter_info, + const TABLE* table, + const TABLE* altered_table, + bool strict) +{ + const dict_index_t* const pk = ib_table.indexes.start; + ut_ad(pk->is_primary()); + ut_ad(!pk->has_virtual()); + + if (ha_alter_info->handler_flags + & (ALTER_STORED_COLUMN_ORDER | ALTER_DROP_STORED_COLUMN + | ALTER_ADD_STORED_BASE_COLUMN)) { +#if 1 // MDEV-17459: adjust fts_fetch_doc_from_rec() and friends; remove this + if (ib_table.fts || innobase_fulltext_exist(altered_table)) + return false; +#endif +#if 1 // MDEV-17468: fix bugs with indexed virtual columns & remove this + for (const dict_index_t* index = ib_table.indexes.start; + index; index = index->indexes.next) { + if (index->has_virtual()) { + ut_ad(ib_table.n_v_cols + || index->is_corrupted()); + return false; + } + } +#endif + uint n_add = 0, n_nullable = 0, lenlen = 0; + const uint blob_prefix = dict_table_has_atomic_blobs(&ib_table) + ? 0 + : REC_ANTELOPE_MAX_INDEX_COL_LEN; + const uint min_local_len = blob_prefix + ? blob_prefix + FIELD_REF_SIZE + : 2 * FIELD_REF_SIZE; + size_t min_size = 0, max_size = 0; + Field** af = altered_table->field; + Field** const end = altered_table->field + + altered_table->s->fields; + List_iterator_fast cf_it( + ha_alter_info->alter_info->create_list); + + for (; af < end; af++) { + const Create_field* cf = cf_it++; + if (!(*af)->stored_in_db() || cf->field) { + /* Virtual or pre-existing column */ + continue; + } + const bool nullable = (*af)->real_maybe_null(); + const bool is_null = (*af)->is_real_null(); + ut_ad(!is_null || nullable); + n_nullable += nullable; + n_add++; + uint l; + switch ((*af)->type()) { + case MYSQL_TYPE_VARCHAR: + l = reinterpret_cast + (*af)->get_length(); + variable_length: + if (l >= min_local_len) { + max_size += blob_prefix + + FIELD_REF_SIZE; + if (!is_null) { + min_size += blob_prefix + + FIELD_REF_SIZE; + } + lenlen += 2; + } else { + if (!is_null) { + min_size += l; + } + l = (*af)->pack_length(); + max_size += l; + lenlen += l > 255 ? 2 : 1; + } + break; + case MYSQL_TYPE_GEOMETRY: + case MYSQL_TYPE_TINY_BLOB: + case MYSQL_TYPE_MEDIUM_BLOB: + case MYSQL_TYPE_BLOB: + case MYSQL_TYPE_LONG_BLOB: + l = reinterpret_cast + ((*af))->get_length(); + goto variable_length; + default: + l = (*af)->pack_length(); + if (l > 255 && ib_table.not_redundant()) { + goto variable_length; + } + max_size += l; + if (!is_null) { + min_size += l; + } + } + } + + ulint n_fields = pk->n_fields + n_add; + + if (n_fields >= REC_MAX_N_USER_FIELDS + DATA_N_SYS_COLS) { + return false; + } + + if (pk->is_gen_clust()) { + min_size += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + + DATA_ROW_ID_LEN; + max_size += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + + DATA_ROW_ID_LEN; + } else { + min_size += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + max_size += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + } + + uint i = pk->n_fields; + while (i-- > pk->n_core_fields) { + const dict_field_t& f = pk->fields[i]; + if (f.col->is_nullable()) { + n_nullable++; + if (!f.col->is_dropped() + && f.col->def_val.data) { + goto instantly_added_column; + } + } else if (f.fixed_len + && (f.fixed_len <= 255 + || !ib_table.not_redundant())) { + if (ib_table.not_redundant() + || !f.col->is_dropped()) { + min_size += f.fixed_len; + max_size += f.fixed_len; + } + } else if (f.col->is_dropped() || !f.col->is_added()) { + lenlen++; + goto set_max_size; + } else { +instantly_added_column: + ut_ad(f.col->is_added()); + if (f.col->def_val.len >= min_local_len) { + min_size += blob_prefix + + FIELD_REF_SIZE; + lenlen += 2; + } else { + min_size += f.col->def_val.len; + lenlen += f.col->def_val.len + > 255 ? 2 : 1; + } +set_max_size: + if (f.fixed_len + && (f.fixed_len <= 255 + || !ib_table.not_redundant())) { + max_size += f.fixed_len; + } else if (f.col->len >= min_local_len) { + max_size += blob_prefix + + FIELD_REF_SIZE; + } else { + max_size += f.col->len; + } + } + } + + do { + const dict_field_t& f = pk->fields[i]; + if (f.col->is_nullable()) { + n_nullable++; + } else if (f.fixed_len) { + min_size += f.fixed_len; + } else { + lenlen++; + } + } while (i--); + + if (ib_table.instant + || (ha_alter_info->handler_flags + & (ALTER_STORED_COLUMN_ORDER + | ALTER_DROP_STORED_COLUMN))) { + n_fields++; + lenlen += 2; + min_size += FIELD_REF_SIZE; + } + + if (ib_table.not_redundant()) { + min_size += REC_N_NEW_EXTRA_BYTES + + UT_BITS_IN_BYTES(n_nullable) + + lenlen; + } else { + min_size += (n_fields > 255 || min_size > 255) + ? n_fields * 2 : n_fields; + min_size += REC_N_OLD_EXTRA_BYTES; + } + + if (page_zip_rec_needs_ext(min_size, ib_table.not_redundant(), + 0, 0)) { + return false; + } + + if (strict && page_zip_rec_needs_ext(max_size, + ib_table.not_redundant(), + 0, 0)) { + return false; + } + } + // Making table system-versioned instantly is not implemented yet. + if (ha_alter_info->handler_flags & ALTER_ADD_SYSTEM_VERSIONING) { + return false; + } + + static constexpr alter_table_operations avoid_rebuild + = ALTER_ADD_STORED_BASE_COLUMN + | ALTER_DROP_STORED_COLUMN + | ALTER_STORED_COLUMN_ORDER + | ALTER_COLUMN_NULLABLE; + + if (!(ha_alter_info->handler_flags & avoid_rebuild)) { + alter_table_operations flags = ha_alter_info->handler_flags + & ~avoid_rebuild; + /* None of the flags are set that we can handle + specially to avoid rebuild. In this case, we can + allow ALGORITHM=INSTANT, except if some requested + operation requires that the table be rebuilt. */ + if (flags & INNOBASE_ALTER_REBUILD) { + return false; + } + if ((flags & ALTER_OPTIONS) + && alter_options_need_rebuild(ha_alter_info, table)) { + return false; + } + } else if (!ib_table.supports_instant()) { + return false; + } + + /* At the moment, we disallow ADD [UNIQUE] INDEX together with + instant ADD COLUMN. + + The main reason is that the work of instant ADD must be done + in commit_inplace_alter_table(). For the rollback_instant() + to work, we must add the columns to dict_table_t beforehand, + and roll back those changes in case the transaction is rolled + back. + + If we added the columns to the dictionary cache already in the + prepare_inplace_alter_table(), we would have to deal with + column number mismatch in ha_innobase::open(), write_row() and + other functions. */ + + /* FIXME: allow instant ADD COLUMN together with + INNOBASE_ONLINE_CREATE (ADD [UNIQUE] INDEX) on pre-existing + columns. */ + if (ha_alter_info->handler_flags + & ((INNOBASE_ALTER_REBUILD | INNOBASE_ONLINE_CREATE) + & ~ALTER_DROP_STORED_COLUMN + & ~ALTER_STORED_COLUMN_ORDER + & ~ALTER_ADD_STORED_BASE_COLUMN + & ~ALTER_COLUMN_NULLABLE + & ~ALTER_OPTIONS)) { + return false; + } + + if ((ha_alter_info->handler_flags & ALTER_OPTIONS) + && alter_options_need_rebuild(ha_alter_info, table)) { + return false; + } + + if (ha_alter_info->handler_flags & ALTER_COLUMN_NULLABLE) { + if (ib_table.not_redundant()) { + /* Instantaneous removal of NOT NULL is + only supported for ROW_FORMAT=REDUNDANT. */ + return false; + } + if (ib_table.fts_doc_id_index + && !innobase_fulltext_exist(altered_table)) { + /* Removing hidden FTS_DOC_ID_INDEX(FTS_DOC_ID) + requires that the table be rebuilt. */ + return false; + } + + Field** af = altered_table->field; + Field** const end = altered_table->field + + altered_table->s->fields; + List_iterator_fast cf_it( + ha_alter_info->alter_info->create_list); + for (unsigned c = 0; af < end; af++) { + const Create_field* cf = cf_it++; + if (!cf->field || !(*af)->stored_in_db()) { + /* Ignore virtual or newly created + column */ + continue; + } + + const dict_col_t* col = dict_table_get_nth_col( + &ib_table, c++); + + if (!col->ord_part || col->is_nullable() + || !(*af)->real_maybe_null()) { + continue; + } + + /* The column would be changed from NOT NULL. + Ensure that it is not a clustered index key. */ + for (auto i = pk->n_uniq; i--; ) { + if (pk->fields[i].col == col) { + return false; + } + } + } + } + + return true; +} + +/** Check whether the non-const default value for the field +@param[in] field field which could be added or changed +@return true if the non-const default is present. */ +static bool is_non_const_value(Field* field) +{ + return field->default_value + && field->default_value->flags + & uint(~(VCOL_SESSION_FUNC | VCOL_TIME_FUNC)); +} + +/** Set default value for the field. +@param[in] field field which could be added or changed +@return true if the default value is set. */ +static bool set_default_value(Field* field) +{ + /* The added/changed NOT NULL column lacks a DEFAULT value, + or the DEFAULT is the same for all rows. + (Time functions, such as CURRENT_TIMESTAMP(), + are evaluated from a timestamp that is assigned + at the start of the statement. Session + functions, such as USER(), always evaluate the + same within a statement.) */ + + ut_ad(!is_non_const_value(field)); + + /* Compute the DEFAULT values of non-constant columns + (VCOL_SESSION_FUNC | VCOL_TIME_FUNC). */ + switch (field->set_default()) { + case 0: /* OK */ + case 3: /* DATETIME to TIME or DATE conversion */ + return true; + case -1: /* OOM, or GEOMETRY type mismatch */ + case 1: /* A number adjusted to the min/max value */ + case 2: /* String truncation, or conversion problem */ + break; + } + + return false; +} + +/** Check whether the table has the FTS_DOC_ID column +@param[in] table InnoDB table with fulltext index +@param[in] altered_table MySQL table with fulltext index +@param[out] fts_doc_col_no The column number for Doc ID, + or ULINT_UNDEFINED if it is of wrong type +@param[out] num_v Number of virtual column +@param[in] check_only check only whether fts doc id exist. +@return whether there exists an FTS_DOC_ID column */ +static +bool +innobase_fts_check_doc_id_col( + const dict_table_t* table, + const TABLE* altered_table, + ulint* fts_doc_col_no, + ulint* num_v, + bool check_only=false) +{ + *fts_doc_col_no = ULINT_UNDEFINED; + + const uint n_cols = altered_table->s->fields; + ulint i; + int err = 0; + *num_v = 0; + + for (i = 0; i < n_cols; i++) { + const Field* field = altered_table->field[i]; + + if (!field->stored_in_db()) { + (*num_v)++; + } + + if (my_strcasecmp(system_charset_info, + field->field_name.str, FTS_DOC_ID_COL_NAME)) { + continue; + } + + if (strcmp(field->field_name.str, FTS_DOC_ID_COL_NAME)) { + err = ER_WRONG_COLUMN_NAME; + } else if (field->type() != MYSQL_TYPE_LONGLONG + || field->pack_length() != 8 + || field->real_maybe_null() + || !(field->flags & UNSIGNED_FLAG) + || !field->stored_in_db()) { + err = ER_INNODB_FT_WRONG_DOCID_COLUMN; + } else { + *fts_doc_col_no = i - *num_v; + } + + if (err && !check_only) { + my_error(err, MYF(0), field->field_name.str); + } + + return(true); + } + + if (!table) { + return(false); + } + + /* Not to count the virtual columns */ + i -= *num_v; + + for (; i + DATA_N_SYS_COLS < (uint) table->n_cols; i++) { + const char* name = dict_table_get_col_name(table, i); + + if (strcmp(name, FTS_DOC_ID_COL_NAME) == 0) { +#ifdef UNIV_DEBUG + const dict_col_t* col; + + col = dict_table_get_nth_col(table, i); + + /* Because the FTS_DOC_ID does not exist in + the .frm file or TABLE_SHARE, this must be the + internally created FTS_DOC_ID column. */ + ut_ad(col->mtype == DATA_INT); + ut_ad(col->len == 8); + ut_ad(col->prtype & DATA_NOT_NULL); + ut_ad(col->prtype & DATA_UNSIGNED); +#endif /* UNIV_DEBUG */ + *fts_doc_col_no = i; + return(true); + } + } + + return(false); +} + +/** Check whether the table is empty. +@param[in] table table to be checked +@param[in] ignore_delete_marked Ignore the delete marked + flag record +@return true if table is empty */ +static bool innobase_table_is_empty(const dict_table_t *table, + bool ignore_delete_marked=true) +{ + if (!table->space) + return false; + dict_index_t *clust_index= dict_table_get_first_index(table); + mtr_t mtr; + btr_pcur_t pcur; + buf_block_t *block; + page_cur_t *cur; + rec_t *rec; + bool next_page= false; + + mtr.start(); + if (pcur.open_leaf(true, clust_index, BTR_SEARCH_LEAF, &mtr) != DB_SUCCESS) + { +non_empty: + mtr.commit(); + return false; + } + rec= page_rec_get_next(btr_pcur_get_rec(&pcur)); + if (UNIV_UNLIKELY(!rec)) + goto non_empty; + if (rec_is_metadata(rec, *clust_index)) + btr_pcur_get_page_cur(&pcur)->rec= rec; +scan_leaf: + cur= btr_pcur_get_page_cur(&pcur); + if (UNIV_UNLIKELY(!page_cur_move_to_next(cur))) + goto non_empty; +next_page: + if (next_page) + { + uint32_t next_page_no= btr_page_get_next(page_cur_get_page(cur)); + if (next_page_no == FIL_NULL) + { + mtr.commit(); + return true; + } + + next_page= false; + block= btr_block_get(*clust_index, next_page_no, RW_S_LATCH, false, &mtr); + if (!block) + goto non_empty; + page_cur_set_before_first(block, cur); + if (UNIV_UNLIKELY(!page_cur_move_to_next(cur))) + goto non_empty; + const auto s= mtr.get_savepoint(); + mtr.rollback_to_savepoint(s - 2, s - 1); + } + + rec= page_cur_get_rec(cur); + if (rec_get_deleted_flag(rec, dict_table_is_comp(table))) + { + if (ignore_delete_marked) + goto scan_leaf; + goto non_empty; + } + else if (!page_rec_is_supremum(rec)) + goto non_empty; + else + { + next_page= true; + goto next_page; + } + goto scan_leaf; +} + +/** Check if InnoDB supports a particular alter table in-place +@param altered_table TABLE object for new version of table. +@param ha_alter_info Structure describing changes to be done +by ALTER TABLE and holding data used during in-place alter. + +@retval HA_ALTER_INPLACE_NOT_SUPPORTED Not supported +@retval HA_ALTER_INPLACE_INSTANT +MDL_EXCLUSIVE is needed for executing prepare_inplace_alter_table() +and commit_inplace_alter_table(). inplace_alter_table() will not be called. +@retval HA_ALTER_INPLACE_COPY_NO_LOCK +MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded to +LOCK=NONE for rebuilding the table in inplace_alter_table() +@retval HA_ALTER_INPLACE_COPY_LOCK +MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded to +LOCK=SHARED for rebuilding the table in inplace_alter_table() +@retval HA_ALTER_INPLACE_NOCOPY_NO_LOCK +MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded to +LOCK=NONE for inplace_alter_table() which will not rebuild the table +@retval HA_ALTER_INPLACE_NOCOPY_LOCK +MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded to +LOCK=SHARED for inplace_alter_table() which will not rebuild the table +*/ + +enum_alter_inplace_result +ha_innobase::check_if_supported_inplace_alter( + TABLE* altered_table, + Alter_inplace_info* ha_alter_info) +{ + DBUG_ENTER("check_if_supported_inplace_alter"); + + if ((ha_alter_info->handler_flags + & INNOBASE_ALTER_VERSIONED_REBUILD) + && altered_table->versioned(VERS_TIMESTAMP)) { + ha_alter_info->unsupported_reason = + "Not implemented for system-versioned timestamp tables"; + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + /* Before 10.2.2 information about virtual columns was not stored in + system tables. We need to do a full alter to rebuild proper 10.2.2+ + metadata with the information about virtual columns */ + if (omits_virtual_cols(*table_share)) { + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + if (altered_table->s->fields > REC_MAX_N_USER_FIELDS) { + /* Deny the inplace ALTER TABLE. MySQL will try to + re-create the table and ha_innobase::create() will + return an error too. This is how we effectively + deny adding too many columns to a table. */ + ha_alter_info->unsupported_reason = + my_get_err_msg(ER_TOO_MANY_FIELDS); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + update_thd(); + + if (!m_prebuilt->table->space) { + ib_senderrf(m_user_thd, IB_LOG_LEVEL_WARN, + ER_TABLESPACE_DISCARDED, + table->s->table_name.str); + } + + if (is_read_only(!high_level_read_only + && (ha_alter_info->handler_flags & ALTER_OPTIONS) + && ha_alter_info->create_info->key_block_size == 0 + && ha_alter_info->create_info->row_type + != ROW_TYPE_COMPRESSED)) { + ha_alter_info->unsupported_reason = + my_get_err_msg(ER_READ_ONLY_MODE); + + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + if (ha_alter_info->handler_flags + & ~(INNOBASE_INPLACE_IGNORE + | INNOBASE_ALTER_INSTANT + | INNOBASE_ALTER_NOREBUILD + | INNOBASE_ALTER_REBUILD + | ALTER_INDEX_IGNORABILITY)) { + + if (ha_alter_info->handler_flags + & ALTER_STORED_COLUMN_TYPE) { + ha_alter_info->unsupported_reason = my_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_COLUMN_TYPE); + } + + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + ut_ad(dict_sys.sys_tables_exist()); + + /* Only support online add foreign key constraint when + check_foreigns is turned off */ + if ((ha_alter_info->handler_flags & ALTER_ADD_FOREIGN_KEY) + && m_prebuilt->trx->check_foreigns) { + ha_alter_info->unsupported_reason = my_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FK_CHECK); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + const char* reason_rebuild = NULL; + + switch (innodb_instant_alter_column_allowed) { + case 0: /* never */ + if ((ha_alter_info->handler_flags + & (ALTER_ADD_STORED_BASE_COLUMN + | ALTER_STORED_COLUMN_ORDER + | ALTER_DROP_STORED_COLUMN)) + || m_prebuilt->table->is_instant()) { + reason_rebuild = + "innodb_instant_alter_column_allowed=never"; +innodb_instant_alter_column_allowed_reason: + if (ha_alter_info->handler_flags + & ALTER_RECREATE_TABLE) { + reason_rebuild = NULL; + } else { + ha_alter_info->handler_flags + |= ALTER_RECREATE_TABLE; + ha_alter_info->unsupported_reason + = reason_rebuild; + } + } + break; + case 1: /* add_last */ + if ((ha_alter_info->handler_flags + & (ALTER_STORED_COLUMN_ORDER | ALTER_DROP_STORED_COLUMN)) + || m_prebuilt->table->instant) { + reason_rebuild = "innodb_instant_atler_column_allowed=" + "add_last"; + goto innodb_instant_alter_column_allowed_reason; + } + } + + switch (ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE) { + case ALTER_OPTIONS: + if (alter_options_need_rebuild(ha_alter_info, table)) { + reason_rebuild = my_get_err_msg( + ER_ALTER_OPERATION_TABLE_OPTIONS_NEED_REBUILD); + ha_alter_info->unsupported_reason = reason_rebuild; + break; + } + /* fall through */ + case 0: + DBUG_RETURN(HA_ALTER_INPLACE_INSTANT); + } + + /* InnoDB cannot IGNORE when creating unique indexes. IGNORE + should silently delete some duplicate rows. Our inplace_alter + code will not delete anything from existing indexes. */ + if (ha_alter_info->ignore + && (ha_alter_info->handler_flags + & (ALTER_ADD_PK_INDEX | ALTER_ADD_UNIQUE_INDEX))) { + ha_alter_info->unsupported_reason = my_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_IGNORE); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + /* DROP PRIMARY KEY is only allowed in combination with ADD + PRIMARY KEY. */ + if ((ha_alter_info->handler_flags + & (ALTER_ADD_PK_INDEX | ALTER_DROP_PK_INDEX)) + == ALTER_DROP_PK_INDEX) { + ha_alter_info->unsupported_reason = my_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_NOPK); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + if (ha_alter_info->handler_flags & ALTER_COLUMN_NULLABLE) { + /* If a NOT NULL attribute is going to be removed and + a UNIQUE INDEX on the column had been promoted to an + implicit PRIMARY KEY, the table should be rebuilt by + ALGORITHM=COPY. (Theoretically, we could support + rebuilding by ALGORITHM=INPLACE if a PRIMARY KEY is + going to be added, either explicitly or by promoting + another UNIQUE KEY.) */ + const uint my_primary_key = altered_table->s->primary_key; + + if (UNIV_UNLIKELY(my_primary_key >= MAX_KEY) + && !dict_index_is_auto_gen_clust( + dict_table_get_first_index(m_prebuilt->table))) { + ha_alter_info->unsupported_reason = my_get_err_msg( + ER_PRIMARY_CANT_HAVE_NULL); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + } + + /* + InnoDB in different MariaDB versions was generating different mtype + codes for certain types. In some cases the signed/unsigned bit was + generated differently too. + + Inplace ALTER would change the mtype/unsigned_flag (to what the + current code generates) without changing the underlying data + represenation, and it might result in data corruption. + + Don't do inplace ALTER if mtype/unsigned_flag are wrong. + */ + for (ulint i = 0, icol= 0; i < table->s->fields; i++) { + const Field* field = table->field[i]; + const dict_col_t* col = dict_table_get_nth_col( + m_prebuilt->table, icol); + unsigned unsigned_flag; + + if (!field->stored_in_db()) { + continue; + } + + icol++; + + if (col->mtype != get_innobase_type_from_mysql_type( + &unsigned_flag, field)) { + + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + if ((col->prtype & DATA_UNSIGNED) != unsigned_flag) { + + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + } + + ulint n_indexes = UT_LIST_GET_LEN((m_prebuilt->table)->indexes); + + /* If InnoDB dictionary and MySQL frm file are not consistent + use "Copy" method. */ + if (m_prebuilt->table->dict_frm_mismatch) { + + ha_alter_info->unsupported_reason = my_get_err_msg( + ER_NO_SUCH_INDEX); + ib_push_frm_error(m_user_thd, m_prebuilt->table, altered_table, + n_indexes, true); + + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + /* '0000-00-00' value isn't allowed for datetime datatype + for newly added column when table is not empty */ + if (ha_alter_info->error_if_not_empty + && m_prebuilt->table->space + && !innobase_table_is_empty(m_prebuilt->table)) { + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + const bool add_drop_v_cols = !!(ha_alter_info->handler_flags + & (ALTER_ADD_VIRTUAL_COLUMN + | ALTER_DROP_VIRTUAL_COLUMN + | ALTER_VIRTUAL_COLUMN_ORDER)); + + /* We should be able to do the operation in-place. + See if we can do it online (LOCK=NONE) or without rebuild. */ + bool online = true, need_rebuild = false; + const uint fulltext_indexes = innobase_fulltext_exist(altered_table); + + /* Fix the key parts. */ + for (KEY* new_key = ha_alter_info->key_info_buffer; + new_key < ha_alter_info->key_info_buffer + + ha_alter_info->key_count; + new_key++) { + + /* Do not support adding/droping a virtual column, while + there is a table rebuild caused by adding a new FTS_DOC_ID */ + if ((new_key->flags & HA_FULLTEXT) && add_drop_v_cols + && !DICT_TF2_FLAG_IS_SET(m_prebuilt->table, + DICT_TF2_FTS_HAS_DOC_ID)) { + ha_alter_info->unsupported_reason = + MSG_UNSUPPORTED_ALTER_ONLINE_ON_VIRTUAL_COLUMN; + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + for (KEY_PART_INFO* key_part = new_key->key_part; + key_part < (new_key->key_part + + new_key->user_defined_key_parts); + key_part++) { + DBUG_ASSERT(key_part->fieldnr + < altered_table->s->fields); + + const Create_field* new_field + = ha_alter_info->alter_info->create_list.elem( + key_part->fieldnr); + + DBUG_ASSERT(new_field); + + key_part->field = altered_table->field[ + key_part->fieldnr]; + + /* In some special cases InnoDB emits "false" + duplicate key errors with NULL key values. Let + us play safe and ensure that we can correctly + print key values even in such cases. */ + key_part->null_offset = key_part->field->null_offset(); + key_part->null_bit = key_part->field->null_bit; + + if (new_field->field) { + /* This is an existing column. */ + continue; + } + + /* This is an added column. */ + DBUG_ASSERT(ha_alter_info->handler_flags + & ALTER_ADD_COLUMN); + + /* We cannot replace a hidden FTS_DOC_ID + with a user-visible FTS_DOC_ID. */ + if (fulltext_indexes && m_prebuilt->table->fts + && !my_strcasecmp( + system_charset_info, + key_part->field->field_name.str, + FTS_DOC_ID_COL_NAME)) { + ha_alter_info->unsupported_reason = my_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_HIDDEN_FTS); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + DBUG_ASSERT((key_part->field->unireg_check + == Field::NEXT_NUMBER) + == !!(key_part->field->flags + & AUTO_INCREMENT_FLAG)); + + if (key_part->field->flags & AUTO_INCREMENT_FLAG) { + /* We cannot assign AUTO_INCREMENT values + during online or instant ALTER. */ + DBUG_ASSERT(key_part->field == altered_table + -> found_next_number_field); + + if (ha_alter_info->online) { + ha_alter_info->unsupported_reason = my_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_AUTOINC); + } + + online = false; + need_rebuild = true; + } + + if (!key_part->field->stored_in_db()) { + /* Do not support adding index on newly added + virtual column, while there is also a drop + virtual column in the same clause */ + if (ha_alter_info->handler_flags + & ALTER_DROP_VIRTUAL_COLUMN) { + ha_alter_info->unsupported_reason = + MSG_UNSUPPORTED_ALTER_ONLINE_ON_VIRTUAL_COLUMN; + + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + if (ha_alter_info->online + && !ha_alter_info->unsupported_reason) { + ha_alter_info->unsupported_reason = + MSG_UNSUPPORTED_ALTER_ONLINE_ON_VIRTUAL_COLUMN; + } + + online = false; + } + } + } + + DBUG_ASSERT(!m_prebuilt->table->fts + || (m_prebuilt->table->fts->doc_col <= table->s->fields)); + + DBUG_ASSERT(!m_prebuilt->table->fts + || (m_prebuilt->table->fts->doc_col + < dict_table_get_n_user_cols(m_prebuilt->table))); + + if (fulltext_indexes && m_prebuilt->table->fts) { + /* FTS index of versioned table has row_end, need rebuild */ + if (table->versioned() != altered_table->versioned()) { + need_rebuild= true; + } + + /* FULLTEXT indexes are supposed to remain. */ + /* Disallow DROP INDEX FTS_DOC_ID_INDEX */ + + for (uint i = 0; i < ha_alter_info->index_drop_count; i++) { + if (!my_strcasecmp( + system_charset_info, + ha_alter_info->index_drop_buffer[i]->name.str, + FTS_DOC_ID_INDEX_NAME)) { + ha_alter_info->unsupported_reason = my_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_CHANGE_FTS); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + } + + /* InnoDB can have a hidden FTS_DOC_ID_INDEX on a + visible FTS_DOC_ID column as well. Prevent dropping or + renaming the FTS_DOC_ID. */ + + for (Field** fp = table->field; *fp; fp++) { + if (!((*fp)->flags + & (FIELD_IS_RENAMED | FIELD_IS_DROPPED))) { + continue; + } + + if (!my_strcasecmp( + system_charset_info, + (*fp)->field_name.str, + FTS_DOC_ID_COL_NAME)) { + ha_alter_info->unsupported_reason = my_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_CHANGE_FTS); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + } + } + + m_prebuilt->trx->will_lock = true; + + /* When changing a NULL column to NOT NULL and specifying a + DEFAULT value, ensure that the DEFAULT expression is a constant. + Also, in ADD COLUMN, for now we only support a + constant DEFAULT expression. */ + Field **af = altered_table->field; + bool fts_need_rebuild = false; + need_rebuild = need_rebuild + || innobase_need_rebuild(ha_alter_info, table); + + for (Create_field& cf : ha_alter_info->alter_info->create_list) { + DBUG_ASSERT(cf.field + || (ha_alter_info->handler_flags + & ALTER_ADD_COLUMN)); + + if (const Field* f = cf.field) { + /* An AUTO_INCREMENT attribute can only + be added to an existing column by ALGORITHM=COPY, + but we can remove the attribute. */ + ut_ad((*af)->unireg_check != Field::NEXT_NUMBER + || f->unireg_check == Field::NEXT_NUMBER); + if (!f->real_maybe_null() || (*af)->real_maybe_null()) + goto next_column; + /* We are changing an existing column + from NULL to NOT NULL. */ + DBUG_ASSERT(ha_alter_info->handler_flags + & ALTER_COLUMN_NOT_NULLABLE); + /* Virtual columns are never NOT NULL. */ + DBUG_ASSERT(f->stored_in_db()); + switch ((*af)->type()) { + case MYSQL_TYPE_TIMESTAMP: + case MYSQL_TYPE_TIMESTAMP2: + /* Inserting NULL into a TIMESTAMP column + would cause the DEFAULT value to be + replaced. Ensure that the DEFAULT + expression is not changing during + ALTER TABLE. */ + if (!(*af)->default_value + && (*af)->is_real_null()) { + /* No DEFAULT value is + specified. We can report + errors for any NULL values for + the TIMESTAMP. */ + goto next_column; + } + break; + default: + /* For any other data type, NULL + values are not converted. */ + goto next_column; + } + + ha_alter_info->unsupported_reason = my_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_NOT_NULL); + } else if (!is_non_const_value(*af) + && set_default_value(*af)) { + if (fulltext_indexes > 1 + && !my_strcasecmp(system_charset_info, + (*af)->field_name.str, + FTS_DOC_ID_COL_NAME)) { + /* If a hidden FTS_DOC_ID column exists + (because of FULLTEXT INDEX), it cannot + be replaced with a user-created one + except when using ALGORITHM=COPY. */ + ha_alter_info->unsupported_reason = + my_get_err_msg(ER_INNODB_FT_LIMIT); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + goto next_column; + } + + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + +next_column: + af++; + } + + const bool supports_instant = instant_alter_column_possible( + *m_prebuilt->table, ha_alter_info, table, altered_table, + is_innodb_strict_mode()); + if (add_drop_v_cols) { + ulonglong flags = ha_alter_info->handler_flags; + + /* TODO: uncomment the flags below, once we start to + support them */ + + flags &= ~(ALTER_ADD_VIRTUAL_COLUMN + | ALTER_DROP_VIRTUAL_COLUMN + | ALTER_VIRTUAL_COLUMN_ORDER + | ALTER_VIRTUAL_GCOL_EXPR + | ALTER_COLUMN_VCOL + /* + | ALTER_ADD_STORED_BASE_COLUMN + | ALTER_DROP_STORED_COLUMN + | ALTER_STORED_COLUMN_ORDER + | ALTER_ADD_UNIQUE_INDEX + */ + | ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX + | ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX + | ALTER_INDEX_ORDER); + if (supports_instant) { + flags &= ~(ALTER_DROP_STORED_COLUMN +#if 0 /* MDEV-17468: remove check_v_col_in_order() and fix the code */ + | ALTER_ADD_STORED_BASE_COLUMN +#endif + | ALTER_STORED_COLUMN_ORDER); + } + if (flags != 0 + || IF_PARTITIONING((altered_table->s->partition_info_str + && altered_table->s->partition_info_str_len), 0) + || (!check_v_col_in_order( + this->table, altered_table, ha_alter_info))) { + ha_alter_info->unsupported_reason = + MSG_UNSUPPORTED_ALTER_ONLINE_ON_VIRTUAL_COLUMN; + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + } + + if (supports_instant && !(ha_alter_info->handler_flags + & INNOBASE_ALTER_NOREBUILD)) { + DBUG_RETURN(HA_ALTER_INPLACE_INSTANT); + } + + if (need_rebuild + && (fulltext_indexes + || innobase_spatial_exist(altered_table) + || innobase_indexed_virtual_exist(altered_table))) { + /* If the table already contains fulltext indexes, + refuse to rebuild the table natively altogether. */ + if (fulltext_indexes > 1) { +cannot_create_many_fulltext_index: + ha_alter_info->unsupported_reason = + my_get_err_msg(ER_INNODB_FT_LIMIT); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + + if (!online || !ha_alter_info->online + || ha_alter_info->unsupported_reason != reason_rebuild) { + /* Either LOCK=NONE was not requested, or we already + gave specific reason to refuse it. */ + } else if (fulltext_indexes) { + ha_alter_info->unsupported_reason = my_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FTS); + } else if (innobase_spatial_exist(altered_table)) { + ha_alter_info->unsupported_reason = my_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_GIS); + } else { + /* MDEV-14341 FIXME: Remove this limitation. */ + ha_alter_info->unsupported_reason = + "online rebuild with indexed virtual columns"; + } + + online = false; + } + + if (ha_alter_info->handler_flags + & ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX) { + /* ADD FULLTEXT|SPATIAL INDEX requires a lock. + + We could do ADD FULLTEXT INDEX without a lock if the + table already contains an FTS_DOC_ID column, but in + that case we would have to apply the modification log + to the full-text indexes. + + We could also do ADD SPATIAL INDEX by implementing + row_log_apply() for it. */ + bool add_fulltext = false; + + for (uint i = 0; i < ha_alter_info->index_add_count; i++) { + const KEY* key = + &ha_alter_info->key_info_buffer[ + ha_alter_info->index_add_buffer[i]]; + if (key->flags & HA_FULLTEXT) { + DBUG_ASSERT(!(key->flags & HA_KEYFLAG_MASK + & ~(HA_FULLTEXT + | HA_PACK_KEY + | HA_GENERATED_KEY + | HA_BINARY_PACK_KEY))); + if (add_fulltext) { + goto cannot_create_many_fulltext_index; + } + + add_fulltext = true; + if (ha_alter_info->online + && !ha_alter_info->unsupported_reason) { + ha_alter_info->unsupported_reason = my_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FTS); + } + + online = false; + + /* Full text search index exists, check + whether the table already has DOC ID column. + If not, InnoDB have to rebuild the table to + add a Doc ID hidden column and change + primary index. */ + ulint fts_doc_col_no; + ulint num_v = 0; + + fts_need_rebuild = + !innobase_fts_check_doc_id_col( + m_prebuilt->table, + altered_table, + &fts_doc_col_no, &num_v, true); + } + + if (online && (key->flags & HA_SPATIAL)) { + + if (ha_alter_info->online) { + ha_alter_info->unsupported_reason = my_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_GIS); + } + + online = false; + } + } + } + + // FIXME: implement Online DDL for system-versioned operations + if (ha_alter_info->handler_flags & INNOBASE_ALTER_VERSIONED_REBUILD) { + + if (ha_alter_info->online) { + ha_alter_info->unsupported_reason = + "Not implemented for system-versioned operations"; + } + + online = false; + } + + if ((need_rebuild && !supports_instant) || fts_need_rebuild) { + ha_alter_info->handler_flags |= ALTER_RECREATE_TABLE; + DBUG_RETURN(online + ? HA_ALTER_INPLACE_COPY_NO_LOCK + : HA_ALTER_INPLACE_COPY_LOCK); + } + + if (ha_alter_info->unsupported_reason) { + } else if (ha_alter_info->handler_flags & INNOBASE_ONLINE_CREATE) { + ha_alter_info->unsupported_reason = "ADD INDEX"; + } else { + ha_alter_info->unsupported_reason = "DROP INDEX"; + } + + DBUG_RETURN(online + ? HA_ALTER_INPLACE_NOCOPY_NO_LOCK + : HA_ALTER_INPLACE_NOCOPY_LOCK); +} + +/*************************************************************//** +Initialize the dict_foreign_t structure with supplied info +@return true if added, false if duplicate foreign->id */ +static MY_ATTRIBUTE((nonnull(1,3,5,7))) +bool +innobase_init_foreign( +/*==================*/ + dict_foreign_t* foreign, /*!< in/out: structure to + initialize */ + const char* constraint_name, /*!< in/out: constraint name if + exists */ + dict_table_t* table, /*!< in: foreign table */ + dict_index_t* index, /*!< in: foreign key index */ + const char** column_names, /*!< in: foreign key column + names */ + ulint num_field, /*!< in: number of columns */ + const char* referenced_table_name, /*!< in: referenced table + name */ + dict_table_t* referenced_table, /*!< in: referenced table */ + dict_index_t* referenced_index, /*!< in: referenced index */ + const char** referenced_column_names,/*!< in: referenced column + names */ + ulint referenced_num_field) /*!< in: number of referenced + columns */ +{ + ut_ad(dict_sys.locked()); + + if (constraint_name) { + ulint db_len; + + /* Catenate 'databasename/' to the constraint name specified + by the user: we conceive the constraint as belonging to the + same MySQL 'database' as the table itself. We store the name + to foreign->id. */ + + db_len = dict_get_db_name_len(table->name.m_name); + + foreign->id = static_cast(mem_heap_alloc( + foreign->heap, db_len + strlen(constraint_name) + 2)); + + memcpy(foreign->id, table->name.m_name, db_len); + foreign->id[db_len] = '/'; + strcpy(foreign->id + db_len + 1, constraint_name); + + /* Check if any existing foreign key has the same id, + this is needed only if user supplies the constraint name */ + + if (table->foreign_set.find(foreign) + != table->foreign_set.end()) { + return(false); + } + } + + foreign->foreign_table = table; + foreign->foreign_table_name = mem_heap_strdup( + foreign->heap, table->name.m_name); + dict_mem_foreign_table_name_lookup_set(foreign, TRUE); + + foreign->foreign_index = index; + foreign->n_fields = static_cast(num_field) + & dict_index_t::MAX_N_FIELDS; + + foreign->foreign_col_names = static_cast( + mem_heap_alloc(foreign->heap, num_field * sizeof(void*))); + + for (ulint i = 0; i < foreign->n_fields; i++) { + foreign->foreign_col_names[i] = mem_heap_strdup( + foreign->heap, column_names[i]); + } + + foreign->referenced_index = referenced_index; + foreign->referenced_table = referenced_table; + + foreign->referenced_table_name = mem_heap_strdup( + foreign->heap, referenced_table_name); + dict_mem_referenced_table_name_lookup_set(foreign, TRUE); + + foreign->referenced_col_names = static_cast( + mem_heap_alloc(foreign->heap, + referenced_num_field * sizeof(void*))); + + for (ulint i = 0; i < foreign->n_fields; i++) { + foreign->referenced_col_names[i] + = mem_heap_strdup(foreign->heap, + referenced_column_names[i]); + } + + return(true); +} + +/*************************************************************//** +Check whether the foreign key options is legit +@return true if it is */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +bool +innobase_check_fk_option( +/*=====================*/ + const dict_foreign_t* foreign) /*!< in: foreign key */ +{ + if (!foreign->foreign_index) { + return(true); + } + + if (foreign->type & (DICT_FOREIGN_ON_UPDATE_SET_NULL + | DICT_FOREIGN_ON_DELETE_SET_NULL)) { + + for (ulint j = 0; j < foreign->n_fields; j++) { + if ((dict_index_get_nth_col( + foreign->foreign_index, j)->prtype) + & DATA_NOT_NULL) { + + /* It is not sensible to define + SET NULL if the column is not + allowed to be NULL! */ + return(false); + } + } + } + + return(true); +} + +/*************************************************************//** +Set foreign key options +@return true if successfully set */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +bool +innobase_set_foreign_key_option( +/*============================*/ + dict_foreign_t* foreign, /*!< in:InnoDB Foreign key */ + Foreign_key* fk_key) /*!< in: Foreign key info from + MySQL */ +{ + ut_ad(!foreign->type); + + switch (fk_key->delete_opt) { + case FK_OPTION_NO_ACTION: + case FK_OPTION_RESTRICT: + case FK_OPTION_SET_DEFAULT: + foreign->type = DICT_FOREIGN_ON_DELETE_NO_ACTION; + break; + case FK_OPTION_CASCADE: + foreign->type = DICT_FOREIGN_ON_DELETE_CASCADE; + break; + case FK_OPTION_SET_NULL: + foreign->type = DICT_FOREIGN_ON_DELETE_SET_NULL; + break; + case FK_OPTION_UNDEF: + break; + } + + switch (fk_key->update_opt) { + case FK_OPTION_NO_ACTION: + case FK_OPTION_RESTRICT: + case FK_OPTION_SET_DEFAULT: + foreign->type |= DICT_FOREIGN_ON_UPDATE_NO_ACTION; + break; + case FK_OPTION_CASCADE: + foreign->type |= DICT_FOREIGN_ON_UPDATE_CASCADE; + break; + case FK_OPTION_SET_NULL: + foreign->type |= DICT_FOREIGN_ON_UPDATE_SET_NULL; + break; + case FK_OPTION_UNDEF: + break; + } + + return(innobase_check_fk_option(foreign)); +} + +/*******************************************************************//** +Check if a foreign key constraint can make use of an index +that is being created. +@param[in] col_names column names +@param[in] n_cols number of columns +@param[in] keys index information +@param[in] add indexes being created +@return useable index, or NULL if none found */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +const KEY* +innobase_find_equiv_index( + const char*const* col_names, + uint n_cols, + const KEY* keys, + span add) +{ + for (span::iterator it = add.begin(), end = add.end(); it != end; + ++it) { + const KEY* key = &keys[*it]; + + if (key->user_defined_key_parts < n_cols + || key->flags & HA_SPATIAL) { +no_match: + continue; + } + + for (uint j = 0; j < n_cols; j++) { + const KEY_PART_INFO& key_part = key->key_part[j]; + uint32 col_len + = key_part.field->pack_length(); + + /* Any index on virtual columns cannot be used + for reference constraint */ + if (!key_part.field->stored_in_db()) { + goto no_match; + } + + /* The MySQL pack length contains 1 or 2 bytes + length field for a true VARCHAR. */ + + if (key_part.field->type() == MYSQL_TYPE_VARCHAR) { + col_len -= static_cast( + key_part.field)->length_bytes; + } + + if (key_part.length < col_len) { + + /* Column prefix indexes cannot be + used for FOREIGN KEY constraints. */ + goto no_match; + } + + if (innobase_strcasecmp(col_names[j], + key_part.field->field_name.str)) { + /* Name mismatch */ + goto no_match; + } + } + + return(key); + } + + return(NULL); +} + +/*************************************************************//** +Find an index whose first fields are the columns in the array +in the same order and is not marked for deletion +@return matching index, NULL if not found */ +static MY_ATTRIBUTE((nonnull(1,4), warn_unused_result)) +dict_index_t* +innobase_find_fk_index( +/*===================*/ + dict_table_t* table, /*!< in: table */ + const char** col_names, + /*!< in: column names, or NULL + to use table->col_names */ + span drop_index, + /*!< in: indexes to be dropped */ + const char** columns,/*!< in: array of column names */ + ulint n_cols) /*!< in: number of columns */ +{ + dict_index_t* index; + + index = dict_table_get_first_index(table); + + while (index != NULL) { + if (dict_foreign_qualify_index(table, col_names, columns, + n_cols, index, NULL, true, 0, + NULL, NULL, NULL) + && std::find(drop_index.begin(), drop_index.end(), index) + == drop_index.end()) { + return index; + } + + index = dict_table_get_next_index(index); + } + + return(NULL); +} + +/** Check whether given column is a base of stored column. +@param[in] col_name column name +@param[in] table table +@param[in] s_cols list of stored columns +@return true if the given column is a base of stored column,else false. */ +static +bool +innobase_col_check_fk( + const char* col_name, + const dict_table_t* table, + dict_s_col_list* s_cols) +{ + dict_s_col_list::const_iterator it; + + for (it = s_cols->begin(); it != s_cols->end(); ++it) { + for (ulint j = it->num_base; j--; ) { + if (!strcmp(col_name, dict_table_get_col_name( + table, it->base_col[j]->ind))) { + return(true); + } + } + } + + return(false); +} + +/** Check whether the foreign key constraint is on base of any stored columns. +@param[in] foreign Foriegn key constraing information +@param[in] table table to which the foreign key objects +to be added +@param[in] s_cols list of stored column information in the table. +@return true if yes, otherwise false. */ +static +bool +innobase_check_fk_stored( + const dict_foreign_t* foreign, + const dict_table_t* table, + dict_s_col_list* s_cols) +{ + ulint type = foreign->type; + + type &= ~(DICT_FOREIGN_ON_DELETE_NO_ACTION + | DICT_FOREIGN_ON_UPDATE_NO_ACTION); + + if (type == 0 || s_cols == NULL) { + return(false); + } + + for (ulint i = 0; i < foreign->n_fields; i++) { + if (innobase_col_check_fk( + foreign->foreign_col_names[i], table, s_cols)) { + return(true); + } + } + + return(false); +} + +/** Create InnoDB foreign key structure from MySQL alter_info +@param[in] ha_alter_info alter table info +@param[in] table_share TABLE_SHARE +@param[in] table table object +@param[in] col_names column names, or NULL to use +table->col_names +@param[in] drop_index indexes to be dropped +@param[in] n_drop_index size of drop_index +@param[out] add_fk foreign constraint added +@param[out] n_add_fk number of foreign constraints +added +@param[in] trx user transaction +@param[in] s_cols list of stored column information +@retval true if successful +@retval false on error (will call my_error()) */ +static MY_ATTRIBUTE((nonnull(1,2,3,7,8), warn_unused_result)) +bool +innobase_get_foreign_key_info( + Alter_inplace_info* + ha_alter_info, + const TABLE_SHARE* + table_share, + dict_table_t* table, + const char** col_names, + dict_index_t** drop_index, + ulint n_drop_index, + dict_foreign_t**add_fk, + ulint* n_add_fk, + const trx_t* trx, + dict_s_col_list*s_cols) +{ + dict_table_t* referenced_table = NULL; + char* referenced_table_name = NULL; + ulint num_fk = 0; + Alter_info* alter_info = ha_alter_info->alter_info; + const CHARSET_INFO* cs = thd_charset(trx->mysql_thd); + + DBUG_ENTER("innobase_get_foreign_key_info"); + + *n_add_fk = 0; + + for (Key& key : alter_info->key_list) { + if (key.type != Key::FOREIGN_KEY || key.old) { + continue; + } + + const char* column_names[MAX_NUM_FK_COLUMNS]; + dict_index_t* index = NULL; + const char* referenced_column_names[MAX_NUM_FK_COLUMNS]; + dict_index_t* referenced_index = NULL; + ulint num_col = 0; + ulint referenced_num_col = 0; + bool correct_option; + + Foreign_key* fk_key = static_cast(&key); + + if (fk_key->columns.elements > 0) { + ulint i = 0; + + /* Get all the foreign key column info for the + current table */ + for (const Key_part_spec& column : fk_key->columns) { + column_names[i] = column.field_name.str; + ut_ad(i < MAX_NUM_FK_COLUMNS); + i++; + } + + index = innobase_find_fk_index( + table, col_names, + span(drop_index, n_drop_index), + column_names, i); + + /* MySQL would add a index in the creation + list if no such index for foreign table, + so we have to use DBUG_EXECUTE_IF to simulate + the scenario */ + DBUG_EXECUTE_IF("innodb_test_no_foreign_idx", + index = NULL;); + + /* Check whether there exist such + index in the the index create clause */ + if (!index && !innobase_find_equiv_index( + column_names, static_cast(i), + ha_alter_info->key_info_buffer, + span(ha_alter_info->index_add_buffer, + ha_alter_info->index_add_count))) { + my_error( + ER_FK_NO_INDEX_CHILD, + MYF(0), + fk_key->name.str + ? fk_key->name.str : "", + table_share->table_name.str); + goto err_exit; + } + + num_col = i; + } + + add_fk[num_fk] = dict_mem_foreign_create(); + + dict_sys.lock(SRW_LOCK_CALL); + + referenced_table_name = dict_get_referenced_table( + table->name.m_name, + LEX_STRING_WITH_LEN(fk_key->ref_db), + LEX_STRING_WITH_LEN(fk_key->ref_table), + &referenced_table, + add_fk[num_fk]->heap, cs); + + /* Test the case when referenced_table failed to + open, if trx->check_foreigns is not set, we should + still be able to add the foreign key */ + DBUG_EXECUTE_IF("innodb_test_open_ref_fail", + referenced_table = NULL;); + + if (!referenced_table && trx->check_foreigns) { + my_error(ER_FK_CANNOT_OPEN_PARENT, + MYF(0), fk_key->ref_table.str); + goto err_exit_unlock; + } + + if (fk_key->ref_columns.elements > 0) { + ulint i = 0; + + for (Key_part_spec &column : fk_key->ref_columns) { + referenced_column_names[i] = + column.field_name.str; + ut_ad(i < MAX_NUM_FK_COLUMNS); + i++; + } + + if (referenced_table) { + referenced_index = + dict_foreign_find_index( + referenced_table, 0, + referenced_column_names, + i, index, + TRUE, FALSE, + NULL, NULL, NULL); + + DBUG_EXECUTE_IF( + "innodb_test_no_reference_idx", + referenced_index = NULL;); + + /* Check whether there exist such + index in the the index create clause */ + if (!referenced_index) { + my_error(ER_FK_NO_INDEX_PARENT, MYF(0), + fk_key->name.str + ? fk_key->name.str : "", + fk_key->ref_table.str); + goto err_exit_unlock; + } + } else { + ut_a(!trx->check_foreigns); + } + + referenced_num_col = i; + } else { + /* Not possible to add a foreign key without a + referenced column */ + my_error(ER_CANNOT_ADD_FOREIGN, MYF(0), + fk_key->ref_table.str); + goto err_exit_unlock; + } + + if (!innobase_init_foreign( + add_fk[num_fk], fk_key->name.str, + table, index, column_names, + num_col, referenced_table_name, + referenced_table, referenced_index, + referenced_column_names, referenced_num_col)) { + my_error( + ER_DUP_CONSTRAINT_NAME, + MYF(0), + "FOREIGN KEY", add_fk[num_fk]->id); + goto err_exit_unlock; + } + + dict_sys.unlock(); + + correct_option = innobase_set_foreign_key_option( + add_fk[num_fk], fk_key); + + DBUG_EXECUTE_IF("innodb_test_wrong_fk_option", + correct_option = false;); + + if (!correct_option) { + my_error(ER_FK_INCORRECT_OPTION, + MYF(0), + table_share->table_name.str, + add_fk[num_fk]->id); + goto err_exit; + } + + if (innobase_check_fk_stored( + add_fk[num_fk], table, s_cols)) { + my_printf_error( + HA_ERR_UNSUPPORTED, + "Cannot add foreign key on the base column " + "of stored column", MYF(0)); + goto err_exit; + } + + num_fk++; + } + + *n_add_fk = num_fk; + + DBUG_RETURN(true); +err_exit_unlock: + dict_sys.unlock(); +err_exit: + for (ulint i = 0; i <= num_fk; i++) { + if (add_fk[i]) { + dict_foreign_free(add_fk[i]); + } + } + + DBUG_RETURN(false); +} + +/*************************************************************//** +Copies an InnoDB column to a MySQL field. This function is +adapted from row_sel_field_store_in_mysql_format(). */ +static +void +innobase_col_to_mysql( +/*==================*/ + const dict_col_t* col, /*!< in: InnoDB column */ + const uchar* data, /*!< in: InnoDB column data */ + ulint len, /*!< in: length of data, in bytes */ + Field* field) /*!< in/out: MySQL field */ +{ + uchar* ptr; + uchar* dest = field->ptr; + ulint flen = field->pack_length(); + + switch (col->mtype) { + case DATA_INT: + ut_ad(len == flen); + + /* Convert integer data from Innobase to little-endian + format, sign bit restored to normal */ + + for (ptr = dest + len; ptr != dest; ) { + *--ptr = *data++; + } + + if (!(col->prtype & DATA_UNSIGNED)) { + ((byte*) dest)[len - 1] ^= 0x80; + } + + break; + + case DATA_VARCHAR: + case DATA_VARMYSQL: + case DATA_BINARY: + field->reset(); + + if (field->type() == MYSQL_TYPE_VARCHAR) { + /* This is a >= 5.0.3 type true VARCHAR. Store the + length of the data to the first byte or the first + two bytes of dest. */ + + dest = row_mysql_store_true_var_len( + dest, len, flen - field->key_length()); + } + + /* Copy the actual data */ + memcpy(dest, data, len); + break; + + case DATA_GEOMETRY: + case DATA_BLOB: + /* Skip MySQL BLOBs when reporting an erroneous row + during index creation or table rebuild. */ + field->set_null(); + break; + +#ifdef UNIV_DEBUG + case DATA_MYSQL: + ut_ad(flen >= len); + ut_ad(col->mbmaxlen >= col->mbminlen); + memcpy(dest, data, len); + break; + + default: + case DATA_SYS_CHILD: + case DATA_SYS: + /* These column types should never be shipped to MySQL. */ + ut_ad(0); + /* fall through */ + case DATA_FLOAT: + case DATA_DOUBLE: + case DATA_DECIMAL: + /* Above are the valid column types for MySQL data. */ + ut_ad(flen == len); + /* fall through */ + case DATA_FIXBINARY: + case DATA_CHAR: + /* We may have flen > len when there is a shorter + prefix on the CHAR and BINARY column. */ + ut_ad(flen >= len); +#else /* UNIV_DEBUG */ + default: +#endif /* UNIV_DEBUG */ + memcpy(dest, data, len); + } +} + +/*************************************************************//** +Copies an InnoDB record to table->record[0]. */ +void +innobase_rec_to_mysql( +/*==================*/ + struct TABLE* table, /*!< in/out: MySQL table */ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: index */ + const rec_offs* offsets)/*!< in: rec_get_offsets( + rec, index, ...) */ +{ + uint n_fields = table->s->fields; + + ut_ad(n_fields == dict_table_get_n_user_cols(index->table) + - !!(DICT_TF2_FLAG_IS_SET(index->table, + DICT_TF2_FTS_HAS_DOC_ID))); + + for (uint i = 0; i < n_fields; i++) { + Field* field = table->field[i]; + ulint ipos; + ulint ilen; + const uchar* ifield; + ulint prefix_col; + + field->reset(); + + ipos = dict_index_get_nth_col_or_prefix_pos( + index, i, true, false, &prefix_col); + + if (ipos == ULINT_UNDEFINED + || rec_offs_nth_extern(offsets, ipos)) { +null_field: + field->set_null(); + continue; + } + + ifield = rec_get_nth_cfield(rec, index, offsets, ipos, &ilen); + + /* Assign the NULL flag */ + if (ilen == UNIV_SQL_NULL) { + ut_ad(field->real_maybe_null()); + goto null_field; + } + + field->set_notnull(); + + innobase_col_to_mysql( + dict_field_get_col( + dict_index_get_nth_field(index, ipos)), + ifield, ilen, field); + } +} + +/*************************************************************//** +Copies an InnoDB index entry to table->record[0]. +This is used in preparation for print_keydup_error() from +inline add index */ +void +innobase_fields_to_mysql( +/*=====================*/ + struct TABLE* table, /*!< in/out: MySQL table */ + const dict_index_t* index, /*!< in: InnoDB index */ + const dfield_t* fields) /*!< in: InnoDB index fields */ +{ + uint n_fields = table->s->fields; + ulint num_v = 0; + + ut_ad(n_fields == dict_table_get_n_user_cols(index->table) + + dict_table_get_n_v_cols(index->table) + - !!(DICT_TF2_FLAG_IS_SET(index->table, + DICT_TF2_FTS_HAS_DOC_ID))); + + for (uint i = 0; i < n_fields; i++) { + Field* field = table->field[i]; + ulint ipos; + ulint prefix_col; + + field->reset(); + + const bool is_v = !field->stored_in_db(); + const ulint col_n = is_v ? num_v++ : i - num_v; + + ipos = dict_index_get_nth_col_or_prefix_pos( + index, col_n, true, is_v, &prefix_col); + + if (ipos == ULINT_UNDEFINED + || dfield_is_ext(&fields[ipos]) + || dfield_is_null(&fields[ipos])) { + + field->set_null(); + } else { + field->set_notnull(); + + const dfield_t* df = &fields[ipos]; + + innobase_col_to_mysql( + dict_field_get_col( + dict_index_get_nth_field(index, ipos)), + static_cast(dfield_get_data(df)), + dfield_get_len(df), field); + } + } +} + +/*************************************************************//** +Copies an InnoDB row to table->record[0]. +This is used in preparation for print_keydup_error() from +row_log_table_apply() */ +void +innobase_row_to_mysql( +/*==================*/ + struct TABLE* table, /*!< in/out: MySQL table */ + const dict_table_t* itab, /*!< in: InnoDB table */ + const dtuple_t* row) /*!< in: InnoDB row */ +{ + uint n_fields = table->s->fields; + ulint num_v = 0; + + /* The InnoDB row may contain an extra FTS_DOC_ID column at the end. */ + ut_ad(row->n_fields == dict_table_get_n_cols(itab)); + ut_ad(n_fields == row->n_fields - DATA_N_SYS_COLS + + dict_table_get_n_v_cols(itab) + - !!(DICT_TF2_FLAG_IS_SET(itab, DICT_TF2_FTS_HAS_DOC_ID))); + + for (uint i = 0; i < n_fields; i++) { + Field* field = table->field[i]; + + field->reset(); + + if (!field->stored_in_db()) { + /* Virtual column are not stored in InnoDB table, so + skip it */ + num_v++; + continue; + } + + const dfield_t* df = dtuple_get_nth_field(row, i - num_v); + + if (dfield_is_ext(df) || dfield_is_null(df)) { + field->set_null(); + } else { + field->set_notnull(); + + innobase_col_to_mysql( + dict_table_get_nth_col(itab, i - num_v), + static_cast(dfield_get_data(df)), + dfield_get_len(df), field); + } + } + if (table->vfield) { + MY_BITMAP* old_read_set = tmp_use_all_columns(table, &table->read_set); + table->update_virtual_fields(table->file, VCOL_UPDATE_FOR_READ); + tmp_restore_column_map(&table->read_set, old_read_set); + } +} + +/*******************************************************************//** +This function checks that index keys are sensible. +@return 0 or error number */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +int +innobase_check_index_keys( +/*======================*/ + const Alter_inplace_info* info, + /*!< in: indexes to be created or dropped */ + const dict_table_t* innodb_table) + /*!< in: Existing indexes */ +{ + for (uint key_num = 0; key_num < info->index_add_count; + key_num++) { + const KEY& key = info->key_info_buffer[ + info->index_add_buffer[key_num]]; + + /* Check that the same index name does not appear + twice in indexes to be created. */ + + for (ulint i = 0; i < key_num; i++) { + const KEY& key2 = info->key_info_buffer[ + info->index_add_buffer[i]]; + + if (0 == strcmp(key.name.str, key2.name.str)) { + my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0), + key.name.str); + + return(ER_WRONG_NAME_FOR_INDEX); + } + } + + /* Check that the same index name does not already exist. */ + + const dict_index_t* index; + + for (index = dict_table_get_first_index(innodb_table); + index; index = dict_table_get_next_index(index)) { + + if (index->is_committed() + && !strcmp(key.name.str, index->name)) { + break; + } + } + + /* Now we are in a situation where we have "ADD INDEX x" + and an index by the same name already exists. We have 4 + possible cases: + 1. No further clauses for an index x are given. Should reject + the operation. + 2. "DROP INDEX x" is given. Should allow the operation. + 3. "RENAME INDEX x TO y" is given. Should allow the operation. + 4. "DROP INDEX x, RENAME INDEX x TO y" is given. Should allow + the operation, since no name clash occurs. In this particular + case MySQL cancels the operation without calling InnoDB + methods. */ + + if (index) { + /* If a key by the same name is being created and + dropped, the name clash is OK. */ + for (uint i = 0; i < info->index_drop_count; + i++) { + const KEY* drop_key + = info->index_drop_buffer[i]; + + if (0 == strcmp(key.name.str, + drop_key->name.str)) { + goto name_ok; + } + } + + for (const Alter_inplace_info::Rename_key_pair& pair : + info->rename_keys) { + if (0 == strcmp(key.name.str, + pair.old_key->name.str)) { + goto name_ok; + } + } + + my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0), + key.name.str); + return(ER_WRONG_NAME_FOR_INDEX); + } + +name_ok: + for (ulint i = 0; i < key.user_defined_key_parts; i++) { + const KEY_PART_INFO& key_part1 + = key.key_part[i]; + const Field* field + = key_part1.field; + unsigned is_unsigned; + + switch (get_innobase_type_from_mysql_type( + &is_unsigned, field)) { + default: + break; + case DATA_INT: + case DATA_FLOAT: + case DATA_DOUBLE: + case DATA_DECIMAL: + /* Check that MySQL does not try to + create a column prefix index field on + an inappropriate data type. */ + + if (field->type() == MYSQL_TYPE_VARCHAR) { + if (key_part1.length + >= field->pack_length() + - ((Field_varstring*) field) + ->length_bytes) { + break; + } + } else { + if (key_part1.length + >= field->pack_length()) { + break; + } + } + + my_error(ER_WRONG_KEY_COLUMN, MYF(0), "InnoDB", + field->field_name.str); + return(ER_WRONG_KEY_COLUMN); + } + + /* Check that the same column does not appear + twice in the index. */ + + for (ulint j = 0; j < i; j++) { + const KEY_PART_INFO& key_part2 + = key.key_part[j]; + + if (key_part1.fieldnr != key_part2.fieldnr) { + continue; + } + + my_error(ER_WRONG_KEY_COLUMN, MYF(0), "InnoDB", + field->field_name.str); + return(ER_WRONG_KEY_COLUMN); + } + } + } + + return(0); +} + +/** Create index field definition for key part +@param[in] new_clustered true if alter is generating a new clustered +index +@param[in] altered_table MySQL table that is being altered +@param[in] key_part MySQL key definition +@param[out] index_field index field definition for key_part */ +static MY_ATTRIBUTE((nonnull)) +void +innobase_create_index_field_def( + bool new_clustered, + const TABLE* altered_table, + const KEY_PART_INFO* key_part, + index_field_t* index_field) +{ + const Field* field; + unsigned is_unsigned; + unsigned num_v = 0; + + DBUG_ENTER("innobase_create_index_field_def"); + + field = new_clustered + ? altered_table->field[key_part->fieldnr] + : key_part->field; + + for (ulint i = 0; i < key_part->fieldnr; i++) { + if (!altered_table->field[i]->stored_in_db()) { + num_v++; + } + } + + auto col_type = get_innobase_type_from_mysql_type( + &is_unsigned, field); + + if ((index_field->is_v_col = !field->stored_in_db())) { + index_field->col_no = num_v; + } else { + index_field->col_no = key_part->fieldnr - num_v; + } + + index_field->descending= !!(key_part->key_part_flag & HA_REVERSE_SORT); + + if (DATA_LARGE_MTYPE(col_type) + || (key_part->length < field->pack_length() + && field->type() != MYSQL_TYPE_VARCHAR) + || (field->type() == MYSQL_TYPE_VARCHAR + && key_part->length < field->pack_length() + - ((Field_varstring*) field)->length_bytes)) { + + index_field->prefix_len = key_part->length; + } else { + index_field->prefix_len = 0; + } + + DBUG_VOID_RETURN; +} + +/** Create index definition for key +@param[in] altered_table MySQL table that is being altered +@param[in] keys key definitions +@param[in] key_number MySQL key number +@param[in] new_clustered true if generating a new clustered +index on the table +@param[in] key_clustered true if this is the new clustered index +@param[out] index index definition +@param[in] heap heap where memory is allocated */ +static MY_ATTRIBUTE((nonnull)) +void +innobase_create_index_def( + const TABLE* altered_table, + const KEY* keys, + ulint key_number, + bool new_clustered, + bool key_clustered, + index_def_t* index, + mem_heap_t* heap) +{ + const KEY* key = &keys[key_number]; + ulint i; + ulint n_fields = key->user_defined_key_parts; + + DBUG_ENTER("innobase_create_index_def"); + DBUG_ASSERT(!key_clustered || new_clustered); + + index->fields = static_cast( + mem_heap_alloc(heap, n_fields * sizeof *index->fields)); + + index->parser = NULL; + index->key_number = key_number; + index->n_fields = n_fields; + index->name = mem_heap_strdup(heap, key->name.str); + index->rebuild = new_clustered; + + if (key_clustered) { + DBUG_ASSERT(!(key->flags & (HA_FULLTEXT | HA_SPATIAL))); + DBUG_ASSERT(key->flags & HA_NOSAME); + index->ind_type = DICT_CLUSTERED | DICT_UNIQUE; + } else if (key->flags & HA_FULLTEXT) { + DBUG_ASSERT(!(key->flags & (HA_SPATIAL | HA_NOSAME))); + DBUG_ASSERT(!(key->flags & HA_KEYFLAG_MASK + & ~(HA_FULLTEXT + | HA_PACK_KEY + | HA_BINARY_PACK_KEY))); + index->ind_type = DICT_FTS; + + /* Note: key->parser is only parser name, + we need to get parser from altered_table instead */ + + if (key->flags & HA_USES_PARSER) { + for (ulint j = 0; j < altered_table->s->keys; j++) { + if (!strcmp(altered_table->key_info[j].name.str, + key->name.str)) { + ut_ad(altered_table->key_info[j].flags + & HA_USES_PARSER); + + plugin_ref parser = + altered_table->key_info[j].parser; + index->parser = + static_cast( + plugin_decl(parser)->info); + + break; + } + } + + DBUG_EXECUTE_IF("fts_instrument_use_default_parser", + index->parser = &fts_default_parser;); + ut_ad(index->parser); + } + } else if (key->flags & HA_SPATIAL) { + DBUG_ASSERT(!(key->flags & HA_NOSAME)); + index->ind_type = DICT_SPATIAL; + ut_ad(n_fields == 1); + ulint num_v = 0; + + /* Need to count the virtual fields before this spatial + indexed field */ + for (ulint i = 0; i < key->key_part->fieldnr; i++) { + num_v += !altered_table->field[i]->stored_in_db(); + } + index->fields[0].col_no = key->key_part[0].fieldnr - num_v; + index->fields[0].prefix_len = 0; + index->fields[0].is_v_col = false; + index->fields[0].descending = false; + + /* Currently, the spatial index cannot be created + on virtual columns. It is blocked in the SQL layer. */ + DBUG_ASSERT(key->key_part[0].field->stored_in_db()); + } else { + index->ind_type = (key->flags & HA_NOSAME) ? DICT_UNIQUE : 0; + } + + if (!(key->flags & HA_SPATIAL)) { + for (i = 0; i < n_fields; i++) { + innobase_create_index_field_def( + new_clustered, altered_table, + &key->key_part[i], &index->fields[i]); + + if (index->fields[i].is_v_col) { + index->ind_type |= DICT_VIRTUAL; + } + } + } + + DBUG_VOID_RETURN; +} + +/*******************************************************************//** +Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME +on the Doc ID column. +@return the status of the FTS_DOC_ID index */ +enum fts_doc_id_index_enum +innobase_fts_check_doc_id_index( +/*============================*/ + const dict_table_t* table, /*!< in: table definition */ + const TABLE* altered_table, /*!< in: MySQL table + that is being altered */ + ulint* fts_doc_col_no) /*!< out: The column number for + Doc ID, or ULINT_UNDEFINED + if it is being created in + ha_alter_info */ +{ + const dict_index_t* index; + const dict_field_t* field; + + if (altered_table) { + /* Check if a unique index with the name of + FTS_DOC_ID_INDEX_NAME is being created. */ + + const ulint fts_n_uniq= altered_table->versioned() ? 2 : 1; + + for (uint i = 0; i < altered_table->s->keys; i++) { + const KEY& key = altered_table->key_info[i]; + + if (innobase_strcasecmp( + key.name.str, FTS_DOC_ID_INDEX_NAME)) { + continue; + } + + if ((key.flags & HA_NOSAME) + && key.user_defined_key_parts == fts_n_uniq + && !(key.key_part[0].key_part_flag + & HA_REVERSE_SORT) + && !strcmp(key.name.str, FTS_DOC_ID_INDEX_NAME) + && !strcmp(key.key_part[0].field->field_name.str, + FTS_DOC_ID_COL_NAME)) { + if (fts_doc_col_no) { + *fts_doc_col_no = ULINT_UNDEFINED; + } + return(FTS_EXIST_DOC_ID_INDEX); + } else { + return(FTS_INCORRECT_DOC_ID_INDEX); + } + } + } + + if (!table) { + return(FTS_NOT_EXIST_DOC_ID_INDEX); + } + + for (index = dict_table_get_first_index(table); + index; index = dict_table_get_next_index(index)) { + + + /* Check if there exists a unique index with the name of + FTS_DOC_ID_INDEX_NAME and ignore the corrupted index */ + if (index->type & DICT_CORRUPT + || innobase_strcasecmp(index->name, FTS_DOC_ID_INDEX_NAME)) { + continue; + } + + if (!dict_index_is_unique(index) + || dict_index_get_n_unique(index) != table->fts_n_uniq() + || strcmp(index->name, FTS_DOC_ID_INDEX_NAME)) { + return(FTS_INCORRECT_DOC_ID_INDEX); + } + + /* Check whether the index has FTS_DOC_ID as its + first column */ + field = dict_index_get_nth_field(index, 0); + + /* The column would be of a BIGINT data type */ + if (strcmp(field->name, FTS_DOC_ID_COL_NAME) == 0 + && !field->descending + && field->col->mtype == DATA_INT + && field->col->len == 8 + && field->col->prtype & DATA_NOT_NULL + && !field->col->is_virtual()) { + if (fts_doc_col_no) { + *fts_doc_col_no = dict_col_get_no(field->col); + } + return(FTS_EXIST_DOC_ID_INDEX); + } else { + return(FTS_INCORRECT_DOC_ID_INDEX); + } + } + + + /* Not found */ + return(FTS_NOT_EXIST_DOC_ID_INDEX); +} +/*******************************************************************//** +Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME +on the Doc ID column in MySQL create index definition. +@return FTS_EXIST_DOC_ID_INDEX if there exists the FTS_DOC_ID index, +FTS_INCORRECT_DOC_ID_INDEX if the FTS_DOC_ID index is of wrong format */ +enum fts_doc_id_index_enum +innobase_fts_check_doc_id_index_in_def( +/*===================================*/ + ulint n_key, /*!< in: Number of keys */ + const KEY* key_info) /*!< in: Key definition */ +{ + /* Check whether there is a "FTS_DOC_ID_INDEX" in the to be built index + list */ + const uint fts_n_uniq= key_info->table->versioned() ? 2 : 1; + for (ulint j = 0; j < n_key; j++) { + const KEY* key = &key_info[j]; + + if (innobase_strcasecmp(key->name.str, FTS_DOC_ID_INDEX_NAME)) { + continue; + } + + /* Do a check on FTS DOC ID_INDEX, it must be unique, + named as "FTS_DOC_ID_INDEX" and on column "FTS_DOC_ID" */ + if (!(key->flags & HA_NOSAME) + || key->user_defined_key_parts != fts_n_uniq + || (key->key_part[0].key_part_flag & HA_REVERSE_SORT) + || strcmp(key->name.str, FTS_DOC_ID_INDEX_NAME) + || strcmp(key->key_part[0].field->field_name.str, + FTS_DOC_ID_COL_NAME)) { + return(FTS_INCORRECT_DOC_ID_INDEX); + } + + return(FTS_EXIST_DOC_ID_INDEX); + } + + return(FTS_NOT_EXIST_DOC_ID_INDEX); +} + +/** Create an index table where indexes are ordered as follows: + +IF a new primary key is defined for the table THEN + + 1) New primary key + 2) The remaining keys in key_info + +ELSE + + 1) All new indexes in the order they arrive from MySQL + +ENDIF + +@return key definitions */ +MY_ATTRIBUTE((nonnull, warn_unused_result, malloc)) +inline index_def_t* +ha_innobase_inplace_ctx::create_key_defs( + const Alter_inplace_info* ha_alter_info, + /*!< in: alter operation */ + const TABLE* altered_table, + /*!< in: MySQL table that is being altered */ + ulint& n_fts_add, + /*!< out: number of FTS indexes to be created */ + ulint& fts_doc_id_col, + /*!< in: The column number for Doc ID */ + bool& add_fts_doc_id, + /*!< in: whether we need to add new DOC ID + column for FTS index */ + bool& add_fts_doc_idx, + /*!< in: whether we need to add new DOC ID + index for FTS index */ + const TABLE* table) + /*!< in: MySQL table that is being altered */ +{ + ulint& n_add = num_to_add_index; + const bool got_default_clust = new_table->indexes.start->is_gen_clust(); + + index_def_t* indexdef; + index_def_t* indexdefs; + bool new_primary; + const uint*const add + = ha_alter_info->index_add_buffer; + const KEY*const key_info + = ha_alter_info->key_info_buffer; + + DBUG_ENTER("ha_innobase_inplace_ctx::create_key_defs"); + DBUG_ASSERT(!add_fts_doc_id || add_fts_doc_idx); + DBUG_ASSERT(ha_alter_info->index_add_count == n_add); + + /* If there is a primary key, it is always the first index + defined for the innodb_table. */ + + new_primary = n_add > 0 + && !my_strcasecmp(system_charset_info, + key_info[*add].name.str, "PRIMARY"); + n_fts_add = 0; + + /* If there is a UNIQUE INDEX consisting entirely of NOT NULL + columns and if the index does not contain column prefix(es) + (only prefix/part of the column is indexed), MySQL will treat the + index as a PRIMARY KEY unless the table already has one. */ + + ut_ad(altered_table->s->primary_key == 0 + || altered_table->s->primary_key == MAX_KEY); + + if (got_default_clust && !new_primary) { + new_primary = (altered_table->s->primary_key != MAX_KEY); + } + + const bool rebuild = new_primary || add_fts_doc_id + || innobase_need_rebuild(ha_alter_info, table); + + /* Reserve one more space if new_primary is true, and we might + need to add the FTS_DOC_ID_INDEX */ + indexdef = indexdefs = static_cast( + mem_heap_alloc( + heap, sizeof *indexdef + * (ha_alter_info->key_count + + rebuild + + got_default_clust))); + + if (rebuild) { + ulint primary_key_number; + + if (new_primary) { + DBUG_ASSERT(n_add || got_default_clust); + DBUG_ASSERT(n_add || !altered_table->s->primary_key); + primary_key_number = altered_table->s->primary_key; + } else if (got_default_clust) { + /* Create the GEN_CLUST_INDEX */ + index_def_t* index = indexdef++; + + index->fields = NULL; + index->n_fields = 0; + index->ind_type = DICT_CLUSTERED; + index->name = innobase_index_reserve_name; + index->rebuild = true; + index->key_number = ~0U; + primary_key_number = ULINT_UNDEFINED; + goto created_clustered; + } else { + primary_key_number = 0; + } + + /* Create the PRIMARY key index definition */ + innobase_create_index_def( + altered_table, key_info, primary_key_number, + true, true, indexdef++, heap); + +created_clustered: + n_add = 1; + + for (ulint i = 0; i < ha_alter_info->key_count; i++) { + if (i == primary_key_number) { + continue; + } + /* Copy the index definitions. */ + innobase_create_index_def( + altered_table, key_info, i, true, + false, indexdef, heap); + + if (indexdef->ind_type & DICT_FTS) { + n_fts_add++; + } + + indexdef++; + n_add++; + } + + if (n_fts_add > 0) { + ulint num_v = 0; + + if (!add_fts_doc_id + && !innobase_fts_check_doc_id_col( + NULL, altered_table, + &fts_doc_id_col, &num_v)) { + fts_doc_id_col = altered_table->s->fields - num_v; + add_fts_doc_id = true; + } + + if (!add_fts_doc_idx) { + fts_doc_id_index_enum ret; + ulint doc_col_no; + + ret = innobase_fts_check_doc_id_index( + NULL, altered_table, &doc_col_no); + + /* This should have been checked before */ + ut_ad(ret != FTS_INCORRECT_DOC_ID_INDEX); + + if (ret == FTS_NOT_EXIST_DOC_ID_INDEX) { + add_fts_doc_idx = true; + } else { + ut_ad(ret == FTS_EXIST_DOC_ID_INDEX); + ut_ad(doc_col_no == ULINT_UNDEFINED + || doc_col_no == fts_doc_id_col); + } + } + } + } else { + /* Create definitions for added secondary indexes. */ + + for (ulint i = 0; i < n_add; i++) { + innobase_create_index_def( + altered_table, key_info, add[i], + false, false, indexdef, heap); + + if (indexdef->ind_type & DICT_FTS) { + n_fts_add++; + } + + indexdef++; + } + } + + DBUG_ASSERT(indexdefs + n_add == indexdef); + + if (add_fts_doc_idx) { + index_def_t* index = indexdef++; + uint nfields = 1; + + if (altered_table->versioned()) + ++nfields; + index->fields = static_cast( + mem_heap_alloc(heap, sizeof(*index->fields) * nfields)); + index->n_fields = nfields; + index->fields[0].col_no = fts_doc_id_col; + index->fields[0].prefix_len = 0; + index->fields[0].descending = false; + index->fields[0].is_v_col = false; + if (nfields == 2) { + index->fields[1].col_no + = altered_table->s->vers.end_fieldno; + index->fields[1].prefix_len = 0; + index->fields[1].descending = false; + index->fields[1].is_v_col = false; + } + index->ind_type = DICT_UNIQUE; + ut_ad(!rebuild + || !add_fts_doc_id + || fts_doc_id_col <= altered_table->s->fields); + + index->name = FTS_DOC_ID_INDEX_NAME; + index->rebuild = rebuild; + + /* TODO: assign a real MySQL key number for this */ + index->key_number = ULINT_UNDEFINED; + n_add++; + } + + DBUG_ASSERT(indexdef > indexdefs); + DBUG_ASSERT((ulint) (indexdef - indexdefs) + <= ha_alter_info->key_count + + add_fts_doc_idx + got_default_clust); + DBUG_ASSERT(ha_alter_info->index_add_count <= n_add); + DBUG_RETURN(indexdefs); +} + +MY_ATTRIBUTE((warn_unused_result)) +bool too_big_key_part_length(size_t max_field_len, const KEY& key) +{ + for (ulint i = 0; i < key.user_defined_key_parts; i++) { + if (key.key_part[i].length > max_field_len) { + return true; + } + } + return false; +} + +/********************************************************************//** +Drop any indexes that we were not able to free previously due to +open table handles. */ +static +void +online_retry_drop_indexes_low( +/*==========================*/ + dict_table_t* table, /*!< in/out: table */ + trx_t* trx) /*!< in/out: transaction */ +{ + ut_ad(dict_sys.locked()); + ut_ad(trx->dict_operation_lock_mode); + ut_ad(trx->dict_operation); + + /* We can have table->n_ref_count > 1, because other threads + may have prebuilt->table pointing to the table. However, these + other threads should be between statements, waiting for the + next statement to execute, or for a meta-data lock. */ + ut_ad(table->get_ref_count() >= 1); + + if (table->drop_aborted) { + row_merge_drop_indexes(trx, table, true); + } +} + +/** After commit, unlock the data dictionary and close any deleted files. +@param deleted handles of deleted files +@param trx committed transaction */ +static void unlock_and_close_files(const std::vector &deleted, + trx_t *trx) +{ + row_mysql_unlock_data_dictionary(trx); + for (pfs_os_file_t d : deleted) + os_file_close(d); + log_write_up_to(trx->commit_lsn, true); +} + +/** Commit a DDL transaction and unlink any deleted files. */ +static void commit_unlock_and_unlink(trx_t *trx) +{ + std::vector deleted; + trx->commit(deleted); + unlock_and_close_files(deleted, trx); +} + +/** +Drop any indexes that we were not able to free previously due to +open table handles. +@param table InnoDB table +@param thd connection handle +*/ +static void online_retry_drop_indexes(dict_table_t *table, THD *thd) +{ + if (table->drop_aborted) + { + trx_t *trx= innobase_trx_allocate(thd); + + trx_start_for_ddl(trx); + if (lock_sys_tables(trx) == DB_SUCCESS) + { + row_mysql_lock_data_dictionary(trx); + online_retry_drop_indexes_low(table, trx); + commit_unlock_and_unlink(trx); + } + else + trx->commit(); + trx->free(); + } + + ut_d(dict_sys.freeze(SRW_LOCK_CALL)); + ut_d(dict_table_check_for_dup_indexes(table, CHECK_ALL_COMPLETE)); + ut_d(dict_sys.unfreeze()); + ut_ad(!table->drop_aborted); +} + +/** Determines if InnoDB is dropping a foreign key constraint. +@param foreign the constraint +@param drop_fk constraints being dropped +@param n_drop_fk number of constraints that are being dropped +@return whether the constraint is being dropped */ +MY_ATTRIBUTE((pure, nonnull(1), warn_unused_result)) +inline +bool +innobase_dropping_foreign( + const dict_foreign_t* foreign, + dict_foreign_t** drop_fk, + ulint n_drop_fk) +{ + while (n_drop_fk--) { + if (*drop_fk++ == foreign) { + return(true); + } + } + + return(false); +} + +/** Determines if an InnoDB FOREIGN KEY constraint depends on a +column that is being dropped or modified to NOT NULL. +@param user_table InnoDB table as it is before the ALTER operation +@param col_name Name of the column being altered +@param drop_fk constraints being dropped +@param n_drop_fk number of constraints that are being dropped +@param drop true=drop column, false=set NOT NULL +@retval true Not allowed (will call my_error()) +@retval false Allowed +*/ +MY_ATTRIBUTE((pure, nonnull(1,4), warn_unused_result)) +static +bool +innobase_check_foreigns_low( + const dict_table_t* user_table, + dict_foreign_t** drop_fk, + ulint n_drop_fk, + const char* col_name, + bool drop) +{ + dict_foreign_t* foreign; + ut_ad(dict_sys.locked()); + + /* Check if any FOREIGN KEY constraints are defined on this + column. */ + + for (dict_foreign_set::const_iterator it = user_table->foreign_set.begin(); + it != user_table->foreign_set.end(); + ++it) { + + foreign = *it; + + if (!drop && !(foreign->type + & (DICT_FOREIGN_ON_DELETE_SET_NULL + | DICT_FOREIGN_ON_UPDATE_SET_NULL))) { + continue; + } + + if (innobase_dropping_foreign(foreign, drop_fk, n_drop_fk)) { + continue; + } + + for (unsigned f = 0; f < foreign->n_fields; f++) { + if (!strcmp(foreign->foreign_col_names[f], + col_name)) { + my_error(drop + ? ER_FK_COLUMN_CANNOT_DROP + : ER_FK_COLUMN_NOT_NULL, MYF(0), + col_name, foreign->id); + return(true); + } + } + } + + if (!drop) { + /* SET NULL clauses on foreign key constraints of + child tables affect the child tables, not the parent table. + The column can be NOT NULL in the parent table. */ + return(false); + } + + /* Check if any FOREIGN KEY constraints in other tables are + referring to the column that is being dropped. */ + for (dict_foreign_set::const_iterator it + = user_table->referenced_set.begin(); + it != user_table->referenced_set.end(); + ++it) { + + foreign = *it; + + if (innobase_dropping_foreign(foreign, drop_fk, n_drop_fk)) { + continue; + } + + for (unsigned f = 0; f < foreign->n_fields; f++) { + char display_name[FN_REFLEN]; + + if (strcmp(foreign->referenced_col_names[f], + col_name)) { + continue; + } + + char* buf_end = innobase_convert_name( + display_name, (sizeof display_name) - 1, + foreign->foreign_table_name, + strlen(foreign->foreign_table_name), + NULL); + *buf_end = '\0'; + my_error(ER_FK_COLUMN_CANNOT_DROP_CHILD, + MYF(0), col_name, foreign->id, + display_name); + + return(true); + } + } + + return(false); +} + +/** Determines if an InnoDB FOREIGN KEY constraint depends on a +column that is being dropped or modified to NOT NULL. +@param ha_alter_info Data used during in-place alter +@param altered_table MySQL table that is being altered +@param old_table MySQL table as it is before the ALTER operation +@param user_table InnoDB table as it is before the ALTER operation +@param drop_fk constraints being dropped +@param n_drop_fk number of constraints that are being dropped +@retval true Not allowed (will call my_error()) +@retval false Allowed +*/ +MY_ATTRIBUTE((pure, nonnull(1,2,3), warn_unused_result)) +static +bool +innobase_check_foreigns( + Alter_inplace_info* ha_alter_info, + const TABLE* old_table, + const dict_table_t* user_table, + dict_foreign_t** drop_fk, + ulint n_drop_fk) +{ + for (Field** fp = old_table->field; *fp; fp++) { + ut_ad(!(*fp)->real_maybe_null() + == !!((*fp)->flags & NOT_NULL_FLAG)); + + auto end = ha_alter_info->alter_info->create_list.end(); + auto it = std::find_if( + ha_alter_info->alter_info->create_list.begin(), end, + [fp](const Create_field& field) { + return field.field == *fp; + }); + + if (it == end || (it->flags & NOT_NULL_FLAG)) { + if (innobase_check_foreigns_low( + user_table, drop_fk, n_drop_fk, + (*fp)->field_name.str, it == end)) { + return(true); + } + } + } + + return(false); +} + +/** Convert a default value for ADD COLUMN. +@param[in,out] heap Memory heap where allocated +@param[out] dfield InnoDB data field to copy to +@param[in] field MySQL value for the column +@param[in] old_field Old column if altering; NULL for ADD COLUMN +@param[in] comp nonzero if in compact format. */ +static void innobase_build_col_map_add( + mem_heap_t* heap, + dfield_t* dfield, + const Field* field, + const Field* old_field, + ulint comp) +{ + if (old_field && old_field->real_maybe_null() + && field->real_maybe_null()) { + return; + } + + if (field->is_real_null()) { + dfield_set_null(dfield); + return; + } + + const Field& from = old_field ? *old_field : *field; + ulint size = from.pack_length(); + + byte* buf = static_cast(mem_heap_alloc(heap, size)); + + row_mysql_store_col_in_innobase_format( + dfield, buf, true, from.ptr, size, comp); +} + +/** Construct the translation table for reordering, dropping or +adding columns. + +@param ha_alter_info Data used during in-place alter +@param altered_table MySQL table that is being altered +@param table MySQL table as it is before the ALTER operation +@param new_table InnoDB table corresponding to MySQL altered_table +@param old_table InnoDB table corresponding to MYSQL table +@param defaults Default values for ADD COLUMN, or NULL if no ADD COLUMN +@param heap Memory heap where allocated +@return array of integers, mapping column numbers in the table +to column numbers in altered_table */ +static MY_ATTRIBUTE((nonnull(1,2,3,4,5,7), warn_unused_result)) +const ulint* +innobase_build_col_map( +/*===================*/ + Alter_inplace_info* ha_alter_info, + const TABLE* altered_table, + const TABLE* table, + dict_table_t* new_table, + const dict_table_t* old_table, + dtuple_t* defaults, + mem_heap_t* heap) +{ + DBUG_ENTER("innobase_build_col_map"); + DBUG_ASSERT(altered_table != table); + DBUG_ASSERT(new_table != old_table); + DBUG_ASSERT(dict_table_get_n_cols(new_table) + + dict_table_get_n_v_cols(new_table) + >= altered_table->s->fields + DATA_N_SYS_COLS); + DBUG_ASSERT(dict_table_get_n_cols(old_table) + + dict_table_get_n_v_cols(old_table) + >= table->s->fields + DATA_N_SYS_COLS + || ha_innobase::omits_virtual_cols(*table->s)); + DBUG_ASSERT(!!defaults == !!(ha_alter_info->handler_flags + & INNOBASE_DEFAULTS)); + DBUG_ASSERT(!defaults || dtuple_get_n_fields(defaults) + == dict_table_get_n_cols(new_table)); + + const uint old_n_v_cols = uint(table->s->fields + - table->s->stored_fields); + DBUG_ASSERT(old_n_v_cols == old_table->n_v_cols + || table->s->frm_version < FRM_VER_EXPRESSSIONS); + DBUG_ASSERT(!old_n_v_cols || table->s->virtual_fields); + + ulint* col_map = static_cast( + mem_heap_alloc( + heap, (size_t(old_table->n_cols) + old_n_v_cols) + * sizeof *col_map)); + + uint i = 0; + uint num_v = 0; + + /* Any dropped columns will map to ULINT_UNDEFINED. */ + for (uint old_i = 0; old_i + DATA_N_SYS_COLS < old_table->n_cols; + old_i++) { + col_map[old_i] = ULINT_UNDEFINED; + } + + for (uint old_i = 0; old_i < old_n_v_cols; old_i++) { + col_map[old_i + old_table->n_cols] = ULINT_UNDEFINED; + } + + const bool omits_virtual = ha_innobase::omits_virtual_cols(*table->s); + + for (const Create_field& new_field : + ha_alter_info->alter_info->create_list) { + bool is_v = !new_field.stored_in_db(); + ulint num_old_v = 0; + + for (uint old_i = 0; table->field[old_i]; old_i++) { + const Field* field = table->field[old_i]; + if (!field->stored_in_db()) { + if (is_v && new_field.field == field) { + if (!omits_virtual) { + col_map[old_table->n_cols + + num_v] + = num_old_v; + } + num_old_v++; + goto found_col; + } + num_old_v++; + continue; + } + + if (new_field.field == field) { + + const Field* altered_field = + altered_table->field[i + num_v]; + + if (defaults) { + innobase_build_col_map_add( + heap, + dtuple_get_nth_field( + defaults, i), + altered_field, + field, + dict_table_is_comp( + new_table)); + } + + col_map[old_i - num_old_v] = i; + if (!old_table->versioned() + || !altered_table->versioned()) { + } else if (old_i == old_table->vers_start) { + new_table->vers_start = (i + num_v) + & dict_index_t::MAX_N_FIELDS; + } else if (old_i == old_table->vers_end) { + new_table->vers_end = (i + num_v) + & dict_index_t::MAX_N_FIELDS; + } + goto found_col; + } + } + + if (!is_v) { + innobase_build_col_map_add( + heap, dtuple_get_nth_field(defaults, i), + altered_table->field[i + num_v], + NULL, + dict_table_is_comp(new_table)); + } +found_col: + if (is_v) { + num_v++; + } else { + i++; + } + } + + DBUG_ASSERT(i == altered_table->s->fields - num_v); + + i = table->s->fields - old_n_v_cols; + + /* Add the InnoDB hidden FTS_DOC_ID column, if any. */ + if (i + DATA_N_SYS_COLS < old_table->n_cols) { + /* There should be exactly one extra field, + the FTS_DOC_ID. */ + DBUG_ASSERT(DICT_TF2_FLAG_IS_SET(old_table, + DICT_TF2_FTS_HAS_DOC_ID)); + DBUG_ASSERT(i + DATA_N_SYS_COLS + 1 == old_table->n_cols); + DBUG_ASSERT(!strcmp(dict_table_get_col_name( + old_table, i), + FTS_DOC_ID_COL_NAME)); + if (altered_table->s->fields + DATA_N_SYS_COLS + - new_table->n_v_cols + < new_table->n_cols) { + DBUG_ASSERT(DICT_TF2_FLAG_IS_SET( + new_table, + DICT_TF2_FTS_HAS_DOC_ID)); + DBUG_ASSERT(altered_table->s->fields + + DATA_N_SYS_COLS + 1 + == static_cast( + new_table->n_cols + + new_table->n_v_cols)); + col_map[i] = altered_table->s->fields + - new_table->n_v_cols; + } else { + DBUG_ASSERT(!DICT_TF2_FLAG_IS_SET( + new_table, + DICT_TF2_FTS_HAS_DOC_ID)); + col_map[i] = ULINT_UNDEFINED; + } + + i++; + } else { + DBUG_ASSERT(!DICT_TF2_FLAG_IS_SET( + old_table, + DICT_TF2_FTS_HAS_DOC_ID)); + } + + for (; i < old_table->n_cols; i++) { + col_map[i] = i + new_table->n_cols - old_table->n_cols; + } + + DBUG_RETURN(col_map); +} + +/** Get the new non-virtual column names if any columns were renamed +@param ha_alter_info Data used during in-place alter +@param altered_table MySQL table that is being altered +@param table MySQL table as it is before the ALTER operation +@param user_table InnoDB table as it is before the ALTER operation +@param heap Memory heap for the allocation +@return array of new column names in rebuilt_table, or NULL if not renamed */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +const char** +innobase_get_col_names( + Alter_inplace_info* ha_alter_info, + const TABLE* altered_table, + const TABLE* table, + const dict_table_t* user_table, + mem_heap_t* heap) +{ + const char** cols; + uint i; + + DBUG_ENTER("innobase_get_col_names"); + DBUG_ASSERT(user_table->n_t_def > table->s->fields); + DBUG_ASSERT(ha_alter_info->handler_flags + & ALTER_COLUMN_NAME); + + cols = static_cast( + mem_heap_zalloc(heap, user_table->n_def * sizeof *cols)); + + i = 0; + for (const Create_field& new_field : + ha_alter_info->alter_info->create_list) { + ulint num_v = 0; + DBUG_ASSERT(i < altered_table->s->fields); + + if (!new_field.stored_in_db()) { + continue; + } + + for (uint old_i = 0; table->field[old_i]; old_i++) { + num_v += !table->field[old_i]->stored_in_db(); + + if (new_field.field == table->field[old_i]) { + cols[old_i - num_v] = new_field.field_name.str; + break; + } + } + + i++; + } + + /* Copy the internal column names. */ + i = table->s->fields - user_table->n_v_def; + cols[i] = dict_table_get_col_name(user_table, i); + + while (++i < user_table->n_def) { + cols[i] = cols[i - 1] + strlen(cols[i - 1]) + 1; + } + + DBUG_RETURN(cols); +} + +/** Check whether the column prefix is increased, decreased, or unchanged. +@param[in] new_prefix_len new prefix length +@param[in] old_prefix_len new prefix length +@retval 1 prefix is increased +@retval 0 prefix is unchanged +@retval -1 prefix is decreased */ +static inline +lint +innobase_pk_col_prefix_compare( + ulint new_prefix_len, + ulint old_prefix_len) +{ + ut_ad(new_prefix_len < COMPRESSED_REC_MAX_DATA_SIZE); + ut_ad(old_prefix_len < COMPRESSED_REC_MAX_DATA_SIZE); + + if (new_prefix_len == old_prefix_len) { + return(0); + } + + if (new_prefix_len == 0) { + new_prefix_len = ULINT_MAX; + } + + if (old_prefix_len == 0) { + old_prefix_len = ULINT_MAX; + } + + if (new_prefix_len > old_prefix_len) { + return(1); + } else { + return(-1); + } +} + +/** Check whether the column is existing in old table. +@param[in] new_col_no new column no +@param[in] col_map mapping of old column numbers to new ones +@param[in] col_map_size the column map size +@return true if the column is existing, otherwise false. */ +static inline +bool +innobase_pk_col_is_existing( + const ulint new_col_no, + const ulint* col_map, + const ulint col_map_size) +{ + for (ulint i = 0; i < col_map_size; i++) { + if (col_map[i] == new_col_no) { + return(true); + } + } + + return(false); +} + +/** Determine whether both the indexes have same set of primary key +fields arranged in the same order. + +Rules when we cannot skip sorting: +(1) Removing existing PK columns somewhere else than at the end of the PK; +(2) Adding existing columns to the PK, except at the end of the PK when no +columns are removed from the PK; +(3) Changing the order of existing PK columns; +(4) Decreasing the prefix length just like removing existing PK columns +follows rule(1), Increasing the prefix length just like adding existing +PK columns follows rule(2); +(5) Changing the ASC/DESC attribute of the existing PK columns. +@param[in] col_map mapping of old column numbers to new ones +@param[in] ha_alter_info Data used during in-place alter +@param[in] old_clust_index index to be compared +@param[in] new_clust_index index to be compared +@retval true if both indexes have same order. +@retval false. */ +static MY_ATTRIBUTE((warn_unused_result)) +bool +innobase_pk_order_preserved( + const ulint* col_map, + const dict_index_t* old_clust_index, + const dict_index_t* new_clust_index) +{ + ulint old_n_uniq + = dict_index_get_n_ordering_defined_by_user( + old_clust_index); + ulint new_n_uniq + = dict_index_get_n_ordering_defined_by_user( + new_clust_index); + + ut_ad(dict_index_is_clust(old_clust_index)); + ut_ad(dict_index_is_clust(new_clust_index)); + ut_ad(old_clust_index->table != new_clust_index->table); + ut_ad(col_map != NULL); + + if (old_n_uniq == 0) { + /* There was no PRIMARY KEY in the table. + If there is no PRIMARY KEY after the ALTER either, + no sorting is needed. */ + return(new_n_uniq == old_n_uniq); + } + + /* DROP PRIMARY KEY is only allowed in combination with + ADD PRIMARY KEY. */ + ut_ad(new_n_uniq > 0); + + /* The order of the last processed new_clust_index key field, + not counting ADD COLUMN, which are constant. */ + lint last_field_order = -1; + ulint existing_field_count = 0; + ulint old_n_cols = dict_table_get_n_cols(old_clust_index->table); + for (ulint new_field = 0; new_field < new_n_uniq; new_field++) { + ulint new_col_no = + new_clust_index->fields[new_field].col->ind; + + /* Check if there is a match in old primary key. */ + ulint old_field = 0; + while (old_field < old_n_uniq) { + ulint old_col_no = + old_clust_index->fields[old_field].col->ind; + + if (col_map[old_col_no] == new_col_no) { + break; + } + + old_field++; + } + + /* The order of key field in the new primary key. + 1. old PK column: idx in old primary key + 2. existing column: old_n_uniq + sequence no + 3. newly added column: no order */ + lint new_field_order; + const bool old_pk_column = old_field < old_n_uniq; + + if (old_pk_column) { + new_field_order = lint(old_field); + } else if (innobase_pk_col_is_existing(new_col_no, col_map, + old_n_cols) + || new_clust_index->table->persistent_autoinc + == new_field + 1) { + /* Adding an existing column or an AUTO_INCREMENT + column may change the existing ordering. */ + new_field_order = lint(old_n_uniq + + existing_field_count++); + } else { + /* Skip newly added column. */ + continue; + } + + if (last_field_order + 1 != new_field_order) { + /* Old PK order is not kept, or existing column + is not added at the end of old PK. */ + return(false); + } + + last_field_order = new_field_order; + + if (!old_pk_column) { + continue; + } + + const dict_field_t &of = old_clust_index->fields[old_field]; + const dict_field_t &nf = new_clust_index->fields[new_field]; + + if (of.descending != nf.descending) { + return false; + } + + /* Check prefix length change. */ + const lint prefix_change = innobase_pk_col_prefix_compare( + nf.prefix_len, of.prefix_len); + + if (prefix_change < 0) { + /* If a column's prefix length is decreased, it should + be the last old PK column in new PK. + Note: we set last_field_order to -2, so that if there + are any old PK colmns or existing columns after it in + new PK, the comparison to new_field_order will fail in + the next round.*/ + last_field_order = -2; + } else if (prefix_change > 0) { + /* If a column's prefix length is increased, it should + be the last PK column in old PK. */ + if (old_field != old_n_uniq - 1) { + return(false); + } + } + } + + return(true); +} + +/** Update the mtype from DATA_BLOB to DATA_GEOMETRY for a specified +GIS column of a table. This is used when we want to create spatial index +on legacy GIS columns coming from 5.6, where we store GIS data as DATA_BLOB +in innodb layer. +@param[in] table_id table id +@param[in] col_name column name +@param[in] trx data dictionary transaction +@retval true Failure +@retval false Success */ +static +bool +innobase_update_gis_column_type( + table_id_t table_id, + const char* col_name, + trx_t* trx) +{ + pars_info_t* info; + dberr_t error; + + DBUG_ENTER("innobase_update_gis_column_type"); + + DBUG_ASSERT(trx->dict_operation); + ut_ad(trx->dict_operation_lock_mode); + ut_ad(dict_sys.locked()); + + info = pars_info_create(); + + pars_info_add_ull_literal(info, "tableid", table_id); + pars_info_add_str_literal(info, "name", col_name); + pars_info_add_int4_literal(info, "mtype", DATA_GEOMETRY); + + trx->op_info = "update column type to DATA_GEOMETRY"; + + error = que_eval_sql( + info, + "PROCEDURE UPDATE_SYS_COLUMNS_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_COLUMNS SET MTYPE=:mtype\n" + "WHERE TABLE_ID=:tableid AND NAME=:name;\n" + "END;\n", trx); + + trx->error_state = DB_SUCCESS; + trx->op_info = ""; + + DBUG_RETURN(error != DB_SUCCESS); +} + +/** Check if we are creating spatial indexes on GIS columns, which are +legacy columns from earlier MySQL, such as 5.6. If so, we have to update +the mtypes of the old GIS columns to DATA_GEOMETRY. +In 5.6, we store GIS columns as DATA_BLOB in InnoDB layer, it will introduce +confusion when we run latest server on older data. That's why we need to +do the upgrade. +@param[in] ha_alter_info Data used during in-place alter +@param[in] table Table on which we want to add indexes +@param[in] trx Transaction +@return DB_SUCCESS if update successfully or no columns need to be updated, +otherwise DB_ERROR, which means we can't update the mtype for some +column, and creating spatial index on it should be dangerous */ +static +dberr_t +innobase_check_gis_columns( + Alter_inplace_info* ha_alter_info, + dict_table_t* table, + trx_t* trx) +{ + DBUG_ENTER("innobase_check_gis_columns"); + + for (uint key_num = 0; + key_num < ha_alter_info->index_add_count; + key_num++) { + + const KEY& key = ha_alter_info->key_info_buffer[ + ha_alter_info->index_add_buffer[key_num]]; + + if (!(key.flags & HA_SPATIAL)) { + continue; + } + + ut_ad(key.user_defined_key_parts == 1); + const KEY_PART_INFO& key_part = key.key_part[0]; + + /* Does not support spatial index on virtual columns */ + if (!key_part.field->stored_in_db()) { + DBUG_RETURN(DB_UNSUPPORTED); + } + + ulint col_nr = dict_table_has_column( + table, + key_part.field->field_name.str, + key_part.fieldnr); + ut_ad(col_nr != table->n_def); + dict_col_t* col = &table->cols[col_nr]; + + if (col->mtype != DATA_BLOB) { + ut_ad(DATA_GEOMETRY_MTYPE(col->mtype)); + continue; + } + + const char* col_name = dict_table_get_col_name( + table, col_nr); + + if (innobase_update_gis_column_type( + table->id, col_name, trx)) { + + DBUG_RETURN(DB_ERROR); + } else { + col->mtype = DATA_GEOMETRY; + + ib::info() << "Updated mtype of column" << col_name + << " in table " << table->name + << ", whose id is " << table->id + << " to DATA_GEOMETRY"; + } + } + + DBUG_RETURN(DB_SUCCESS); +} + +/** Collect virtual column info for its addition +@param[in] ha_alter_info Data used during in-place alter +@param[in] altered_table MySQL table that is being altered to +@param[in] table MySQL table as it is before the ALTER operation +@retval true Failure +@retval false Success */ +static +bool +prepare_inplace_add_virtual( + Alter_inplace_info* ha_alter_info, + const TABLE* altered_table, + const TABLE* table) +{ + ha_innobase_inplace_ctx* ctx; + uint16_t i = 0; + + ctx = static_cast + (ha_alter_info->handler_ctx); + + unsigned j = altered_table->s->virtual_fields + ctx->num_to_drop_vcol; + + ctx->add_vcol = static_cast( + mem_heap_zalloc(ctx->heap, j * sizeof *ctx->add_vcol)); + ctx->add_vcol_name = static_cast( + mem_heap_alloc(ctx->heap, j * sizeof *ctx->add_vcol_name)); + + j = 0; + + for (const Create_field& new_field : + ha_alter_info->alter_info->create_list) { + const Field* field = altered_table->field[i++]; + + if (new_field.field || field->stored_in_db()) { + continue; + } + + unsigned is_unsigned; + auto col_type = get_innobase_type_from_mysql_type( + &is_unsigned, field); + + auto col_len = field->pack_length(); + unsigned field_type = field->type() | is_unsigned; + + if (!field->real_maybe_null()) { + field_type |= DATA_NOT_NULL; + } + + if (field->binary()) { + field_type |= DATA_BINARY_TYPE; + } + + unsigned charset_no; + + if (dtype_is_string_type(col_type)) { + charset_no = field->charset()->number; + + DBUG_EXECUTE_IF( + "ib_alter_add_virtual_fail", + charset_no += MAX_CHAR_COLL_NUM;); + + if (charset_no > MAX_CHAR_COLL_NUM) { + my_error(ER_WRONG_KEY_COLUMN, MYF(0), "InnoDB", + field->field_name.str); + return(true); + } + } else { + charset_no = 0; + } + + if (field->type() == MYSQL_TYPE_VARCHAR) { + uint32 length_bytes + = static_cast( + field)->length_bytes; + + col_len -= length_bytes; + + if (length_bytes == 2) { + field_type |= DATA_LONG_TRUE_VARCHAR; + } + } + + new (&ctx->add_vcol[j]) dict_v_col_t(); + ctx->add_vcol[j].m_col.prtype = dtype_form_prtype( + field_type, charset_no); + + ctx->add_vcol[j].m_col.prtype |= DATA_VIRTUAL; + + ctx->add_vcol[j].m_col.mtype = col_type; + + ctx->add_vcol[j].m_col.len = static_cast(col_len); + + ctx->add_vcol[j].m_col.ind = (i - 1) + & dict_index_t::MAX_N_FIELDS; + ctx->add_vcol[j].num_base = 0; + ctx->add_vcol_name[j] = field->field_name.str; + ctx->add_vcol[j].base_col = NULL; + ctx->add_vcol[j].v_pos = (ctx->old_table->n_v_cols + - ctx->num_to_drop_vcol + j) + & dict_index_t::MAX_N_FIELDS; + + /* MDEV-17468: Do this on ctx->instant_table later */ + innodb_base_col_setup(ctx->old_table, field, &ctx->add_vcol[j]); + j++; + } + + ctx->num_to_add_vcol = j; + return(false); +} + +/** Collect virtual column info for its addition +@param[in] ha_alter_info Data used during in-place alter +@param[in] table MySQL table as it is before the ALTER operation +@retval true Failure +@retval false Success */ +static +bool +prepare_inplace_drop_virtual( + Alter_inplace_info* ha_alter_info, + const TABLE* table) +{ + ha_innobase_inplace_ctx* ctx; + unsigned i = 0, j = 0; + + ctx = static_cast + (ha_alter_info->handler_ctx); + + ctx->num_to_drop_vcol = 0; + for (i = 0; table->field[i]; i++) { + const Field* field = table->field[i]; + if (field->flags & FIELD_IS_DROPPED && !field->stored_in_db()) { + ctx->num_to_drop_vcol++; + } + } + + ctx->drop_vcol = static_cast( + mem_heap_alloc(ctx->heap, ctx->num_to_drop_vcol + * sizeof *ctx->drop_vcol)); + ctx->drop_vcol_name = static_cast( + mem_heap_alloc(ctx->heap, ctx->num_to_drop_vcol + * sizeof *ctx->drop_vcol_name)); + + for (i = 0; table->field[i]; i++) { + Field *field = table->field[i]; + if (!(field->flags & FIELD_IS_DROPPED) || field->stored_in_db()) { + continue; + } + + unsigned is_unsigned; + + auto col_type = get_innobase_type_from_mysql_type( + &is_unsigned, field); + + auto col_len = field->pack_length(); + unsigned field_type = field->type() | is_unsigned; + + if (!field->real_maybe_null()) { + field_type |= DATA_NOT_NULL; + } + + if (field->binary()) { + field_type |= DATA_BINARY_TYPE; + } + + unsigned charset_no = 0; + + if (dtype_is_string_type(col_type)) { + charset_no = field->charset()->number; + + DBUG_EXECUTE_IF( + "ib_alter_add_virtual_fail", + charset_no += MAX_CHAR_COLL_NUM;); + + if (charset_no > MAX_CHAR_COLL_NUM) { + my_error(ER_WRONG_KEY_COLUMN, MYF(0), "InnoDB", + field->field_name.str); + return(true); + } + } else { + charset_no = 0; + } + + if (field->type() == MYSQL_TYPE_VARCHAR) { + uint32 length_bytes + = static_cast( + field)->length_bytes; + + col_len -= length_bytes; + + if (length_bytes == 2) { + field_type |= DATA_LONG_TRUE_VARCHAR; + } + } + + + ctx->drop_vcol[j].m_col.prtype = dtype_form_prtype( + field_type, charset_no); + + ctx->drop_vcol[j].m_col.prtype |= DATA_VIRTUAL; + + ctx->drop_vcol[j].m_col.mtype = col_type; + + ctx->drop_vcol[j].m_col.len = static_cast(col_len); + + ctx->drop_vcol[j].m_col.ind = i & dict_index_t::MAX_N_FIELDS; + + ctx->drop_vcol_name[j] = field->field_name.str; + + dict_v_col_t* v_col = dict_table_get_nth_v_col_mysql( + ctx->old_table, i); + ctx->drop_vcol[j].v_pos = v_col->v_pos; + j++; + } + + return(false); +} + +/** Insert a new record to INNODB SYS_VIRTUAL +@param[in] table InnoDB table +@param[in] pos virtual column column no +@param[in] base_pos base column pos +@param[in] trx transaction +@retval false on success +@retval true on failure (my_error() will have been called) */ +static bool innobase_insert_sys_virtual( + const dict_table_t* table, + ulint pos, + ulint base_pos, + trx_t* trx) +{ + pars_info_t* info = pars_info_create(); + pars_info_add_ull_literal(info, "id", table->id); + pars_info_add_int4_literal(info, "pos", pos); + pars_info_add_int4_literal(info, "base_pos", base_pos); + + if (DB_SUCCESS != que_eval_sql( + info, + "PROCEDURE P () IS\n" + "BEGIN\n" + "INSERT INTO SYS_VIRTUAL VALUES (:id, :pos, :base_pos);\n" + "END;\n", trx)) { + my_error(ER_INTERNAL_ERROR, MYF(0), + "InnoDB: ADD COLUMN...VIRTUAL"); + return true; + } + + return false; +} + +/** Insert a record to the SYS_COLUMNS dictionary table. +@param[in] table_id table id +@param[in] pos position of the column +@param[in] field_name field name +@param[in] mtype main type +@param[in] prtype precise type +@param[in] len fixed length in bytes, or 0 +@param[in] n_base number of base columns of virtual columns, or 0 +@param[in] update whether to update instead of inserting +@retval false on success +@retval true on failure (my_error() will have been called) */ +static bool innodb_insert_sys_columns( + table_id_t table_id, + ulint pos, + const char* field_name, + ulint mtype, + ulint prtype, + ulint len, + ulint n_base, + trx_t* trx, + bool update = false) +{ + pars_info_t* info = pars_info_create(); + pars_info_add_ull_literal(info, "id", table_id); + pars_info_add_int4_literal(info, "pos", pos); + pars_info_add_str_literal(info, "name", field_name); + pars_info_add_int4_literal(info, "mtype", mtype); + pars_info_add_int4_literal(info, "prtype", prtype); + pars_info_add_int4_literal(info, "len", len); + pars_info_add_int4_literal(info, "base", n_base); + + if (update) { + if (DB_SUCCESS != que_eval_sql( + info, + "PROCEDURE UPD_COL () IS\n" + "BEGIN\n" + "UPDATE SYS_COLUMNS SET\n" + "NAME=:name, MTYPE=:mtype, PRTYPE=:prtype, " + "LEN=:len, PREC=:base\n" + "WHERE TABLE_ID=:id AND POS=:pos;\n" + "END;\n", trx)) { + my_error(ER_INTERNAL_ERROR, MYF(0), + "InnoDB: Updating SYS_COLUMNS failed"); + return true; + } + + return false; + } + + if (DB_SUCCESS != que_eval_sql( + info, + "PROCEDURE ADD_COL () IS\n" + "BEGIN\n" + "INSERT INTO SYS_COLUMNS VALUES" + "(:id,:pos,:name,:mtype,:prtype,:len,:base);\n" + "END;\n", trx)) { + my_error(ER_INTERNAL_ERROR, MYF(0), + "InnoDB: Insert into SYS_COLUMNS failed"); + return true; + } + + return false; +} + +/** Update INNODB SYS_COLUMNS on new virtual columns +@param[in] table InnoDB table +@param[in] col_name column name +@param[in] vcol virtual column +@param[in] trx transaction +@retval false on success +@retval true on failure (my_error() will have been called) */ +static bool innobase_add_one_virtual( + const dict_table_t* table, + const char* col_name, + dict_v_col_t* vcol, + trx_t* trx) +{ + ulint pos = dict_create_v_col_pos(vcol->v_pos, + vcol->m_col.ind); + + if (innodb_insert_sys_columns(table->id, pos, col_name, + vcol->m_col.mtype, vcol->m_col.prtype, + vcol->m_col.len, vcol->num_base, trx)) { + return true; + } + + for (unsigned i = 0; i < vcol->num_base; i++) { + if (innobase_insert_sys_virtual( + table, pos, vcol->base_col[i]->ind, trx)) { + return true; + } + } + + return false; +} + +/** Update SYS_TABLES.N_COLS in the data dictionary. +@param[in] user_table InnoDB table +@param[in] n the new value of SYS_TABLES.N_COLS +@param[in] trx transaction +@return whether the operation failed */ +static bool innodb_update_cols(const dict_table_t* table, ulint n, trx_t* trx) +{ + pars_info_t* info = pars_info_create(); + + pars_info_add_int4_literal(info, "n", n); + pars_info_add_ull_literal(info, "id", table->id); + + if (DB_SUCCESS != que_eval_sql(info, + "PROCEDURE UPDATE_N_COLS () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLES SET N_COLS = :n" + " WHERE ID = :id;\n" + "END;\n", trx)) { + my_error(ER_INTERNAL_ERROR, MYF(0), + "InnoDB: Updating SYS_TABLES.N_COLS failed"); + return true; + } + + return false; +} + +/** Update system table for adding virtual column(s) +@param[in] ha_alter_info Data used during in-place alter +@param[in] user_table InnoDB table +@param[in] trx transaction +@retval true Failure +@retval false Success */ +static +bool +innobase_add_virtual_try( + const Alter_inplace_info* ha_alter_info, + const dict_table_t* user_table, + trx_t* trx) +{ + ha_innobase_inplace_ctx* ctx = static_cast( + ha_alter_info->handler_ctx); + + for (ulint i = 0; i < ctx->num_to_add_vcol; i++) { + if (innobase_add_one_virtual( + user_table, ctx->add_vcol_name[i], + &ctx->add_vcol[i], trx)) { + return true; + } + } + + return false; +} + +/** Delete metadata from SYS_COLUMNS and SYS_VIRTUAL. +@param[in] id table id +@param[in] pos first SYS_COLUMNS.POS +@param[in,out] trx data dictionary transaction +@retval true Failure +@retval false Success. */ +static bool innobase_instant_drop_cols(table_id_t id, ulint pos, trx_t* trx) +{ + pars_info_t* info = pars_info_create(); + pars_info_add_ull_literal(info, "id", id); + pars_info_add_int4_literal(info, "pos", pos); + + dberr_t err = que_eval_sql( + info, + "PROCEDURE DELETE_COL () IS\n" + "BEGIN\n" + "DELETE FROM SYS_COLUMNS WHERE\n" + "TABLE_ID = :id AND POS >= :pos;\n" + "DELETE FROM SYS_VIRTUAL WHERE TABLE_ID = :id;\n" + "END;\n", trx); + if (err != DB_SUCCESS) { + my_error(ER_INTERNAL_ERROR, MYF(0), + "InnoDB: DELETE from SYS_COLUMNS/SYS_VIRTUAL failed"); + return true; + } + + return false; +} + +/** Update INNODB SYS_COLUMNS on new virtual column's position +@param[in] table InnoDB table +@param[in] old_pos old position +@param[in] new_pos new position +@param[in] trx transaction +@return DB_SUCCESS if successful, otherwise error code */ +static +dberr_t +innobase_update_v_pos_sys_columns( + const dict_table_t* table, + ulint old_pos, + ulint new_pos, + trx_t* trx) +{ + pars_info_t* info = pars_info_create(); + + pars_info_add_int4_literal(info, "pos", old_pos); + pars_info_add_int4_literal(info, "val", new_pos); + pars_info_add_ull_literal(info, "id", table->id); + + dberr_t error = que_eval_sql( + info, + "PROCEDURE P () IS\n" + "BEGIN\n" + "UPDATE SYS_COLUMNS\n" + "SET POS = :val\n" + "WHERE POS = :pos\n" + "AND TABLE_ID = :id;\n" + "END;\n", trx); + + return(error); +} + +/** Update INNODB SYS_VIRTUAL table with new virtual column position +@param[in] table InnoDB table +@param[in] old_pos old position +@param[in] new_pos new position +@param[in] trx transaction +@return DB_SUCCESS if successful, otherwise error code */ +static +dberr_t +innobase_update_v_pos_sys_virtual( + const dict_table_t* table, + ulint old_pos, + ulint new_pos, + trx_t* trx) +{ + pars_info_t* info = pars_info_create(); + + pars_info_add_int4_literal(info, "pos", old_pos); + pars_info_add_int4_literal(info, "val", new_pos); + pars_info_add_ull_literal(info, "id", table->id); + + dberr_t error = que_eval_sql( + info, + "PROCEDURE P () IS\n" + "BEGIN\n" + "UPDATE SYS_VIRTUAL\n" + "SET POS = :val\n" + "WHERE POS = :pos\n" + "AND TABLE_ID = :id;\n" + "END;\n", trx); + + return(error); +} + +/** Update InnoDB system tables on dropping a virtual column +@param[in] table InnoDB table +@param[in] col_name column name of the dropping column +@param[in] drop_col col information for the dropping column +@param[in] n_prev_dropped number of previously dropped columns in the + same alter clause +@param[in] trx transaction +@return DB_SUCCESS if successful, otherwise error code */ +static +dberr_t +innobase_drop_one_virtual_sys_columns( + const dict_table_t* table, + const char* col_name, + dict_col_t* drop_col, + ulint n_prev_dropped, + trx_t* trx) +{ + pars_info_t* info = pars_info_create(); + pars_info_add_ull_literal(info, "id", table->id); + + pars_info_add_str_literal(info, "name", col_name); + + dberr_t error = que_eval_sql( + info, + "PROCEDURE P () IS\n" + "BEGIN\n" + "DELETE FROM SYS_COLUMNS\n" + "WHERE TABLE_ID = :id\n" + "AND NAME = :name;\n" + "END;\n", trx); + + if (error != DB_SUCCESS) { + return(error); + } + + dict_v_col_t* v_col = dict_table_get_nth_v_col_mysql( + table, drop_col->ind); + + /* Adjust column positions for all subsequent columns */ + for (ulint i = v_col->v_pos + 1; i < table->n_v_cols; i++) { + dict_v_col_t* t_col = dict_table_get_nth_v_col(table, i); + ulint old_p = dict_create_v_col_pos( + t_col->v_pos - n_prev_dropped, + t_col->m_col.ind - n_prev_dropped); + ulint new_p = dict_create_v_col_pos( + t_col->v_pos - 1 - n_prev_dropped, + ulint(t_col->m_col.ind) - 1 - n_prev_dropped); + + error = innobase_update_v_pos_sys_columns( + table, old_p, new_p, trx); + if (error != DB_SUCCESS) { + return(error); + } + error = innobase_update_v_pos_sys_virtual( + table, old_p, new_p, trx); + if (error != DB_SUCCESS) { + return(error); + } + } + + return(error); +} + +/** Delete virtual column's info from INNODB SYS_VIRTUAL +@param[in] table InnoDB table +@param[in] pos position of the virtual column to be deleted +@param[in] trx transaction +@return DB_SUCCESS if successful, otherwise error code */ +static +dberr_t +innobase_drop_one_virtual_sys_virtual( + const dict_table_t* table, + ulint pos, + trx_t* trx) +{ + pars_info_t* info = pars_info_create(); + pars_info_add_ull_literal(info, "id", table->id); + + pars_info_add_int4_literal(info, "pos", pos); + + dberr_t error = que_eval_sql( + info, + "PROCEDURE P () IS\n" + "BEGIN\n" + "DELETE FROM SYS_VIRTUAL\n" + "WHERE TABLE_ID = :id\n" + "AND POS = :pos;\n" + "END;\n", trx); + + return(error); +} + +/** Update system table for dropping virtual column(s) +@param[in] ha_alter_info Data used during in-place alter +@param[in] user_table InnoDB table +@param[in] trx transaction +@retval true Failure +@retval false Success */ +static +bool +innobase_drop_virtual_try( + const Alter_inplace_info* ha_alter_info, + const dict_table_t* user_table, + trx_t* trx) +{ + ha_innobase_inplace_ctx* ctx; + dberr_t err = DB_SUCCESS; + + ctx = static_cast + (ha_alter_info->handler_ctx); + + for (unsigned i = 0; i < ctx->num_to_drop_vcol; i++) { + + ulint pos = dict_create_v_col_pos( + ctx->drop_vcol[i].v_pos - i, + ctx->drop_vcol[i].m_col.ind - i); + err = innobase_drop_one_virtual_sys_virtual( + user_table, pos, trx); + + if (err != DB_SUCCESS) { + my_error(ER_INTERNAL_ERROR, MYF(0), + "InnoDB: DROP COLUMN...VIRTUAL"); + return(true); + } + + err = innobase_drop_one_virtual_sys_columns( + user_table, ctx->drop_vcol_name[i], + &(ctx->drop_vcol[i].m_col), i, trx); + + if (err != DB_SUCCESS) { + my_error(ER_INTERNAL_ERROR, MYF(0), + "InnoDB: DROP COLUMN...VIRTUAL"); + return(true); + } + } + + return false; +} + +/** Serialise metadata of dropped or reordered columns. +@param[in,out] heap memory heap for allocation +@param[out] field data field with the metadata */ +inline +void dict_table_t::serialise_columns(mem_heap_t* heap, dfield_t* field) const +{ + DBUG_ASSERT(instant); + const dict_index_t& index = *UT_LIST_GET_FIRST(indexes); + unsigned n_fixed = index.first_user_field(); + unsigned num_non_pk_fields = index.n_fields - n_fixed; + + ulint len = 4 + num_non_pk_fields * 2; + + byte* data = static_cast(mem_heap_alloc(heap, len)); + + dfield_set_data(field, data, len); + + mach_write_to_4(data, num_non_pk_fields); + + data += 4; + + for (ulint i = n_fixed; i < index.n_fields; i++) { + mach_write_to_2(data, instant->field_map[i - n_fixed]); + data += 2; + } +} + +/** Construct the metadata record for instant ALTER TABLE. +@param[in] row dummy or default values for existing columns +@param[in,out] heap memory heap for allocations +@return metadata record */ +inline +dtuple_t* +dict_index_t::instant_metadata(const dtuple_t& row, mem_heap_t* heap) const +{ + ut_ad(is_primary()); + dtuple_t* entry; + + if (!table->instant) { + entry = row_build_index_entry(&row, NULL, this, heap); + entry->info_bits = REC_INFO_METADATA_ADD; + return entry; + } + + entry = dtuple_create(heap, n_fields + 1); + entry->n_fields_cmp = n_uniq; + entry->info_bits = REC_INFO_METADATA_ALTER; + + const dict_field_t* field = fields; + + for (uint i = 0; i <= n_fields; i++, field++) { + dfield_t* dfield = dtuple_get_nth_field(entry, i); + + if (i == first_user_field()) { + table->serialise_columns(heap, dfield); + dfield->type.metadata_blob_init(); + field--; + continue; + } + + ut_ad(!field->col->is_virtual()); + + if (field->col->is_dropped()) { + dict_col_copy_type(field->col, &dfield->type); + if (field->col->is_nullable()) { + dfield_set_null(dfield); + } else { + dfield_set_data(dfield, field_ref_zero, + field->fixed_len); + } + continue; + } + + const dfield_t* s = dtuple_get_nth_field(&row, field->col->ind); + ut_ad(dict_col_type_assert_equal(field->col, &s->type)); + *dfield = *s; + + if (dfield_is_null(dfield)) { + continue; + } + + if (dfield_is_ext(dfield)) { + ut_ad(i > first_user_field()); + ut_ad(!field->prefix_len); + ut_ad(dfield->len >= FIELD_REF_SIZE); + dfield_set_len(dfield, dfield->len - FIELD_REF_SIZE); + } + + if (!field->prefix_len) { + continue; + } + + ut_ad(field->col->ord_part); + ut_ad(i < n_uniq); + + ulint len = dtype_get_at_most_n_mbchars( + field->col->prtype, + field->col->mbminlen, field->col->mbmaxlen, + field->prefix_len, dfield->len, + static_cast(dfield_get_data(dfield))); + dfield_set_len(dfield, len); + } + + return entry; +} + +/** Insert or update SYS_COLUMNS and the hidden metadata record +for instant ALTER TABLE. +@param[in] ha_alter_info ALTER TABLE context +@param[in,out] ctx ALTER TABLE context for the current partition +@param[in] altered_table MySQL table that is being altered +@param[in] table MySQL table as it is before the ALTER operation +@param[in,out] trx dictionary transaction +@retval true failure +@retval false success */ +static bool innobase_instant_try( + const Alter_inplace_info* ha_alter_info, + ha_innobase_inplace_ctx* ctx, + const TABLE* altered_table, + const TABLE* table, + trx_t* trx) +{ + DBUG_ASSERT(!ctx->need_rebuild()); + DBUG_ASSERT(ctx->is_instant()); + + dict_table_t* user_table = ctx->old_table; + + dict_index_t* index = dict_table_get_first_index(user_table); + const unsigned n_old_fields = index->n_fields; + const dict_col_t* old_cols = user_table->cols; + DBUG_ASSERT(user_table->n_cols == ctx->old_n_cols); + +#ifdef BTR_CUR_HASH_ADAPT + /* Acquire the ahi latch to avoid a race condition + between ahi access and instant alter table */ + srw_spin_lock* ahi_latch = btr_search_sys.get_latch(*index); + ahi_latch->wr_lock(SRW_LOCK_CALL); +#endif /* BTR_CUR_HASH_ADAPT */ + const bool metadata_changed = ctx->instant_column(); +#ifdef BTR_CUR_HASH_ADAPT + ahi_latch->wr_unlock(); +#endif /* BTR_CUR_HASH_ADAPT */ + + DBUG_ASSERT(index->n_fields >= n_old_fields); + /* The table may have been emptied and may have lost its + 'instantness' during this ALTER TABLE. */ + + /* Construct a table row of default values for the stored columns. */ + dtuple_t* row = dtuple_create(ctx->heap, user_table->n_cols); + dict_table_copy_types(row, user_table); + Field** af = altered_table->field; + Field** const end = altered_table->field + altered_table->s->fields; + ut_d(List_iterator_fast cf_it( + ha_alter_info->alter_info->create_list)); + if (ctx->first_alter_pos + && innobase_instant_drop_cols(user_table->id, + ctx->first_alter_pos - 1, trx)) { + return true; + } + for (uint i = 0; af < end; af++) { + if (!(*af)->stored_in_db()) { + ut_d(cf_it++); + continue; + } + + const dict_col_t* old = dict_table_t::find(old_cols, + ctx->col_map, + ctx->old_n_cols, i); + DBUG_ASSERT(!old || i >= ctx->old_n_cols - DATA_N_SYS_COLS + || old->ind == i + || (ctx->first_alter_pos + && old->ind >= ctx->first_alter_pos - 1)); + + dfield_t* d = dtuple_get_nth_field(row, i); + const dict_col_t* col = dict_table_get_nth_col(user_table, i); + DBUG_ASSERT(!col->is_virtual()); + DBUG_ASSERT(!col->is_dropped()); + DBUG_ASSERT(col->mtype != DATA_SYS); + DBUG_ASSERT(!strcmp((*af)->field_name.str, + dict_table_get_col_name(user_table, i))); + DBUG_ASSERT(old || col->is_added()); + + ut_d(const Create_field* new_field = cf_it++); + /* new_field->field would point to an existing column. + If it is NULL, the column was added by this ALTER TABLE. */ + ut_ad(!new_field->field == !old); + + if (col->is_added()) { + dfield_set_data(d, col->def_val.data, + col->def_val.len); + } else if ((*af)->real_maybe_null()) { + /* Store NULL for nullable 'core' columns. */ + dfield_set_null(d); + } else { + switch ((*af)->type()) { + case MYSQL_TYPE_VARCHAR: + case MYSQL_TYPE_GEOMETRY: + case MYSQL_TYPE_TINY_BLOB: + case MYSQL_TYPE_MEDIUM_BLOB: + case MYSQL_TYPE_BLOB: + case MYSQL_TYPE_LONG_BLOB: + variable_length: + /* Store the empty string for 'core' + variable-length NOT NULL columns. */ + dfield_set_data(d, field_ref_zero, 0); + break; + case MYSQL_TYPE_STRING: + if (col->mbminlen != col->mbmaxlen + && user_table->not_redundant()) { + goto variable_length; + } + /* fall through */ + default: + /* For fixed-length NOT NULL 'core' columns, + get a dummy default value from SQL. Note that + we will preserve the old values of these + columns when updating the metadata + record, to avoid unnecessary updates. */ + ulint len = (*af)->pack_length(); + DBUG_ASSERT(d->type.mtype != DATA_INT + || len <= 8); + row_mysql_store_col_in_innobase_format( + d, d->type.mtype == DATA_INT + ? static_cast( + mem_heap_alloc(ctx->heap, len)) + : NULL, true, (*af)->ptr, len, + dict_table_is_comp(user_table)); + ut_ad(new_field->field->pack_length() == len); + } + } + + bool update = old && (!ctx->first_alter_pos + || i < ctx->first_alter_pos - 1); + DBUG_ASSERT(!old || col->same_format(*old)); + if (update + && old->prtype == d->type.prtype) { + /* The record is already present in SYS_COLUMNS. */ + } else if (innodb_insert_sys_columns(user_table->id, i, + (*af)->field_name.str, + d->type.mtype, + d->type.prtype, + d->type.len, 0, trx, + update)) { + return true; + } + + i++; + } + + if (innodb_update_cols(user_table, dict_table_encode_n_col( + unsigned(user_table->n_cols) + - DATA_N_SYS_COLS, + user_table->n_v_cols) + | (user_table->flags & DICT_TF_COMPACT) << 31, + trx)) { + return true; + } + + if (ctx->first_alter_pos) { +add_all_virtual: + for (uint i = 0; i < user_table->n_v_cols; i++) { + if (innobase_add_one_virtual( + user_table, + dict_table_get_v_col_name(user_table, i), + &user_table->v_cols[i], trx)) { + return true; + } + } + } else if (ha_alter_info->handler_flags & ALTER_DROP_VIRTUAL_COLUMN) { + if (innobase_instant_drop_cols(user_table->id, 65536, trx)) { + return true; + } + goto add_all_virtual; + } else if ((ha_alter_info->handler_flags & ALTER_ADD_VIRTUAL_COLUMN) + && innobase_add_virtual_try(ha_alter_info, user_table, + trx)) { + return true; + } + + if (!user_table->space) { + /* In case of ALTER TABLE...DISCARD TABLESPACE, + update only the metadata and transform the dictionary + cache entry to the canonical format. */ + index->clear_instant_alter(); + return false; + } + + unsigned i = unsigned(user_table->n_cols) - DATA_N_SYS_COLS; + DBUG_ASSERT(i >= altered_table->s->stored_fields); + DBUG_ASSERT(i <= altered_table->s->stored_fields + 1); + if (i > altered_table->s->fields) { + const dict_col_t& fts_doc_id = user_table->cols[i - 1]; + DBUG_ASSERT(!strcmp(fts_doc_id.name(*user_table), + FTS_DOC_ID_COL_NAME)); + DBUG_ASSERT(!fts_doc_id.is_nullable()); + DBUG_ASSERT(fts_doc_id.len == 8); + dfield_set_data(dtuple_get_nth_field(row, i - 1), + field_ref_zero, fts_doc_id.len); + } + byte trx_id[DATA_TRX_ID_LEN], roll_ptr[DATA_ROLL_PTR_LEN]; + dfield_set_data(dtuple_get_nth_field(row, i++), field_ref_zero, + DATA_ROW_ID_LEN); + dfield_set_data(dtuple_get_nth_field(row, i++), trx_id, sizeof trx_id); + dfield_set_data(dtuple_get_nth_field(row, i),roll_ptr,sizeof roll_ptr); + DBUG_ASSERT(i + 1 == user_table->n_cols); + + trx_write_trx_id(trx_id, trx->id); + /* The DB_ROLL_PTR will be assigned later, when allocating undo log. + Silence a Valgrind warning in dtuple_validate() when + row_ins_clust_index_entry_low() searches for the insert position. */ + memset(roll_ptr, 0, sizeof roll_ptr); + + dtuple_t* entry = index->instant_metadata(*row, ctx->heap); + mtr_t mtr; + mtr.start(); + index->set_modified(mtr); + btr_pcur_t pcur; + dberr_t err= pcur.open_leaf(true, index, BTR_MODIFY_TREE, &mtr); + if (err != DB_SUCCESS) { +func_exit: + mtr.commit(); + + if (err != DB_SUCCESS) { + my_error_innodb(err, table->s->table_name.str, + user_table->flags); + return true; + } + return false; + } + ut_ad(btr_pcur_is_before_first_on_page(&pcur)); + + buf_block_t* block = btr_pcur_get_block(&pcur); + ut_ad(page_is_leaf(block->page.frame)); + ut_ad(!page_has_prev(block->page.frame)); + ut_ad(!buf_block_get_page_zip(block)); + const rec_t* rec = btr_pcur_move_to_next_on_page(&pcur); + if (UNIV_UNLIKELY(!rec)) { + err = DB_CORRUPTION; + goto func_exit; + } + + que_thr_t* thr = pars_complete_graph_for_exec( + NULL, trx, ctx->heap, NULL); + page_id_t id{block->page.id()}; + const bool is_root = id.page_no() == index->page; + + if (rec_is_metadata(rec, *index)) { + ut_ad(page_rec_is_user_rec(rec)); + if (is_root + && !rec_is_alter_metadata(rec, *index) + && !index->table->instant + && !page_has_next(block->page.frame) + && page_rec_is_last(rec, block->page.frame)) { + goto empty_table; + } + + if (!metadata_changed) { + goto func_exit; + } + + /* Ensure that the root page is in the correct format. */ + id.set_page_no(index->page); + buf_block_t* root = mtr.get_already_latched( + id, MTR_MEMO_PAGE_SX_FIX); + + if (UNIV_UNLIKELY(!root)) { + err = DB_CORRUPTION; + goto func_exit; + } + + if (fil_page_get_type(root->page.frame) + != FIL_PAGE_TYPE_INSTANT) { + DBUG_ASSERT("wrong page type" == 0); + err = DB_CORRUPTION; + goto func_exit; + } + + btr_set_instant(root, *index, &mtr); + + /* Extend the record with any added columns. */ + uint n = uint(index->n_fields) - n_old_fields; + /* Reserve room for DB_TRX_ID,DB_ROLL_PTR and any + non-updated off-page columns in case they are moved off + page as a result of the update. */ + const uint16_t f = user_table->instant != NULL; + upd_t* update = upd_create(index->n_fields + f, ctx->heap); + update->n_fields = n + f; + update->info_bits = f + ? REC_INFO_METADATA_ALTER + : REC_INFO_METADATA_ADD; + if (f) { + upd_field_t* uf = upd_get_nth_field(update, 0); + uf->field_no = index->first_user_field(); + uf->new_val = entry->fields[uf->field_no]; + DBUG_ASSERT(!dfield_is_ext(&uf->new_val)); + DBUG_ASSERT(!dfield_is_null(&uf->new_val)); + } + + /* Add the default values for instantly added columns */ + unsigned j = f; + + for (unsigned k = n_old_fields; k < index->n_fields; k++) { + upd_field_t* uf = upd_get_nth_field(update, j++); + uf->field_no = static_cast(k + f); + uf->new_val = entry->fields[k + f]; + + ut_ad(j <= n + f); + } + + ut_ad(j == n + f); + + rec_offs* offsets = NULL; + mem_heap_t* offsets_heap = NULL; + big_rec_t* big_rec; + err = btr_cur_pessimistic_update( + BTR_NO_LOCKING_FLAG | BTR_KEEP_POS_FLAG, + btr_pcur_get_btr_cur(&pcur), + &offsets, &offsets_heap, ctx->heap, + &big_rec, update, UPD_NODE_NO_ORD_CHANGE, + thr, trx->id, &mtr); + if (err == DB_SUCCESS) { + offsets = rec_get_offsets( + btr_pcur_get_rec(&pcur), index, offsets, + index->n_core_fields, ULINT_UNDEFINED, + &offsets_heap); + } + + if (big_rec) { + if (err == DB_SUCCESS) { + err = btr_store_big_rec_extern_fields( + &pcur, offsets, big_rec, &mtr, + BTR_STORE_UPDATE); + } + + dtuple_big_rec_free(big_rec); + } + if (offsets_heap) { + mem_heap_free(offsets_heap); + } + ut_free(pcur.old_rec_buf); + goto func_exit; + } else if (is_root && page_rec_is_supremum(rec) + && !index->table->instant) { +empty_table: + /* The table is empty. */ + ut_ad(fil_page_index_page_check(block->page.frame)); + ut_ad(!page_has_siblings(block->page.frame)); + ut_ad(block->page.id().page_no() == index->page); + /* MDEV-17383: free metadata BLOBs! */ + btr_page_empty(block, NULL, index, 0, &mtr); + if (index->is_instant()) { + index->clear_instant_add(); + } + goto func_exit; + } else if (!user_table->is_instant()) { + ut_ad(!user_table->not_redundant()); + goto func_exit; + } + + /* Convert the table to the instant ALTER TABLE format. */ + mtr.commit(); + mtr.start(); + index->set_modified(mtr); + if (buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, &mtr, + &err)) { + if (fil_page_get_type(root->page.frame) != FIL_PAGE_INDEX) { + DBUG_ASSERT("wrong page type" == 0); + err = DB_CORRUPTION; + goto func_exit; + } + + btr_set_instant(root, *index, &mtr); + mtr.commit(); + mtr.start(); + index->set_modified(mtr); + err = row_ins_clust_index_entry_low( + BTR_NO_LOCKING_FLAG, BTR_MODIFY_TREE, index, + index->n_uniq, entry, 0, thr); + } + + goto func_exit; +} + +/** Adjust the create index column number from "New table" to +"old InnoDB table" while we are doing dropping virtual column. Since we do +not create separate new table for the dropping/adding virtual columns. +To correctly find the indexed column, we will need to find its col_no +in the "Old Table", not the "New table". +@param[in] ha_alter_info Data used during in-place alter +@param[in] old_table MySQL table as it is before the ALTER operation +@param[in] num_v_dropped number of virtual column dropped +@param[in,out] index_def index definition */ +static +void +innodb_v_adjust_idx_col( + const Alter_inplace_info* ha_alter_info, + const TABLE* old_table, + ulint num_v_dropped, + index_def_t* index_def) +{ + for (ulint i = 0; i < index_def->n_fields; i++) { +#ifdef UNIV_DEBUG + bool col_found = false; +#endif /* UNIV_DEBUG */ + ulint num_v = 0; + + index_field_t* index_field = &index_def->fields[i]; + + /* Only adjust virtual column col_no, since non-virtual + column position (in non-vcol list) won't change unless + table rebuild */ + if (!index_field->is_v_col) { + continue; + } + + const Field* field = NULL; + + /* Found the field in the new table */ + for (const Create_field& new_field : + ha_alter_info->alter_info->create_list) { + if (new_field.stored_in_db()) { + continue; + } + + field = new_field.field; + + if (num_v == index_field->col_no) { + break; + } + num_v++; + } + + if (!field) { + /* this means the field is a newly added field, this + should have been blocked when we drop virtual column + at the same time */ + ut_ad(num_v_dropped > 0); + ut_a(0); + } + + ut_ad(!field->stored_in_db()); + + num_v = 0; + + /* Look for its position in old table */ + for (uint old_i = 0; old_table->field[old_i]; old_i++) { + if (old_table->field[old_i] == field) { + /* Found it, adjust its col_no to its position + in old table */ + index_def->fields[i].col_no = num_v; + ut_d(col_found = true); + break; + } + + num_v += !old_table->field[old_i]->stored_in_db(); + } + + ut_ad(col_found); + } +} + +/** Create index metadata in the data dictionary. +@param[in,out] trx dictionary transaction +@param[in,out] index index being created +@param[in] mode encryption mode (for creating a table) +@param[in] key_id encryption key identifier (for creating a table) +@param[in] add_v virtual columns that are being added, or NULL +@return the created index */ +MY_ATTRIBUTE((nonnull(1,2), warn_unused_result)) +static +dict_index_t* +create_index_dict( + trx_t* trx, + dict_index_t* index, + fil_encryption_t mode, + uint32_t key_id, + const dict_add_v_col_t* add_v) +{ + DBUG_ENTER("create_index_dict"); + + mem_heap_t* heap = mem_heap_create(512); + ind_node_t* node = ind_create_graph_create( + index, index->table->name.m_name, heap, mode, key_id, add_v); + que_thr_t* thr = pars_complete_graph_for_exec(node, trx, heap, NULL); + + que_fork_start_command( + static_cast(que_node_get_parent(thr))); + + que_run_threads(thr); + + DBUG_ASSERT(trx->error_state != DB_SUCCESS || index != node->index); + DBUG_ASSERT(trx->error_state != DB_SUCCESS || node->index); + index = node->index; + + que_graph_free((que_t*) que_node_get_parent(thr)); + + DBUG_RETURN(index); +} + +/** Update internal structures with concurrent writes blocked, +while preparing ALTER TABLE. + +@param ha_alter_info Data used during in-place alter +@param altered_table MySQL table that is being altered +@param old_table MySQL table as it is before the ALTER operation +@param table_name Table name in MySQL +@param flags Table and tablespace flags +@param flags2 Additional table flags +@param fts_doc_id_col The column number of FTS_DOC_ID +@param add_fts_doc_id Flag: add column FTS_DOC_ID? +@param add_fts_doc_id_idx Flag: add index FTS_DOC_ID_INDEX (FTS_DOC_ID)? + +@retval true Failure +@retval false Success +*/ +static MY_ATTRIBUTE((warn_unused_result, nonnull(1,2,3,4))) +bool +prepare_inplace_alter_table_dict( +/*=============================*/ + Alter_inplace_info* ha_alter_info, + const TABLE* altered_table, + const TABLE* old_table, + const char* table_name, + ulint flags, + ulint flags2, + ulint fts_doc_id_col, + bool add_fts_doc_id, + bool add_fts_doc_id_idx) +{ + bool dict_locked = false; + ulint* add_key_nums; /* MySQL key numbers */ + index_def_t* index_defs; /* index definitions */ + dict_table_t* user_table; + dict_index_t* fts_index = NULL; + bool new_clustered = false; + dberr_t error = DB_SUCCESS; + ulint num_fts_index; + dict_add_v_col_t* add_v = NULL; + ha_innobase_inplace_ctx*ctx; + + DBUG_ENTER("prepare_inplace_alter_table_dict"); + + ctx = static_cast + (ha_alter_info->handler_ctx); + + DBUG_ASSERT((ctx->add_autoinc != ULINT_UNDEFINED) + == (ctx->sequence.max_value() > 0)); + DBUG_ASSERT(!ctx->num_to_drop_index == !ctx->drop_index); + DBUG_ASSERT(!ctx->num_to_drop_fk == !ctx->drop_fk); + DBUG_ASSERT(!add_fts_doc_id || add_fts_doc_id_idx); + DBUG_ASSERT(!add_fts_doc_id_idx + || innobase_fulltext_exist(altered_table)); + DBUG_ASSERT(!ctx->defaults); + DBUG_ASSERT(!ctx->add_index); + DBUG_ASSERT(!ctx->add_key_numbers); + DBUG_ASSERT(!ctx->num_to_add_index); + + user_table = ctx->new_table; + + switch (ha_alter_info->inplace_supported) { + default: break; + case HA_ALTER_INPLACE_INSTANT: + case HA_ALTER_INPLACE_NOCOPY_LOCK: + case HA_ALTER_INPLACE_NOCOPY_NO_LOCK: + /* If we promised ALGORITHM=NOCOPY or ALGORITHM=INSTANT, + we must retain the original ROW_FORMAT of the table. */ + flags = (user_table->flags & (DICT_TF_MASK_COMPACT + | DICT_TF_MASK_ZIP_SSIZE + | DICT_TF_MASK_ATOMIC_BLOBS)) + | (flags & ~(DICT_TF_MASK_COMPACT + | DICT_TF_MASK_ZIP_SSIZE + | DICT_TF_MASK_ATOMIC_BLOBS)); + } + + trx_start_if_not_started_xa(ctx->prebuilt->trx, true); + + if (ha_alter_info->handler_flags + & ALTER_DROP_VIRTUAL_COLUMN) { + if (prepare_inplace_drop_virtual(ha_alter_info, old_table)) { + DBUG_RETURN(true); + } + } + + if (ha_alter_info->handler_flags + & ALTER_ADD_VIRTUAL_COLUMN) { + if (prepare_inplace_add_virtual( + ha_alter_info, altered_table, old_table)) { + DBUG_RETURN(true); + } + + /* Need information for newly added virtual columns + for create index */ + + if (ha_alter_info->handler_flags + & ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX) { + for (ulint i = 0; i < ctx->num_to_add_vcol; i++) { + /* Set mbminmax for newly added column */ + dict_col_t& col = ctx->add_vcol[i].m_col; + unsigned mbminlen, mbmaxlen; + dtype_get_mblen(col.mtype, col.prtype, + &mbminlen, &mbmaxlen); + col.mbminlen = mbminlen & 7; + col.mbmaxlen = mbmaxlen & 7; + } + add_v = static_cast( + mem_heap_alloc(ctx->heap, sizeof *add_v)); + add_v->n_v_col = ctx->num_to_add_vcol; + add_v->v_col = ctx->add_vcol; + add_v->v_col_name = ctx->add_vcol_name; + } + } + + /* There should be no order change for virtual columns coming in + here */ + ut_ad(check_v_col_in_order(old_table, altered_table, ha_alter_info)); + + /* Create table containing all indexes to be built in this + ALTER TABLE ADD INDEX so that they are in the correct order + in the table. */ + + ctx->num_to_add_index = ha_alter_info->index_add_count; + + ut_ad(ctx->prebuilt->trx->mysql_thd != NULL); + const char* path = thd_innodb_tmpdir( + ctx->prebuilt->trx->mysql_thd); + + index_defs = ctx->create_key_defs( + ha_alter_info, altered_table, + num_fts_index, + fts_doc_id_col, add_fts_doc_id, add_fts_doc_id_idx, + old_table); + + new_clustered = (DICT_CLUSTERED & index_defs[0].ind_type) != 0; + + create_table_info_t info(ctx->prebuilt->trx->mysql_thd, altered_table, + ha_alter_info->create_info, NULL, NULL, + srv_file_per_table); + + /* The primary index would be rebuilt if a FTS Doc ID + column is to be added, and the primary index definition + is just copied from old table and stored in indexdefs[0] */ + DBUG_ASSERT(!add_fts_doc_id || new_clustered); + DBUG_ASSERT(!!new_clustered == + (innobase_need_rebuild(ha_alter_info, old_table) + || add_fts_doc_id)); + + /* Allocate memory for dictionary index definitions */ + + ctx->add_index = static_cast( + mem_heap_zalloc(ctx->heap, ctx->num_to_add_index + * sizeof *ctx->add_index)); + ctx->add_key_numbers = add_key_nums = static_cast( + mem_heap_alloc(ctx->heap, ctx->num_to_add_index + * sizeof *ctx->add_key_numbers)); + + const bool fts_exist = ctx->new_table->flags2 + & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS); + /* Acquire a lock on the table before creating any indexes. */ + bool table_lock_failed = false; + + if (!ctx->online) { +acquire_lock: + ctx->prebuilt->trx->op_info = "acquiring table lock"; + error = lock_table_for_trx(user_table, ctx->trx, LOCK_S); + } else if (add_key_nums) { + /* FIXME: trx_resurrect_table_locks() will not resurrect + MDL for any recovered transactions that may hold locks on + the table. We will prevent race conditions by "unnecessarily" + acquiring an InnoDB table lock even for online operation, + to ensure that the rollback of recovered transactions will + not run concurrently with online ADD INDEX. */ + user_table->lock_mutex_lock(); + for (lock_t *lock = UT_LIST_GET_FIRST(user_table->locks); + lock; + lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) { + if (lock->trx->is_recovered) { + user_table->lock_mutex_unlock(); + goto acquire_lock; + } + } + user_table->lock_mutex_unlock(); + } + + if (fts_exist) { + purge_sys.stop_FTS(*ctx->new_table); + if (error == DB_SUCCESS) { + error = fts_lock_tables(ctx->trx, *ctx->new_table); + } + } + + if (error == DB_SUCCESS) { + error = lock_sys_tables(ctx->trx); + } + + if (error != DB_SUCCESS) { + table_lock_failed = true; + goto error_handling; + } + + /* Latch the InnoDB data dictionary exclusively so that no deadlocks + or lock waits can happen in it during an index create operation. */ + + row_mysql_lock_data_dictionary(ctx->trx); + dict_locked = true; + online_retry_drop_indexes_low(ctx->new_table, ctx->trx); + + ut_d(dict_table_check_for_dup_indexes( + ctx->new_table, CHECK_ABORTED_OK)); + + DBUG_EXECUTE_IF("innodb_OOM_prepare_inplace_alter", + error = DB_OUT_OF_MEMORY; + goto error_handling;); + + /* If a new clustered index is defined for the table we need + to rebuild the table with a temporary name. */ + + if (new_clustered) { + if (innobase_check_foreigns( + ha_alter_info, old_table, + user_table, ctx->drop_fk, ctx->num_to_drop_fk)) { +new_clustered_failed: + DBUG_ASSERT(ctx->trx != ctx->prebuilt->trx); + ctx->trx->rollback(); + + ut_ad(user_table->get_ref_count() == 1); + + if (user_table->drop_aborted) { + row_mysql_unlock_data_dictionary(ctx->trx); + trx_start_for_ddl(ctx->trx); + if (lock_sys_tables(ctx->trx) == DB_SUCCESS) { + row_mysql_lock_data_dictionary( + ctx->trx); + online_retry_drop_indexes_low( + user_table, ctx->trx); + commit_unlock_and_unlink(ctx->trx); + } else { + ctx->trx->commit(); + } + row_mysql_lock_data_dictionary(ctx->trx); + } + + if (ctx->need_rebuild()) { + if (ctx->new_table) { + ut_ad(!ctx->new_table->cached); + dict_mem_table_free(ctx->new_table); + } + ctx->new_table = ctx->old_table; + } + + while (ctx->num_to_add_index--) { + if (dict_index_t*& i = ctx->add_index[ + ctx->num_to_add_index]) { + dict_mem_index_free(i); + i = NULL; + } + } + + goto err_exit; + } + + size_t prefixlen= strlen(mysql_data_home); + if (mysql_data_home[prefixlen-1] != FN_LIBCHAR) + prefixlen++; + size_t tablen = altered_table->s->path.length - prefixlen; + const char* part = ctx->old_table->name.part(); + size_t partlen = part ? strlen(part) : 0; + char* new_table_name = static_cast( + mem_heap_alloc(ctx->heap, tablen + partlen + 1)); + memcpy(new_table_name, + altered_table->s->path.str + prefixlen, tablen); +#ifdef _WIN32 + { + char *sep= strchr(new_table_name, FN_LIBCHAR); + sep[0]= '/'; + } +#endif + memcpy(new_table_name + tablen, part ? part : "", partlen + 1); + ulint n_cols = 0; + ulint n_v_cols = 0; + dtuple_t* defaults; + ulint z = 0; + + for (uint i = 0; i < altered_table->s->fields; i++) { + const Field* field = altered_table->field[i]; + + if (!field->stored_in_db()) { + n_v_cols++; + } else { + n_cols++; + } + } + + ut_ad(n_cols + n_v_cols == altered_table->s->fields); + + if (add_fts_doc_id) { + n_cols++; + DBUG_ASSERT(flags2 & DICT_TF2_FTS); + DBUG_ASSERT(add_fts_doc_id_idx); + flags2 |= DICT_TF2_FTS_ADD_DOC_ID + | DICT_TF2_FTS_HAS_DOC_ID + | DICT_TF2_FTS; + } + + DBUG_ASSERT(!add_fts_doc_id_idx || (flags2 & DICT_TF2_FTS)); + + ctx->new_table = dict_table_t::create( + {new_table_name, tablen + partlen}, nullptr, + n_cols + n_v_cols, n_v_cols, flags, flags2); + + /* The rebuilt indexed_table will use the renamed + column names. */ + ctx->col_names = NULL; + + if (DICT_TF_HAS_DATA_DIR(flags)) { + ctx->new_table->data_dir_path = + mem_heap_strdup(ctx->new_table->heap, + user_table->data_dir_path); + } + + for (uint i = 0; i < altered_table->s->fields; i++) { + const Field* field = altered_table->field[i]; + unsigned is_unsigned; + auto col_type = get_innobase_type_from_mysql_type( + &is_unsigned, field); + unsigned field_type = field->type() | is_unsigned; + const bool is_virtual = !field->stored_in_db(); + + /* we assume in dtype_form_prtype() that this + fits in two bytes */ + ut_a(field_type <= MAX_CHAR_COLL_NUM); + + if (!field->real_maybe_null()) { + field_type |= DATA_NOT_NULL; + } + + if (field->binary()) { + field_type |= DATA_BINARY_TYPE; + } + + if (altered_table->versioned()) { + if (i == altered_table->s->vers.start_fieldno) { + field_type |= DATA_VERS_START; + } else if (i == + altered_table->s->vers.end_fieldno) { + field_type |= DATA_VERS_END; + } else if (!(field->flags + & VERS_UPDATE_UNVERSIONED_FLAG)) { + field_type |= DATA_VERSIONED; + } + } + + unsigned charset_no; + + if (dtype_is_string_type(col_type)) { + charset_no = field->charset()->number; + + if (charset_no > MAX_CHAR_COLL_NUM) { + my_error(ER_WRONG_KEY_COLUMN, MYF(0), "InnoDB", + field->field_name.str); + goto new_clustered_failed; + } + } else { + charset_no = 0; + } + + auto col_len = field->pack_length(); + + /* The MySQL pack length contains 1 or 2 bytes + length field for a true VARCHAR. Let us + subtract that, so that the InnoDB column + length in the InnoDB data dictionary is the + real maximum byte length of the actual data. */ + + if (field->type() == MYSQL_TYPE_VARCHAR) { + uint32 length_bytes + = static_cast( + field)->length_bytes; + + col_len -= length_bytes; + + if (length_bytes == 2) { + field_type |= DATA_LONG_TRUE_VARCHAR; + } + + } + + if (dict_col_name_is_reserved(field->field_name.str)) { +wrong_column_name: + dict_mem_table_free(ctx->new_table); + ctx->new_table = ctx->old_table; + my_error(ER_WRONG_COLUMN_NAME, MYF(0), + field->field_name.str); + goto new_clustered_failed; + } + + /** Note the FTS_DOC_ID name is case sensitive due + to internal query parser. + FTS_DOC_ID column must be of BIGINT NOT NULL type + and it should be in all capitalized characters */ + if (!innobase_strcasecmp(field->field_name.str, + FTS_DOC_ID_COL_NAME)) { + if (col_type != DATA_INT + || field->real_maybe_null() + || col_len != sizeof(doc_id_t) + || strcmp(field->field_name.str, + FTS_DOC_ID_COL_NAME)) { + goto wrong_column_name; + } + } + + if (is_virtual) { + dict_mem_table_add_v_col( + ctx->new_table, ctx->heap, + field->field_name.str, + col_type, + dtype_form_prtype( + field_type, charset_no) + | DATA_VIRTUAL, + col_len, i, 0); + } else { + dict_mem_table_add_col( + ctx->new_table, ctx->heap, + field->field_name.str, + col_type, + dtype_form_prtype( + field_type, charset_no), + col_len); + } + } + + if (n_v_cols) { + for (uint i = 0; i < altered_table->s->fields; i++) { + dict_v_col_t* v_col; + const Field* field = altered_table->field[i]; + + if (!!field->stored_in_db()) { + continue; + } + v_col = dict_table_get_nth_v_col( + ctx->new_table, z); + z++; + innodb_base_col_setup( + ctx->new_table, field, v_col); + } + } + + if (add_fts_doc_id) { + fts_add_doc_id_column(ctx->new_table, ctx->heap); + ctx->new_table->fts->doc_col = fts_doc_id_col; + ut_ad(fts_doc_id_col + == altered_table->s->fields - n_v_cols); + } else if (ctx->new_table->fts) { + ctx->new_table->fts->doc_col = fts_doc_id_col; + } + + dict_table_add_system_columns(ctx->new_table, ctx->heap); + + if (ha_alter_info->handler_flags & INNOBASE_DEFAULTS) { + defaults = dtuple_create_with_vcol( + ctx->heap, + dict_table_get_n_cols(ctx->new_table), + dict_table_get_n_v_cols(ctx->new_table)); + + dict_table_copy_types(defaults, ctx->new_table); + } else { + defaults = NULL; + } + + ctx->col_map = innobase_build_col_map( + ha_alter_info, altered_table, old_table, + ctx->new_table, user_table, defaults, ctx->heap); + ctx->defaults = defaults; + } else { + DBUG_ASSERT(!innobase_need_rebuild(ha_alter_info, old_table)); + DBUG_ASSERT(old_table->s->primary_key + == altered_table->s->primary_key); + + for (dict_index_t* index + = dict_table_get_first_index(user_table); + index != NULL; + index = dict_table_get_next_index(index)) { + if (!index->to_be_dropped && index->is_corrupted()) { + my_error(ER_CHECK_NO_SUCH_TABLE, MYF(0)); + goto error_handled; + } + } + + for (dict_index_t* index + = dict_table_get_first_index(user_table); + index != NULL; + index = dict_table_get_next_index(index)) { + if (!index->to_be_dropped && index->is_corrupted()) { + my_error(ER_CHECK_NO_SUCH_TABLE, MYF(0)); + goto error_handled; + } + } + + if (!ctx->new_table->fts + && innobase_fulltext_exist(altered_table)) { + ctx->new_table->fts = fts_create( + ctx->new_table); + ctx->new_table->fts->doc_col = fts_doc_id_col; + } + + /* Check if we need to update mtypes of legacy GIS columns. + This check is only needed when we don't have to rebuild + the table, since rebuild would update all mtypes for GIS + columns */ + error = innobase_check_gis_columns( + ha_alter_info, ctx->new_table, ctx->trx); + if (error != DB_SUCCESS) { + ut_ad(error == DB_ERROR); + my_error(ER_TABLE_CANT_HANDLE_SPKEYS, MYF(0), "SYS_COLUMNS"); + goto error_handled; + } + } + + ut_ad(new_clustered == ctx->need_rebuild()); + + /* Create the index metadata. */ + for (ulint a = 0; a < ctx->num_to_add_index; a++) { + if (index_defs[a].ind_type & DICT_VIRTUAL + && ctx->num_to_drop_vcol > 0 && !new_clustered) { + innodb_v_adjust_idx_col(ha_alter_info, old_table, + ctx->num_to_drop_vcol, + &index_defs[a]); + } + + ctx->add_index[a] = row_merge_create_index( + ctx->new_table, &index_defs[a], add_v); + + add_key_nums[a] = index_defs[a].key_number; + + DBUG_ASSERT(ctx->add_index[a]->is_committed() + == !!new_clustered); + } + + DBUG_ASSERT(!ctx->need_rebuild() + || !ctx->new_table->persistent_autoinc); + + if (ctx->need_rebuild() && instant_alter_column_possible( + *user_table, ha_alter_info, old_table, altered_table, + ha_innobase::is_innodb_strict_mode(ctx->trx->mysql_thd))) { + for (uint a = 0; a < ctx->num_to_add_index; a++) { + ctx->add_index[a]->table = ctx->new_table; + error = dict_index_add_to_cache( + ctx->add_index[a], FIL_NULL, add_v); + ut_a(error == DB_SUCCESS); + } + + DBUG_ASSERT(ha_alter_info->key_count + /* hidden GEN_CLUST_INDEX in InnoDB */ + + dict_index_is_auto_gen_clust( + dict_table_get_first_index(ctx->new_table)) + /* hidden FTS_DOC_ID_INDEX in InnoDB */ + + (ctx->old_table->fts_doc_id_index + && innobase_fts_check_doc_id_index_in_def( + altered_table->s->keys, + altered_table->key_info) + != FTS_EXIST_DOC_ID_INDEX) + == ctx->num_to_add_index); + + ctx->num_to_add_index = 0; + ctx->add_index = NULL; + + uint i = 0; // index of stored columns ctx->new_table->cols[] + Field **af = altered_table->field; + + for (const Create_field& new_field : + ha_alter_info->alter_info->create_list) { + DBUG_ASSERT(!new_field.field + || std::find(old_table->field, + old_table->field + + old_table->s->fields, + new_field.field) != + old_table->field + old_table->s->fields); + DBUG_ASSERT(new_field.field + || !strcmp(new_field.field_name.str, + (*af)->field_name.str)); + + if (!(*af)->stored_in_db()) { + af++; + continue; + } + + dict_col_t* col = dict_table_get_nth_col( + ctx->new_table, i); + DBUG_ASSERT(!strcmp((*af)->field_name.str, + dict_table_get_col_name(ctx->new_table, + i))); + DBUG_ASSERT(!col->is_added()); + + if (new_field.field) { + /* This is a pre-existing column, + possibly at a different position. */ + } else if ((*af)->is_real_null()) { + /* DEFAULT NULL */ + col->def_val.len = UNIV_SQL_NULL; + } else { + switch ((*af)->type()) { + case MYSQL_TYPE_VARCHAR: + col->def_val.len = reinterpret_cast + + ((*af))->get_length(); + col->def_val.data = reinterpret_cast + + ((*af))->get_data(); + break; + case MYSQL_TYPE_GEOMETRY: + case MYSQL_TYPE_TINY_BLOB: + case MYSQL_TYPE_MEDIUM_BLOB: + case MYSQL_TYPE_BLOB: + case MYSQL_TYPE_LONG_BLOB: + col->def_val.len = reinterpret_cast + + ((*af))->get_length(); + col->def_val.data = reinterpret_cast + + ((*af))->get_ptr(); + break; + default: + dfield_t d; + dict_col_copy_type(col, &d.type); + ulint len = (*af)->pack_length(); + DBUG_ASSERT(len <= 8 + || d.type.mtype + != DATA_INT); + row_mysql_store_col_in_innobase_format( + &d, + d.type.mtype == DATA_INT + ? static_cast( + mem_heap_alloc( + ctx->heap, + len)) + : NULL, + true, (*af)->ptr, len, + dict_table_is_comp( + user_table)); + col->def_val.len = d.len; + col->def_val.data = d.data; + } + } + + i++; + af++; + } + + DBUG_ASSERT(af == altered_table->field + + altered_table->s->fields); + /* There might exist a hidden FTS_DOC_ID column for + FULLTEXT INDEX. If it exists, the columns should have + been implicitly added by ADD FULLTEXT INDEX together + with instant ADD COLUMN. (If a hidden FTS_DOC_ID pre-existed, + then the ctx->col_map[] check should have prevented + adding visible user columns after that.) */ + DBUG_ASSERT(DATA_N_SYS_COLS + i == ctx->new_table->n_cols + || (1 + DATA_N_SYS_COLS + i + == ctx->new_table->n_cols + && !strcmp(dict_table_get_col_name( + ctx->new_table, i), + FTS_DOC_ID_COL_NAME))); + + if (altered_table->found_next_number_field) { + ctx->new_table->persistent_autoinc + = ctx->old_table->persistent_autoinc; + } + + ctx->prepare_instant(); + } + + if (ctx->need_rebuild()) { + DBUG_ASSERT(ctx->need_rebuild()); + DBUG_ASSERT(!ctx->is_instant()); + DBUG_ASSERT(num_fts_index <= 1); + DBUG_ASSERT(!ctx->online || num_fts_index == 0); + DBUG_ASSERT(!ctx->online + || !ha_alter_info->mdl_exclusive_after_prepare + || ctx->add_autoinc == ULINT_UNDEFINED); + DBUG_ASSERT(!ctx->online + || !innobase_need_rebuild(ha_alter_info, old_table) + || !innobase_fulltext_exist(altered_table)); + + uint32_t key_id = FIL_DEFAULT_ENCRYPTION_KEY; + fil_encryption_t mode = FIL_ENCRYPTION_DEFAULT; + + if (fil_space_t* s = user_table->space) { + if (const fil_space_crypt_t* c = s->crypt_data) { + key_id = c->key_id; + mode = c->encryption; + } + } + + if (ha_alter_info->handler_flags & ALTER_OPTIONS) { + const ha_table_option_struct& alt_opt= + *ha_alter_info->create_info->option_struct; + const ha_table_option_struct& opt= + *old_table->s->option_struct; + if (alt_opt.encryption != opt.encryption + || alt_opt.encryption_key_id + != opt.encryption_key_id) { + key_id = uint32_t(alt_opt.encryption_key_id); + mode = fil_encryption_t(alt_opt.encryption); + } + } + + if (dict_sys.find_table( + {ctx->new_table->name.m_name, + strlen(ctx->new_table->name.m_name)})) { + my_error(ER_TABLE_EXISTS_ERROR, MYF(0), + ctx->new_table->name.m_name); + goto new_clustered_failed; + } + + /* Create the table. */ + ctx->trx->dict_operation = true; + + error = row_create_table_for_mysql(ctx->new_table, ctx->trx); + + switch (error) { + case DB_SUCCESS: + DBUG_ASSERT(ctx->new_table->get_ref_count() == 0); + DBUG_ASSERT(ctx->new_table->id != 0); + break; + case DB_DUPLICATE_KEY: + my_error(HA_ERR_TABLE_EXIST, MYF(0), + altered_table->s->table_name.str); + goto new_table_failed; + case DB_UNSUPPORTED: + my_error(ER_UNSUPPORTED_EXTENSION, MYF(0), + altered_table->s->table_name.str); + goto new_table_failed; + default: + my_error_innodb(error, table_name, flags); +new_table_failed: + DBUG_ASSERT(ctx->trx != ctx->prebuilt->trx); + ctx->new_table = NULL; + goto new_clustered_failed; + } + + for (ulint a = 0; a < ctx->num_to_add_index; a++) { + dict_index_t* index = ctx->add_index[a]; + const ulint n_v_col = index->get_new_n_vcol(); + index = create_index_dict(ctx->trx, index, + mode, key_id, add_v); + error = ctx->trx->error_state; + if (error != DB_SUCCESS) { + if (index) { + dict_mem_index_free(index); + } +error_handling_drop_uncached_1: + while (++a < ctx->num_to_add_index) { + dict_mem_index_free(ctx->add_index[a]); + } + goto error_handling; + } else { + DBUG_ASSERT(index != ctx->add_index[a]); + } + + ctx->add_index[a] = index; + /* For ALTER TABLE...FORCE or OPTIMIZE TABLE, + we may only issue warnings, because there will + be no schema change from the user perspective. */ + if (!info.row_size_is_acceptable( + *index, + !!(ha_alter_info->handler_flags + & ~(INNOBASE_INPLACE_IGNORE + | INNOBASE_ALTER_NOVALIDATE + | ALTER_RECREATE_TABLE)))) { + error = DB_TOO_BIG_RECORD; + goto error_handling_drop_uncached_1; + } + index->parser = index_defs[a].parser; + if (n_v_col) { + index->assign_new_v_col(n_v_col); + } + /* Note the id of the transaction that created this + index, we use it to restrict readers from accessing + this index, to ensure read consistency. */ + ut_ad(index->trx_id == ctx->trx->id); + + if (index->type & DICT_FTS) { + DBUG_ASSERT(num_fts_index == 1); + DBUG_ASSERT(!fts_index); + DBUG_ASSERT(index->type == DICT_FTS); + fts_index = ctx->add_index[a]; + } + } + + dict_index_t* clust_index = dict_table_get_first_index( + user_table); + dict_index_t* new_clust_index = dict_table_get_first_index( + ctx->new_table); + ut_ad(!new_clust_index->is_instant()); + /* row_merge_build_index() depends on the correct value */ + ut_ad(new_clust_index->n_core_null_bytes + == UT_BITS_IN_BYTES(new_clust_index->n_nullable)); + + if (const Field* ai = altered_table->found_next_number_field) { + const unsigned col_no = innodb_col_no(ai); + + ctx->new_table->persistent_autoinc = + (dict_table_get_nth_col_pos( + ctx->new_table, col_no, NULL) + 1) + & dict_index_t::MAX_N_FIELDS; + + /* Initialize the AUTO_INCREMENT sequence + to the rebuilt table from the old one. */ + if (!old_table->found_next_number_field + || !user_table->space) { + } else if (ib_uint64_t autoinc + = btr_read_autoinc(clust_index)) { + btr_write_autoinc(new_clust_index, autoinc); + } + } + + ctx->skip_pk_sort = innobase_pk_order_preserved( + ctx->col_map, clust_index, new_clust_index); + + DBUG_EXECUTE_IF("innodb_alter_table_pk_assert_no_sort", + DBUG_ASSERT(ctx->skip_pk_sort);); + + if (ctx->online) { + /* Allocate a log for online table rebuild. */ + clust_index->lock.x_lock(SRW_LOCK_CALL); + bool ok = row_log_allocate( + ctx->prebuilt->trx, + clust_index, ctx->new_table, + !(ha_alter_info->handler_flags + & ALTER_ADD_PK_INDEX), + ctx->defaults, ctx->col_map, path, + old_table, + ctx->allow_not_null); + clust_index->lock.x_unlock(); + + if (!ok) { + error = DB_OUT_OF_MEMORY; + goto error_handling; + } + } + } else if (ctx->num_to_add_index) { + ut_ad(!ctx->is_instant()); + + for (ulint a = 0; a < ctx->num_to_add_index; a++) { + dict_index_t* index = ctx->add_index[a]; + const ulint n_v_col = index->get_new_n_vcol(); + DBUG_EXECUTE_IF( + "create_index_metadata_fail", + if (a + 1 == ctx->num_to_add_index) { + ctx->trx->error_state = + DB_OUT_OF_FILE_SPACE; + goto index_created; + }); + index = create_index_dict(ctx->trx, index, + FIL_ENCRYPTION_DEFAULT, + FIL_DEFAULT_ENCRYPTION_KEY, + add_v); +#ifndef DBUG_OFF +index_created: +#endif + error = ctx->trx->error_state; + if (error != DB_SUCCESS) { + if (index) { + dict_mem_index_free(index); + } +error_handling_drop_uncached: + while (++a < ctx->num_to_add_index) { + dict_mem_index_free(ctx->add_index[a]); + } + goto error_handling; + } else { + DBUG_ASSERT(index != ctx->add_index[a]); + } + ctx->add_index[a]= index; + if (!info.row_size_is_acceptable(*index, true)) { + error = DB_TOO_BIG_RECORD; + goto error_handling_drop_uncached; + } + + index->parser = index_defs[a].parser; + if (n_v_col) { + index->assign_new_v_col(n_v_col); + } + + ctx->change_col_collation(index, *altered_table); + /* Note the id of the transaction that created this + index, we use it to restrict readers from accessing + this index, to ensure read consistency. */ + ut_ad(index->trx_id == ctx->trx->id); + + /* If ADD INDEX with LOCK=NONE has been + requested, allocate a modification log. */ + if (index->type & DICT_FTS) { + DBUG_ASSERT(num_fts_index == 1); + DBUG_ASSERT(!fts_index); + DBUG_ASSERT(index->type == DICT_FTS); + fts_index = ctx->add_index[a]; + /* Fulltext indexes are not covered + by a modification log. */ + } else if (!ctx->online + || !user_table->is_readable() + || !user_table->space) { + /* No need to allocate a modification log. */ + DBUG_ASSERT(!index->online_log); + } else { + index->lock.x_lock(SRW_LOCK_CALL); + + bool ok = row_log_allocate( + ctx->prebuilt->trx, + index, + NULL, true, NULL, NULL, + path, old_table, + ctx->allow_not_null); + + index->lock.x_unlock(); + + DBUG_EXECUTE_IF( + "innodb_OOM_prepare_add_index", + if (ok && a == 1) { + row_log_free( + index->online_log); + index->online_log = NULL; + ctx->old_table->indexes.start + ->online_log = nullptr; + ok = false; + }); + + if (!ok) { + error = DB_OUT_OF_MEMORY; + goto error_handling_drop_uncached; + } + } + } + } else if (ctx->is_instant() + && !info.row_size_is_acceptable(*user_table, true)) { + error = DB_TOO_BIG_RECORD; + goto error_handling; + } + + if (ctx->online && ctx->num_to_add_index) { + /* Assign a consistent read view for + row_merge_read_clustered_index(). */ + ctx->prebuilt->trx->read_view.open(ctx->prebuilt->trx); + } + + if (fts_index) { + ut_ad(ctx->trx->dict_operation); + ut_ad(ctx->trx->dict_operation_lock_mode); + ut_ad(dict_sys.locked()); + + DICT_TF2_FLAG_SET(ctx->new_table, DICT_TF2_FTS); + if (ctx->need_rebuild()) { + /* For !ctx->need_rebuild(), this will be set at + commit_cache_norebuild(). */ + ctx->new_table->fts_doc_id_index + = dict_table_get_index_on_name( + ctx->new_table, FTS_DOC_ID_INDEX_NAME); + DBUG_ASSERT(ctx->new_table->fts_doc_id_index != NULL); + } + + error = fts_create_index_tables(ctx->trx, fts_index, + ctx->new_table->id); + + DBUG_EXECUTE_IF("innodb_test_fail_after_fts_index_table", + error = DB_LOCK_WAIT_TIMEOUT; + goto error_handling;); + + if (error != DB_SUCCESS) { + goto error_handling; + } + + if (!ctx->new_table->fts + || ib_vector_size(ctx->new_table->fts->indexes) == 0) { + error = fts_create_common_tables( + ctx->trx, ctx->new_table, true); + + DBUG_EXECUTE_IF( + "innodb_test_fail_after_fts_common_table", + error = DB_LOCK_WAIT_TIMEOUT;); + + if (error != DB_SUCCESS) { + goto error_handling; + } + + error = innobase_fts_load_stopword( + ctx->new_table, ctx->trx, + ctx->prebuilt->trx->mysql_thd) + ? DB_SUCCESS : DB_ERROR; + + if (error != DB_SUCCESS) { + goto error_handling; + } + } + } + + DBUG_ASSERT(error == DB_SUCCESS); + + { + /* Commit the data dictionary transaction in order to release + the table locks on the system tables. This means that if + MariaDB is killed while rebuilding the table inside + row_merge_build_indexes(), ctx->new_table will not be dropped + by trx_rollback_active(). */ + ut_d(dict_table_check_for_dup_indexes(user_table, + CHECK_PARTIAL_OK)); + if (ctx->need_rebuild()) { + ctx->new_table->acquire(); + } + + /* fts_create_common_tables() may drop old common tables, + whose files would be deleted here. */ + commit_unlock_and_unlink(ctx->trx); + if (fts_exist) { + purge_sys.resume_FTS(); + } + + trx_start_for_ddl(ctx->trx); + ctx->prebuilt->trx_id = ctx->trx->id; + } + + if (ctx->old_table->fts) { + fts_sync_during_ddl(ctx->old_table); + } + + DBUG_RETURN(false); + +error_handling: + /* After an error, remove all those index definitions from the + dictionary which were defined. */ + + switch (error) { + case DB_TABLESPACE_EXISTS: + my_error(ER_TABLESPACE_EXISTS, MYF(0), "(unknown)"); + break; + case DB_DUPLICATE_KEY: + my_error(ER_DUP_KEY, MYF(0), "SYS_INDEXES"); + break; + default: + my_error_innodb(error, table_name, user_table->flags); + } + + ctx->trx->rollback(); + + ut_ad(!ctx->need_rebuild() + || !user_table->indexes.start->online_log); + + ctx->prebuilt->trx->error_info = NULL; + ctx->trx->error_state = DB_SUCCESS; + + if (false) { +error_handled: + ut_ad(!table_lock_failed); + ut_ad(ctx->trx->state == TRX_STATE_ACTIVE); + ut_ad(!ctx->trx->undo_no); + ut_ad(dict_locked); + } else if (table_lock_failed) { + if (!dict_locked) { + row_mysql_lock_data_dictionary(ctx->trx); + } + goto err_exit; + } else { + ut_ad(ctx->trx->state == TRX_STATE_NOT_STARTED); + if (new_clustered && !user_table->drop_aborted) { + goto err_exit; + } + if (dict_locked) { + row_mysql_unlock_data_dictionary(ctx->trx); + } + trx_start_for_ddl(ctx->trx); + dberr_t err= lock_sys_tables(ctx->trx); + row_mysql_lock_data_dictionary(ctx->trx); + if (err != DB_SUCCESS) { + goto err_exit; + } + } + + /* n_ref_count must be 1, because background threads cannot + be executing on this very table as we are + holding MDL_EXCLUSIVE. */ + ut_ad(ctx->online || user_table->get_ref_count() == 1); + + if (new_clustered) { + online_retry_drop_indexes_low(user_table, ctx->trx); + commit_unlock_and_unlink(ctx->trx); + row_mysql_lock_data_dictionary(ctx->trx); + } else { + row_merge_drop_indexes(ctx->trx, user_table, true); + ctx->trx->commit(); + } + + ut_d(dict_table_check_for_dup_indexes(user_table, CHECK_ALL_COMPLETE)); + ut_ad(!user_table->drop_aborted); + +err_exit: + /* Clear the to_be_dropped flag in the data dictionary cache. */ + for (ulint i = 0; i < ctx->num_to_drop_index; i++) { + DBUG_ASSERT(ctx->drop_index[i]->is_committed()); + DBUG_ASSERT(ctx->drop_index[i]->to_be_dropped); + ctx->drop_index[i]->to_be_dropped = 0; + } + + if (ctx->trx) { + row_mysql_unlock_data_dictionary(ctx->trx); + ctx->trx->rollback(); + ctx->trx->free(); + } + trx_commit_for_mysql(ctx->prebuilt->trx); + if (fts_exist) { + purge_sys.resume_FTS(); + } + + for (uint i = 0; i < ctx->num_to_add_fk; i++) { + if (ctx->add_fk[i]) { + dict_foreign_free(ctx->add_fk[i]); + } + } + + delete ctx; + ha_alter_info->handler_ctx = NULL; + + DBUG_RETURN(true); +} + +/* Check whether an index is needed for the foreign key constraint. +If so, if it is dropped, is there an equivalent index can play its role. +@return true if the index is needed and can't be dropped */ +static MY_ATTRIBUTE((nonnull(1,2,3,5), warn_unused_result)) +bool +innobase_check_foreign_key_index( +/*=============================*/ + Alter_inplace_info* ha_alter_info, /*!< in: Structure describing + changes to be done by ALTER + TABLE */ + dict_index_t* index, /*!< in: index to check */ + dict_table_t* indexed_table, /*!< in: table that owns the + foreign keys */ + const char** col_names, /*!< in: column names, or NULL + for indexed_table->col_names */ + trx_t* trx, /*!< in/out: transaction */ + dict_foreign_t** drop_fk, /*!< in: Foreign key constraints + to drop */ + ulint n_drop_fk) /*!< in: Number of foreign keys + to drop */ +{ + const dict_foreign_set* fks = &indexed_table->referenced_set; + + /* Check for all FK references from other tables to the index. */ + for (dict_foreign_set::const_iterator it = fks->begin(); + it != fks->end(); ++it) { + + dict_foreign_t* foreign = *it; + if (foreign->referenced_index != index) { + continue; + } + ut_ad(indexed_table == foreign->referenced_table); + + if (NULL == dict_foreign_find_index( + indexed_table, col_names, + foreign->referenced_col_names, + foreign->n_fields, index, + /*check_charsets=*/TRUE, + /*check_null=*/FALSE, + NULL, NULL, NULL) + && NULL == innobase_find_equiv_index( + foreign->referenced_col_names, + foreign->n_fields, + ha_alter_info->key_info_buffer, + span(ha_alter_info->index_add_buffer, + ha_alter_info->index_add_count))) { + + /* Index cannot be dropped. */ + trx->error_info = index; + return(true); + } + } + + fks = &indexed_table->foreign_set; + + /* Check for all FK references in current table using the index. */ + for (dict_foreign_set::const_iterator it = fks->begin(); + it != fks->end(); ++it) { + + dict_foreign_t* foreign = *it; + if (foreign->foreign_index != index) { + continue; + } + + ut_ad(indexed_table == foreign->foreign_table); + + if (!innobase_dropping_foreign( + foreign, drop_fk, n_drop_fk) + && NULL == dict_foreign_find_index( + indexed_table, col_names, + foreign->foreign_col_names, + foreign->n_fields, index, + /*check_charsets=*/TRUE, + /*check_null=*/FALSE, + NULL, NULL, NULL) + && NULL == innobase_find_equiv_index( + foreign->foreign_col_names, + foreign->n_fields, + ha_alter_info->key_info_buffer, + span(ha_alter_info->index_add_buffer, + ha_alter_info->index_add_count))) { + + /* Index cannot be dropped. */ + trx->error_info = index; + return(true); + } + } + + return(false); +} + +/** +Rename a given index in the InnoDB data dictionary. + +@param index index to rename +@param new_name new name of the index +@param[in,out] trx dict transaction to use, not going to be committed here + +@retval true Failure +@retval false Success */ +static MY_ATTRIBUTE((warn_unused_result)) +bool +rename_index_try( + const dict_index_t* index, + const char* new_name, + trx_t* trx) +{ + DBUG_ENTER("rename_index_try"); + ut_ad(dict_sys.locked()); + ut_ad(trx->dict_operation_lock_mode); + + pars_info_t* pinfo; + dberr_t err; + + pinfo = pars_info_create(); + + pars_info_add_ull_literal(pinfo, "table_id", index->table->id); + pars_info_add_ull_literal(pinfo, "index_id", index->id); + pars_info_add_str_literal(pinfo, "new_name", new_name); + + trx->op_info = "Renaming an index in SYS_INDEXES"; + + DBUG_EXECUTE_IF( + "ib_rename_index_fail1", + DBUG_SET("+d,innodb_report_deadlock"); + ); + + err = que_eval_sql( + pinfo, + "PROCEDURE RENAME_INDEX_IN_SYS_INDEXES () IS\n" + "BEGIN\n" + "UPDATE SYS_INDEXES SET\n" + "NAME = :new_name\n" + "WHERE\n" + "ID = :index_id AND\n" + "TABLE_ID = :table_id;\n" + "END;\n", trx); /* pinfo is freed by que_eval_sql() */ + + DBUG_EXECUTE_IF( + "ib_rename_index_fail1", + DBUG_SET("-d,innodb_report_deadlock"); + ); + + trx->op_info = ""; + + if (err != DB_SUCCESS) { + my_error_innodb(err, index->table->name.m_name, 0); + DBUG_RETURN(true); + } + + DBUG_RETURN(false); +} + + +/** +Rename a given index in the InnoDB data dictionary cache. + +@param[in,out] index index to rename +@param new_name new index name +*/ +static +void +innobase_rename_index_cache(dict_index_t* index, const char* new_name) +{ + DBUG_ENTER("innobase_rename_index_cache"); + ut_ad(dict_sys.locked()); + + size_t old_name_len = strlen(index->name); + size_t new_name_len = strlen(new_name); + + if (old_name_len < new_name_len) { + index->name = static_cast( + mem_heap_alloc(index->heap, new_name_len + 1)); + } + + memcpy(const_cast(index->name()), new_name, new_name_len + 1); + + DBUG_VOID_RETURN; +} + + +/** Rename the index name in cache. +@param[in] ctx alter context +@param[in] ha_alter_info Data used during inplace alter. */ +static void +innobase_rename_indexes_cache(const ha_innobase_inplace_ctx *ctx, + const Alter_inplace_info *ha_alter_info) +{ + DBUG_ASSERT(ha_alter_info->handler_flags & ALTER_RENAME_INDEX); + + std::vector> rename_info; + rename_info.reserve(ha_alter_info->rename_keys.size()); + + for (const Alter_inplace_info::Rename_key_pair &pair : + ha_alter_info->rename_keys) + { + dict_index_t *index= + dict_table_get_index_on_name(ctx->old_table, pair.old_key->name.str); + ut_ad(index); + + rename_info.emplace_back(index, pair.new_key->name.str); + } + + for (const auto &pair : rename_info) + innobase_rename_index_cache(pair.first, pair.second); +} + +/** Fill the stored column information in s_cols list. +@param[in] altered_table mysql table object +@param[in] table innodb table object +@param[out] s_cols list of stored column +@param[out] s_heap heap for storing stored +column information. */ +static +void +alter_fill_stored_column( + const TABLE* altered_table, + dict_table_t* table, + dict_s_col_list** s_cols, + mem_heap_t** s_heap) +{ + ulint n_cols = altered_table->s->fields; + ulint stored_col_no = 0; + + for (ulint i = 0; i < n_cols; i++) { + Field* field = altered_table->field[i]; + dict_s_col_t s_col; + + if (field->stored_in_db()) { + stored_col_no++; + } + + if (!innobase_is_s_fld(field)) { + continue; + } + + ulint num_base = 0; + dict_col_t* col = dict_table_get_nth_col(table, + stored_col_no); + + s_col.m_col = col; + s_col.s_pos = i; + + if (*s_cols == NULL) { + *s_cols = UT_NEW_NOKEY(dict_s_col_list()); + *s_heap = mem_heap_create(1000); + } + + if (num_base != 0) { + s_col.base_col = static_cast(mem_heap_zalloc( + *s_heap, num_base * sizeof(dict_col_t*))); + } else { + s_col.base_col = NULL; + } + + s_col.num_base = num_base; + innodb_base_col_setup_for_stored(table, field, &s_col); + (*s_cols)->push_front(s_col); + } +} + +static bool alter_templ_needs_rebuild(const TABLE* altered_table, + const Alter_inplace_info* ha_alter_info, + const dict_table_t* table); + +/** Check whether the column is present in table foreign key +relations. +@param table table which has foreign key relation +@param col column to be checked +@param col_name column name to be display during error +@param drop_fk Drop foreign key constraint +@param n_drop_fk number of drop foreign keys +@param add_fk Newly added foreign key constraint +@param n_add_fk number of newly added foreign constraint */ +static +bool check_col_is_in_fk_indexes( + const dict_table_t *table, const dict_col_t *col, + const char* col_name, + span drop_fk, + span add_fk) +{ + char *fk_id= nullptr; + + for (const auto &f : table->foreign_set) + { + if (!f->foreign_index || + std::find(drop_fk.begin(), drop_fk.end(), f) != drop_fk.end()) + continue; + for (ulint i= 0; i < f->n_fields; i++) + if (f->foreign_index->fields[i].col == col) + { + fk_id= f->id; + goto err_exit; + } + } + + for (const auto &a : add_fk) + { + for (ulint i= 0; i < a->n_fields; i++) + { + if (a->foreign_index->fields[i].col == col) + { + fk_id= a->id; + goto err_exit; + } + } + } + + for (const auto &f : table->referenced_set) + { + if (!f->referenced_index) continue; + for (ulint i= 0; i < f->n_fields; i++) + { + if (f->referenced_index->fields[i].col == col) + { + my_error(ER_FK_COLUMN_CANNOT_CHANGE_CHILD, MYF(0), + col_name, f->id, f->foreign_table_name); + return true; + } + } + } + return false; +err_exit: + my_error(ER_FK_COLUMN_CANNOT_CHANGE, MYF(0), col_name, + fk_id ? fk_id : + (std::string(table->name.m_name) + "_ibfk_0").c_str()); + return true; +} + +/** Allows InnoDB to update internal structures with concurrent +writes blocked (provided that check_if_supported_inplace_alter() +did not return HA_ALTER_INPLACE_NO_LOCK). +This will be invoked before inplace_alter_table(). + +@param altered_table TABLE object for new version of table. +@param ha_alter_info Structure describing changes to be done +by ALTER TABLE and holding data used during in-place alter. + +@retval true Failure +@retval false Success +*/ + +bool +ha_innobase::prepare_inplace_alter_table( +/*=====================================*/ + TABLE* altered_table, + Alter_inplace_info* ha_alter_info) +{ + dict_index_t** drop_index; /*!< Index to be dropped */ + ulint n_drop_index; /*!< Number of indexes to drop */ + dict_foreign_t**drop_fk; /*!< Foreign key constraints to drop */ + ulint n_drop_fk; /*!< Number of foreign keys to drop */ + dict_foreign_t**add_fk = NULL; /*!< Foreign key constraints to drop */ + ulint n_add_fk= 0; /*!< Number of foreign keys to drop */ + dict_table_t* indexed_table; /*!< Table where indexes are created */ + mem_heap_t* heap; + const char** col_names; + int error; + ulint add_autoinc_col_no = ULINT_UNDEFINED; + ulonglong autoinc_col_max_value = 0; + ulint fts_doc_col_no = ULINT_UNDEFINED; + bool add_fts_doc_id = false; + bool add_fts_doc_id_idx = false; + bool add_fts_idx = false; + dict_s_col_list*s_cols = NULL; + mem_heap_t* s_heap = NULL; + + DBUG_ENTER("prepare_inplace_alter_table"); + DBUG_ASSERT(!ha_alter_info->handler_ctx); + DBUG_ASSERT(ha_alter_info->create_info); + DBUG_ASSERT(!srv_read_only_mode); + + /* Init online ddl status variables */ + onlineddl_rowlog_rows = 0; + onlineddl_rowlog_pct_used = 0; + onlineddl_pct_progress = 0; + + MONITOR_ATOMIC_INC(MONITOR_PENDING_ALTER_TABLE); + +#ifdef UNIV_DEBUG + for (dict_index_t* index = dict_table_get_first_index(m_prebuilt->table); + index; + index = dict_table_get_next_index(index)) { + ut_ad(!index->to_be_dropped); + } +#endif /* UNIV_DEBUG */ + + ut_d(dict_sys.freeze(SRW_LOCK_CALL)); + ut_d(dict_table_check_for_dup_indexes( + m_prebuilt->table, CHECK_ABORTED_OK)); + ut_d(dict_sys.unfreeze()); + + if (!(ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)) { + /* Nothing to do */ + DBUG_ASSERT(!m_prebuilt->trx->dict_operation_lock_mode); + m_prebuilt->trx_id = 0; + DBUG_RETURN(false); + } + +#ifdef WITH_PARTITION_STORAGE_ENGINE + if (table->part_info == NULL) { +#endif + /* Ignore the MDL downgrade when table is empty. + This optimization is disabled for partition table. */ + ha_alter_info->mdl_exclusive_after_prepare = + innobase_table_is_empty(m_prebuilt->table, false); + if (ha_alter_info->online + && ha_alter_info->mdl_exclusive_after_prepare) { + ha_alter_info->online = false; + } +#ifdef WITH_PARTITION_STORAGE_ENGINE + } +#endif + indexed_table = m_prebuilt->table; + + /* ALTER TABLE will not implicitly move a table from a single-table + tablespace to the system tablespace when innodb_file_per_table=OFF. + But it will implicitly move a table from the system tablespace to a + single-table tablespace if innodb_file_per_table = ON. */ + + create_table_info_t info(m_user_thd, + altered_table, + ha_alter_info->create_info, + NULL, + NULL, + srv_file_per_table); + + info.set_tablespace_type(indexed_table->space != fil_system.sys_space); + + if (ha_alter_info->handler_flags & ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX) { + if (info.gcols_in_fulltext_or_spatial()) { + goto err_exit_no_heap; + } + } + + if (indexed_table->is_readable()) { + } else { + if (indexed_table->corrupted) { + /* Handled below */ + } else { + if (const fil_space_t* space = indexed_table->space) { + String str; + const char* engine= table_type(); + + push_warning_printf( + m_user_thd, + Sql_condition::WARN_LEVEL_WARN, + HA_ERR_DECRYPTION_FAILED, + "Table %s in file %s is encrypted but encryption service or" + " used key_id is not available. " + " Can't continue reading table.", + table_share->table_name.str, + space->chain.start->name); + + my_error(ER_GET_ERRMSG, MYF(0), HA_ERR_DECRYPTION_FAILED, str.c_ptr(), engine); + DBUG_RETURN(true); + } + } + } + + if (indexed_table->corrupted + || dict_table_get_first_index(indexed_table) == NULL + || dict_table_get_first_index(indexed_table)->is_corrupted()) { + /* The clustered index is corrupted. */ + my_error(ER_CHECK_NO_SUCH_TABLE, MYF(0)); + DBUG_RETURN(true); + } else { + const char* invalid_opt = info.create_options_are_invalid(); + + /* Check engine specific table options */ + if (const char* invalid_tbopt = info.check_table_options()) { + my_error(ER_ILLEGAL_HA_CREATE_OPTION, MYF(0), + table_type(), invalid_tbopt); + goto err_exit_no_heap; + } + + if (invalid_opt) { + my_error(ER_ILLEGAL_HA_CREATE_OPTION, MYF(0), + table_type(), invalid_opt); + goto err_exit_no_heap; + } + } + + /* Check if any index name is reserved. */ + if (innobase_index_name_is_reserved( + m_user_thd, + ha_alter_info->key_info_buffer, + ha_alter_info->key_count)) { +err_exit_no_heap: + DBUG_ASSERT(!m_prebuilt->trx->dict_operation_lock_mode); + online_retry_drop_indexes(m_prebuilt->table, m_user_thd); + DBUG_RETURN(true); + } + + indexed_table = m_prebuilt->table; + + /* Check that index keys are sensible */ + error = innobase_check_index_keys(ha_alter_info, indexed_table); + + if (error) { + goto err_exit_no_heap; + } + + /* Prohibit renaming a column to something that the table + already contains. */ + if (ha_alter_info->handler_flags + & ALTER_COLUMN_NAME) { + for (Field** fp = table->field; *fp; fp++) { + if (!((*fp)->flags & FIELD_IS_RENAMED)) { + continue; + } + + const char* name = 0; + + for (const Create_field& cf : + ha_alter_info->alter_info->create_list) { + if (cf.field == *fp) { + name = cf.field_name.str; + goto check_if_ok_to_rename; + } + } + + ut_error; +check_if_ok_to_rename: + /* Prohibit renaming a column from FTS_DOC_ID + if full-text indexes exist. */ + if (!my_strcasecmp(system_charset_info, + (*fp)->field_name.str, + FTS_DOC_ID_COL_NAME) + && innobase_fulltext_exist(altered_table)) { + my_error(ER_INNODB_FT_WRONG_DOCID_COLUMN, + MYF(0), name); + goto err_exit_no_heap; + } + + /* Prohibit renaming a column to an internal column. */ + const char* s = m_prebuilt->table->col_names; + unsigned j; + /* Skip user columns. + MySQL should have checked these already. + We want to allow renaming of c1 to c2, c2 to c1. */ + for (j = 0; j < table->s->fields; j++) { + if (table->field[j]->stored_in_db()) { + s += strlen(s) + 1; + } + } + + for (; j < m_prebuilt->table->n_def; j++) { + if (!my_strcasecmp( + system_charset_info, name, s)) { + my_error(ER_WRONG_COLUMN_NAME, MYF(0), + s); + goto err_exit_no_heap; + } + + s += strlen(s) + 1; + } + } + } + + if (!info.innobase_table_flags()) { + my_error(ER_ILLEGAL_HA_CREATE_OPTION, MYF(0), + table_type(), "PAGE_COMPRESSED"); + goto err_exit_no_heap; + } + + if (info.flags2() & DICT_TF2_USE_FILE_PER_TABLE) { + /* Preserve the DATA DIRECTORY attribute, because it + currently cannot be changed during ALTER TABLE. */ + info.flags_set(m_prebuilt->table->flags + & 1U << DICT_TF_POS_DATA_DIR); + } + + + /* ALGORITHM=INPLACE without rebuild (10.3+ ALGORITHM=NOCOPY) + must use the current ROW_FORMAT of the table. */ + const ulint max_col_len = DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG( + innobase_need_rebuild(ha_alter_info, this->table) + ? info.flags() + : m_prebuilt->table->flags); + + /* Check each index's column length to make sure they do not + exceed limit */ + for (ulint i = 0; i < ha_alter_info->key_count; i++) { + const KEY* key = &ha_alter_info->key_info_buffer[i]; + + if (key->flags & HA_FULLTEXT) { + /* The column length does not matter for + fulltext search indexes. But, UNIQUE + fulltext indexes are not supported. */ + DBUG_ASSERT(!(key->flags & HA_NOSAME)); + DBUG_ASSERT(!(key->flags & HA_KEYFLAG_MASK + & ~(HA_FULLTEXT + | HA_PACK_KEY + | HA_BINARY_PACK_KEY))); + add_fts_idx = true; + continue; + } + + if (too_big_key_part_length(max_col_len, *key)) { + my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0), + max_col_len); + goto err_exit_no_heap; + } + } + + /* We won't be allowed to add fts index to a table with + fts indexes already but without AUX_HEX_NAME set. + This means the aux tables of the table failed to + rename to hex format but new created aux tables + shall be in hex format, which is contradictory. */ + if (!DICT_TF2_FLAG_IS_SET(indexed_table, DICT_TF2_FTS_AUX_HEX_NAME) + && indexed_table->fts != NULL && add_fts_idx) { + my_error(ER_INNODB_FT_AUX_NOT_HEX_ID, MYF(0)); + goto err_exit_no_heap; + } + + /* Check existing index definitions for too-long column + prefixes as well, in case max_col_len shrunk. */ + for (const dict_index_t* index + = dict_table_get_first_index(indexed_table); + index; + index = dict_table_get_next_index(index)) { + if (index->type & DICT_FTS) { + DBUG_ASSERT(index->type == DICT_FTS + || (index->type & DICT_CORRUPT)); + + /* We need to drop any corrupted fts indexes + before we add a new fts index. */ + if (add_fts_idx && index->type & DICT_CORRUPT) { + ib_errf(m_user_thd, IB_LOG_LEVEL_ERROR, + ER_INNODB_INDEX_CORRUPT, + "Fulltext index '%s' is corrupt. " + "you should drop this index first.", + index->name()); + + goto err_exit_no_heap; + } + + continue; + } + + for (ulint i = 0; i < dict_index_get_n_fields(index); i++) { + const dict_field_t* field + = dict_index_get_nth_field(index, i); + if (field->prefix_len > max_col_len) { + my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0), + max_col_len); + goto err_exit_no_heap; + } + } + } + + n_drop_index = 0; + n_drop_fk = 0; + + if (ha_alter_info->handler_flags + & (INNOBASE_ALTER_NOREBUILD | INNOBASE_ALTER_REBUILD + | INNOBASE_ALTER_INSTANT)) { + heap = mem_heap_create(1024); + + if (ha_alter_info->handler_flags + & ALTER_COLUMN_NAME) { + col_names = innobase_get_col_names( + ha_alter_info, altered_table, table, + indexed_table, heap); + } else { + col_names = NULL; + } + } else { + heap = NULL; + col_names = NULL; + } + + if (ha_alter_info->handler_flags + & ALTER_DROP_FOREIGN_KEY) { + DBUG_ASSERT(ha_alter_info->alter_info->drop_list.elements > 0); + + drop_fk = static_cast( + mem_heap_alloc( + heap, + ha_alter_info->alter_info->drop_list.elements + * sizeof(dict_foreign_t*))); + + for (Alter_drop& drop : ha_alter_info->alter_info->drop_list) { + if (drop.type != Alter_drop::FOREIGN_KEY) { + continue; + } + + dict_foreign_t* foreign; + + for (dict_foreign_set::iterator it + = m_prebuilt->table->foreign_set.begin(); + it != m_prebuilt->table->foreign_set.end(); + ++it) { + + foreign = *it; + const char* fid = strchr(foreign->id, '/'); + + DBUG_ASSERT(fid); + /* If no database/ prefix was present in + the FOREIGN KEY constraint name, compare + to the full constraint name. */ + fid = fid ? fid + 1 : foreign->id; + + if (!my_strcasecmp(system_charset_info, + fid, drop.name)) { + goto found_fk; + } + } + + my_error(ER_CANT_DROP_FIELD_OR_KEY, MYF(0), + drop.type_name(), drop.name); + goto err_exit; +found_fk: + for (ulint i = n_drop_fk; i--; ) { + if (drop_fk[i] == foreign) { + goto dup_fk; + } + } + drop_fk[n_drop_fk++] = foreign; +dup_fk: + continue; + } + + DBUG_ASSERT(n_drop_fk > 0); + + DBUG_ASSERT(n_drop_fk + <= ha_alter_info->alter_info->drop_list.elements); + } else { + drop_fk = NULL; + } + + if (ha_alter_info->index_drop_count) { + dict_index_t* drop_primary = NULL; + + DBUG_ASSERT(ha_alter_info->handler_flags + & (ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX + | ALTER_DROP_UNIQUE_INDEX + | ALTER_DROP_PK_INDEX)); + /* Check which indexes to drop. */ + drop_index = static_cast( + mem_heap_alloc( + heap, (ha_alter_info->index_drop_count + 1) + * sizeof *drop_index)); + + for (uint i = 0; i < ha_alter_info->index_drop_count; i++) { + const KEY* key + = ha_alter_info->index_drop_buffer[i]; + dict_index_t* index + = dict_table_get_index_on_name( + indexed_table, key->name.str); + + if (!index) { + push_warning_printf( + m_user_thd, + Sql_condition::WARN_LEVEL_WARN, + HA_ERR_WRONG_INDEX, + "InnoDB could not find key" + " with name %s", key->name.str); + } else { + ut_ad(!index->to_be_dropped); + if (!index->is_primary()) { + drop_index[n_drop_index++] = index; + } else { + drop_primary = index; + } + } + } + + /* If all FULLTEXT indexes were removed, drop an + internal FTS_DOC_ID_INDEX as well, unless it exists in + the table. */ + + if (innobase_fulltext_exist(table) + && !innobase_fulltext_exist(altered_table) + && !DICT_TF2_FLAG_IS_SET( + indexed_table, DICT_TF2_FTS_HAS_DOC_ID)) { + dict_index_t* fts_doc_index + = indexed_table->fts_doc_id_index; + ut_ad(fts_doc_index); + + // Add some fault tolerance for non-debug builds. + if (fts_doc_index == NULL) { + goto check_if_can_drop_indexes; + } + + DBUG_ASSERT(!fts_doc_index->to_be_dropped); + + for (uint i = 0; i < table->s->keys; i++) { + if (!my_strcasecmp( + system_charset_info, + FTS_DOC_ID_INDEX_NAME, + table->key_info[i].name.str)) { + /* The index exists in the MySQL + data dictionary. Do not drop it, + even though it is no longer needed + by InnoDB fulltext search. */ + goto check_if_can_drop_indexes; + } + } + + drop_index[n_drop_index++] = fts_doc_index; + } + +check_if_can_drop_indexes: + /* Check if the indexes can be dropped. */ + + /* Prevent a race condition between DROP INDEX and + CREATE TABLE adding FOREIGN KEY constraints. */ + row_mysql_lock_data_dictionary(m_prebuilt->trx); + + if (!n_drop_index) { + drop_index = NULL; + } else { + /* Flag all indexes that are to be dropped. */ + for (ulint i = 0; i < n_drop_index; i++) { + ut_ad(!drop_index[i]->to_be_dropped); + drop_index[i]->to_be_dropped = 1; + } + } + + if (m_prebuilt->trx->check_foreigns) { + for (uint i = 0; i < n_drop_index; i++) { + dict_index_t* index = drop_index[i]; + + if (innobase_check_foreign_key_index( + ha_alter_info, index, + indexed_table, col_names, + m_prebuilt->trx, drop_fk, n_drop_fk)) { + row_mysql_unlock_data_dictionary( + m_prebuilt->trx); + m_prebuilt->trx->error_info = index; + print_error(HA_ERR_DROP_INDEX_FK, + MYF(0)); + goto err_exit; + } + } + + /* If a primary index is dropped, need to check + any depending foreign constraints get affected */ + if (drop_primary + && innobase_check_foreign_key_index( + ha_alter_info, drop_primary, + indexed_table, col_names, + m_prebuilt->trx, drop_fk, n_drop_fk)) { + row_mysql_unlock_data_dictionary(m_prebuilt->trx); + print_error(HA_ERR_DROP_INDEX_FK, MYF(0)); + goto err_exit; + } + } + + row_mysql_unlock_data_dictionary(m_prebuilt->trx); + } else { + drop_index = NULL; + } + + /* Check if any of the existing indexes are marked as corruption + and if they are, refuse adding more indexes. */ + if (ha_alter_info->handler_flags & ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX) { + for (dict_index_t* index = dict_table_get_first_index(indexed_table); + index != NULL; index = dict_table_get_next_index(index)) { + + if (!index->to_be_dropped && index->is_committed() + && index->is_corrupted()) { + my_error(ER_INDEX_CORRUPT, MYF(0), index->name()); + goto err_exit; + } + } + } + + if (ha_alter_info->handler_flags + & ALTER_ADD_FOREIGN_KEY) { + ut_ad(!m_prebuilt->trx->check_foreigns); + + alter_fill_stored_column(altered_table, m_prebuilt->table, + &s_cols, &s_heap); + + add_fk = static_cast( + mem_heap_zalloc( + heap, + ha_alter_info->alter_info->key_list.elements + * sizeof(dict_foreign_t*))); + + if (!innobase_get_foreign_key_info( + ha_alter_info, table_share, + m_prebuilt->table, col_names, + drop_index, n_drop_index, + add_fk, &n_add_fk, m_prebuilt->trx, s_cols)) { +err_exit: + if (n_drop_index) { + row_mysql_lock_data_dictionary(m_prebuilt->trx); + + /* Clear the to_be_dropped flags, which might + have been set at this point. */ + for (ulint i = 0; i < n_drop_index; i++) { + ut_ad(drop_index[i]->is_committed()); + drop_index[i]->to_be_dropped = 0; + } + + row_mysql_unlock_data_dictionary( + m_prebuilt->trx); + } + + for (uint i = 0; i < n_add_fk; i++) { + if (add_fk[i]) { + dict_foreign_free(add_fk[i]); + } + } + + if (heap) { + mem_heap_free(heap); + } + + if (s_cols != NULL) { + UT_DELETE(s_cols); + mem_heap_free(s_heap); + } + + goto err_exit_no_heap; + } + + if (s_cols != NULL) { + UT_DELETE(s_cols); + mem_heap_free(s_heap); + } + } + + /** Alter shouldn't support if the foreign and referenced + index columns are modified */ + if (ha_alter_info->handler_flags + & ALTER_COLUMN_TYPE_CHANGE_BY_ENGINE) { + + for (uint i= 0, n_v_col= 0; i < table->s->fields; + i++) { + Field* field = table->field[i]; + + /* Altering the virtual column is not + supported for inplace alter algorithm */ + if (field->vcol_info) { + n_v_col++; + continue; + } + + for (const Create_field& new_field : + ha_alter_info->alter_info->create_list) { + if (new_field.field == field) { + if (!field->is_equal(new_field)) { + goto field_changed; + } + break; + } + } + + continue; +field_changed: + const char* col_name= field->field_name.str; + dict_col_t *col= dict_table_get_nth_col( + m_prebuilt->table, i - n_v_col); + if (check_col_is_in_fk_indexes( + m_prebuilt->table, col, col_name, + span( + const_cast( + drop_fk), n_drop_fk), + span( + const_cast( + add_fk), n_add_fk))) + goto err_exit; + } + } + + if (ha_alter_info->handler_flags & ALTER_RENAME_INDEX) { + for (const Alter_inplace_info::Rename_key_pair& pair : + ha_alter_info->rename_keys) { + dict_index_t* index = dict_table_get_index_on_name( + indexed_table, pair.old_key->name.str); + + if (!index || index->is_corrupted()) { + my_error(ER_INDEX_CORRUPT, MYF(0), + index->name()); + goto err_exit; + } + } + } + + const ha_table_option_struct& alt_opt= + *ha_alter_info->create_info->option_struct; + + ha_innobase_inplace_ctx *ctx = NULL; + + if (!(ha_alter_info->handler_flags & INNOBASE_ALTER_DATA) + || ((ha_alter_info->handler_flags & ~(INNOBASE_INPLACE_IGNORE + | INNOBASE_ALTER_NOCREATE + | INNOBASE_ALTER_INSTANT)) + == ALTER_OPTIONS + && !alter_options_need_rebuild(ha_alter_info, table))) { + + DBUG_ASSERT(!m_prebuilt->trx->dict_operation_lock_mode); + online_retry_drop_indexes(m_prebuilt->table, m_user_thd); + + if (heap) { + ctx = new ha_innobase_inplace_ctx( + m_prebuilt, + drop_index, n_drop_index, + drop_fk, n_drop_fk, + add_fk, n_add_fk, + ha_alter_info->online, + heap, indexed_table, + col_names, ULINT_UNDEFINED, 0, 0, + (ha_alter_info->ignore + || !thd_is_strict_mode(m_user_thd)), + alt_opt.page_compressed, + alt_opt.page_compression_level); + ha_alter_info->handler_ctx = ctx; + } + + if ((ha_alter_info->handler_flags + & ALTER_DROP_VIRTUAL_COLUMN) + && prepare_inplace_drop_virtual(ha_alter_info, table)) { + DBUG_RETURN(true); + } + + if ((ha_alter_info->handler_flags + & ALTER_ADD_VIRTUAL_COLUMN) + && prepare_inplace_add_virtual( + ha_alter_info, altered_table, table)) { + DBUG_RETURN(true); + } + + if (!(ha_alter_info->handler_flags & INNOBASE_ALTER_DATA) + && alter_templ_needs_rebuild(altered_table, ha_alter_info, + ctx->new_table) + && ctx->new_table->n_v_cols > 0) { + /* Changing maria record structure may end up here only + if virtual columns were altered. In this case, however, + vc_templ should be rebuilt. Since we don't actually + change any stored data, we can just dispose vc_templ; + it will be recreated on next ha_innobase::open(). */ + + DBUG_ASSERT(ctx->new_table == ctx->old_table); + + dict_free_vc_templ(ctx->new_table->vc_templ); + UT_DELETE(ctx->new_table->vc_templ); + + ctx->new_table->vc_templ = NULL; + } + + +success: + /* Memorize the future transaction ID for committing + the data dictionary change, to be reported by + ha_innobase::table_version(). */ + m_prebuilt->trx_id = (ha_alter_info->handler_flags + & ~INNOBASE_INPLACE_IGNORE) + ? static_cast + (ha_alter_info->handler_ctx)->trx->id + : 0; + DBUG_RETURN(false); + } + + /* If we are to build a full-text search index, check whether + the table already has a DOC ID column. If not, we will need to + add a Doc ID hidden column and rebuild the primary index */ + if (innobase_fulltext_exist(altered_table)) { + ulint doc_col_no; + ulint num_v = 0; + + if (!innobase_fts_check_doc_id_col( + m_prebuilt->table, + altered_table, &fts_doc_col_no, &num_v)) { + + fts_doc_col_no = altered_table->s->fields - num_v; + add_fts_doc_id = true; + add_fts_doc_id_idx = true; + + } else if (fts_doc_col_no == ULINT_UNDEFINED) { + goto err_exit; + } + + switch (innobase_fts_check_doc_id_index( + m_prebuilt->table, altered_table, + &doc_col_no)) { + case FTS_NOT_EXIST_DOC_ID_INDEX: + add_fts_doc_id_idx = true; + break; + case FTS_INCORRECT_DOC_ID_INDEX: + my_error(ER_INNODB_FT_WRONG_DOCID_INDEX, MYF(0), + FTS_DOC_ID_INDEX_NAME); + goto err_exit; + case FTS_EXIST_DOC_ID_INDEX: + DBUG_ASSERT( + doc_col_no == fts_doc_col_no + || doc_col_no == ULINT_UNDEFINED + || (ha_alter_info->handler_flags + & (ALTER_STORED_COLUMN_ORDER + | ALTER_DROP_STORED_COLUMN + | ALTER_ADD_STORED_BASE_COLUMN))); + } + } + + /* See if an AUTO_INCREMENT column was added. */ + uint i = 0; + ulint num_v = 0; + for (const Create_field& new_field : + ha_alter_info->alter_info->create_list) { + const Field* field; + + DBUG_ASSERT(i < altered_table->s->fields); + + for (uint old_i = 0; table->field[old_i]; old_i++) { + if (new_field.field == table->field[old_i]) { + goto found_col; + } + } + + /* This is an added column. */ + DBUG_ASSERT(!new_field.field); + DBUG_ASSERT(ha_alter_info->handler_flags + & ALTER_ADD_COLUMN); + + field = altered_table->field[i]; + + DBUG_ASSERT((field->unireg_check + == Field::NEXT_NUMBER) + == !!(field->flags & AUTO_INCREMENT_FLAG)); + + if (field->flags & AUTO_INCREMENT_FLAG) { + if (add_autoinc_col_no != ULINT_UNDEFINED) { + /* This should have been blocked earlier. */ + ut_ad(0); + my_error(ER_WRONG_AUTO_KEY, MYF(0)); + goto err_exit; + } + + /* Get the col no of the old table non-virtual column array */ + add_autoinc_col_no = i - num_v; + + autoinc_col_max_value = innobase_get_int_col_max_value(field); + } +found_col: + num_v += !new_field.stored_in_db(); + i++; + } + + DBUG_ASSERT(heap); + DBUG_ASSERT(m_user_thd == m_prebuilt->trx->mysql_thd); + DBUG_ASSERT(!ha_alter_info->handler_ctx); + + ha_alter_info->handler_ctx = new ha_innobase_inplace_ctx( + m_prebuilt, + drop_index, n_drop_index, + drop_fk, n_drop_fk, add_fk, n_add_fk, + ha_alter_info->online, + heap, m_prebuilt->table, col_names, + add_autoinc_col_no, + ha_alter_info->create_info->auto_increment_value, + autoinc_col_max_value, + ha_alter_info->ignore || !thd_is_strict_mode(m_user_thd), + alt_opt.page_compressed, alt_opt.page_compression_level); + + if (!prepare_inplace_alter_table_dict( + ha_alter_info, altered_table, table, + table_share->table_name.str, + info.flags(), info.flags2(), + fts_doc_col_no, add_fts_doc_id, + add_fts_doc_id_idx)) { + goto success; + } + + DBUG_RETURN(true); +} + +/* Check whether a columnn length change alter operation requires +to rebuild the template. +@param[in] altered_table TABLE object for new version of table. +@param[in] ha_alter_info Structure describing changes to be done + by ALTER TABLE and holding data used + during in-place alter. +@param[in] table table being altered +@return TRUE if needs rebuild. */ +static +bool +alter_templ_needs_rebuild( + const TABLE* altered_table, + const Alter_inplace_info* ha_alter_info, + const dict_table_t* table) +{ + ulint i = 0; + + for (Field** fp = altered_table->field; *fp; fp++, i++) { + for (const Create_field& cf : + ha_alter_info->alter_info->create_list) { + for (ulint j=0; j < table->n_cols; j++) { + dict_col_t* cols + = dict_table_get_nth_col(table, j); + if (cf.length > cols->len) { + return(true); + } + } + } + } + + return(false); +} + +/** Alter the table structure in-place with operations +specified using Alter_inplace_info. +The level of concurrency allowed during this operation depends +on the return value from check_if_supported_inplace_alter(). + +@param altered_table TABLE object for new version of table. +@param ha_alter_info Structure describing changes to be done +by ALTER TABLE and holding data used during in-place alter. + +@retval true Failure +@retval false Success +*/ + +bool +ha_innobase::inplace_alter_table( +/*=============================*/ + TABLE* altered_table, + Alter_inplace_info* ha_alter_info) +{ + dberr_t error; + dict_add_v_col_t* add_v = NULL; + dict_vcol_templ_t* s_templ = NULL; + dict_vcol_templ_t* old_templ = NULL; + struct TABLE* eval_table = altered_table; + bool rebuild_templ = false; + DBUG_ENTER("inplace_alter_table"); + DBUG_ASSERT(!srv_read_only_mode); + + DEBUG_SYNC(m_user_thd, "innodb_inplace_alter_table_enter"); + + /* Ignore the inplace alter phase when table is empty */ + if (!(ha_alter_info->handler_flags & INNOBASE_ALTER_DATA) + || ha_alter_info->mdl_exclusive_after_prepare) { +ok_exit: + DEBUG_SYNC(m_user_thd, "innodb_after_inplace_alter_table"); + DBUG_RETURN(false); + } + + if ((ha_alter_info->handler_flags & ~(INNOBASE_INPLACE_IGNORE + | INNOBASE_ALTER_NOCREATE + | INNOBASE_ALTER_INSTANT)) + == ALTER_OPTIONS + && !alter_options_need_rebuild(ha_alter_info, table)) { + goto ok_exit; + } + + ha_innobase_inplace_ctx* ctx + = static_cast + (ha_alter_info->handler_ctx); + + DBUG_ASSERT(ctx); + DBUG_ASSERT(ctx->trx); + DBUG_ASSERT(ctx->prebuilt == m_prebuilt); + + if (ctx->is_instant()) goto ok_exit; + + dict_index_t* pk = dict_table_get_first_index(m_prebuilt->table); + ut_ad(pk != NULL); + + /* For partitioned tables this could be already allocated from a + previous partition invocation. For normal tables this is NULL. */ + UT_DELETE(ctx->m_stage); + + ctx->m_stage = UT_NEW_NOKEY(ut_stage_alter_t(pk)); + + if (!m_prebuilt->table->is_readable()) { + goto all_done; + } + + /* If we are doing a table rebuilding or having added virtual + columns in the same clause, we will need to build a table template + that carries translation information between MySQL TABLE and InnoDB + table, which indicates the virtual columns and their base columns + info. This is used to do the computation callback, so that the + data in base columns can be extracted send to server. + If the Column length changes and it is a part of virtual + index then we need to rebuild the template. */ + rebuild_templ + = ctx->need_rebuild() + || ((ha_alter_info->handler_flags + & ALTER_COLUMN_TYPE_CHANGE_BY_ENGINE) + && alter_templ_needs_rebuild( + altered_table, ha_alter_info, ctx->new_table)); + + if ((ctx->new_table->n_v_cols > 0) && rebuild_templ) { + /* Save the templ if isn't NULL so as to restore the + original state in case of alter operation failures. */ + if (ctx->new_table->vc_templ != NULL && !ctx->need_rebuild()) { + old_templ = ctx->new_table->vc_templ; + } + s_templ = UT_NEW_NOKEY(dict_vcol_templ_t()); + + innobase_build_v_templ( + altered_table, ctx->new_table, s_templ, NULL, false); + + ctx->new_table->vc_templ = s_templ; + } else if (ctx->num_to_add_vcol > 0 && ctx->num_to_drop_vcol == 0) { + /* if there is ongoing drop virtual column, then we disallow + inplace add index on newly added virtual column, so it does + not need to come in here to rebuild template with add_v. + Please also see the assertion in innodb_v_adjust_idx_col() */ + + s_templ = UT_NEW_NOKEY(dict_vcol_templ_t()); + + add_v = static_cast( + mem_heap_alloc(ctx->heap, sizeof *add_v)); + add_v->n_v_col = ctx->num_to_add_vcol; + add_v->v_col = ctx->add_vcol; + add_v->v_col_name = ctx->add_vcol_name; + + innobase_build_v_templ( + altered_table, ctx->new_table, s_templ, add_v, false); + old_templ = ctx->new_table->vc_templ; + ctx->new_table->vc_templ = s_templ; + } + + /* Drop virtual column without rebuild will keep dict table + unchanged, we use old table to evaluate virtual column value + in innobase_get_computed_value(). */ + if (!ctx->need_rebuild() && ctx->num_to_drop_vcol > 0) { + eval_table = table; + } + + /* Read the clustered index of the table and build + indexes based on this information using temporary + files and merge sort. */ + DBUG_EXECUTE_IF("innodb_OOM_inplace_alter", + error = DB_OUT_OF_MEMORY; goto oom;); + + error = row_merge_build_indexes( + m_prebuilt->trx, + m_prebuilt->table, ctx->new_table, + ctx->online, + ctx->add_index, ctx->add_key_numbers, ctx->num_to_add_index, + altered_table, ctx->defaults, ctx->col_map, + ctx->add_autoinc, ctx->sequence, ctx->skip_pk_sort, + ctx->m_stage, add_v, eval_table, ctx->allow_not_null, + ctx->change_col_collate.empty() + ? nullptr : &ctx->change_col_collate); + +#ifndef DBUG_OFF +oom: +#endif /* !DBUG_OFF */ + if (error == DB_SUCCESS && ctx->online && ctx->need_rebuild()) { + DEBUG_SYNC_C("row_log_table_apply1_before"); + error = row_log_table_apply( + ctx->thr, m_prebuilt->table, altered_table, + ctx->m_stage, ctx->new_table); + } + + /* Init online ddl status variables */ + onlineddl_rowlog_rows = 0; + onlineddl_rowlog_pct_used = 0; + onlineddl_pct_progress = 0; + + if (s_templ) { + ut_ad(ctx->need_rebuild() || ctx->num_to_add_vcol > 0 + || rebuild_templ); + dict_free_vc_templ(s_templ); + UT_DELETE(s_templ); + + ctx->new_table->vc_templ = old_templ; + } + + DEBUG_SYNC_C("inplace_after_index_build"); + + DBUG_EXECUTE_IF("create_index_fail", + error = DB_DUPLICATE_KEY; + m_prebuilt->trx->error_key_num = ULINT_UNDEFINED;); + + /* After an error, remove all those index definitions + from the dictionary which were defined. */ + + switch (error) { + KEY* dup_key; + default: + my_error_innodb(error, + table_share->table_name.str, + m_prebuilt->table->flags); + break; + all_done: + case DB_SUCCESS: + ut_d(dict_sys.freeze(SRW_LOCK_CALL)); + ut_d(dict_table_check_for_dup_indexes( + m_prebuilt->table, CHECK_PARTIAL_OK)); + ut_d(dict_sys.unfreeze()); + /* prebuilt->table->n_ref_count can be anything here, + given that we hold at most a shared lock on the table. */ + goto ok_exit; + case DB_DUPLICATE_KEY: + if (m_prebuilt->trx->error_key_num == ULINT_UNDEFINED + || ha_alter_info->key_count == 0) { + /* This should be the hidden index on + FTS_DOC_ID, or there is no PRIMARY KEY in the + table. Either way, we should be seeing and + reporting a bogus duplicate key error. */ + dup_key = NULL; + } else { + DBUG_ASSERT(m_prebuilt->trx->error_key_num + < ha_alter_info->key_count); + dup_key = &ha_alter_info->key_info_buffer[ + m_prebuilt->trx->error_key_num]; + } + print_keydup_error(altered_table, dup_key, MYF(0)); + break; + case DB_ONLINE_LOG_TOO_BIG: + DBUG_ASSERT(ctx->online); + my_error(ER_INNODB_ONLINE_LOG_TOO_BIG, MYF(0), + get_error_key_name(m_prebuilt->trx->error_key_num, + ha_alter_info, m_prebuilt->table)); + break; + case DB_INDEX_CORRUPT: + my_error(ER_INDEX_CORRUPT, MYF(0), + get_error_key_name(m_prebuilt->trx->error_key_num, + ha_alter_info, m_prebuilt->table)); + break; + case DB_DECRYPTION_FAILED: + String str; + const char* engine= table_type(); + get_error_message(HA_ERR_DECRYPTION_FAILED, &str); + my_error(ER_GET_ERRMSG, MYF(0), HA_ERR_DECRYPTION_FAILED, + str.c_ptr(), engine); + break; + } + + /* prebuilt->table->n_ref_count can be anything here, given + that we hold at most a shared lock on the table. */ + m_prebuilt->trx->error_info = NULL; + ctx->trx->error_state = DB_SUCCESS; + + DBUG_RETURN(true); +} + +/** Free the modification log for online table rebuild. +@param table table that was being rebuilt online */ +static +void +innobase_online_rebuild_log_free( +/*=============================*/ + dict_table_t* table) +{ + dict_index_t* clust_index = dict_table_get_first_index(table); + ut_ad(dict_sys.locked()); + clust_index->lock.x_lock(SRW_LOCK_CALL); + + if (clust_index->online_log) { + ut_ad(dict_index_get_online_status(clust_index) + == ONLINE_INDEX_CREATION); + clust_index->online_status = ONLINE_INDEX_COMPLETE; + row_log_free(clust_index->online_log); + clust_index->online_log = NULL; + DEBUG_SYNC_C("innodb_online_rebuild_log_free_aborted"); + } + + DBUG_ASSERT(dict_index_get_online_status(clust_index) + == ONLINE_INDEX_COMPLETE); + clust_index->lock.x_unlock(); +} + +/** For each user column, which is part of an index which is not going to be +dropped, it checks if the column number of the column is same as col_no +argument passed. +@param[in] table table +@param[in] col_no column number +@param[in] is_v if this is a virtual column +@param[in] only_committed whether to consider only committed indexes +@retval true column exists +@retval false column does not exist, true if column is system column or +it is in the index. */ +static +bool +check_col_exists_in_indexes( + const dict_table_t* table, + ulint col_no, + bool is_v, + bool only_committed = false) +{ + /* This function does not check system columns */ + if (!is_v && dict_table_get_nth_col(table, col_no)->mtype == DATA_SYS) { + return(true); + } + + for (const dict_index_t* index = dict_table_get_first_index(table); + index; + index = dict_table_get_next_index(index)) { + + if (only_committed + ? !index->is_committed() + : index->to_be_dropped) { + continue; + } + + for (ulint i = 0; i < index->n_user_defined_cols; i++) { + const dict_col_t* idx_col + = dict_index_get_nth_col(index, i); + + if (is_v && idx_col->is_virtual()) { + const dict_v_col_t* v_col = reinterpret_cast< + const dict_v_col_t*>(idx_col); + if (v_col->v_pos == col_no) { + return(true); + } + } + + if (!is_v && !idx_col->is_virtual() + && dict_col_get_no(idx_col) == col_no) { + return(true); + } + } + } + + return(false); +} + +/** Rollback a secondary index creation, drop the indexes with +temparary index prefix +@param user_table InnoDB table +@param table the TABLE +@param locked TRUE=table locked, FALSE=may need to do a lazy drop +@param trx the transaction +@param alter_trx transaction which takes S-lock on the table + while creating the index */ +static +void +innobase_rollback_sec_index( + dict_table_t* user_table, + const TABLE* table, + bool locked, + trx_t* trx, + const trx_t* alter_trx=NULL) +{ + row_merge_drop_indexes(trx, user_table, locked, alter_trx); + + /* Free the table->fts only if there is no FTS_DOC_ID + in the table */ + if (user_table->fts + && !DICT_TF2_FLAG_IS_SET(user_table, + DICT_TF2_FTS_HAS_DOC_ID) + && !innobase_fulltext_exist(table)) { + user_table->fts->~fts_t(); + user_table->fts = nullptr; + } +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Roll back the changes made during prepare_inplace_alter_table() +and inplace_alter_table() inside the storage engine. Note that the +allowed level of concurrency during this operation will be the same as +for inplace_alter_table() and thus might be higher than during +prepare_inplace_alter_table(). (E.g concurrent writes were blocked +during prepare, but might not be during commit). + +@param ha_alter_info Data used during in-place alter. +@param table the TABLE +@param prebuilt the prebuilt struct +@retval true Failure +@retval false Success +*/ +inline bool rollback_inplace_alter_table(Alter_inplace_info *ha_alter_info, + const TABLE *table, + row_prebuilt_t *prebuilt) +{ + bool fail= false; + ha_innobase_inplace_ctx *ctx= static_cast + (ha_alter_info->handler_ctx); + + DBUG_ENTER("rollback_inplace_alter_table"); + + DEBUG_SYNC_C("innodb_rollback_inplace_alter_table"); + if (!ctx) + /* If we have not started a transaction yet, + (almost) nothing has been or needs to be done. */ + dict_sys.lock(SRW_LOCK_CALL); + else if (ctx->trx->state == TRX_STATE_NOT_STARTED) + goto free_and_exit; + else if (ctx->new_table) + { + ut_ad(ctx->trx->state == TRX_STATE_ACTIVE); + const bool fts_exist= (ctx->new_table->flags2 & + (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS)) || + ctx->adding_fulltext_index(); + if (ctx->need_rebuild()) + { + if (fts_exist) + { + fts_optimize_remove_table(ctx->new_table); + purge_sys.stop_FTS(*ctx->new_table); + } + + dberr_t err= lock_table_for_trx(ctx->new_table, ctx->trx, LOCK_X); + if (fts_exist) + { + if (err == DB_SUCCESS) + err= fts_lock_common_tables(ctx->trx, *ctx->new_table); + for (const dict_index_t* index= ctx->new_table->indexes.start; + err == DB_SUCCESS && index; index= index->indexes.next) + if (index->type & DICT_FTS) + err= fts_lock_index_tables(ctx->trx, *index); + } + if (err == DB_SUCCESS) + err= lock_sys_tables(ctx->trx); + + row_mysql_lock_data_dictionary(ctx->trx); + /* Detach ctx->new_table from dict_index_t::online_log. */ + innobase_online_rebuild_log_free(ctx->old_table); + + ut_d(const bool last_handle=) ctx->new_table->release(); + ut_ad(last_handle); + if (err == DB_SUCCESS) + err= ctx->trx->drop_table(*ctx->new_table); + + if (err == DB_SUCCESS) + for (const dict_index_t* index= ctx->new_table->indexes.start; index; + index= index->indexes.next) + if (index->type & DICT_FTS) + if (dberr_t err2= fts_drop_index_tables(ctx->trx, *index)) + err= err2; + + if (err != DB_SUCCESS) + { + my_error_innodb(err, table->s->table_name.str, ctx->new_table->flags); + fail= true; + } + } + else + { + DBUG_ASSERT(!(ha_alter_info->handler_flags & ALTER_ADD_PK_INDEX)); + DBUG_ASSERT(ctx->old_table == prebuilt->table); + uint &innodb_lock_wait_timeout= + thd_lock_wait_timeout(ctx->trx->mysql_thd); + const uint save_timeout= innodb_lock_wait_timeout; + innodb_lock_wait_timeout= ~0U; /* infinite */ + dict_index_t *old_clust_index= ctx->old_table->indexes.start; + old_clust_index->lock.x_lock(SRW_LOCK_CALL); + old_clust_index->online_log= nullptr; + old_clust_index->lock.x_unlock(); + if (fts_exist) + { + const dict_index_t *fts_index= nullptr; + for (ulint a= 0; a < ctx->num_to_add_index; a++) + { + const dict_index_t *index = ctx->add_index[a]; + if (index->type & DICT_FTS) + fts_index= index; + } + + /* Remove the fts table from fts_optimize_wq if there are + no FTS secondary index exist other than newly added one */ + if (fts_index && + (ib_vector_is_empty(prebuilt->table->fts->indexes) || + (ib_vector_size(prebuilt->table->fts->indexes) == 1 && + fts_index == static_cast( + ib_vector_getp(prebuilt->table->fts->indexes, 0))))) + fts_optimize_remove_table(prebuilt->table); + + purge_sys.stop_FTS(*prebuilt->table); + ut_a(!fts_index || !fts_lock_index_tables(ctx->trx, *fts_index)); + ut_a(!fts_lock_common_tables(ctx->trx, *ctx->new_table)); + ut_a(!lock_sys_tables(ctx->trx)); + } + else + { + ut_a(!lock_table_for_trx(dict_sys.sys_indexes, ctx->trx, LOCK_X)); + ut_a(!lock_table_for_trx(dict_sys.sys_fields, ctx->trx, LOCK_X)); + } + innodb_lock_wait_timeout= save_timeout; + DEBUG_SYNC_C("innodb_rollback_after_fts_lock"); + row_mysql_lock_data_dictionary(ctx->trx); + ctx->rollback_instant(); + innobase_rollback_sec_index(ctx->old_table, table, + ha_alter_info->alter_info->requested_lock == + Alter_info::ALTER_TABLE_LOCK_EXCLUSIVE, + ctx->trx, prebuilt->trx); + ctx->clean_new_vcol_index(); + ctx->cleanup_col_collation(); + ut_d(dict_table_check_for_dup_indexes(ctx->old_table, CHECK_ABORTED_OK)); + } + + DEBUG_SYNC(ctx->trx->mysql_thd, "before_commit_rollback_inplace"); + commit_unlock_and_unlink(ctx->trx); + if (fts_exist) + purge_sys.resume_FTS(); + if (ctx->old_table->fts) + { + dict_sys.lock(SRW_LOCK_CALL); + ut_ad(fts_check_cached_index(ctx->old_table)); + fts_optimize_add_table(ctx->old_table); + dict_sys.unlock(); + } + goto free_and_exit; + } + else + { +free_and_exit: + DBUG_ASSERT(ctx->prebuilt == prebuilt); + ctx->trx->free(); + ctx->trx= nullptr; + + dict_sys.lock(SRW_LOCK_CALL); + + if (ctx->add_vcol) + { + for (ulint i = 0; i < ctx->num_to_add_vcol; i++) + ctx->add_vcol[i].~dict_v_col_t(); + ctx->num_to_add_vcol= 0; + ctx->add_vcol= nullptr; + } + + for (ulint i= 0; i < ctx->num_to_add_fk; i++) + dict_foreign_free(ctx->add_fk[i]); + /* Clear the to_be_dropped flags in the data dictionary cache. + The flags may already have been cleared, in case an error was + detected in commit_inplace_alter_table(). */ + for (ulint i= 0; i < ctx->num_to_drop_index; i++) + { + dict_index_t *index= ctx->drop_index[i]; + DBUG_ASSERT(index->is_committed()); + index->to_be_dropped= 0; + } + } + + DBUG_ASSERT(!prebuilt->table->indexes.start->online_log); + DBUG_ASSERT(prebuilt->table->indexes.start->online_status == + ONLINE_INDEX_COMPLETE); + + /* Reset dict_col_t::ord_part for unindexed columns */ + for (ulint i= 0; i < dict_table_get_n_cols(prebuilt->table); i++) + { + dict_col_t &col= prebuilt->table->cols[i]; + if (col.ord_part && !check_col_exists_in_indexes(prebuilt->table, i, false, + true)) + col.ord_part= 0; + } + + for (ulint i = 0; i < dict_table_get_n_v_cols(prebuilt->table); i++) + { + dict_col_t &col = prebuilt->table->v_cols[i].m_col; + if (col.ord_part && !check_col_exists_in_indexes(prebuilt->table, i, true, + true)) + col.ord_part= 0; + } + dict_sys.unlock(); + trx_commit_for_mysql(prebuilt->trx); + prebuilt->trx_id = 0; + MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE); + DBUG_RETURN(fail); +} + +/** Drop a FOREIGN KEY constraint from the data dictionary tables. +@param trx data dictionary transaction +@param table_name Table name in MySQL +@param foreign_id Foreign key constraint identifier +@retval true Failure +@retval false Success */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +bool +innobase_drop_foreign_try( +/*======================*/ + trx_t* trx, + const char* table_name, + const char* foreign_id) +{ + DBUG_ENTER("innobase_drop_foreign_try"); + + DBUG_ASSERT(trx->dict_operation); + ut_ad(trx->dict_operation_lock_mode); + ut_ad(dict_sys.locked()); + + /* Drop the constraint from the data dictionary. */ + static const char sql[] = + "PROCEDURE DROP_FOREIGN_PROC () IS\n" + "BEGIN\n" + "DELETE FROM SYS_FOREIGN WHERE ID=:id;\n" + "DELETE FROM SYS_FOREIGN_COLS WHERE ID=:id;\n" + "END;\n"; + + dberr_t error; + pars_info_t* info; + + info = pars_info_create(); + pars_info_add_str_literal(info, "id", foreign_id); + + trx->op_info = "dropping foreign key constraint from dictionary"; + error = que_eval_sql(info, sql, trx); + trx->op_info = ""; + + DBUG_EXECUTE_IF("ib_drop_foreign_error", + error = DB_OUT_OF_FILE_SPACE;); + + if (error != DB_SUCCESS) { + my_error_innodb(error, table_name, 0); + trx->error_state = DB_SUCCESS; + DBUG_RETURN(true); + } + + DBUG_RETURN(false); +} + +/** Rename a column in the data dictionary tables. +@param[in] ctx ALTER TABLE context +@param[in,out] trx Data dictionary transaction +@param[in] table_name Table name in MySQL +@param[in] from old column name +@param[in] to new column name +@retval true Failure +@retval false Success */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +bool +innobase_rename_column_try( + const ha_innobase_inplace_ctx& ctx, + trx_t* trx, + const char* table_name, + const char* from, + const char* to) +{ + dberr_t error; + bool clust_has_wide_format = false; + + DBUG_ENTER("innobase_rename_column_try"); + + DBUG_ASSERT(trx->dict_operation); + ut_ad(trx->dict_operation_lock_mode); + ut_ad(dict_sys.locked()); + + if (ctx.need_rebuild()) { + goto rename_foreign; + } + + error = DB_SUCCESS; + + trx->op_info = "renaming column in SYS_FIELDS"; + + for (const dict_index_t* index = dict_table_get_first_index( + ctx.old_table); + index != NULL; + index = dict_table_get_next_index(index)) { + + bool wide_format = false; + for (size_t i = 0; i < dict_index_get_n_fields(index); i++) { + dict_field_t* field= dict_index_get_nth_field(index, i); + if (field->prefix_len || field->descending) { + wide_format = true; + break; + } + } + + for (ulint i = 0; i < dict_index_get_n_fields(index); i++) { + const dict_field_t& f = index->fields[i]; + DBUG_ASSERT(!f.name == f.col->is_dropped()); + + if (!f.name || my_strcasecmp(system_charset_info, + f.name, from)) { + continue; + } + + pars_info_t* info = pars_info_create(); + ulint pos = wide_format + ? i << 16 | f.prefix_len + | !!f.descending << 15 + : i; + pars_info_add_ull_literal(info, "indexid", index->id); + pars_info_add_int4_literal(info, "nth", pos); + pars_info_add_str_literal(info, "new", to); + + error = que_eval_sql( + info, + "PROCEDURE RENAME_SYS_FIELDS_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_FIELDS SET COL_NAME=:new\n" + "WHERE INDEX_ID=:indexid\n" + "AND POS=:nth;\n" + "END;\n", trx); + DBUG_EXECUTE_IF("ib_rename_column_error", + error = DB_OUT_OF_FILE_SPACE;); + + if (error != DB_SUCCESS) { + goto err_exit; + } + + if (!wide_format || !clust_has_wide_format + || f.prefix_len || f.descending) { + continue; + } + + /* For secondary indexes, the + wide_format check can be 'polluted' + by PRIMARY KEY column prefix or descending + field. Try also the simpler encoding + of SYS_FIELDS.POS. */ + info = pars_info_create(); + + pars_info_add_ull_literal(info, "indexid", index->id); + pars_info_add_int4_literal(info, "nth", i); + pars_info_add_str_literal(info, "new", to); + + error = que_eval_sql( + info, + "PROCEDURE RENAME_SYS_FIELDS_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_FIELDS SET COL_NAME=:new\n" + "WHERE INDEX_ID=:indexid\n" + "AND POS=:nth;\n" + "END;\n", trx); + + if (error != DB_SUCCESS) { + goto err_exit; + } + } + + if (index == dict_table_get_first_index(ctx.old_table)) { + clust_has_wide_format = wide_format; + } + } + + if (error != DB_SUCCESS) { +err_exit: + my_error_innodb(error, table_name, 0); + trx->error_state = DB_SUCCESS; + trx->op_info = ""; + DBUG_RETURN(true); + } + +rename_foreign: + trx->op_info = "renaming column in SYS_FOREIGN_COLS"; + + std::set fk_evict; + bool foreign_modified; + + for (dict_foreign_set::const_iterator it = ctx.old_table->foreign_set.begin(); + it != ctx.old_table->foreign_set.end(); + ++it) { + + dict_foreign_t* foreign = *it; + foreign_modified = false; + + for (unsigned i = 0; i < foreign->n_fields; i++) { + if (my_strcasecmp(system_charset_info, + foreign->foreign_col_names[i], + from)) { + continue; + } + + /* Ignore the foreign key rename if fk info + is being dropped. */ + if (innobase_dropping_foreign( + foreign, ctx.drop_fk, + ctx.num_to_drop_fk)) { + continue; + } + + pars_info_t* info = pars_info_create(); + + pars_info_add_str_literal(info, "id", foreign->id); + pars_info_add_int4_literal(info, "nth", i); + pars_info_add_str_literal(info, "new", to); + + error = que_eval_sql( + info, + "PROCEDURE RENAME_SYS_FOREIGN_F_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_FOREIGN_COLS\n" + "SET FOR_COL_NAME=:new\n" + "WHERE ID=:id AND POS=:nth;\n" + "END;\n", trx); + + if (error != DB_SUCCESS) { + goto err_exit; + } + foreign_modified = true; + } + + if (foreign_modified) { + fk_evict.insert(foreign); + } + } + + for (dict_foreign_set::const_iterator it + = ctx.old_table->referenced_set.begin(); + it != ctx.old_table->referenced_set.end(); + ++it) { + + foreign_modified = false; + dict_foreign_t* foreign = *it; + + for (unsigned i = 0; i < foreign->n_fields; i++) { + if (my_strcasecmp(system_charset_info, + foreign->referenced_col_names[i], + from)) { + continue; + } + + pars_info_t* info = pars_info_create(); + + pars_info_add_str_literal(info, "id", foreign->id); + pars_info_add_int4_literal(info, "nth", i); + pars_info_add_str_literal(info, "new", to); + + error = que_eval_sql( + info, + "PROCEDURE RENAME_SYS_FOREIGN_R_PROC () IS\n" + "BEGIN\n" + "UPDATE SYS_FOREIGN_COLS\n" + "SET REF_COL_NAME=:new\n" + "WHERE ID=:id AND POS=:nth;\n" + "END;\n", trx); + + if (error != DB_SUCCESS) { + goto err_exit; + } + foreign_modified = true; + } + + if (foreign_modified) { + fk_evict.insert(foreign); + } + } + + /* Reload the foreign key info for instant table too. */ + if (ctx.need_rebuild() || ctx.is_instant()) { + std::for_each(fk_evict.begin(), fk_evict.end(), + dict_foreign_remove_from_cache); + } + + trx->op_info = ""; + DBUG_RETURN(false); +} + +/** Rename columns in the data dictionary tables. +@param ha_alter_info Data used during in-place alter. +@param ctx In-place ALTER TABLE context +@param table the TABLE +@param trx data dictionary transaction +@param table_name Table name in MySQL +@retval true Failure +@retval false Success */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +bool +innobase_rename_columns_try( +/*========================*/ + Alter_inplace_info* ha_alter_info, + ha_innobase_inplace_ctx*ctx, + const TABLE* table, + trx_t* trx, + const char* table_name) +{ + uint i = 0; + + DBUG_ASSERT(ctx->need_rebuild()); + DBUG_ASSERT(ha_alter_info->handler_flags + & ALTER_COLUMN_NAME); + + for (Field** fp = table->field; *fp; fp++, i++) { + if (!((*fp)->flags & FIELD_IS_RENAMED)) { + continue; + } + + for (const Create_field& cf : + ha_alter_info->alter_info->create_list) { + if (cf.field == *fp) { + if (innobase_rename_column_try( + *ctx, trx, table_name, + cf.field->field_name.str, + cf.field_name.str)) { + return(true); + } + goto processed_field; + } + } + + ut_error; +processed_field: + continue; + } + + return(false); +} + +/** Enlarge a column in the data dictionary tables. +@param ctx In-place ALTER TABLE context +@param trx data dictionary transaction +@param table_name Table name in MySQL +@param pos 0-based index to user_table->cols[] or user_table->v_cols[] +@param f new column +@param is_v if it's a virtual column +@retval true Failure +@retval false Success */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +bool +innobase_rename_or_enlarge_column_try( + ha_innobase_inplace_ctx*ctx, + trx_t* trx, + const char* table_name, + ulint pos, + const Field& f, + bool is_v) +{ + dict_col_t* col; + dict_table_t* user_table = ctx->old_table; + + DBUG_ENTER("innobase_rename_or_enlarge_column_try"); + DBUG_ASSERT(!ctx->need_rebuild()); + + DBUG_ASSERT(trx->dict_operation); + ut_ad(trx->dict_operation_lock_mode); + ut_ad(dict_sys.locked()); + + ulint n_base; + + if (is_v) { + dict_v_col_t* v_col= dict_table_get_nth_v_col(user_table, pos); + pos = dict_create_v_col_pos(v_col->v_pos, v_col->m_col.ind); + col = &v_col->m_col; + n_base = v_col->num_base; + } else { + col = dict_table_get_nth_col(user_table, pos); + n_base = 0; + } + + unsigned prtype; + uint8_t mtype; + uint16_t len; + get_type(f, prtype, mtype, len); + DBUG_ASSERT(!dtype_is_string_type(col->mtype) + || col->mbminlen == f.charset()->mbminlen); + DBUG_ASSERT(col->len <= len); + +#ifdef UNIV_DEBUG + ut_ad(col->mbminlen <= col->mbmaxlen); + switch (mtype) { + case DATA_MYSQL: + if (!(prtype & DATA_BINARY_TYPE) || user_table->not_redundant() + || col->mbminlen != col->mbmaxlen) { + /* NOTE: we could allow this when !(prtype & + DATA_BINARY_TYPE) and ROW_FORMAT is not REDUNDANT and + mbminlenlen == len); + break; + case DATA_BINARY: + case DATA_VARCHAR: + case DATA_VARMYSQL: + case DATA_DECIMAL: + case DATA_BLOB: + break; + default: + ut_ad(!((col->prtype ^ prtype) & ~DATA_VERSIONED)); + ut_ad(col->mtype == mtype); + ut_ad(col->len == len); + } +#endif /* UNIV_DEBUG */ + + const char* col_name = col->name(*user_table); + const bool same_name = !strcmp(col_name, f.field_name.str); + + if (!same_name + && innobase_rename_column_try(*ctx, trx, table_name, + col_name, f.field_name.str)) { + DBUG_RETURN(true); + } + + if (same_name + && col->prtype == prtype && col->mtype == mtype + && col->len == len) { + DBUG_RETURN(false); + } + + DBUG_RETURN(innodb_insert_sys_columns(user_table->id, pos, + f.field_name.str, + mtype, prtype, len, + n_base, trx, true)); +} + +/** Rename or enlarge columns in the data dictionary cache +as part of commit_try_norebuild(). +@param ha_alter_info Data used during in-place alter. +@param ctx In-place ALTER TABLE context +@param altered_table metadata after ALTER TABLE +@param table metadata before ALTER TABLE +@param trx data dictionary transaction +@param table_name Table name in MySQL +@retval true Failure +@retval false Success */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +bool +innobase_rename_or_enlarge_columns_try( + Alter_inplace_info* ha_alter_info, + ha_innobase_inplace_ctx*ctx, + const TABLE* altered_table, + const TABLE* table, + trx_t* trx, + const char* table_name) +{ + DBUG_ENTER("innobase_rename_or_enlarge_columns_try"); + + if (!(ha_alter_info->handler_flags + & (ALTER_COLUMN_TYPE_CHANGE_BY_ENGINE + | ALTER_COLUMN_NAME))) { + DBUG_RETURN(false); + } + + ulint i = 0; + ulint num_v = 0; + + for (Field** fp = table->field; *fp; fp++, i++) { + const bool is_v = !(*fp)->stored_in_db(); + ulint idx = is_v ? num_v++ : i - num_v; + + Field** af = altered_table->field; + for (const Create_field& cf : + ha_alter_info->alter_info->create_list) { + if (cf.field == *fp) { + if (innobase_rename_or_enlarge_column_try( + ctx, trx, table_name, + idx, **af, is_v)) { + DBUG_RETURN(true); + } + break; + } + af++; + } + } + + DBUG_RETURN(false); +} + +/** Rename or enlarge columns in the data dictionary cache +as part of commit_cache_norebuild(). +@param ha_alter_info Data used during in-place alter. +@param altered_table metadata after ALTER TABLE +@param table metadata before ALTER TABLE +@param user_table InnoDB table that was being altered */ +static MY_ATTRIBUTE((nonnull)) +void +innobase_rename_or_enlarge_columns_cache( +/*=====================================*/ + Alter_inplace_info* ha_alter_info, + const TABLE* altered_table, + const TABLE* table, + dict_table_t* user_table) +{ + if (!(ha_alter_info->handler_flags + & (ALTER_COLUMN_TYPE_CHANGE_BY_ENGINE + | ALTER_COLUMN_NAME))) { + return; + } + + uint i = 0; + ulint num_v = 0; + + for (Field** fp = table->field; *fp; fp++, i++) { + const bool is_virtual = !(*fp)->stored_in_db(); + + Field** af = altered_table->field; + for (Create_field& cf : + ha_alter_info->alter_info->create_list) { + if (cf.field != *fp) { + af++; + continue; + } + + ulint col_n = is_virtual ? num_v : i - num_v; + dict_col_t *col = is_virtual + ? &dict_table_get_nth_v_col(user_table, col_n) + ->m_col + : dict_table_get_nth_col(user_table, col_n); + const bool is_string= dtype_is_string_type(col->mtype); + DBUG_ASSERT(col->mbminlen + == (is_string + ? (*af)->charset()->mbminlen : 0)); + unsigned prtype; + uint8_t mtype; + uint16_t len; + get_type(**af, prtype, mtype, len); + DBUG_ASSERT(is_string == dtype_is_string_type(mtype)); + + col->prtype = prtype; + col->mtype = mtype; + col->len = len; + col->mbmaxlen = is_string + ? (*af)->charset()->mbmaxlen & 7: 0; + + if ((*fp)->flags & FIELD_IS_RENAMED) { + dict_mem_table_col_rename( + user_table, col_n, + cf.field->field_name.str, + (*af)->field_name.str, is_virtual); + } + + break; + } + + if (is_virtual) { + num_v++; + } + } +} + +/** Set the auto-increment value of the table on commit. +@param ha_alter_info Data used during in-place alter +@param ctx In-place ALTER TABLE context +@param altered_table MySQL table that is being altered +@param old_table MySQL table as it is before the ALTER operation +@return whether the operation failed (and my_error() was called) */ +static MY_ATTRIBUTE((nonnull)) +bool +commit_set_autoinc( + Alter_inplace_info* ha_alter_info, + ha_innobase_inplace_ctx*ctx, + const TABLE* altered_table, + const TABLE* old_table) +{ + DBUG_ENTER("commit_set_autoinc"); + + if (!altered_table->found_next_number_field) { + /* There is no AUTO_INCREMENT column in the table + after the ALTER operation. */ + } else if (ctx->add_autoinc != ULINT_UNDEFINED) { + ut_ad(ctx->need_rebuild()); + /* An AUTO_INCREMENT column was added. Get the last + value from the sequence, which may be based on a + supplied AUTO_INCREMENT value. */ + ib_uint64_t autoinc = ctx->sequence.last(); + ctx->new_table->autoinc = autoinc; + /* Bulk index creation does not update + PAGE_ROOT_AUTO_INC, so we must persist the "last used" + value here. */ + btr_write_autoinc(dict_table_get_first_index(ctx->new_table), + autoinc - 1, true); + } else if ((ha_alter_info->handler_flags + & ALTER_CHANGE_CREATE_OPTION) + && (ha_alter_info->create_info->used_fields + & HA_CREATE_USED_AUTO)) { + + if (!ctx->old_table->space) { + my_error(ER_TABLESPACE_DISCARDED, MYF(0), + old_table->s->table_name.str); + DBUG_RETURN(true); + } + + /* An AUTO_INCREMENT value was supplied by the user. + It must be persisted to the data file. */ + const Field* ai = old_table->found_next_number_field; + ut_ad(!strcmp(dict_table_get_col_name(ctx->old_table, + innodb_col_no(ai)), + ai->field_name.str)); + + ib_uint64_t autoinc + = ha_alter_info->create_info->auto_increment_value; + if (autoinc == 0) { + autoinc = 1; + } + + if (autoinc >= ctx->old_table->autoinc) { + /* Persist the predecessor of the + AUTO_INCREMENT value as the last used one. */ + ctx->new_table->autoinc = autoinc--; + } else { + /* Mimic ALGORITHM=COPY in the following scenario: + + CREATE TABLE t (a SERIAL); + INSERT INTO t SET a=100; + ALTER TABLE t AUTO_INCREMENT = 1; + INSERT INTO t SET a=NULL; + SELECT * FROM t; + + By default, ALGORITHM=INPLACE would reset the + sequence to 1, while after ALGORITHM=COPY, the + last INSERT would use a value larger than 100. + + We could only search the tree to know current + max counter in the table and compare. */ + const dict_col_t* autoinc_col + = dict_table_get_nth_col(ctx->old_table, + innodb_col_no(ai)); + dict_index_t* index + = dict_table_get_first_index(ctx->old_table); + while (index != NULL + && index->fields[0].col != autoinc_col) { + index = dict_table_get_next_index(index); + } + + ut_ad(index); + + ib_uint64_t max_in_table = index + ? row_search_max_autoinc(index) + : 0; + + if (autoinc <= max_in_table) { + ctx->new_table->autoinc = innobase_next_autoinc( + max_in_table, 1, + ctx->prebuilt->autoinc_increment, + ctx->prebuilt->autoinc_offset, + innobase_get_int_col_max_value(ai)); + /* Persist the maximum value as the + last used one. */ + autoinc = max_in_table; + } else { + /* Persist the predecessor of the + AUTO_INCREMENT value as the last used one. */ + ctx->new_table->autoinc = autoinc--; + } + } + + btr_write_autoinc(dict_table_get_first_index(ctx->new_table), + autoinc, true); + } else if (ctx->need_rebuild()) { + /* No AUTO_INCREMENT value was specified. + Copy it from the old table. */ + ctx->new_table->autoinc = ctx->old_table->autoinc; + /* The persistent value was already copied in + prepare_inplace_alter_table_dict() when ctx->new_table + was created. If this was a LOCK=NONE operation, the + AUTO_INCREMENT values would be updated during + row_log_table_apply(). If this was LOCK!=NONE, + the table contents could not possibly have changed + between prepare_inplace and commit_inplace. */ + } + + DBUG_RETURN(false); +} + +/** Add or drop foreign key constraints to the data dictionary tables, +but do not touch the data dictionary cache. +@param ha_alter_info Data used during in-place alter +@param ctx In-place ALTER TABLE context +@param trx Data dictionary transaction +@param table_name Table name in MySQL +@retval true Failure +@retval false Success +*/ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +bool +innobase_update_foreign_try( +/*========================*/ + ha_innobase_inplace_ctx*ctx, + trx_t* trx, + const char* table_name) +{ + ulint foreign_id; + ulint i; + + DBUG_ENTER("innobase_update_foreign_try"); + + foreign_id = dict_table_get_highest_foreign_id(ctx->new_table); + + foreign_id++; + + for (i = 0; i < ctx->num_to_add_fk; i++) { + dict_foreign_t* fk = ctx->add_fk[i]; + + ut_ad(fk->foreign_table == ctx->new_table + || fk->foreign_table == ctx->old_table); + + dberr_t error = dict_create_add_foreign_id( + &foreign_id, ctx->old_table->name.m_name, fk); + + if (error != DB_SUCCESS) { + my_error(ER_TOO_LONG_IDENT, MYF(0), + fk->id); + DBUG_RETURN(true); + } + + if (!fk->foreign_index) { + fk->foreign_index = dict_foreign_find_index( + ctx->new_table, ctx->col_names, + fk->foreign_col_names, + fk->n_fields, fk->referenced_index, TRUE, + fk->type + & (DICT_FOREIGN_ON_DELETE_SET_NULL + | DICT_FOREIGN_ON_UPDATE_SET_NULL), + NULL, NULL, NULL); + if (!fk->foreign_index) { + my_error(ER_FK_INCORRECT_OPTION, + MYF(0), table_name, fk->id); + DBUG_RETURN(true); + } + } + + /* The fk->foreign_col_names[] uses renamed column + names, while the columns in ctx->old_table have not + been renamed yet. */ + error = dict_create_add_foreign_to_dictionary( + ctx->old_table->name.m_name, fk, trx); + + DBUG_EXECUTE_IF( + "innodb_test_cannot_add_fk_system", + error = DB_ERROR;); + + if (error != DB_SUCCESS) { + my_error(ER_FK_FAIL_ADD_SYSTEM, MYF(0), + fk->id); + DBUG_RETURN(true); + } + } + + for (i = 0; i < ctx->num_to_drop_fk; i++) { + dict_foreign_t* fk = ctx->drop_fk[i]; + + DBUG_ASSERT(fk->foreign_table == ctx->old_table); + + if (innobase_drop_foreign_try(trx, table_name, fk->id)) { + DBUG_RETURN(true); + } + } + + DBUG_RETURN(false); +} + +/** Update the foreign key constraint definitions in the data dictionary cache +after the changes to data dictionary tables were committed. +@param ctx In-place ALTER TABLE context +@param user_thd MySQL connection +@return InnoDB error code (should always be DB_SUCCESS) */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +dberr_t +innobase_update_foreign_cache( +/*==========================*/ + ha_innobase_inplace_ctx* ctx, + THD* user_thd) +{ + dict_table_t* user_table; + dberr_t err = DB_SUCCESS; + + DBUG_ENTER("innobase_update_foreign_cache"); + + ut_ad(dict_sys.locked()); + + user_table = ctx->old_table; + + /* Discard the added foreign keys, because we will + load them from the data dictionary. */ + for (ulint i = 0; i < ctx->num_to_add_fk; i++) { + dict_foreign_t* fk = ctx->add_fk[i]; + dict_foreign_free(fk); + } + + if (ctx->need_rebuild()) { + /* The rebuilt table is already using the renamed + column names. No need to pass col_names or to drop + constraints from the data dictionary cache. */ + DBUG_ASSERT(!ctx->col_names); + user_table = ctx->new_table; + } else { + /* Drop the foreign key constraints if the + table was not rebuilt. If the table is rebuilt, + there would not be any foreign key contraints for + it yet in the data dictionary cache. */ + for (ulint i = 0; i < ctx->num_to_drop_fk; i++) { + dict_foreign_t* fk = ctx->drop_fk[i]; + dict_foreign_remove_from_cache(fk); + } + } + + /* Load the old or added foreign keys from the data dictionary + and prevent the table from being evicted from the data + dictionary cache (work around the lack of WL#6049). */ + dict_names_t fk_tables; + + err = dict_load_foreigns(user_table->name.m_name, + ctx->col_names, 1, true, + DICT_ERR_IGNORE_FK_NOKEY, + fk_tables); + + if (err == DB_CANNOT_ADD_CONSTRAINT) { + fk_tables.clear(); + + /* It is possible there are existing foreign key are + loaded with "foreign_key checks" off, + so let's retry the loading with charset_check is off */ + err = dict_load_foreigns(user_table->name.m_name, + ctx->col_names, 1, false, + DICT_ERR_IGNORE_NONE, + fk_tables); + + /* The load with "charset_check" off is successful, warn + the user that the foreign key has loaded with mis-matched + charset */ + if (err == DB_SUCCESS) { + push_warning_printf( + user_thd, + Sql_condition::WARN_LEVEL_WARN, + ER_ALTER_INFO, + "Foreign key constraints for table '%s'" + " are loaded with charset check off", + user_table->name.m_name); + } + } + + /* For complete loading of foreign keys, all associated tables must + also be loaded. */ + while (err == DB_SUCCESS && !fk_tables.empty()) { + const char *f = fk_tables.front(); + if (!dict_sys.load_table({f, strlen(f)})) { + err = DB_TABLE_NOT_FOUND; + ib::error() + << "Failed to load table " + << table_name_t(const_cast(f)) + << " which has a foreign key constraint with" + << user_table->name; + break; + } + + fk_tables.pop_front(); + } + + DBUG_RETURN(err); +} + +/** Changes SYS_COLUMNS.PRTYPE for one column. +@param[in,out] trx transaction +@param[in] table_name table name +@param[in] tableid table ID as in SYS_TABLES +@param[in] pos column position +@param[in] prtype new precise type +@return boolean flag +@retval true on failure +@retval false on success */ +static +bool +vers_change_field_try( + trx_t* trx, + const char* table_name, + const table_id_t tableid, + const ulint pos, + const ulint prtype) +{ + DBUG_ENTER("vers_change_field_try"); + + pars_info_t* info = pars_info_create(); + + pars_info_add_int4_literal(info, "prtype", prtype); + pars_info_add_ull_literal(info,"tableid", tableid); + pars_info_add_int4_literal(info, "pos", pos); + + dberr_t error = que_eval_sql(info, + "PROCEDURE CHANGE_COLUMN_MTYPE () IS\n" + "BEGIN\n" + "UPDATE SYS_COLUMNS SET PRTYPE=:prtype\n" + "WHERE TABLE_ID=:tableid AND POS=:pos;\n" + "END;\n", trx); + + if (error != DB_SUCCESS) { + my_error_innodb(error, table_name, 0); + trx->error_state = DB_SUCCESS; + trx->op_info = ""; + DBUG_RETURN(true); + } + + DBUG_RETURN(false); +} + +/** Changes fields WITH/WITHOUT SYSTEM VERSIONING property in SYS_COLUMNS. +@param[in] ha_alter_info alter info +@param[in] ctx alter inplace context +@param[in] trx transaction +@param[in] table old table +@return boolean flag +@retval true on failure +@retval false on success */ +static +bool +vers_change_fields_try( + const Alter_inplace_info* ha_alter_info, + const ha_innobase_inplace_ctx* ctx, + trx_t* trx, + const TABLE* table) +{ + DBUG_ENTER("vers_change_fields_try"); + + DBUG_ASSERT(ha_alter_info); + DBUG_ASSERT(ctx); + + for (const Create_field& create_field : ha_alter_info->alter_info->create_list) { + if (!create_field.field) { + continue; + } + if (create_field.versioning + == Column_definition::VERSIONING_NOT_SET) { + continue; + } + + const dict_table_t* new_table = ctx->new_table; + const uint pos = innodb_col_no(create_field.field); + const dict_col_t* col = dict_table_get_nth_col(new_table, pos); + + DBUG_ASSERT(!col->vers_sys_start()); + DBUG_ASSERT(!col->vers_sys_end()); + + ulint new_prtype + = create_field.versioning + == Column_definition::WITHOUT_VERSIONING + ? col->prtype & ~DATA_VERSIONED + : col->prtype | DATA_VERSIONED; + + if (vers_change_field_try(trx, table->s->table_name.str, + new_table->id, pos, + new_prtype)) { + DBUG_RETURN(true); + } + } + + DBUG_RETURN(false); +} + +/** Changes WITH/WITHOUT SYSTEM VERSIONING for fields +in the data dictionary cache. +@param ha_alter_info Data used during in-place alter +@param ctx In-place ALTER TABLE context +@param table MySQL table as it is before the ALTER operation */ +static +void +vers_change_fields_cache( + Alter_inplace_info* ha_alter_info, + const ha_innobase_inplace_ctx* ctx, + const TABLE* table) +{ + DBUG_ENTER("vers_change_fields_cache"); + + DBUG_ASSERT(ha_alter_info); + DBUG_ASSERT(ctx); + DBUG_ASSERT(ha_alter_info->handler_flags & ALTER_COLUMN_UNVERSIONED); + + for (const Create_field& create_field : + ha_alter_info->alter_info->create_list) { + if (!create_field.field || create_field.field->vcol_info) { + continue; + } + dict_col_t* col = dict_table_get_nth_col( + ctx->new_table, innodb_col_no(create_field.field)); + + if (create_field.versioning + == Column_definition::WITHOUT_VERSIONING) { + + DBUG_ASSERT(!col->vers_sys_start()); + DBUG_ASSERT(!col->vers_sys_end()); + col->prtype &= ~DATA_VERSIONED; + } else if (create_field.versioning + == Column_definition::WITH_VERSIONING) { + + DBUG_ASSERT(!col->vers_sys_start()); + DBUG_ASSERT(!col->vers_sys_end()); + col->prtype |= DATA_VERSIONED; + } + } + + DBUG_VOID_RETURN; +} + +/** Commit the changes made during prepare_inplace_alter_table() +and inplace_alter_table() inside the data dictionary tables, +when rebuilding the table. +@param ha_alter_info Data used during in-place alter +@param ctx In-place ALTER TABLE context +@param altered_table MySQL table that is being altered +@param old_table MySQL table as it is before the ALTER operation +@param trx Data dictionary transaction +@param table_name Table name in MySQL +@retval true Failure +@retval false Success +*/ +inline MY_ATTRIBUTE((nonnull, warn_unused_result)) +bool +commit_try_rebuild( +/*===============*/ + Alter_inplace_info* ha_alter_info, + ha_innobase_inplace_ctx*ctx, + TABLE* altered_table, + const TABLE* old_table, + bool statistics_exist, + trx_t* trx, + const char* table_name) +{ + dict_table_t* rebuilt_table = ctx->new_table; + dict_table_t* user_table = ctx->old_table; + + DBUG_ENTER("commit_try_rebuild"); + DBUG_ASSERT(ctx->need_rebuild()); + DBUG_ASSERT(trx->dict_operation_lock_mode); + DBUG_ASSERT(!(ha_alter_info->handler_flags + & ALTER_DROP_FOREIGN_KEY) + || ctx->num_to_drop_fk > 0); + DBUG_ASSERT(ctx->num_to_drop_fk + <= ha_alter_info->alter_info->drop_list.elements); + + innobase_online_rebuild_log_free(user_table); + + for (dict_index_t* index = dict_table_get_first_index(rebuilt_table); + index; + index = dict_table_get_next_index(index)) { + DBUG_ASSERT(dict_index_get_online_status(index) + == ONLINE_INDEX_COMPLETE); + DBUG_ASSERT(index->is_committed()); + if (index->is_corrupted()) { + my_error(ER_INDEX_CORRUPT, MYF(0), index->name()); + DBUG_RETURN(true); + } + } + + if (innobase_update_foreign_try(ctx, trx, table_name)) { + DBUG_RETURN(true); + } + + /* Clear the to_be_dropped flag in the data dictionary cache + of user_table. */ + for (ulint i = 0; i < ctx->num_to_drop_index; i++) { + dict_index_t* index = ctx->drop_index[i]; + DBUG_ASSERT(index->table == user_table); + DBUG_ASSERT(index->is_committed()); + DBUG_ASSERT(index->to_be_dropped); + index->to_be_dropped = 0; + } + + if ((ha_alter_info->handler_flags + & ALTER_COLUMN_NAME) + && innobase_rename_columns_try(ha_alter_info, ctx, old_table, + trx, table_name)) { + DBUG_RETURN(true); + } + + /* The new table must inherit the flag from the + "parent" table. */ + if (!user_table->space) { + rebuilt_table->file_unreadable = true; + rebuilt_table->flags2 |= DICT_TF2_DISCARDED; + } + + /* We can now rename the old table as a temporary table, + rename the new temporary table as the old table and drop the + old table. */ + char* old_name= mem_heap_strdup(ctx->heap, user_table->name.m_name); + + dberr_t error = row_rename_table_for_mysql(user_table->name.m_name, + ctx->tmp_name, trx, false); + if (error == DB_SUCCESS) { + error = row_rename_table_for_mysql( + rebuilt_table->name.m_name, old_name, trx, false); + if (error == DB_SUCCESS) { + /* The statistics for the surviving indexes will be + re-inserted in alter_stats_rebuild(). */ + if (statistics_exist) { + error = trx->drop_table_statistics(old_name); + } + if (error == DB_SUCCESS) { + error = trx->drop_table(*user_table); + } + } + } + + /* We must be still holding a table handle. */ + DBUG_ASSERT(user_table->get_ref_count() == 1); + DBUG_EXECUTE_IF("ib_rebuild_cannot_rename", error = DB_ERROR;); + + switch (error) { + case DB_SUCCESS: + DBUG_RETURN(false); + case DB_TABLESPACE_EXISTS: + ut_a(rebuilt_table->get_ref_count() == 1); + my_error(ER_TABLESPACE_EXISTS, MYF(0), ctx->tmp_name); + DBUG_RETURN(true); + case DB_DUPLICATE_KEY: + ut_a(rebuilt_table->get_ref_count() == 1); + my_error(ER_TABLE_EXISTS_ERROR, MYF(0), ctx->tmp_name); + DBUG_RETURN(true); + default: + my_error_innodb(error, table_name, user_table->flags); + DBUG_RETURN(true); + } +} + +/** Rename indexes in dictionary. +@param[in] ctx alter info context +@param[in] ha_alter_info Operation used during inplace alter +@param[out] trx transaction to change the index name + in dictionary +@return true if it failed to rename +@return false if it is success. */ +static +bool +rename_indexes_try( + const ha_innobase_inplace_ctx* ctx, + const Alter_inplace_info* ha_alter_info, + trx_t* trx) +{ + DBUG_ASSERT(ha_alter_info->handler_flags & ALTER_RENAME_INDEX); + + for (const Alter_inplace_info::Rename_key_pair& pair : + ha_alter_info->rename_keys) { + dict_index_t* index = dict_table_get_index_on_name( + ctx->old_table, pair.old_key->name.str); + // This was checked previously in + // ha_innobase::prepare_inplace_alter_table() + ut_ad(index); + + if (rename_index_try(index, pair.new_key->name.str, trx)) { + return true; + } + } + + return false; +} + +/** Set of column numbers */ +typedef std::set, ut_allocator > col_set; + +/** Collect (not instantly dropped) columns from dropped indexes +@param[in] ctx In-place ALTER TABLE context +@param[in, out] drop_col_list list which will be set, containing columns + which is part of index being dropped +@param[in, out] drop_v_col_list list which will be set, containing + virtual columns which is part of index + being dropped */ +static +void +collect_columns_from_dropped_indexes( + const ha_innobase_inplace_ctx* ctx, + col_set& drop_col_list, + col_set& drop_v_col_list) +{ + for (ulint index_count = 0; index_count < ctx->num_to_drop_index; + index_count++) { + const dict_index_t* index = ctx->drop_index[index_count]; + + for (ulint col = 0; col < index->n_user_defined_cols; col++) { + const dict_col_t* idx_col + = dict_index_get_nth_col(index, col); + + if (idx_col->is_virtual()) { + const dict_v_col_t* v_col + = reinterpret_cast< + const dict_v_col_t*>(idx_col); + drop_v_col_list.insert(v_col->v_pos); + + } else { + ulint col_no = dict_col_get_no(idx_col); + if (ctx->col_map + && ctx->col_map[col_no] + == ULINT_UNDEFINED) { + // this column was instantly dropped + continue; + } + drop_col_list.insert(col_no); + } + } + } +} + +/** Change PAGE_COMPRESSED to ON or change the PAGE_COMPRESSION_LEVEL. +@param[in] level PAGE_COMPRESSION_LEVEL +@param[in] table table before the change +@param[in,out] trx data dictionary transaction +@param[in] table_name table name in MariaDB +@return whether the operation succeeded */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +static +bool +innobase_page_compression_try( + uint level, + const dict_table_t* table, + trx_t* trx, + const char* table_name) +{ + DBUG_ENTER("innobase_page_compression_try"); + DBUG_ASSERT(level >= 1); + DBUG_ASSERT(level <= 9); + + unsigned flags = table->flags + & ~(0xFU << DICT_TF_POS_PAGE_COMPRESSION_LEVEL); + flags |= 1U << DICT_TF_POS_PAGE_COMPRESSION + | level << DICT_TF_POS_PAGE_COMPRESSION_LEVEL; + + if (table->flags == flags) { + DBUG_RETURN(false); + } + + pars_info_t* info = pars_info_create(); + + pars_info_add_ull_literal(info, "id", table->id); + pars_info_add_int4_literal(info, "type", + dict_tf_to_sys_tables_type(flags)); + + dberr_t error = que_eval_sql(info, + "PROCEDURE CHANGE_COMPRESSION () IS\n" + "BEGIN\n" + "UPDATE SYS_TABLES SET TYPE=:type\n" + "WHERE ID=:id;\n" + "END;\n", trx); + + if (error != DB_SUCCESS) { + my_error_innodb(error, table_name, 0); + trx->error_state = DB_SUCCESS; + trx->op_info = ""; + DBUG_RETURN(true); + } + + DBUG_RETURN(false); +} + +/** Evict the table from cache and reopen it. Drop outdated statistics. +@param thd mariadb THD entity +@param table innodb table +@param table_name user-friendly table name for errors +@param ctx ALTER TABLE context +@return newly opened table */ +static dict_table_t *innobase_reload_table(THD *thd, dict_table_t *table, + const LEX_CSTRING &table_name, + ha_innobase_inplace_ctx &ctx) +{ + if (ctx.is_instant()) + { + for (auto i= ctx.old_n_v_cols; i--; ) + { + ctx.old_v_cols[i].~dict_v_col_t(); + const_cast(ctx.old_n_v_cols)= 0; + } + } + + const table_id_t id= table->id; + table->release(); + dict_sys.remove(table); + return dict_table_open_on_id(id, true, DICT_TABLE_OP_NORMAL); +} + +/** Commit the changes made during prepare_inplace_alter_table() +and inplace_alter_table() inside the data dictionary tables, +when not rebuilding the table. +@param ha_alter_info Data used during in-place alter +@param ctx In-place ALTER TABLE context +@param old_table MySQL table as it is before the ALTER operation +@param trx Data dictionary transaction +@param table_name Table name in MySQL +@retval true Failure +@retval false Success +*/ +inline MY_ATTRIBUTE((nonnull, warn_unused_result)) +bool +commit_try_norebuild( +/*=================*/ + Alter_inplace_info* ha_alter_info, + ha_innobase_inplace_ctx*ctx, + TABLE* altered_table, + const TABLE* old_table, + trx_t* trx, + const char* table_name) +{ + DBUG_ENTER("commit_try_norebuild"); + DBUG_ASSERT(!ctx->need_rebuild()); + DBUG_ASSERT(trx->dict_operation_lock_mode); + DBUG_ASSERT(!(ha_alter_info->handler_flags + & ALTER_DROP_FOREIGN_KEY) + || ctx->num_to_drop_fk > 0); + DBUG_ASSERT(ctx->num_to_drop_fk + <= ha_alter_info->alter_info->drop_list.elements + || ctx->num_to_drop_vcol + == ha_alter_info->alter_info->drop_list.elements); + + if (ctx->page_compression_level + && innobase_page_compression_try(ctx->page_compression_level, + ctx->new_table, trx, + table_name)) { + DBUG_RETURN(true); + } + + for (ulint i = 0; i < ctx->num_to_add_index; i++) { + dict_index_t* index = ctx->add_index[i]; + DBUG_ASSERT(dict_index_get_online_status(index) + == ONLINE_INDEX_COMPLETE); + DBUG_ASSERT(!index->is_committed()); + if (index->is_corrupted()) { + /* Report a duplicate key + error for the index that was + flagged corrupted, most likely + because a duplicate value was + inserted (directly or by + rollback) after + ha_innobase::inplace_alter_table() + completed. + TODO: report this as a corruption + with a detailed reason once + WL#6379 has been implemented. */ + my_error(ER_DUP_UNKNOWN_IN_INDEX, + MYF(0), index->name()); + DBUG_RETURN(true); + } + } + + if (innobase_update_foreign_try(ctx, trx, table_name)) { + DBUG_RETURN(true); + } + + if ((ha_alter_info->handler_flags & ALTER_COLUMN_UNVERSIONED) + && vers_change_fields_try(ha_alter_info, ctx, trx, old_table)) { + DBUG_RETURN(true); + } + + dberr_t error = DB_SUCCESS; + dict_index_t* index; + const char *op = "rename index to add"; + ulint num_fts_index = 0; + + /* We altered the table in place. Mark the indexes as committed. */ + for (ulint i = 0; i < ctx->num_to_add_index; i++) { + index = ctx->add_index[i]; + DBUG_ASSERT(dict_index_get_online_status(index) + == ONLINE_INDEX_COMPLETE); + DBUG_ASSERT(!index->is_committed()); + error = row_merge_rename_index_to_add( + trx, ctx->new_table->id, index->id); + if (error) { + goto handle_error; + } + } + + for (dict_index_t *index = UT_LIST_GET_FIRST(ctx->old_table->indexes); + index; index = UT_LIST_GET_NEXT(indexes, index)) { + if (index->type & DICT_FTS) { + num_fts_index++; + } + } + + char db[MAX_DB_UTF8_LEN], table[MAX_TABLE_UTF8_LEN]; + if (ctx->num_to_drop_index) { + dict_fs2utf8(ctx->old_table->name.m_name, + db, sizeof db, table, sizeof table); + } + + for (ulint i = 0; i < ctx->num_to_drop_index; i++) { + index = ctx->drop_index[i]; + DBUG_ASSERT(index->is_committed()); + DBUG_ASSERT(index->table == ctx->new_table); + DBUG_ASSERT(index->to_be_dropped); + op = "DROP INDEX"; + + static const char drop_index[] = + "PROCEDURE DROP_INDEX_PROC () IS\n" + "BEGIN\n" + "DELETE FROM SYS_FIELDS WHERE INDEX_ID=:indexid;\n" + "DELETE FROM SYS_INDEXES WHERE ID=:indexid;\n" + "END;\n"; + + pars_info_t* info = pars_info_create(); + pars_info_add_ull_literal(info, "indexid", index->id); + error = que_eval_sql(info, drop_index, trx); + + if (error == DB_SUCCESS && index->type & DICT_FTS) { + DBUG_ASSERT(index->table->fts); + DEBUG_SYNC_C("norebuild_fts_drop"); + error = fts_drop_index(index->table, index, trx); + ut_ad(num_fts_index); + num_fts_index--; + } + + if (error != DB_SUCCESS) { + goto handle_error; + } + + error = dict_stats_delete_from_index_stats(db, table, + index->name, trx); + switch (error) { + case DB_SUCCESS: + case DB_STATS_DO_NOT_EXIST: + continue; + default: + goto handle_error; + } + } + + if (const size_t size = ha_alter_info->rename_keys.size()) { + char tmp_name[5]; + char db[MAX_DB_UTF8_LEN], table[MAX_TABLE_UTF8_LEN]; + + dict_fs2utf8(ctx->new_table->name.m_name, db, sizeof db, + table, sizeof table); + tmp_name[0]= (char)0xff; + for (size_t i = 0; error == DB_SUCCESS && i < size; i++) { + snprintf(tmp_name+1, sizeof(tmp_name)-1, "%zu", i); + error = dict_stats_rename_index(db, table, + ha_alter_info-> + rename_keys[i]. + old_key->name.str, + tmp_name, trx); + } + for (size_t i = 0; error == DB_SUCCESS && i < size; i++) { + snprintf(tmp_name+1, sizeof(tmp_name)-1, "%zu", i); + error = dict_stats_rename_index(db, table, tmp_name, + ha_alter_info + ->rename_keys[i]. + new_key->name.str, + trx); + } + + switch (error) { + case DB_SUCCESS: + case DB_STATS_DO_NOT_EXIST: + break; + case DB_DUPLICATE_KEY: + my_error(ER_DUP_KEY, MYF(0), + "mysql.innodb_index_stats"); + DBUG_RETURN(true); + default: + goto handle_error; + } + } + + if ((ctx->old_table->flags2 & DICT_TF2_FTS) && !num_fts_index) { + error = fts_drop_tables(trx, *ctx->old_table); + if (error != DB_SUCCESS) { +handle_error: + switch (error) { + case DB_TOO_MANY_CONCURRENT_TRXS: + my_error(ER_TOO_MANY_CONCURRENT_TRXS, MYF(0)); + break; + case DB_LOCK_WAIT_TIMEOUT: + my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0)); + break; + default: + sql_print_error("InnoDB: %s: %s\n", op, + ut_strerr(error)); + DBUG_ASSERT(error == DB_IO_ERROR + || error == DB_LOCK_TABLE_FULL + || error == DB_DECRYPTION_FAILED + || error == DB_PAGE_CORRUPTED + || error == DB_CORRUPTION); + my_error(ER_INTERNAL_ERROR, MYF(0), op); + } + + DBUG_RETURN(true); + } + } + + if (innobase_rename_or_enlarge_columns_try(ha_alter_info, ctx, + altered_table, old_table, + trx, table_name)) { + DBUG_RETURN(true); + } + + if ((ha_alter_info->handler_flags & ALTER_RENAME_INDEX) + && rename_indexes_try(ctx, ha_alter_info, trx)) { + DBUG_RETURN(true); + } + + if (ctx->is_instant()) { + DBUG_RETURN(innobase_instant_try(ha_alter_info, ctx, + altered_table, old_table, + trx)); + } + + if (ha_alter_info->handler_flags + & (ALTER_DROP_VIRTUAL_COLUMN | ALTER_ADD_VIRTUAL_COLUMN)) { + if ((ha_alter_info->handler_flags & ALTER_DROP_VIRTUAL_COLUMN) + && innobase_drop_virtual_try(ha_alter_info, ctx->old_table, + trx)) { + DBUG_RETURN(true); + } + + if ((ha_alter_info->handler_flags & ALTER_ADD_VIRTUAL_COLUMN) + && innobase_add_virtual_try(ha_alter_info, ctx->old_table, + trx)) { + DBUG_RETURN(true); + } + + unsigned n_col = ctx->old_table->n_cols + - DATA_N_SYS_COLS; + unsigned n_v_col = ctx->old_table->n_v_cols + + ctx->num_to_add_vcol - ctx->num_to_drop_vcol; + + if (innodb_update_cols( + ctx->old_table, + dict_table_encode_n_col(n_col, n_v_col) + | unsigned(ctx->old_table->flags & DICT_TF_COMPACT) + << 31, trx)) { + DBUG_RETURN(true); + } + } + + DBUG_RETURN(false); +} + +/** Commit the changes to the data dictionary cache +after a successful commit_try_norebuild() call. +@param ha_alter_info algorithm=inplace context +@param ctx In-place ALTER TABLE context for the current partition +@param altered_table the TABLE after the ALTER +@param table the TABLE before the ALTER +@param trx Data dictionary transaction +(will be started and committed, for DROP INDEX) +@return whether all replacements were found for dropped indexes */ +inline MY_ATTRIBUTE((nonnull)) +bool +commit_cache_norebuild( +/*===================*/ + Alter_inplace_info* ha_alter_info, + ha_innobase_inplace_ctx*ctx, + const TABLE* altered_table, + const TABLE* table, + trx_t* trx) +{ + DBUG_ENTER("commit_cache_norebuild"); + DBUG_ASSERT(!ctx->need_rebuild()); + DBUG_ASSERT(ctx->new_table->space != fil_system.temp_space); + DBUG_ASSERT(!ctx->new_table->is_temporary()); + + bool found = true; + + if (ctx->page_compression_level) { + DBUG_ASSERT(ctx->new_table->space != fil_system.sys_space); +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wconversion" /* GCC 4 and 5 need this here */ +#endif + ctx->new_table->flags + = static_cast( + (ctx->new_table->flags + & ~(0xFU + << DICT_TF_POS_PAGE_COMPRESSION_LEVEL)) + | 1 << DICT_TF_POS_PAGE_COMPRESSION + | (ctx->page_compression_level & 0xF) + << DICT_TF_POS_PAGE_COMPRESSION_LEVEL) + & ((1U << DICT_TF_BITS) - 1); +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 +# pragma GCC diagnostic pop +#endif + + if (fil_space_t* space = ctx->new_table->space) { + bool update = !(space->flags + & FSP_FLAGS_MASK_PAGE_COMPRESSION); + mysql_mutex_lock(&fil_system.mutex); + space->flags &= ~FSP_FLAGS_MASK_MEM_COMPRESSION_LEVEL; + space->flags |= ctx->page_compression_level + << FSP_FLAGS_MEM_COMPRESSION_LEVEL; + if (!space->full_crc32()) { + space->flags + |= FSP_FLAGS_MASK_PAGE_COMPRESSION; + } else if (!space->is_compressed()) { + space->flags |= static_cast( + innodb_compression_algorithm) + << FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO; + } + mysql_mutex_unlock(&fil_system.mutex); + + if (update) { + /* Maybe we should introduce an undo + log record for updating tablespace + flags, and perform the update already + in innobase_page_compression_try(). + + If the server is killed before the + following mini-transaction commit + becomes durable, fsp_flags_try_adjust() + will perform the equivalent adjustment + and warn "adjusting FSP_SPACE_FLAGS". */ + mtr_t mtr; + mtr.start(); + if (buf_block_t* b = buf_page_get( + page_id_t(space->id, 0), + space->zip_size(), + RW_X_LATCH, &mtr)) { + byte* f = FSP_HEADER_OFFSET + + FSP_SPACE_FLAGS + + b->page.frame; + const auto sf = space->flags + & ~FSP_FLAGS_MEM_MASK; + if (mach_read_from_4(f) != sf) { + mtr.set_named_space(space); + mtr.write<4,mtr_t::FORCED>( + *b, f, sf); + } + } + mtr.commit(); + } + } + } + + col_set drop_list; + col_set v_drop_list; + + /* Check if the column, part of an index to be dropped is part of any + other index which is not being dropped. If it so, then set the ord_part + of the column to 0. */ + collect_columns_from_dropped_indexes(ctx, drop_list, v_drop_list); + + for (ulint col : drop_list) { + if (!check_col_exists_in_indexes(ctx->new_table, col, false)) { + ctx->new_table->cols[col].ord_part = 0; + } + } + + for (ulint col : v_drop_list) { + if (!check_col_exists_in_indexes(ctx->new_table, col, true)) { + ctx->new_table->v_cols[col].m_col.ord_part = 0; + } + } + + for (ulint i = 0; i < ctx->num_to_add_index; i++) { + dict_index_t* index = ctx->add_index[i]; + DBUG_ASSERT(dict_index_get_online_status(index) + == ONLINE_INDEX_COMPLETE); + DBUG_ASSERT(!index->is_committed()); + index->change_col_info = nullptr; + index->set_committed(true); + } + + for (ulint i = 0; i < ctx->num_to_drop_index; i++) { + dict_index_t* index = ctx->drop_index[i]; + DBUG_ASSERT(index->is_committed()); + DBUG_ASSERT(index->table == ctx->new_table); + DBUG_ASSERT(index->to_be_dropped); + + if (!dict_foreign_replace_index(index->table, ctx->col_names, + index)) { + found = false; + } + + dict_index_remove_from_cache(index->table, index); + } + + fts_clear_all(ctx->old_table); + + if (!ctx->is_instant()) { + innobase_rename_or_enlarge_columns_cache( + ha_alter_info, altered_table, table, ctx->new_table); + } else { + ut_ad(ctx->col_map); + + if (fts_t* fts = ctx->new_table->fts) { + ut_ad(fts->doc_col != ULINT_UNDEFINED); + ut_ad(ctx->new_table->n_cols > DATA_N_SYS_COLS); + const ulint c = ctx->col_map[fts->doc_col]; + ut_ad(c < ulint(ctx->new_table->n_cols) + - DATA_N_SYS_COLS); + ut_d(const dict_col_t& col = ctx->new_table->cols[c]); + ut_ad(!col.is_nullable()); + ut_ad(!col.is_virtual()); + ut_ad(!col.is_added()); + ut_ad(col.prtype & DATA_UNSIGNED); + ut_ad(col.mtype == DATA_INT); + ut_ad(col.len == 8); + ut_ad(col.ord_part); + fts->doc_col = c; + } + + if (ha_alter_info->handler_flags & ALTER_DROP_STORED_COLUMN) { + const dict_index_t* index = ctx->new_table->indexes.start; + + for (const dict_field_t* f = index->fields, + * const end = f + index->n_fields; + f != end; f++) { + dict_col_t& c = *f->col; + if (c.is_dropped()) { + c.set_dropped(!c.is_nullable(), + DATA_LARGE_MTYPE(c.mtype) + || (!f->fixed_len + && c.len > 255), + f->fixed_len); + } + } + } + + if (!ctx->instant_table->persistent_autoinc) { + ctx->new_table->persistent_autoinc = 0; + } + } + + if (ha_alter_info->handler_flags & ALTER_COLUMN_UNVERSIONED) { + vers_change_fields_cache(ha_alter_info, ctx, table); + } + + if (ha_alter_info->handler_flags & ALTER_RENAME_INDEX) { + innobase_rename_indexes_cache(ctx, ha_alter_info); + } + + ctx->new_table->fts_doc_id_index + = ctx->new_table->fts + ? dict_table_get_index_on_name( + ctx->new_table, FTS_DOC_ID_INDEX_NAME) + : NULL; + DBUG_ASSERT((ctx->new_table->fts == NULL) + == (ctx->new_table->fts_doc_id_index == NULL)); + if (table->found_next_number_field + && !altered_table->found_next_number_field) { + ctx->prebuilt->table->persistent_autoinc = 0; + } + DBUG_RETURN(found); +} + +/** Adjust the persistent statistics after non-rebuilding ALTER TABLE. +Remove statistics for dropped indexes, add statistics for created indexes +and rename statistics for renamed indexes. +@param ha_alter_info Data used during in-place alter +@param ctx In-place ALTER TABLE context +@param thd MySQL connection +*/ +static +void +alter_stats_norebuild( +/*==================*/ + Alter_inplace_info* ha_alter_info, + ha_innobase_inplace_ctx* ctx, + THD* thd) +{ + DBUG_ENTER("alter_stats_norebuild"); + DBUG_ASSERT(!ctx->need_rebuild()); + + if (!dict_stats_is_persistent_enabled(ctx->new_table)) { + DBUG_VOID_RETURN; + } + + for (ulint i = 0; i < ctx->num_to_add_index; i++) { + dict_index_t* index = ctx->add_index[i]; + DBUG_ASSERT(index->table == ctx->new_table); + + if (!(index->type & DICT_FTS)) { + dict_stats_init(ctx->new_table); + dict_stats_update_for_index(index); + } + } + + DBUG_VOID_RETURN; +} + +/** Adjust the persistent statistics after rebuilding ALTER TABLE. +Remove statistics for dropped indexes, add statistics for created indexes +and rename statistics for renamed indexes. +@param table InnoDB table that was rebuilt by ALTER TABLE +@param table_name Table name in MySQL +@param thd MySQL connection +*/ +static +void +alter_stats_rebuild( +/*================*/ + dict_table_t* table, + const char* table_name, + THD* thd) +{ + DBUG_ENTER("alter_stats_rebuild"); + + if (!table->space + || !dict_stats_is_persistent_enabled(table)) { + DBUG_VOID_RETURN; + } + + dberr_t ret = dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT); + + if (ret != DB_SUCCESS) { + push_warning_printf( + thd, + Sql_condition::WARN_LEVEL_WARN, + ER_ALTER_INFO, + "Error updating stats for table '%s'" + " after table rebuild: %s", + table_name, ut_strerr(ret)); + } + + DBUG_VOID_RETURN; +} + +/** Apply the log for the table rebuild operation. +@param[in] ctx Inplace Alter table context +@param[in] altered_table MySQL table that is being altered +@return true Failure, else false. */ +static bool alter_rebuild_apply_log( + ha_innobase_inplace_ctx* ctx, + Alter_inplace_info* ha_alter_info, + TABLE* altered_table) +{ + DBUG_ENTER("alter_rebuild_apply_log"); + + if (!ctx->online) { + DBUG_RETURN(false); + } + + /* We copied the table. Any indexes that were requested to be + dropped were not created in the copy of the table. Apply any + last bit of the rebuild log and then rename the tables. */ + dict_table_t* user_table = ctx->old_table; + + DEBUG_SYNC_C("row_log_table_apply2_before"); + + dict_vcol_templ_t* s_templ = NULL; + + if (ctx->new_table->n_v_cols > 0) { + s_templ = UT_NEW_NOKEY( + dict_vcol_templ_t()); + s_templ->vtempl = NULL; + + innobase_build_v_templ(altered_table, ctx->new_table, s_templ, + NULL, true); + ctx->new_table->vc_templ = s_templ; + } + + dberr_t error = row_log_table_apply( + ctx->thr, user_table, altered_table, + static_cast( + ha_alter_info->handler_ctx)->m_stage, + ctx->new_table); + + if (s_templ) { + ut_ad(ctx->need_rebuild()); + dict_free_vc_templ(s_templ); + UT_DELETE(s_templ); + ctx->new_table->vc_templ = NULL; + } + + DBUG_RETURN(ctx->log_failure( + ha_alter_info, altered_table, error)); +} + +/** Commit or rollback the changes made during +prepare_inplace_alter_table() and inplace_alter_table() inside +the storage engine. Note that the allowed level of concurrency +during this operation will be the same as for +inplace_alter_table() and thus might be higher than during +prepare_inplace_alter_table(). (E.g concurrent writes were +blocked during prepare, but might not be during commit). +@param altered_table TABLE object for new version of table. +@param ha_alter_info Structure describing changes to be done +by ALTER TABLE and holding data used during in-place alter. +@param commit true => Commit, false => Rollback. +@retval true Failure +@retval false Success +*/ + +bool +ha_innobase::commit_inplace_alter_table( +/*====================================*/ + TABLE* altered_table, + Alter_inplace_info* ha_alter_info, + bool commit) +{ + ha_innobase_inplace_ctx*ctx0; + + ctx0 = static_cast + (ha_alter_info->handler_ctx); + +#ifndef DBUG_OFF + uint failure_inject_count = 1; +#endif /* DBUG_OFF */ + + DBUG_ENTER("commit_inplace_alter_table"); + DBUG_ASSERT(!srv_read_only_mode); + DBUG_ASSERT(!ctx0 || ctx0->prebuilt == m_prebuilt); + DBUG_ASSERT(!ctx0 || ctx0->old_table == m_prebuilt->table); + + DEBUG_SYNC_C("innodb_commit_inplace_alter_table_enter"); + + DEBUG_SYNC_C("innodb_commit_inplace_alter_table_wait"); + + if (ctx0 != NULL && ctx0->m_stage != NULL) { + ctx0->m_stage->begin_phase_end(); + } + + if (!commit) { + /* A rollback is being requested. So far we may at + most have created stubs for ADD INDEX or a copy of the + table for rebuild. */ + DBUG_RETURN(rollback_inplace_alter_table( + ha_alter_info, table, m_prebuilt)); + } + + if (!(ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)) { + DBUG_ASSERT(!ctx0); + MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE); + if (table->found_next_number_field + && !altered_table->found_next_number_field) { + m_prebuilt->table->persistent_autoinc = 0; + /* Don't reset ha_alter_info->group_commit_ctx to make + partitions engine to call this function for all + partitions. */ + } + else + ha_alter_info->group_commit_ctx = NULL; + DBUG_RETURN(false); + } + + DBUG_ASSERT(ctx0); + + inplace_alter_handler_ctx** ctx_array; + inplace_alter_handler_ctx* ctx_single[2]; + + if (ha_alter_info->group_commit_ctx) { + ctx_array = ha_alter_info->group_commit_ctx; + } else { + ctx_single[0] = ctx0; + ctx_single[1] = NULL; + ctx_array = ctx_single; + } + + DBUG_ASSERT(ctx0 == ctx_array[0]); + ut_ad(m_prebuilt->table == ctx0->old_table); + ha_alter_info->group_commit_ctx = NULL; + + const bool new_clustered = ctx0->need_rebuild(); + trx_t* const trx = ctx0->trx; + trx->op_info = "acquiring table lock"; + bool fts_exist = false; + for (inplace_alter_handler_ctx** pctx = ctx_array; *pctx; pctx++) { + auto ctx = static_cast(*pctx); + DBUG_ASSERT(ctx->prebuilt->trx == m_prebuilt->trx); + ut_ad(m_prebuilt != ctx->prebuilt || ctx == ctx0); + DBUG_ASSERT(new_clustered == ctx->need_rebuild()); + /* If decryption failed for old table or new table + fail here. */ + if ((!ctx->old_table->is_readable() + && ctx->old_table->space) + || (!ctx->new_table->is_readable() + && ctx->new_table->space)) { + String str; + const char* engine= table_type(); + get_error_message(HA_ERR_DECRYPTION_FAILED, &str); + my_error(ER_GET_ERRMSG, MYF(0), HA_ERR_DECRYPTION_FAILED, str.c_ptr(), engine); + DBUG_RETURN(true); + } + if ((ctx->old_table->flags2 | ctx->new_table->flags2) + & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS)) { + fts_exist = true; + } + } + + bool already_stopped= false; + for (inplace_alter_handler_ctx** pctx = ctx_array; *pctx; pctx++) { + auto ctx = static_cast(*pctx); + dberr_t error = DB_SUCCESS; + + if (fts_exist) { + purge_sys.stop_FTS(*ctx->old_table, already_stopped); + already_stopped = true; + } + + if (new_clustered && ctx->old_table->fts) { + ut_ad(!ctx->old_table->fts->add_wq); + fts_optimize_remove_table(ctx->old_table); + } + + dict_sys.freeze(SRW_LOCK_CALL); + for (auto f : ctx->old_table->referenced_set) { + if (dict_table_t* child = f->foreign_table) { + error = lock_table_for_trx(child, trx, LOCK_X); + if (error != DB_SUCCESS) { + break; + } + } + } + dict_sys.unfreeze(); + + if (ctx->new_table->fts) { + ut_ad(!ctx->new_table->fts->add_wq); + fts_optimize_remove_table(ctx->new_table); + fts_sync_during_ddl(ctx->new_table); + } + + /* Exclusively lock the table, to ensure that no other + transaction is holding locks on the table while we + change the table definition. Any recovered incomplete + transactions would be holding InnoDB locks only, not MDL. */ + if (error == DB_SUCCESS) { + error = lock_table_for_trx(ctx->new_table, trx, + LOCK_X); + } + + DBUG_EXECUTE_IF("deadlock_table_fail", + { + error= DB_DEADLOCK; + trx_rollback_for_mysql(trx); + }); + + if (error != DB_SUCCESS) { +lock_fail: + my_error_innodb( + error, table_share->table_name.str, 0); + if (fts_exist) { + purge_sys.resume_FTS(); + } + + /* Deadlock encountered and rollbacked the + transaction. So restart the transaction + to remove the newly created table or + index from data dictionary and table cache + in rollback_inplace_alter_table() */ + if (trx->state == TRX_STATE_NOT_STARTED) { + trx_start_for_ddl(trx); + } + + DBUG_RETURN(true); + } else if ((ctx->new_table->flags2 + & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS)) + && (error = fts_lock_tables(trx, *ctx->new_table)) + != DB_SUCCESS) { + goto lock_fail; + } else if (!new_clustered) { + } else if ((error = lock_table_for_trx(ctx->old_table, trx, + LOCK_X)) + != DB_SUCCESS) { + goto lock_fail; + } else if ((ctx->old_table->flags2 + & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS)) + && (error = fts_lock_tables(trx, *ctx->old_table)) + != DB_SUCCESS) { + goto lock_fail; + } + } + + DEBUG_SYNC(m_user_thd, "innodb_alter_commit_after_lock_table"); + + if (new_clustered) { + /* We are holding MDL_EXCLUSIVE as well as exclusive + InnoDB table locks. Let us apply any table rebuild log + before locking dict_sys. */ + for (inplace_alter_handler_ctx** pctx= ctx_array; *pctx; + pctx++) { + auto ctx= static_cast(*pctx); + DBUG_ASSERT(ctx->need_rebuild()); + if (alter_rebuild_apply_log(ctx, ha_alter_info, + altered_table)) { + if (fts_exist) { + purge_sys.resume_FTS(); + } + DBUG_RETURN(true); + } + } + } else { + dberr_t error= DB_SUCCESS; + for (inplace_alter_handler_ctx** pctx= ctx_array; *pctx; + pctx++) { + auto ctx= static_cast(*pctx); + + if (!ctx->online || !ctx->old_table->space + || !ctx->old_table->is_readable()) { + continue; + } + + for (ulint i = 0; i < ctx->num_to_add_index; i++) { + dict_index_t *index= ctx->add_index[i]; + + ut_ad(!(index->type & + (DICT_FTS | DICT_SPATIAL))); + + index->lock.x_lock(SRW_LOCK_CALL); + if (!index->online_log) { + /* online log would've cleared + when we detect the error in + other index */ + index->lock.x_unlock(); + continue; + } + + if (index->is_corrupted()) { + /* Online index log has been + preserved to show the error + when it happened via + row_log_apply() by DML thread */ + error= row_log_get_error(index); +err_index: + ut_ad(error != DB_SUCCESS); + ctx->log_failure( + ha_alter_info, + altered_table, error); + row_log_free(index->online_log); + index->online_log= nullptr; + index->lock.x_unlock(); + + ctx->old_table->indexes.start + ->online_log= nullptr; + if (fts_exist) { + purge_sys.resume_FTS(); + } + MONITOR_ATOMIC_INC( + MONITOR_BACKGROUND_DROP_INDEX); + DBUG_RETURN(true); + } + + index->lock.x_unlock(); + + error = row_log_apply( + m_prebuilt->trx, index, altered_table, + ctx->m_stage); + + index->lock.x_lock(SRW_LOCK_CALL); + + if (error != DB_SUCCESS) { + goto err_index; + } + + row_log_free(index->online_log); + index->online_log= nullptr; + index->lock.x_unlock(); + } + + ctx->old_table->indexes.start->online_log= nullptr; + } + } + + dict_table_t *table_stats = nullptr, *index_stats = nullptr; + MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr; + dberr_t error = DB_SUCCESS; + if (!ctx0->old_table->is_stats_table() && + !ctx0->new_table->is_stats_table()) { + table_stats = dict_table_open_on_name( + TABLE_STATS_NAME, false, DICT_ERR_IGNORE_NONE); + if (table_stats) { + dict_sys.freeze(SRW_LOCK_CALL); + table_stats = dict_acquire_mdl_shared( + table_stats, m_user_thd, &mdl_table); + dict_sys.unfreeze(); + } + index_stats = dict_table_open_on_name( + INDEX_STATS_NAME, false, DICT_ERR_IGNORE_NONE); + if (index_stats) { + dict_sys.freeze(SRW_LOCK_CALL); + index_stats = dict_acquire_mdl_shared( + index_stats, m_user_thd, &mdl_index); + dict_sys.unfreeze(); + } + + if (table_stats && index_stats + && !strcmp(table_stats->name.m_name, TABLE_STATS_NAME) + && !strcmp(index_stats->name.m_name, INDEX_STATS_NAME) + && !(error = lock_table_for_trx(table_stats, + trx, LOCK_X))) { + error = lock_table_for_trx(index_stats, trx, LOCK_X); + } + } + + DBUG_EXECUTE_IF("stats_lock_fail", + error = DB_LOCK_WAIT_TIMEOUT; + trx_rollback_for_mysql(trx);); + + if (error == DB_SUCCESS) { + error = lock_sys_tables(trx); + } + if (error != DB_SUCCESS) { + if (table_stats) { + dict_table_close(table_stats, false, m_user_thd, + mdl_table); + } + if (index_stats) { + dict_table_close(index_stats, false, m_user_thd, + mdl_index); + } + my_error_innodb(error, table_share->table_name.str, 0); + if (fts_exist) { + purge_sys.resume_FTS(); + } + + if (trx->state == TRX_STATE_NOT_STARTED) { + /* Transaction may have been rolled back + due to a lock wait timeout, deadlock, + or a KILL statement. So restart the + transaction to remove the newly created + table or index stubs from data dictionary + and table cache in + rollback_inplace_alter_table() */ + trx_start_for_ddl(trx); + } + + DBUG_RETURN(true); + } + + row_mysql_lock_data_dictionary(trx); + + /* Apply the changes to the data dictionary tables, for all + partitions. */ + for (inplace_alter_handler_ctx** pctx = ctx_array; *pctx; pctx++) { + auto ctx = static_cast(*pctx); + + DBUG_ASSERT(new_clustered == ctx->need_rebuild()); + if (ctx->need_rebuild() && !ctx->old_table->space) { + my_error(ER_TABLESPACE_DISCARDED, MYF(0), + table->s->table_name.str); +fail: + trx->rollback(); + ut_ad(!trx->fts_trx); + if (table_stats) { + dict_table_close(table_stats, true, m_user_thd, + mdl_table); + } + if (index_stats) { + dict_table_close(index_stats, true, m_user_thd, + mdl_index); + } + row_mysql_unlock_data_dictionary(trx); + if (fts_exist) { + purge_sys.resume_FTS(); + } + trx_start_for_ddl(trx); + DBUG_RETURN(true); + } + + if (commit_set_autoinc(ha_alter_info, ctx, + altered_table, table)) { + goto fail; + } + + if (ctx->need_rebuild()) { + ctx->tmp_name = dict_mem_create_temporary_tablename( + ctx->heap, ctx->new_table->name.m_name, + ctx->new_table->id); + + if (commit_try_rebuild(ha_alter_info, ctx, + altered_table, table, + table_stats && index_stats, + trx, + table_share->table_name.str)) { + goto fail; + } + } else if (commit_try_norebuild(ha_alter_info, ctx, + altered_table, table, trx, + table_share->table_name.str)) { + goto fail; + } +#ifndef DBUG_OFF + { + /* Generate a dynamic dbug text. */ + char buf[32]; + + snprintf(buf, sizeof buf, + "ib_commit_inplace_fail_%u", + failure_inject_count++); + + DBUG_EXECUTE_IF(buf, + my_error(ER_INTERNAL_ERROR, MYF(0), + "Injected error!"); + goto fail; + ); + } +#endif + } + + if (table_stats) { + dict_table_close(table_stats, true, m_user_thd, mdl_table); + } + if (index_stats) { + dict_table_close(index_stats, true, m_user_thd, mdl_index); + } + + /* Commit or roll back the changes to the data dictionary. */ + DEBUG_SYNC(m_user_thd, "innodb_alter_inplace_before_commit"); + + if (new_clustered) { + ut_ad(trx->has_logged_persistent()); + for (inplace_alter_handler_ctx** pctx = ctx_array; *pctx; + pctx++) { + auto ctx= static_cast(*pctx); + ut_ad(!strcmp(ctx->old_table->name.m_name, + ctx->tmp_name)); + ut_ad(ctx->new_table->get_ref_count() == 1); + const bool own = m_prebuilt == ctx->prebuilt; + trx_t* const user_trx = m_prebuilt->trx; + ctx->prebuilt->table->release(); + ctx->prebuilt->table = nullptr; + row_prebuilt_free(ctx->prebuilt); + /* Rebuild the prebuilt object. */ + ctx->prebuilt = row_create_prebuilt( + ctx->new_table, altered_table->s->reclength); + if (own) { + m_prebuilt = ctx->prebuilt; + } + trx_start_if_not_started(user_trx, true); + m_prebuilt->trx = user_trx; + } + } + + ut_ad(!trx->fts_trx); + + std::vector deleted; + DBUG_EXECUTE_IF("innodb_alter_commit_crash_before_commit", + log_buffer_flush_to_disk(); DBUG_SUICIDE();); + /* The SQL layer recovery of ALTER TABLE will invoke + innodb_check_version() to know whether our trx->id, which we + reported via ha_innobase::table_version() after + ha_innobase::prepare_inplace_alter_table(), was committed. + + If this trx was committed (the log write below completed), + we will be able to recover our trx->id to + dict_table_t::def_trx_id from the data dictionary tables. + + For this logic to work, purge_sys.stop_SYS() and + purge_sys.resume_SYS() will ensure that the DB_TRX_ID that we + wrote to the SYS_ tables will be preserved until the SQL layer + has durably marked the ALTER TABLE operation as completed. + + During recovery, the purge of InnoDB transaction history will + not start until innodb_ddl_recovery_done(). */ + ha_alter_info->inplace_alter_table_committed = purge_sys.resume_SYS; + purge_sys.stop_SYS(); + trx->commit(deleted); + + /* At this point, the changes to the persistent storage have + been committed or rolled back. What remains to be done is to + update the in-memory structures, close some handles, release + temporary files, and (unless we rolled back) update persistent + statistics. */ + for (inplace_alter_handler_ctx** pctx = ctx_array; + *pctx; pctx++) { + ha_innobase_inplace_ctx* ctx + = static_cast(*pctx); + + DBUG_ASSERT(ctx->need_rebuild() == new_clustered); + + innobase_copy_frm_flags_from_table_share( + ctx->new_table, altered_table->s); + + if (new_clustered) { + DBUG_PRINT("to_be_dropped", + ("table: %s", ctx->old_table->name.m_name)); + + if (innobase_update_foreign_cache(ctx, m_user_thd) + != DB_SUCCESS + && m_prebuilt->trx->check_foreigns) { +foreign_fail: + push_warning_printf( + m_user_thd, + Sql_condition::WARN_LEVEL_WARN, + ER_ALTER_INFO, + "failed to load FOREIGN KEY" + " constraints"); + } + } else { + bool fk_fail = innobase_update_foreign_cache( + ctx, m_user_thd) != DB_SUCCESS; + + if (!commit_cache_norebuild(ha_alter_info, ctx, + altered_table, table, + trx)) { + fk_fail = true; + } + + if (fk_fail && m_prebuilt->trx->check_foreigns) { + goto foreign_fail; + } + } + + dict_mem_table_free_foreign_vcol_set(ctx->new_table); + dict_mem_table_fill_foreign_vcol_set(ctx->new_table); + } + + ut_ad(trx == ctx0->trx); + ctx0->trx = nullptr; + + /* Free the ctx->trx of other partitions, if any. We will only + use the ctx0->trx here. Others may have been allocated in + the prepare stage. */ + + for (inplace_alter_handler_ctx** pctx = &ctx_array[1]; *pctx; + pctx++) { + ha_innobase_inplace_ctx* ctx + = static_cast(*pctx); + + if (ctx->trx) { + ctx->trx->rollback(); + ctx->trx->free(); + ctx->trx = NULL; + } + } + + /* MDEV-17468: Avoid this at least when ctx->is_instant(). + Currently dict_load_column_low() is the only place where + num_base for virtual columns is assigned to nonzero. */ + if (ctx0->num_to_drop_vcol || ctx0->num_to_add_vcol + || (ctx0->new_table->n_v_cols && !new_clustered + && (ha_alter_info->alter_info->drop_list.elements + || ha_alter_info->alter_info->create_list.elements)) + || (ctx0->is_instant() + && m_prebuilt->table->n_v_cols + && ha_alter_info->handler_flags & ALTER_STORED_COLUMN_ORDER) + || !ctx0->change_col_collate.empty()) { + DBUG_ASSERT(ctx0->old_table->get_ref_count() == 1); + ut_ad(ctx0->prebuilt == m_prebuilt); + + for (inplace_alter_handler_ctx** pctx = ctx_array; *pctx; + pctx++) { + auto ctx= static_cast(*pctx); + ctx->prebuilt->table = innobase_reload_table( + m_user_thd, ctx->prebuilt->table, + table->s->table_name, *ctx); + innobase_copy_frm_flags_from_table_share( + ctx->prebuilt->table, altered_table->s); + } + + unlock_and_close_files(deleted, trx); + log_write_up_to(trx->commit_lsn, true); + DBUG_EXECUTE_IF("innodb_alter_commit_crash_after_commit", + DBUG_SUICIDE();); + trx->free(); + if (fts_exist) { + purge_sys.resume_FTS(); + } + MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE); + /* There is no need to reset dict_table_t::persistent_autoinc + as the table is reloaded */ + DBUG_RETURN(false); + } + + for (inplace_alter_handler_ctx** pctx = ctx_array; + *pctx; pctx++) { + ha_innobase_inplace_ctx* ctx + = static_cast + (*pctx); + DBUG_ASSERT(ctx->need_rebuild() == new_clustered); + + /* Publish the created fulltext index, if any. + Note that a fulltext index can be created without + creating the clustered index, if there already exists + a suitable FTS_DOC_ID column. If not, one will be + created, implying new_clustered */ + for (ulint i = 0; i < ctx->num_to_add_index; i++) { + dict_index_t* index = ctx->add_index[i]; + + if (index->type & DICT_FTS) { + DBUG_ASSERT(index->type == DICT_FTS); + /* We reset DICT_TF2_FTS here because the bit + is left unset when a drop proceeds the add. */ + DICT_TF2_FLAG_SET(ctx->new_table, DICT_TF2_FTS); + fts_add_index(index, ctx->new_table); + } + } + + ut_d(dict_table_check_for_dup_indexes( + ctx->new_table, CHECK_ALL_COMPLETE)); + + /* Start/Restart the FTS background operations. */ + if (ctx->new_table->fts) { + fts_optimize_add_table(ctx->new_table); + } + + ut_d(dict_table_check_for_dup_indexes( + ctx->new_table, CHECK_ABORTED_OK)); + +#ifdef UNIV_DEBUG + if (!(ctx->new_table->fts != NULL + && ctx->new_table->fts->cache->sync->in_progress)) { + ut_a(fts_check_cached_index(ctx->new_table)); + } +#endif + } + + unlock_and_close_files(deleted, trx); + log_write_up_to(trx->commit_lsn, true); + DBUG_EXECUTE_IF("innodb_alter_commit_crash_after_commit", + DBUG_SUICIDE();); + trx->free(); + if (fts_exist) { + purge_sys.resume_FTS(); + } + + /* TODO: The following code could be executed + while allowing concurrent access to the table + (MDL downgrade). */ + + if (new_clustered) { + for (inplace_alter_handler_ctx** pctx = ctx_array; + *pctx; pctx++) { + ha_innobase_inplace_ctx* ctx + = static_cast + (*pctx); + DBUG_ASSERT(ctx->need_rebuild()); + + alter_stats_rebuild( + ctx->new_table, table->s->table_name.str, + m_user_thd); + } + } else { + for (inplace_alter_handler_ctx** pctx = ctx_array; + *pctx; pctx++) { + ha_innobase_inplace_ctx* ctx + = static_cast + (*pctx); + DBUG_ASSERT(!ctx->need_rebuild()); + + alter_stats_norebuild(ha_alter_info, ctx, m_user_thd); + } + } + + innobase_parse_hint_from_comment( + m_user_thd, m_prebuilt->table, altered_table->s); + + /* TODO: Also perform DROP TABLE and DROP INDEX after + the MDL downgrade. */ + +#ifndef DBUG_OFF + dict_index_t* clust_index = dict_table_get_first_index( + ctx0->prebuilt->table); + DBUG_ASSERT(!clust_index->online_log); + DBUG_ASSERT(dict_index_get_online_status(clust_index) + == ONLINE_INDEX_COMPLETE); + + for (dict_index_t* index = clust_index; + index; + index = dict_table_get_next_index(index)) { + DBUG_ASSERT(!index->to_be_dropped); + } +#endif /* DBUG_OFF */ + MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE); + DBUG_RETURN(false); +} + +/** +@param thd the session +@param start_value the lower bound +@param max_value the upper bound (inclusive) */ + +ib_sequence_t::ib_sequence_t( + THD* thd, + ulonglong start_value, + ulonglong max_value) + : + m_max_value(max_value), + m_increment(0), + m_offset(0), + m_next_value(start_value), + m_eof(false) +{ + if (thd != 0 && m_max_value > 0) { + + thd_get_autoinc(thd, &m_offset, &m_increment); + + if (m_increment > 1 || m_offset > 1) { + + /* If there is an offset or increment specified + then we need to work out the exact next value. */ + + m_next_value = innobase_next_autoinc( + start_value, 1, + m_increment, m_offset, m_max_value); + + } else if (start_value == 0) { + /* The next value can never be 0. */ + m_next_value = 1; + } + } else { + m_eof = true; + } +} + +/** +Postfix increment +@return the next value to insert */ + +ulonglong +ib_sequence_t::operator++(int) UNIV_NOTHROW +{ + ulonglong current = m_next_value; + + ut_ad(!m_eof); + ut_ad(m_max_value > 0); + + m_next_value = innobase_next_autoinc( + current, 1, m_increment, m_offset, m_max_value); + + if (m_next_value == m_max_value && current == m_next_value) { + m_eof = true; + } + + return(current); +} diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc new file mode 100644 index 00000000..b00308d7 --- /dev/null +++ b/storage/innobase/handler/i_s.cc @@ -0,0 +1,6506 @@ +/***************************************************************************** + +Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2014, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file handler/i_s.cc +InnoDB INFORMATION SCHEMA tables interface to MySQL. + +Created July 18, 2007 Vasil Dimov +*******************************************************/ + +#include "univ.i" +#include +#include + +#include +#include +#include + +#include "i_s.h" +#include "btr0pcur.h" +#include "btr0types.h" +#include "dict0dict.h" +#include "dict0load.h" +#include "buf0buddy.h" +#include "buf0buf.h" +#include "ibuf0ibuf.h" +#include "dict0mem.h" +#include "dict0types.h" +#include "srv0start.h" +#include "trx0i_s.h" +#include "trx0trx.h" +#include "srv0mon.h" +#include "pars0pars.h" +#include "fts0types.h" +#include "fts0opt.h" +#include "fts0priv.h" +#include "btr0btr.h" +#include "page0zip.h" +#include "fil0fil.h" +#include "fil0crypt.h" +#include "dict0crea.h" +#include "fts0vlc.h" +#include "scope.h" +#include "log.h" + +/** The latest successfully looked up innodb_fts_aux_table */ +table_id_t innodb_ft_aux_table_id; + +/** structure associates a name string with a file page type and/or buffer +page state. */ +struct buf_page_desc_t{ + const char* type_str; /*!< String explain the page + type/state */ + ulint type_value; /*!< Page type or page state */ +}; + +/** We also define I_S_PAGE_TYPE_INDEX as the Index Page's position +in i_s_page_type[] array */ +#define I_S_PAGE_TYPE_INDEX 1 + +/** Any unassigned FIL_PAGE_TYPE will be treated as unknown. */ +#define I_S_PAGE_TYPE_UNKNOWN FIL_PAGE_TYPE_UNKNOWN + +/** R-tree index page */ +#define I_S_PAGE_TYPE_RTREE (FIL_PAGE_TYPE_LAST + 1) + +/** Change buffer B-tree page */ +#define I_S_PAGE_TYPE_IBUF (FIL_PAGE_TYPE_LAST + 2) + +#define I_S_PAGE_TYPE_LAST I_S_PAGE_TYPE_IBUF + +#define I_S_PAGE_TYPE_BITS 4 + +/** Name string for File Page Types */ +static buf_page_desc_t i_s_page_type[] = { + {"ALLOCATED", FIL_PAGE_TYPE_ALLOCATED}, + {"INDEX", FIL_PAGE_INDEX}, + {"UNDO_LOG", FIL_PAGE_UNDO_LOG}, + {"INODE", FIL_PAGE_INODE}, + {"IBUF_FREE_LIST", FIL_PAGE_IBUF_FREE_LIST}, + {"IBUF_BITMAP", FIL_PAGE_IBUF_BITMAP}, + {"SYSTEM", FIL_PAGE_TYPE_SYS}, + {"TRX_SYSTEM", FIL_PAGE_TYPE_TRX_SYS}, + {"FILE_SPACE_HEADER", FIL_PAGE_TYPE_FSP_HDR}, + {"EXTENT_DESCRIPTOR", FIL_PAGE_TYPE_XDES}, + {"BLOB", FIL_PAGE_TYPE_BLOB}, + {"COMPRESSED_BLOB", FIL_PAGE_TYPE_ZBLOB}, + {"COMPRESSED_BLOB2", FIL_PAGE_TYPE_ZBLOB2}, + {"UNKNOWN", I_S_PAGE_TYPE_UNKNOWN}, + {"RTREE_INDEX", I_S_PAGE_TYPE_RTREE}, + {"IBUF_INDEX", I_S_PAGE_TYPE_IBUF}, + {"PAGE COMPRESSED", FIL_PAGE_PAGE_COMPRESSED}, + {"PAGE COMPRESSED AND ENCRYPTED", FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED}, +}; + +/** This structure defines information we will fetch from pages +currently cached in the buffer pool. It will be used to populate +table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE */ +struct buf_page_info_t{ + ulint block_id; /*!< Buffer Pool block ID */ + /** page identifier */ + page_id_t id; + uint32_t access_time; /*!< Time of first access */ + uint32_t state; /*!< buf_page_t::state() */ +#ifdef BTR_CUR_HASH_ADAPT + unsigned hashed:1; /*!< Whether hash index has been + built on this page */ +#endif /* BTR_CUR_HASH_ADAPT */ + unsigned is_old:1; /*!< TRUE if the block is in the old + blocks in buf_pool.LRU_old */ + unsigned freed_page_clock:31; /*!< the value of + buf_pool.freed_page_clock */ + unsigned zip_ssize:PAGE_ZIP_SSIZE_BITS; + /*!< Compressed page size */ + unsigned compressed_only:1; /*!< ROW_FORMAT=COMPRESSED only */ + unsigned page_type:I_S_PAGE_TYPE_BITS; /*!< Page type */ + unsigned num_recs:UNIV_PAGE_SIZE_SHIFT_MAX-2; + /*!< Number of records on Page */ + unsigned data_size:UNIV_PAGE_SIZE_SHIFT_MAX; + /*!< Sum of the sizes of the records */ + lsn_t newest_mod; /*!< Log sequence number of + the youngest modification */ + lsn_t oldest_mod; /*!< Log sequence number of + the oldest modification */ + index_id_t index_id; /*!< Index ID if a index page */ +}; + +/* +Use the following types mapping: + +C type ST_FIELD_INFO::field_type +--------------------------------- +long MYSQL_TYPE_LONGLONG +(field_length=MY_INT64_NUM_DECIMAL_DIGITS) + +long unsigned MYSQL_TYPE_LONGLONG +(field_length=MY_INT64_NUM_DECIMAL_DIGITS, field_flags=MY_I_S_UNSIGNED) + +char* MYSQL_TYPE_STRING +(field_length=n) + +float MYSQL_TYPE_FLOAT +(field_length=0 is ignored) + +void* MYSQL_TYPE_LONGLONG +(field_length=MY_INT64_NUM_DECIMAL_DIGITS, field_flags=MY_I_S_UNSIGNED) + +boolean (if else) MYSQL_TYPE_LONG +(field_length=1) + +time_t MYSQL_TYPE_DATETIME +(field_length=0 ignored) +--------------------------------- +*/ + +/** +Common function to fill any of the dynamic tables: +INFORMATION_SCHEMA.innodb_trx +INFORMATION_SCHEMA.innodb_locks +INFORMATION_SCHEMA.innodb_lock_waits +@retval false if access to the table is blocked +@retval true if something should be filled in */ +static bool trx_i_s_common_fill_table(THD *thd, TABLE_LIST *tables) +{ + DBUG_ENTER("trx_i_s_common_fill_table"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) + DBUG_RETURN(false); + + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); + + /* update the cache */ + trx_i_s_cache_start_write(trx_i_s_cache); + trx_i_s_possibly_fetch_data_into_cache(trx_i_s_cache); + trx_i_s_cache_end_write(trx_i_s_cache); + + if (trx_i_s_cache_is_truncated(trx_i_s_cache)) + sql_print_warning("InnoDB: Data in %.*s truncated due to memory limit" + " of %u bytes", + int(tables->schema_table_name.length), + tables->schema_table_name.str, + TRX_I_S_MEM_LIMIT); + + DBUG_RETURN(true); +} + +/*******************************************************************//** +Unbind a dynamic INFORMATION_SCHEMA table. +@return 0 on success */ +static +int +i_s_common_deinit( +/*==============*/ + void* p); /*!< in/out: table schema object */ +/*******************************************************************//** +Auxiliary function to store time_t value in MYSQL_TYPE_DATETIME +field. +@return 0 on success */ +static +int +field_store_time_t( +/*===============*/ + Field* field, /*!< in/out: target field for storage */ + time_t time) /*!< in: value to store */ +{ + MYSQL_TIME my_time; + struct tm tm_time; + + if (time) { +#if 0 + /* use this if you are sure that `variables' and `time_zone' + are always initialized */ + thd->variables.time_zone->gmt_sec_to_TIME( + &my_time, (my_time_t) time); +#else + localtime_r(&time, &tm_time); + localtime_to_TIME(&my_time, &tm_time); + my_time.time_type = MYSQL_TIMESTAMP_DATETIME; +#endif + } else { + memset(&my_time, 0, sizeof(my_time)); + } + + /* JAN: TODO: MySQL 5.7 + return(field->store_time(&my_time, MYSQL_TIMESTAMP_DATETIME)); + */ + return(field->store_time(&my_time)); +} + +/*******************************************************************//** +Auxiliary function to store char* value in MYSQL_TYPE_STRING field. +@return 0 on success */ +static +int +field_store_string( +/*===============*/ + Field* field, /*!< in/out: target field for storage */ + const char* str) /*!< in: NUL-terminated utf-8 string, + or NULL */ +{ + if (!str) { + field->set_null(); + return 0; + } + + field->set_notnull(); + return field->store(str, uint(strlen(str)), system_charset_info); +} + +#ifdef BTR_CUR_HASH_ADAPT +# define I_S_AHI 1 /* Include the IS_HASHED column */ +#else +# define I_S_AHI 0 /* Omit the IS_HASHED column */ +#endif + +static const LEX_CSTRING isolation_level_values[] = +{ + { STRING_WITH_LEN("READ UNCOMMITTED") }, + { STRING_WITH_LEN("READ COMMITTED") }, + { STRING_WITH_LEN("REPEATABLE READ") }, + { STRING_WITH_LEN("SERIALIZABLE") } +}; + +static TypelibBuffer<4> isolation_level_values_typelib(isolation_level_values); + +namespace Show { + +/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_trx */ +static ST_FIELD_INFO innodb_trx_fields_info[]= +{ +#define IDX_TRX_ID 0 + Column("trx_id", ULonglong(), NOT_NULL), + +#define IDX_TRX_STATE 1 + Column("trx_state", Varchar(13), NOT_NULL), + +#define IDX_TRX_STARTED 2 + Column("trx_started", Datetime(0), NOT_NULL), + +#define IDX_TRX_REQUESTED_LOCK_ID 3 + Column("trx_requested_lock_id", + Varchar(TRX_I_S_LOCK_ID_MAX_LEN + 1), NULLABLE), + +#define IDX_TRX_WAIT_STARTED 4 + Column("trx_wait_started", Datetime(0), NULLABLE), + +#define IDX_TRX_WEIGHT 5 + Column("trx_weight", ULonglong(), NOT_NULL), + +#define IDX_TRX_MYSQL_THREAD_ID 6 + Column("trx_mysql_thread_id", ULonglong(), NOT_NULL), + +#define IDX_TRX_QUERY 7 + Column("trx_query", Varchar(TRX_I_S_TRX_QUERY_MAX_LEN), NULLABLE), + +#define IDX_TRX_OPERATION_STATE 8 + Column("trx_operation_state", Varchar(64), NULLABLE), + +#define IDX_TRX_TABLES_IN_USE 9 + Column("trx_tables_in_use", ULonglong(), NOT_NULL), + +#define IDX_TRX_TABLES_LOCKED 10 + Column("trx_tables_locked", ULonglong(), NOT_NULL), + +#define IDX_TRX_LOCK_STRUCTS 11 + Column("trx_lock_structs", ULonglong(), NOT_NULL), + +#define IDX_TRX_LOCK_MEMORY_BYTES 12 + Column("trx_lock_memory_bytes", ULonglong(), NOT_NULL), + +#define IDX_TRX_ROWS_LOCKED 13 + Column("trx_rows_locked", ULonglong(), NOT_NULL), + +#define IDX_TRX_ROWS_MODIFIED 14 + Column("trx_rows_modified", ULonglong(), NOT_NULL), + +#define IDX_TRX_CONNCURRENCY_TICKETS 15 + Column("trx_concurrency_tickets", ULonglong(), NOT_NULL), + +#define IDX_TRX_ISOLATION_LEVEL 16 + Column("trx_isolation_level", + Enum(&isolation_level_values_typelib), NOT_NULL), + +#define IDX_TRX_UNIQUE_CHECKS 17 + Column("trx_unique_checks", SLong(1), NOT_NULL), + +#define IDX_TRX_FOREIGN_KEY_CHECKS 18 + Column("trx_foreign_key_checks", SLong(1), NOT_NULL), + +#define IDX_TRX_LAST_FOREIGN_KEY_ERROR 19 + Column("trx_last_foreign_key_error", + Varchar(TRX_I_S_TRX_FK_ERROR_MAX_LEN),NULLABLE), + +#define IDX_TRX_READ_ONLY 20 + Column("trx_is_read_only", SLong(1), NOT_NULL), + +#define IDX_TRX_AUTOCOMMIT_NON_LOCKING 21 + Column("trx_autocommit_non_locking", SLong(1), NOT_NULL), + + CEnd() +}; + +} // namespace Show + +/*******************************************************************//** +Read data from cache buffer and fill the INFORMATION_SCHEMA.innodb_trx +table with it. +@retval 0 on success +@retval 1 on failure */ +static int fill_innodb_trx_from_cache(THD *thd, TABLE_LIST *tables, Item*) +{ + ulint rows_num; + char lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1]; + ulint i; + + DBUG_ENTER("fill_innodb_trx_from_cache"); + + if (!trx_i_s_common_fill_table(thd, tables)) { + DBUG_RETURN(0); + } + + struct cache + { + cache() { trx_i_s_cache_start_read(trx_i_s_cache); } + ~cache() { trx_i_s_cache_end_read(trx_i_s_cache); } + } c; + + Field** fields = tables->table->field; + + rows_num = trx_i_s_cache_get_rows_used(trx_i_s_cache, + I_S_INNODB_TRX); + + for (i = 0; i < rows_num; i++) { + + i_s_trx_row_t* row; + + row = (i_s_trx_row_t*) + trx_i_s_cache_get_nth_row( + trx_i_s_cache, I_S_INNODB_TRX, i); + + /* trx_id */ + OK(fields[IDX_TRX_ID]->store(row->trx_id, true)); + + /* trx_state */ + OK(field_store_string(fields[IDX_TRX_STATE], + row->trx_state)); + + /* trx_started */ + OK(field_store_time_t(fields[IDX_TRX_STARTED], + (time_t) row->trx_started)); + + /* trx_requested_lock_id */ + /* trx_wait_started */ + if (row->trx_wait_started != 0) { + + OK(field_store_string( + fields[IDX_TRX_REQUESTED_LOCK_ID], + trx_i_s_create_lock_id( + row->requested_lock_row, + lock_id, sizeof(lock_id)))); + /* field_store_string() sets it no notnull */ + + OK(field_store_time_t( + fields[IDX_TRX_WAIT_STARTED], + (time_t) row->trx_wait_started)); + fields[IDX_TRX_WAIT_STARTED]->set_notnull(); + } else { + + fields[IDX_TRX_REQUESTED_LOCK_ID]->set_null(); + fields[IDX_TRX_WAIT_STARTED]->set_null(); + } + + /* trx_weight */ + OK(fields[IDX_TRX_WEIGHT]->store(row->trx_weight, true)); + + /* trx_mysql_thread_id */ + OK(fields[IDX_TRX_MYSQL_THREAD_ID]->store( + row->trx_mysql_thread_id, true)); + + /* trx_query */ + if (row->trx_query) { + /* store will do appropriate character set + conversion check */ + fields[IDX_TRX_QUERY]->store( + row->trx_query, + static_cast(strlen(row->trx_query)), + row->trx_query_cs); + fields[IDX_TRX_QUERY]->set_notnull(); + } else { + fields[IDX_TRX_QUERY]->set_null(); + } + + /* trx_operation_state */ + OK(field_store_string(fields[IDX_TRX_OPERATION_STATE], + row->trx_operation_state)); + + /* trx_tables_in_use */ + OK(fields[IDX_TRX_TABLES_IN_USE]->store( + row->trx_tables_in_use, true)); + + /* trx_tables_locked */ + OK(fields[IDX_TRX_TABLES_LOCKED]->store( + row->trx_tables_locked, true)); + + /* trx_lock_structs */ + OK(fields[IDX_TRX_LOCK_STRUCTS]->store( + row->trx_lock_structs, true)); + + /* trx_lock_memory_bytes */ + OK(fields[IDX_TRX_LOCK_MEMORY_BYTES]->store( + row->trx_lock_memory_bytes, true)); + + /* trx_rows_locked */ + OK(fields[IDX_TRX_ROWS_LOCKED]->store( + row->trx_rows_locked, true)); + + /* trx_rows_modified */ + OK(fields[IDX_TRX_ROWS_MODIFIED]->store( + row->trx_rows_modified, true)); + + /* trx_concurrency_tickets */ + OK(fields[IDX_TRX_CONNCURRENCY_TICKETS]->store(0, true)); + + /* trx_isolation_level */ + OK(fields[IDX_TRX_ISOLATION_LEVEL]->store( + 1 + row->trx_isolation_level, true)); + + /* trx_unique_checks */ + OK(fields[IDX_TRX_UNIQUE_CHECKS]->store( + row->trx_unique_checks, true)); + + /* trx_foreign_key_checks */ + OK(fields[IDX_TRX_FOREIGN_KEY_CHECKS]->store( + row->trx_foreign_key_checks, true)); + + /* trx_last_foreign_key_error */ + OK(field_store_string(fields[IDX_TRX_LAST_FOREIGN_KEY_ERROR], + row->trx_foreign_key_error)); + + /* trx_is_read_only*/ + OK(fields[IDX_TRX_READ_ONLY]->store( + row->trx_is_read_only, true)); + + /* trx_is_autocommit_non_locking */ + OK(fields[IDX_TRX_AUTOCOMMIT_NON_LOCKING]->store( + row->trx_is_autocommit_non_locking, true)); + + OK(schema_table_store_record(thd, tables->table)); + } + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.innodb_trx +@return 0 on success */ +static +int +innodb_trx_init( +/*============*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_trx_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::innodb_trx_fields_info; + schema->fill_table = fill_innodb_trx_from_cache; + + DBUG_RETURN(0); +} + +static struct st_mysql_information_schema i_s_info = +{ + MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION +}; + +/** version number reported by SHOW PLUGINS */ +constexpr unsigned i_s_version= MYSQL_VERSION_MAJOR << 8 | MYSQL_VERSION_MINOR; + +struct st_maria_plugin i_s_innodb_trx = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_TRX", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "InnoDB transactions", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + innodb_trx_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + +static const LEX_CSTRING lock_mode_values[] = +{ + { STRING_WITH_LEN("S") }, + { STRING_WITH_LEN("S,GAP") }, + { STRING_WITH_LEN("X") }, + { STRING_WITH_LEN("X,GAP") }, + { STRING_WITH_LEN("IS") }, + { STRING_WITH_LEN("IS,GAP") }, + { STRING_WITH_LEN("IX") }, + { STRING_WITH_LEN("IX,GAP") }, + { STRING_WITH_LEN("AUTO_INC") } +}; + +static TypelibBuffer<9> lock_mode_values_typelib(lock_mode_values); + +static const LEX_CSTRING lock_type_values[] = +{ + { STRING_WITH_LEN("RECORD") }, + { STRING_WITH_LEN("TABLE") } +}; + +static TypelibBuffer<2> lock_type_values_typelib(lock_type_values); + +namespace Show { +/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_locks */ +static ST_FIELD_INFO innodb_locks_fields_info[]= +{ +#define IDX_LOCK_ID 0 + Column("lock_id", Varchar(TRX_I_S_LOCK_ID_MAX_LEN + 1), NOT_NULL), + +#define IDX_LOCK_TRX_ID 1 + Column("lock_trx_id", ULonglong(), NOT_NULL), + +#define IDX_LOCK_MODE 2 + Column("lock_mode", Enum(&lock_mode_values_typelib), NOT_NULL), + +#define IDX_LOCK_TYPE 3 + Column("lock_type", Enum(&lock_type_values_typelib), NOT_NULL), + +#define IDX_LOCK_TABLE 4 + Column("lock_table", Varchar(1024), NOT_NULL), + +#define IDX_LOCK_INDEX 5 + Column("lock_index", Varchar(1024), NULLABLE), + +#define IDX_LOCK_SPACE 6 + Column("lock_space", ULong(), NULLABLE), + +#define IDX_LOCK_PAGE 7 + Column("lock_page", ULong(), NULLABLE), + +#define IDX_LOCK_REC 8 + Column("lock_rec", ULong(), NULLABLE), + +#define IDX_LOCK_DATA 9 + Column("lock_data", Varchar(TRX_I_S_LOCK_DATA_MAX_LEN), NULLABLE), + CEnd() +}; +} // namespace Show + +/*******************************************************************//** +Read data from cache buffer and fill the INFORMATION_SCHEMA.innodb_locks +table with it. +@return 0 on success */ +static +int +fill_innodb_locks_from_cache( +/*=========================*/ + THD* thd, /*!< in: MySQL client connection */ + TABLE_LIST* tables, /*!< in/out: fill this table */ + Item*) +{ + ulint rows_num; + char lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1]; + ulint i; + + DBUG_ENTER("fill_innodb_locks_from_cache"); + + if (!trx_i_s_common_fill_table(thd, tables)) { + DBUG_RETURN(0); + } + + struct cache + { + cache() { trx_i_s_cache_start_read(trx_i_s_cache); } + ~cache() { trx_i_s_cache_end_read(trx_i_s_cache); } + } c; + + Field** fields = tables->table->field; + + rows_num = trx_i_s_cache_get_rows_used(trx_i_s_cache, + I_S_INNODB_LOCKS); + + for (i = 0; i < rows_num; i++) { + + i_s_locks_row_t* row; + char buf[MAX_FULL_NAME_LEN + 1]; + const char* bufend; + + row = (i_s_locks_row_t*) + trx_i_s_cache_get_nth_row( + trx_i_s_cache, I_S_INNODB_LOCKS, i); + + /* lock_id */ + trx_i_s_create_lock_id(row, lock_id, sizeof(lock_id)); + OK(field_store_string(fields[IDX_LOCK_ID], + lock_id)); + + /* lock_trx_id */ + OK(fields[IDX_LOCK_TRX_ID]->store(row->lock_trx_id, true)); + + /* lock_mode */ + OK(fields[IDX_LOCK_MODE]->store(row->lock_mode, true)); + + /* lock_type */ + OK(fields[IDX_LOCK_TYPE]->store( + row->lock_index ? 1 : 2, true)); + + /* lock_table */ + bufend = innobase_convert_name(buf, sizeof(buf), + row->lock_table, + strlen(row->lock_table), + thd); + OK(fields[IDX_LOCK_TABLE]->store( + buf, uint(bufend - buf), system_charset_info)); + + if (row->lock_index) { + /* record lock */ + OK(field_store_string(fields[IDX_LOCK_INDEX], + row->lock_index)); + OK(fields[IDX_LOCK_SPACE]->store( + row->lock_page.space(), true)); + fields[IDX_LOCK_SPACE]->set_notnull(); + OK(fields[IDX_LOCK_PAGE]->store( + row->lock_page.page_no(), true)); + fields[IDX_LOCK_PAGE]->set_notnull(); + OK(fields[IDX_LOCK_REC]->store( + row->lock_rec, true)); + fields[IDX_LOCK_REC]->set_notnull(); + OK(field_store_string(fields[IDX_LOCK_DATA], + row->lock_data)); + } else { + fields[IDX_LOCK_INDEX]->set_null(); + fields[IDX_LOCK_SPACE]->set_null(); + fields[IDX_LOCK_REC]->set_null(); + fields[IDX_LOCK_DATA]->set_null(); + } + + OK(schema_table_store_record(thd, tables->table)); + } + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.innodb_locks +@return 0 on success */ +static +int +innodb_locks_init( +/*==============*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_locks_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::innodb_locks_fields_info; + schema->fill_table = fill_innodb_locks_from_cache; + + DBUG_RETURN(0); +} + +struct st_maria_plugin i_s_innodb_locks = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_LOCKS", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "InnoDB conflicting locks", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + innodb_locks_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + + +namespace Show { +/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_lock_waits */ +static ST_FIELD_INFO innodb_lock_waits_fields_info[]= +{ +#define IDX_REQUESTING_TRX_ID 0 + Column("requesting_trx_id", ULonglong(), NOT_NULL), + +#define IDX_REQUESTED_LOCK_ID 1 + Column("requested_lock_id", Varchar(TRX_I_S_LOCK_ID_MAX_LEN + 1), NOT_NULL), + +#define IDX_BLOCKING_TRX_ID 2 + Column("blocking_trx_id", ULonglong(), NOT_NULL), + +#define IDX_BLOCKING_LOCK_ID 3 + Column("blocking_lock_id", Varchar(TRX_I_S_LOCK_ID_MAX_LEN + 1), NOT_NULL), + CEnd() +}; +} // namespace Show + +/*******************************************************************//** +Read data from cache buffer and fill the +INFORMATION_SCHEMA.innodb_lock_waits table with it. +@return 0 on success */ +static +int +fill_innodb_lock_waits_from_cache( +/*==============================*/ + THD* thd, /*!< in: used to call + schema_table_store_record() */ + TABLE_LIST* tables, /*!< in/out: fill this table */ + Item*) +{ + ulint rows_num; + char requested_lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1]; + char blocking_lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1]; + ulint i; + + DBUG_ENTER("fill_innodb_lock_waits_from_cache"); + + if (!trx_i_s_common_fill_table(thd, tables)) { + DBUG_RETURN(0); + } + + struct cache + { + cache() { trx_i_s_cache_start_read(trx_i_s_cache); } + ~cache() { trx_i_s_cache_end_read(trx_i_s_cache); } + } c; + + Field** fields = tables->table->field; + + rows_num = trx_i_s_cache_get_rows_used(trx_i_s_cache, + I_S_INNODB_LOCK_WAITS); + + for (i = 0; i < rows_num; i++) { + + i_s_lock_waits_row_t* row; + + row = (i_s_lock_waits_row_t*) + trx_i_s_cache_get_nth_row( + trx_i_s_cache, I_S_INNODB_LOCK_WAITS, i); + + /* requesting_trx_id */ + OK(fields[IDX_REQUESTING_TRX_ID]->store( + row->requested_lock_row->lock_trx_id, true)); + + /* requested_lock_id */ + OK(field_store_string( + fields[IDX_REQUESTED_LOCK_ID], + trx_i_s_create_lock_id( + row->requested_lock_row, + requested_lock_id, + sizeof(requested_lock_id)))); + + /* blocking_trx_id */ + OK(fields[IDX_BLOCKING_TRX_ID]->store( + row->blocking_lock_row->lock_trx_id, true)); + + /* blocking_lock_id */ + OK(field_store_string( + fields[IDX_BLOCKING_LOCK_ID], + trx_i_s_create_lock_id( + row->blocking_lock_row, + blocking_lock_id, + sizeof(blocking_lock_id)))); + + OK(schema_table_store_record(thd, tables->table)); + } + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.innodb_lock_waits +@return 0 on success */ +static +int +innodb_lock_waits_init( +/*===================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_lock_waits_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::innodb_lock_waits_fields_info; + schema->fill_table = fill_innodb_lock_waits_from_cache; + + DBUG_RETURN(0); +} + +struct st_maria_plugin i_s_innodb_lock_waits = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_LOCK_WAITS", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "InnoDB which lock is blocking which", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + innodb_lock_waits_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + +namespace Show { +/* Fields of the dynamic table information_schema.innodb_cmp. */ +static ST_FIELD_INFO i_s_cmp_fields_info[] = +{ + Column("page_size", SLong(5),NOT_NULL, "Compressed Page Size"), + Column("compress_ops", SLong(), NOT_NULL, "Total Number of Compressions"), + Column("compress_ops_ok",SLong(), NOT_NULL, "Total Number of " + "Successful Compressions"), + Column("compress_time", SLong(), NOT_NULL, "Total Duration of " + "Compressions, in Seconds"), + Column("uncompress_ops", SLong(), NOT_NULL, "Total Number of Decompressions"), + Column("uncompress_time",SLong(), NOT_NULL, "Total Duration of " + "Decompressions, in Seconds"), + CEnd(), +}; +} // namespace Show + + +/*******************************************************************//** +Fill the dynamic table information_schema.innodb_cmp or +innodb_cmp_reset. +@return 0 on success, 1 on failure */ +static +int +i_s_cmp_fill_low( +/*=============*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* , /*!< in: condition (ignored) */ + ibool reset) /*!< in: TRUE=reset cumulated counts */ +{ + TABLE* table = (TABLE*) tables->table; + int status = 0; + + DBUG_ENTER("i_s_cmp_fill_low"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + + DBUG_RETURN(0); + } + + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); + + for (uint i = 0; i < PAGE_ZIP_SSIZE_MAX; i++) { + page_zip_stat_t* zip_stat = &page_zip_stat[i]; + + table->field[0]->store(UNIV_ZIP_SIZE_MIN << i); + + /* The cumulated counts are not protected by any + mutex. Thus, some operation in page0zip.cc could + increment a counter between the time we read it and + clear it. We could introduce mutex protection, but it + could cause a measureable performance hit in + page0zip.cc. */ + table->field[1]->store(zip_stat->compressed, true); + table->field[2]->store(zip_stat->compressed_ok, true); + table->field[3]->store(zip_stat->compressed_usec / 1000000, + true); + table->field[4]->store(zip_stat->decompressed, true); + table->field[5]->store(zip_stat->decompressed_usec / 1000000, + true); + + if (reset) { + new (zip_stat) page_zip_stat_t(); + } + + if (schema_table_store_record(thd, table)) { + status = 1; + break; + } + } + + DBUG_RETURN(status); +} + +/*******************************************************************//** +Fill the dynamic table information_schema.innodb_cmp. +@return 0 on success, 1 on failure */ +static +int +i_s_cmp_fill( +/*=========*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* cond) /*!< in: condition (ignored) */ +{ + return(i_s_cmp_fill_low(thd, tables, cond, FALSE)); +} + +/*******************************************************************//** +Fill the dynamic table information_schema.innodb_cmp_reset. +@return 0 on success, 1 on failure */ +static +int +i_s_cmp_reset_fill( +/*===============*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* cond) /*!< in: condition (ignored) */ +{ + return(i_s_cmp_fill_low(thd, tables, cond, TRUE)); +} + +/*******************************************************************//** +Bind the dynamic table information_schema.innodb_cmp. +@return 0 on success */ +static +int +i_s_cmp_init( +/*=========*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_cmp_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::i_s_cmp_fields_info; + schema->fill_table = i_s_cmp_fill; + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Bind the dynamic table information_schema.innodb_cmp_reset. +@return 0 on success */ +static +int +i_s_cmp_reset_init( +/*===============*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_cmp_reset_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::i_s_cmp_fields_info; + schema->fill_table = i_s_cmp_reset_fill; + + DBUG_RETURN(0); +} + +struct st_maria_plugin i_s_innodb_cmp = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_CMP", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "Statistics for the InnoDB compression", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + i_s_cmp_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + +struct st_maria_plugin i_s_innodb_cmp_reset = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_CMP_RESET", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "Statistics for the InnoDB compression;" + " reset cumulated counts", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + i_s_cmp_reset_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + + +namespace Show { +/* Fields of the dynamic tables +information_schema.innodb_cmp_per_index and +information_schema.innodb_cmp_per_index_reset. */ +static ST_FIELD_INFO i_s_cmp_per_index_fields_info[]= +{ +#define IDX_DATABASE_NAME 0 + Column("database_name", Varchar(NAME_CHAR_LEN), NOT_NULL), + +#define IDX_TABLE_NAME 1 /* FIXME: this is in my_charset_filename! */ + Column("table_name", Varchar(NAME_CHAR_LEN), NOT_NULL), + +#define IDX_INDEX_NAME 2 + Column("index_name", Varchar(NAME_CHAR_LEN), NOT_NULL), + +#define IDX_COMPRESS_OPS 3 + Column("compress_ops", SLong(), NOT_NULL), + +#define IDX_COMPRESS_OPS_OK 4 + Column("compress_ops_ok", SLong(), NOT_NULL), + +#define IDX_COMPRESS_TIME 5 + Column("compress_time", SLong(), NOT_NULL), + +#define IDX_UNCOMPRESS_OPS 6 + Column("uncompress_ops", SLong(), NOT_NULL), + +#define IDX_UNCOMPRESS_TIME 7 + Column("uncompress_time", SLong(), NOT_NULL), + + CEnd() +}; + +} // namespace Show + +/*******************************************************************//** +Fill the dynamic table +information_schema.innodb_cmp_per_index or +information_schema.innodb_cmp_per_index_reset. +@return 0 on success, 1 on failure */ +static +int +i_s_cmp_per_index_fill_low( +/*=======================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* , /*!< in: condition (ignored) */ + ibool reset) /*!< in: TRUE=reset cumulated counts */ +{ + TABLE* table = tables->table; + Field** fields = table->field; + int status = 0; + + DBUG_ENTER("i_s_cmp_per_index_fill_low"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + + DBUG_RETURN(0); + } + + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); + + /* Create a snapshot of the stats so we do not bump into lock + order violations with dict_sys.latch below. */ + mysql_mutex_lock(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index_t snap (page_zip_stat_per_index); + mysql_mutex_unlock(&page_zip_stat_per_index_mutex); + + dict_sys.freeze(SRW_LOCK_CALL); + + page_zip_stat_per_index_t::iterator iter; + ulint i; + + for (iter = snap.begin(), i = 0; iter != snap.end(); iter++, i++) { + + if (dict_index_t* index + = dict_index_get_if_in_cache_low(iter->first)) { + char db_utf8[MAX_DB_UTF8_LEN]; + char table_utf8[MAX_TABLE_UTF8_LEN]; + + dict_fs2utf8(index->table->name.m_name, + db_utf8, sizeof(db_utf8), + table_utf8, sizeof(table_utf8)); + + status = field_store_string(fields[IDX_DATABASE_NAME], + db_utf8) + || field_store_string(fields[IDX_TABLE_NAME], + table_utf8) + || field_store_string(fields[IDX_INDEX_NAME], + index->name); + } else { + /* index not found */ + char name[MY_INT64_NUM_DECIMAL_DIGITS + + sizeof "index_id: "]; + fields[IDX_DATABASE_NAME]->set_null(); + fields[IDX_TABLE_NAME]->set_null(); + fields[IDX_INDEX_NAME]->set_notnull(); + status = fields[IDX_INDEX_NAME]->store( + name, + uint(snprintf(name, sizeof name, + "index_id: " IB_ID_FMT, + iter->first)), + system_charset_info); + } + + if (status + || fields[IDX_COMPRESS_OPS]->store( + iter->second.compressed, true) + || fields[IDX_COMPRESS_OPS_OK]->store( + iter->second.compressed_ok, true) + || fields[IDX_COMPRESS_TIME]->store( + iter->second.compressed_usec / 1000000, true) + || fields[IDX_UNCOMPRESS_OPS]->store( + iter->second.decompressed, true) + || fields[IDX_UNCOMPRESS_TIME]->store( + iter->second.decompressed_usec / 1000000, true) + || schema_table_store_record(thd, table)) { + status = 1; + break; + } + /* Release and reacquire the dict_sys.latch to allow other + threads to proceed. This could eventually result in the + contents of INFORMATION_SCHEMA.innodb_cmp_per_index being + inconsistent, but it is an acceptable compromise. */ + if (i == 1000) { + dict_sys.unfreeze(); + i = 0; + dict_sys.freeze(SRW_LOCK_CALL); + } + } + + dict_sys.unfreeze(); + + if (reset) { + page_zip_reset_stat_per_index(); + } + + DBUG_RETURN(status); +} + +/*******************************************************************//** +Fill the dynamic table information_schema.innodb_cmp_per_index. +@return 0 on success, 1 on failure */ +static +int +i_s_cmp_per_index_fill( +/*===================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* cond) /*!< in: condition (ignored) */ +{ + return(i_s_cmp_per_index_fill_low(thd, tables, cond, FALSE)); +} + +/*******************************************************************//** +Fill the dynamic table information_schema.innodb_cmp_per_index_reset. +@return 0 on success, 1 on failure */ +static +int +i_s_cmp_per_index_reset_fill( +/*=========================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* cond) /*!< in: condition (ignored) */ +{ + return(i_s_cmp_per_index_fill_low(thd, tables, cond, TRUE)); +} + +/*******************************************************************//** +Bind the dynamic table information_schema.innodb_cmp_per_index. +@return 0 on success */ +static +int +i_s_cmp_per_index_init( +/*===================*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_cmp_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::i_s_cmp_per_index_fields_info; + schema->fill_table = i_s_cmp_per_index_fill; + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Bind the dynamic table information_schema.innodb_cmp_per_index_reset. +@return 0 on success */ +static +int +i_s_cmp_per_index_reset_init( +/*=========================*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_cmp_reset_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::i_s_cmp_per_index_fields_info; + schema->fill_table = i_s_cmp_per_index_reset_fill; + + DBUG_RETURN(0); +} + +struct st_maria_plugin i_s_innodb_cmp_per_index = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_CMP_PER_INDEX", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "Statistics for the InnoDB compression (per index)", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + i_s_cmp_per_index_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + +struct st_maria_plugin i_s_innodb_cmp_per_index_reset = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_CMP_PER_INDEX_RESET", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "Statistics for the InnoDB compression (per index);" + " reset cumulated counts", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + i_s_cmp_per_index_reset_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + + +namespace Show { +/* Fields of the dynamic table information_schema.innodb_cmpmem. */ +static ST_FIELD_INFO i_s_cmpmem_fields_info[] = +{ + Column("page_size", SLong(5), NOT_NULL, "Buddy Block Size"), + Column("buffer_pool_instance", SLong(), NOT_NULL, "Buffer Pool Id"), + Column("pages_used", SLong(), NOT_NULL, "Currently in Use"), + Column("pages_free", SLong(), NOT_NULL, "Currently Available"), + Column("relocation_ops", SLonglong(), NOT_NULL, "Total Number of Relocations"), + Column("relocation_time", SLong(), NOT_NULL, "Total Duration of Relocations," + " in Seconds"), + CEnd() +}; +} // namespace Show + +/*******************************************************************//** +Fill the dynamic table information_schema.innodb_cmpmem or +innodb_cmpmem_reset. +@return 0 on success, 1 on failure */ +static +int +i_s_cmpmem_fill_low( +/*================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* , /*!< in: condition (ignored) */ + ibool reset) /*!< in: TRUE=reset cumulated counts */ +{ + TABLE* table = (TABLE*) tables->table; + + DBUG_ENTER("i_s_cmpmem_fill_low"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + + DBUG_RETURN(0); + } + + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); + + ulint zip_free_len_local[BUF_BUDDY_SIZES_MAX + 1]; + buf_buddy_stat_t buddy_stat_local[BUF_BUDDY_SIZES_MAX + 1]; + + /* Save buddy stats for buffer pool in local variables. */ + mysql_mutex_lock(&buf_pool.mutex); + + for (uint x = 0; x <= BUF_BUDDY_SIZES; x++) { + zip_free_len_local[x] = (x < BUF_BUDDY_SIZES) ? + UT_LIST_GET_LEN(buf_pool.zip_free[x]) : 0; + + buddy_stat_local[x] = buf_pool.buddy_stat[x]; + + if (reset) { + /* This is protected by buf_pool.mutex. */ + buf_pool.buddy_stat[x].relocated = 0; + buf_pool.buddy_stat[x].relocated_usec = 0; + } + } + + mysql_mutex_unlock(&buf_pool.mutex); + + for (uint x = 0; x <= BUF_BUDDY_SIZES; x++) { + buf_buddy_stat_t* buddy_stat = &buddy_stat_local[x]; + + Field **field = table->field; + + (*field++)->store(BUF_BUDDY_LOW << x); + (*field++)->store(0, true); + (*field++)->store(buddy_stat->used, true); + (*field++)->store(zip_free_len_local[x], true); + (*field++)->store(buddy_stat->relocated, true); + (*field)->store(buddy_stat->relocated_usec / 1000000, true); + + if (schema_table_store_record(thd, table)) { + DBUG_RETURN(1); + } + } + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Fill the dynamic table information_schema.innodb_cmpmem. +@return 0 on success, 1 on failure */ +static +int +i_s_cmpmem_fill( +/*============*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* cond) /*!< in: condition (ignored) */ +{ + return(i_s_cmpmem_fill_low(thd, tables, cond, FALSE)); +} + +/*******************************************************************//** +Fill the dynamic table information_schema.innodb_cmpmem_reset. +@return 0 on success, 1 on failure */ +static +int +i_s_cmpmem_reset_fill( +/*==================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* cond) /*!< in: condition (ignored) */ +{ + return(i_s_cmpmem_fill_low(thd, tables, cond, TRUE)); +} + +/*******************************************************************//** +Bind the dynamic table information_schema.innodb_cmpmem. +@return 0 on success */ +static +int +i_s_cmpmem_init( +/*============*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_cmpmem_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::i_s_cmpmem_fields_info; + schema->fill_table = i_s_cmpmem_fill; + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Bind the dynamic table information_schema.innodb_cmpmem_reset. +@return 0 on success */ +static +int +i_s_cmpmem_reset_init( +/*==================*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_cmpmem_reset_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::i_s_cmpmem_fields_info; + schema->fill_table = i_s_cmpmem_reset_fill; + + DBUG_RETURN(0); +} + +struct st_maria_plugin i_s_innodb_cmpmem = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_CMPMEM", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "Statistics for the InnoDB compressed buffer pool", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + i_s_cmpmem_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + +struct st_maria_plugin i_s_innodb_cmpmem_reset = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_CMPMEM_RESET", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "Statistics for the InnoDB compressed buffer pool;" + " reset cumulated counts", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + i_s_cmpmem_reset_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + + +static const LEX_CSTRING metric_type_values[] = +{ + { STRING_WITH_LEN("value") }, + { STRING_WITH_LEN("status_counter") }, + { STRING_WITH_LEN("set_owner") }, + { STRING_WITH_LEN("set_member") }, + { STRING_WITH_LEN("counter") } +}; + +static TypelibBuffer<5> metric_type_values_typelib(metric_type_values); + +namespace Show { +/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_metrics */ +static ST_FIELD_INFO innodb_metrics_fields_info[]= +{ +#define METRIC_NAME 0 + Column("NAME", Varchar(NAME_LEN + 1), NOT_NULL), + +#define METRIC_SUBSYS 1 + Column("SUBSYSTEM", Varchar(NAME_LEN + 1), NOT_NULL), + +#define METRIC_VALUE_START 2 + Column("COUNT", SLonglong(), NOT_NULL), + +#define METRIC_MAX_VALUE_START 3 + Column("MAX_COUNT", SLonglong(), NULLABLE), + +#define METRIC_MIN_VALUE_START 4 + Column("MIN_COUNT", SLonglong(), NULLABLE), + +#define METRIC_AVG_VALUE_START 5 + Column("AVG_COUNT", Float(MAX_FLOAT_STR_LENGTH), NULLABLE), + +#define METRIC_VALUE_RESET 6 + Column("COUNT_RESET", SLonglong(), NOT_NULL), + +#define METRIC_MAX_VALUE_RESET 7 + Column("MAX_COUNT_RESET", SLonglong(), NULLABLE), + +#define METRIC_MIN_VALUE_RESET 8 + Column("MIN_COUNT_RESET", SLonglong(), NULLABLE), + +#define METRIC_AVG_VALUE_RESET 9 + Column("AVG_COUNT_RESET", Float(MAX_FLOAT_STR_LENGTH), NULLABLE), + +#define METRIC_START_TIME 10 + Column("TIME_ENABLED", Datetime(0), NULLABLE), + +#define METRIC_STOP_TIME 11 + Column("TIME_DISABLED", Datetime(0), NULLABLE), + +#define METRIC_TIME_ELAPSED 12 + Column("TIME_ELAPSED", SLonglong(), NULLABLE), + +#define METRIC_RESET_TIME 13 + Column("TIME_RESET", Datetime(0), NULLABLE), + +#define METRIC_STATUS 14 + Column("ENABLED", SLong(1), NOT_NULL), + +#define METRIC_TYPE 15 + Column("TYPE", Enum(&metric_type_values_typelib), NOT_NULL), + +#define METRIC_DESC 16 + Column("COMMENT", Varchar(NAME_LEN + 1), NOT_NULL), + CEnd() +}; +} // namespace Show + +/**********************************************************************//** +Fill the information schema metrics table. +@return 0 on success */ +static +int +i_s_metrics_fill( +/*=============*/ + THD* thd, /*!< in: thread */ + TABLE* table_to_fill) /*!< in/out: fill this table */ +{ + int count; + Field** fields; + double time_diff = 0; + monitor_info_t* monitor_info; + mon_type_t min_val; + mon_type_t max_val; + + DBUG_ENTER("i_s_metrics_fill"); + fields = table_to_fill->field; + + for (count = 0; count < NUM_MONITOR; count++) { + monitor_info = srv_mon_get_info((monitor_id_t) count); + + /* A good place to sanity check the Monitor ID */ + ut_a(count == monitor_info->monitor_id); + + /* If the item refers to a Module, nothing to fill, + continue. */ + if ((monitor_info->monitor_type & MONITOR_MODULE) + || (monitor_info->monitor_type & MONITOR_HIDDEN)) { + continue; + } + + /* If this is an existing "status variable", and + its corresponding counter is still on, we need + to calculate the result from its corresponding + counter. */ + if (monitor_info->monitor_type & MONITOR_EXISTING + && MONITOR_IS_ON(count)) { + srv_mon_process_existing_counter((monitor_id_t) count, + MONITOR_GET_VALUE); + } + + /* Fill in counter's basic information */ + OK(field_store_string(fields[METRIC_NAME], + monitor_info->monitor_name)); + + OK(field_store_string(fields[METRIC_SUBSYS], + monitor_info->monitor_module)); + + OK(field_store_string(fields[METRIC_DESC], + monitor_info->monitor_desc)); + + /* Fill in counter values */ + OK(fields[METRIC_VALUE_RESET]->store( + MONITOR_VALUE(count), FALSE)); + + OK(fields[METRIC_VALUE_START]->store( + MONITOR_VALUE_SINCE_START(count), FALSE)); + + /* If the max value is MAX_RESERVED, counter max + value has not been updated. Set the column value + to NULL. */ + if (MONITOR_MAX_VALUE(count) == MAX_RESERVED + || MONITOR_MAX_MIN_NOT_INIT(count)) { + fields[METRIC_MAX_VALUE_RESET]->set_null(); + } else { + OK(fields[METRIC_MAX_VALUE_RESET]->store( + MONITOR_MAX_VALUE(count), FALSE)); + fields[METRIC_MAX_VALUE_RESET]->set_notnull(); + } + + /* If the min value is MAX_RESERVED, counter min + value has not been updated. Set the column value + to NULL. */ + if (MONITOR_MIN_VALUE(count) == MIN_RESERVED + || MONITOR_MAX_MIN_NOT_INIT(count)) { + fields[METRIC_MIN_VALUE_RESET]->set_null(); + } else { + OK(fields[METRIC_MIN_VALUE_RESET]->store( + MONITOR_MIN_VALUE(count), FALSE)); + fields[METRIC_MIN_VALUE_RESET]->set_notnull(); + } + + /* Calculate the max value since counter started */ + max_val = srv_mon_calc_max_since_start((monitor_id_t) count); + + if (max_val == MAX_RESERVED + || MONITOR_MAX_MIN_NOT_INIT(count)) { + fields[METRIC_MAX_VALUE_START]->set_null(); + } else { + OK(fields[METRIC_MAX_VALUE_START]->store( + max_val, FALSE)); + fields[METRIC_MAX_VALUE_START]->set_notnull(); + } + + /* Calculate the min value since counter started */ + min_val = srv_mon_calc_min_since_start((monitor_id_t) count); + + if (min_val == MIN_RESERVED + || MONITOR_MAX_MIN_NOT_INIT(count)) { + fields[METRIC_MIN_VALUE_START]->set_null(); + } else { + OK(fields[METRIC_MIN_VALUE_START]->store( + min_val, FALSE)); + + fields[METRIC_MIN_VALUE_START]->set_notnull(); + } + + /* If monitor has been enabled (no matter it is disabled + or not now), fill METRIC_START_TIME and METRIC_TIME_ELAPSED + field */ + if (MONITOR_FIELD(count, mon_start_time)) { + OK(field_store_time_t(fields[METRIC_START_TIME], + (time_t)MONITOR_FIELD(count, mon_start_time))); + fields[METRIC_START_TIME]->set_notnull(); + + /* If monitor is enabled, the TIME_ELAPSED is the + time difference between current and time when monitor + is enabled. Otherwise, it is the time difference + between time when monitor is enabled and time + when it is disabled */ + if (MONITOR_IS_ON(count)) { + time_diff = difftime(time(NULL), + MONITOR_FIELD(count, mon_start_time)); + } else { + time_diff = difftime( + MONITOR_FIELD(count, mon_stop_time), + MONITOR_FIELD(count, mon_start_time)); + } + + OK(fields[METRIC_TIME_ELAPSED]->store( + time_diff)); + fields[METRIC_TIME_ELAPSED]->set_notnull(); + } else { + fields[METRIC_START_TIME]->set_null(); + fields[METRIC_TIME_ELAPSED]->set_null(); + time_diff = 0; + } + + /* Unless MONITOR_NO_AVERAGE is set, we must + to calculate the average value. If this is a monitor set + owner marked by MONITOR_SET_OWNER, divide + the value by another counter (number of calls) designated + by monitor_info->monitor_related_id. + Otherwise average the counter value by the time between the + time that the counter is enabled and time it is disabled + or time it is sampled. */ + if ((monitor_info->monitor_type + & (MONITOR_NO_AVERAGE | MONITOR_SET_OWNER)) + == MONITOR_SET_OWNER + && monitor_info->monitor_related_id) { + mon_type_t value_start + = MONITOR_VALUE_SINCE_START( + monitor_info->monitor_related_id); + + if (value_start) { + OK(fields[METRIC_AVG_VALUE_START]->store( + MONITOR_VALUE_SINCE_START(count) + / value_start, FALSE)); + + fields[METRIC_AVG_VALUE_START]->set_notnull(); + } else { + fields[METRIC_AVG_VALUE_START]->set_null(); + } + + if (mon_type_t related_value = + MONITOR_VALUE(monitor_info->monitor_related_id)) { + OK(fields[METRIC_AVG_VALUE_RESET] + ->store(MONITOR_VALUE(count) + / related_value, false)); + fields[METRIC_AVG_VALUE_RESET]->set_notnull(); + } else { + fields[METRIC_AVG_VALUE_RESET]->set_null(); + } + } else if (!(monitor_info->monitor_type + & (MONITOR_NO_AVERAGE + | MONITOR_DISPLAY_CURRENT))) { + if (time_diff != 0) { + OK(fields[METRIC_AVG_VALUE_START]->store( + (double) MONITOR_VALUE_SINCE_START( + count) / time_diff)); + fields[METRIC_AVG_VALUE_START]->set_notnull(); + } else { + fields[METRIC_AVG_VALUE_START]->set_null(); + } + + if (MONITOR_FIELD(count, mon_reset_time)) { + /* calculate the time difference since last + reset */ + if (MONITOR_IS_ON(count)) { + time_diff = difftime( + time(NULL), MONITOR_FIELD( + count, mon_reset_time)); + } else { + time_diff = difftime( + MONITOR_FIELD(count, mon_stop_time), + MONITOR_FIELD(count, mon_reset_time)); + } + } else { + time_diff = 0; + } + + if (time_diff != 0) { + OK(fields[METRIC_AVG_VALUE_RESET]->store( + static_cast( + MONITOR_VALUE(count)) + / time_diff)); + fields[METRIC_AVG_VALUE_RESET]->set_notnull(); + } else { + fields[METRIC_AVG_VALUE_RESET]->set_null(); + } + } else { + fields[METRIC_AVG_VALUE_START]->set_null(); + fields[METRIC_AVG_VALUE_RESET]->set_null(); + } + + if (MONITOR_IS_ON(count)) { + /* If monitor is on, the stop time will set to NULL */ + fields[METRIC_STOP_TIME]->set_null(); + + /* Display latest Monitor Reset Time only if Monitor + counter is on. */ + if (MONITOR_FIELD(count, mon_reset_time)) { + OK(field_store_time_t( + fields[METRIC_RESET_TIME], + (time_t)MONITOR_FIELD( + count, mon_reset_time))); + fields[METRIC_RESET_TIME]->set_notnull(); + } else { + fields[METRIC_RESET_TIME]->set_null(); + } + + OK(fields[METRIC_STATUS]->store(1, true)); + } else { + if (MONITOR_FIELD(count, mon_stop_time)) { + OK(field_store_time_t(fields[METRIC_STOP_TIME], + (time_t)MONITOR_FIELD(count, mon_stop_time))); + fields[METRIC_STOP_TIME]->set_notnull(); + } else { + fields[METRIC_STOP_TIME]->set_null(); + } + + fields[METRIC_RESET_TIME]->set_null(); + + OK(fields[METRIC_STATUS]->store(0, true)); + } + + uint metric_type; + + if (monitor_info->monitor_type & MONITOR_DISPLAY_CURRENT) { + metric_type = 1; /* "value" */ + } else if (monitor_info->monitor_type & MONITOR_EXISTING) { + metric_type = 2; /* "status_counter" */ + } else if (monitor_info->monitor_type & MONITOR_SET_OWNER) { + metric_type = 3; /* "set_owner" */ + } else if (monitor_info->monitor_type & MONITOR_SET_MEMBER) { + metric_type = 4; /* "set_member" */ + } else { + metric_type = 5; /* "counter" */ + } + + OK(fields[METRIC_TYPE]->store(metric_type, true)); + + OK(schema_table_store_record(thd, table_to_fill)); + } + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Function to fill information schema metrics tables. +@return 0 on success */ +static +int +i_s_metrics_fill_table( +/*===================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (not used) */ +{ + DBUG_ENTER("i_s_metrics_fill_table"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + i_s_metrics_fill(thd, tables->table); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.innodb_metrics +@return 0 on success */ +static +int +innodb_metrics_init( +/*================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_metrics_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::innodb_metrics_fields_info; + schema->fill_table = i_s_metrics_fill_table; + + DBUG_RETURN(0); +} + +struct st_maria_plugin i_s_innodb_metrics = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_METRICS", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "InnoDB Metrics Info", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + innodb_metrics_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + +namespace Show { +/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_ft_default_stopword */ +static ST_FIELD_INFO i_s_stopword_fields_info[]= +{ +#define STOPWORD_VALUE 0 + Column("value", Varchar(TRX_ID_MAX_LEN + 1), NOT_NULL), + CEnd() +}; +} // namespace Show + +/*******************************************************************//** +Fill the dynamic table information_schema.innodb_ft_default_stopword. +@return 0 on success, 1 on failure */ +static +int +i_s_stopword_fill( +/*==============*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (not used) */ +{ + Field** fields; + ulint i = 0; + TABLE* table = (TABLE*) tables->table; + + DBUG_ENTER("i_s_stopword_fill"); + + fields = table->field; + + /* Fill with server default stopword list in array + fts_default_stopword */ + while (fts_default_stopword[i]) { + OK(field_store_string(fields[STOPWORD_VALUE], + fts_default_stopword[i])); + + OK(schema_table_store_record(thd, table)); + i++; + } + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Bind the dynamic table information_schema.innodb_ft_default_stopword. +@return 0 on success */ +static +int +i_s_stopword_init( +/*==============*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_stopword_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::i_s_stopword_fields_info; + schema->fill_table = i_s_stopword_fill; + + DBUG_RETURN(0); +} + +struct st_maria_plugin i_s_innodb_ft_default_stopword = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_FT_DEFAULT_STOPWORD", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "Default stopword list for InnoDB Full Text Search", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + i_s_stopword_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + +namespace Show { +/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED +INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED */ +static ST_FIELD_INFO i_s_fts_doc_fields_info[]= +{ +#define I_S_FTS_DOC_ID 0 + Column("DOC_ID", ULonglong(), NOT_NULL), + CEnd() +}; +} // namespace Show + +/*******************************************************************//** +Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED or +INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED +@return 0 on success, 1 on failure */ +static +int +i_s_fts_deleted_generic_fill( +/*=========================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + ibool being_deleted) /*!< in: BEING_DELTED table */ +{ + Field** fields; + TABLE* table = (TABLE*) tables->table; + trx_t* trx; + fts_table_t fts_table; + fts_doc_ids_t* deleted; + dict_table_t* user_table; + + DBUG_ENTER("i_s_fts_deleted_generic_fill"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); + + MDL_ticket* mdl_ticket = nullptr; + user_table = dict_table_open_on_id( + innodb_ft_aux_table_id, false, DICT_TABLE_OP_NORMAL, + thd, &mdl_ticket); + + if (!user_table) { + DBUG_RETURN(0); + } else if (!dict_table_has_fts_index(user_table) + || !user_table->is_readable()) { + dict_table_close(user_table, false, thd, mdl_ticket); + DBUG_RETURN(0); + } + + deleted = fts_doc_ids_create(); + + trx = trx_create(); + trx->op_info = "Select for FTS DELETE TABLE"; + + FTS_INIT_FTS_TABLE(&fts_table, + (being_deleted) ? "BEING_DELETED" : "DELETED", + FTS_COMMON_TABLE, user_table); + + fts_table_fetch_doc_ids(trx, &fts_table, deleted); + + dict_table_close(user_table, false, thd, mdl_ticket); + + trx->free(); + + fields = table->field; + + int ret = 0; + + for (ulint j = 0; j < ib_vector_size(deleted->doc_ids); ++j) { + doc_id_t doc_id; + + doc_id = *(doc_id_t*) ib_vector_get_const(deleted->doc_ids, j); + + BREAK_IF(ret = fields[I_S_FTS_DOC_ID]->store(doc_id, true)); + + BREAK_IF(ret = schema_table_store_record(thd, table)); + } + + fts_doc_ids_free(deleted); + + DBUG_RETURN(ret); +} + +/*******************************************************************//** +Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED +@return 0 on success, 1 on failure */ +static +int +i_s_fts_deleted_fill( +/*=================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (ignored) */ +{ + DBUG_ENTER("i_s_fts_deleted_fill"); + + DBUG_RETURN(i_s_fts_deleted_generic_fill(thd, tables, FALSE)); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED +@return 0 on success */ +static +int +i_s_fts_deleted_init( +/*=================*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_fts_deleted_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::i_s_fts_doc_fields_info; + schema->fill_table = i_s_fts_deleted_fill; + + DBUG_RETURN(0); +} + +struct st_maria_plugin i_s_innodb_ft_deleted = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_FT_DELETED", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "INNODB AUXILIARY FTS DELETED TABLE", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + i_s_fts_deleted_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + +/*******************************************************************//** +Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED +@return 0 on success, 1 on failure */ +static +int +i_s_fts_being_deleted_fill( +/*=======================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (ignored) */ +{ + DBUG_ENTER("i_s_fts_being_deleted_fill"); + + DBUG_RETURN(i_s_fts_deleted_generic_fill(thd, tables, TRUE)); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED +@return 0 on success */ +static +int +i_s_fts_being_deleted_init( +/*=======================*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_fts_deleted_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::i_s_fts_doc_fields_info; + schema->fill_table = i_s_fts_being_deleted_fill; + + DBUG_RETURN(0); +} + +struct st_maria_plugin i_s_innodb_ft_being_deleted = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_FT_BEING_DELETED", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "INNODB AUXILIARY FTS BEING DELETED TABLE", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + i_s_fts_being_deleted_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + + +namespace Show { +/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHED and +INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE */ +static ST_FIELD_INFO i_s_fts_index_fields_info[]= +{ +#define I_S_FTS_WORD 0 + Column("WORD", Varchar(FTS_MAX_WORD_LEN + 1), NOT_NULL), + +#define I_S_FTS_FIRST_DOC_ID 1 + Column("FIRST_DOC_ID", ULonglong(), NOT_NULL), + +#define I_S_FTS_LAST_DOC_ID 2 + Column("LAST_DOC_ID", ULonglong(), NOT_NULL), + +#define I_S_FTS_DOC_COUNT 3 + Column("DOC_COUNT", ULonglong(), NOT_NULL), + +#define I_S_FTS_ILIST_DOC_ID 4 + Column("DOC_ID", ULonglong(), NOT_NULL), + +#define I_S_FTS_ILIST_DOC_POS 5 + Column("POSITION", ULonglong(), NOT_NULL), + CEnd() +}; +} // namespace Show + +/*******************************************************************//** +Go through the Doc Node and its ilist, fill the dynamic table +INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHED for one FTS index on the table. +@return 0 on success, 1 on failure */ +static +int +i_s_fts_index_cache_fill_one_index( +/*===============================*/ + fts_index_cache_t* index_cache, /*!< in: FTS index cache */ + THD* thd, /*!< in: thread */ + fts_string_t* conv_str, /*!< in/out: buffer */ + TABLE_LIST* tables) /*!< in/out: tables to fill */ +{ + TABLE* table = (TABLE*) tables->table; + Field** fields; + CHARSET_INFO* index_charset; + const ib_rbt_node_t* rbt_node; + uint dummy_errors; + char* word_str; + + DBUG_ENTER("i_s_fts_index_cache_fill_one_index"); + + fields = table->field; + + index_charset = index_cache->charset; + conv_str->f_n_char = 0; + + int ret = 0; + + /* Go through each word in the index cache */ + for (rbt_node = rbt_first(index_cache->words); + rbt_node; + rbt_node = rbt_next(index_cache->words, rbt_node)) { + fts_tokenizer_word_t* word; + + word = rbt_value(fts_tokenizer_word_t, rbt_node); + + /* Convert word from index charset to system_charset_info */ + if (index_charset->cset != system_charset_info->cset) { + conv_str->f_n_char = my_convert( + reinterpret_cast(conv_str->f_str), + static_cast(conv_str->f_len), + system_charset_info, + reinterpret_cast(word->text.f_str), + static_cast(word->text.f_len), + index_charset, &dummy_errors); + ut_ad(conv_str->f_n_char <= conv_str->f_len); + conv_str->f_str[conv_str->f_n_char] = 0; + word_str = reinterpret_cast(conv_str->f_str); + } else { + word_str = reinterpret_cast(word->text.f_str); + } + + /* Decrypt the ilist, and display Dod ID and word position */ + for (ulint i = 0; i < ib_vector_size(word->nodes); i++) { + fts_node_t* node; + const byte* ptr; + ulint decoded = 0; + doc_id_t doc_id = 0; + + node = static_cast (ib_vector_get( + word->nodes, i)); + + ptr = node->ilist; + + while (decoded < node->ilist_size) { + + doc_id += fts_decode_vlc(&ptr); + + /* Get position info */ + while (*ptr) { + + OK(field_store_string( + fields[I_S_FTS_WORD], + word_str)); + + OK(fields[I_S_FTS_FIRST_DOC_ID]->store( + node->first_doc_id, + true)); + + OK(fields[I_S_FTS_LAST_DOC_ID]->store( + node->last_doc_id, + true)); + + OK(fields[I_S_FTS_DOC_COUNT]->store( + node->doc_count, true)); + + OK(fields[I_S_FTS_ILIST_DOC_ID]->store( + doc_id, true)); + + OK(fields[I_S_FTS_ILIST_DOC_POS]->store( + fts_decode_vlc(&ptr), true)); + + OK(schema_table_store_record( + thd, table)); + } + + ++ptr; + + decoded = ptr - (byte*) node->ilist; + } + } + } + + DBUG_RETURN(ret); +} +/*******************************************************************//** +Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHED +@return 0 on success, 1 on failure */ +static +int +i_s_fts_index_cache_fill( +/*=====================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (ignored) */ +{ + dict_table_t* user_table; + fts_cache_t* cache; + + DBUG_ENTER("i_s_fts_index_cache_fill"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); + + MDL_ticket* mdl_ticket = nullptr; + user_table = dict_table_open_on_id( + innodb_ft_aux_table_id, false, DICT_TABLE_OP_NORMAL, + thd, &mdl_ticket); + + if (!user_table) { + DBUG_RETURN(0); + } + + if (!user_table->fts || !user_table->fts->cache) { + dict_table_close(user_table, false, thd, mdl_ticket); + DBUG_RETURN(0); + } + + cache = user_table->fts->cache; + + int ret = 0; + fts_string_t conv_str; + byte word[HA_FT_MAXBYTELEN + 1]; + conv_str.f_len = sizeof word; + conv_str.f_str = word; + + mysql_mutex_lock(&cache->lock); + + for (ulint i = 0; i < ib_vector_size(cache->indexes); i++) { + fts_index_cache_t* index_cache; + + index_cache = static_cast ( + ib_vector_get(cache->indexes, i)); + + BREAK_IF(ret = i_s_fts_index_cache_fill_one_index( + index_cache, thd, &conv_str, tables)); + } + + mysql_mutex_unlock(&cache->lock); + dict_table_close(user_table, false, thd, mdl_ticket); + + DBUG_RETURN(ret); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHE +@return 0 on success */ +static +int +i_s_fts_index_cache_init( +/*=====================*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_fts_index_cache_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::i_s_fts_index_fields_info; + schema->fill_table = i_s_fts_index_cache_fill; + + DBUG_RETURN(0); +} + +struct st_maria_plugin i_s_innodb_ft_index_cache = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_FT_INDEX_CACHE", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "INNODB AUXILIARY FTS INDEX CACHED", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + i_s_fts_index_cache_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + +/*******************************************************************//** +Go through a FTS index auxiliary table, fetch its rows and fill +FTS word cache structure. +@return DB_SUCCESS on success, otherwise error code */ +static +dberr_t +i_s_fts_index_table_fill_selected( +/*==============================*/ + dict_index_t* index, /*!< in: FTS index */ + ib_vector_t* words, /*!< in/out: vector to hold + fetched words */ + ulint selected, /*!< in: selected FTS index */ + fts_string_t* word) /*!< in: word to select */ +{ + pars_info_t* info; + fts_table_t fts_table; + trx_t* trx; + que_t* graph; + dberr_t error; + fts_fetch_t fetch; + char table_name[MAX_FULL_NAME_LEN]; + + info = pars_info_create(); + + fetch.read_arg = words; + fetch.read_record = fts_optimize_index_fetch_node; + fetch.total_memory = 0; + + DBUG_EXECUTE_IF("fts_instrument_result_cache_limit", + fts_result_cache_limit = 8192; + ); + + trx = trx_create(); + + trx->op_info = "fetching FTS index nodes"; + + pars_info_bind_function(info, "my_func", fetch.read_record, &fetch); + pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len); + + FTS_INIT_INDEX_TABLE(&fts_table, fts_get_suffix(selected), + FTS_INDEX_TABLE, index); + fts_get_table_name(&fts_table, table_name); + pars_info_bind_id(info, "table_name", table_name); + + graph = fts_parse_sql( + &fts_table, info, + "DECLARE FUNCTION my_func;\n" + "DECLARE CURSOR c IS" + " SELECT word, doc_count, first_doc_id, last_doc_id," + " ilist\n" + " FROM $table_name WHERE word >= :word;\n" + "BEGIN\n" + "\n" + "OPEN c;\n" + "WHILE 1 = 1 LOOP\n" + " FETCH c INTO my_func();\n" + " IF c % NOTFOUND THEN\n" + " EXIT;\n" + " END IF;\n" + "END LOOP;\n" + "CLOSE c;"); + + for (;;) { + error = fts_eval_sql(trx, graph); + + if (UNIV_LIKELY(error == DB_SUCCESS)) { + fts_sql_commit(trx); + + break; + } else { + fts_sql_rollback(trx); + + if (error == DB_LOCK_WAIT_TIMEOUT) { + ib::warn() << "Lock wait timeout reading" + " FTS index. Retrying!"; + + trx->error_state = DB_SUCCESS; + } else { + ib::error() << "Error occurred while reading" + " FTS index: " << error; + break; + } + } + } + + que_graph_free(graph); + + trx->free(); + + if (fetch.total_memory >= fts_result_cache_limit) { + error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT; + } + + return(error); +} + +/*******************************************************************//** +Free words. */ +static +void +i_s_fts_index_table_free_one_fetch( +/*===============================*/ + ib_vector_t* words) /*!< in: words fetched */ +{ + for (ulint i = 0; i < ib_vector_size(words); i++) { + fts_word_t* word; + + word = static_cast(ib_vector_get(words, i)); + + for (ulint j = 0; j < ib_vector_size(word->nodes); j++) { + fts_node_t* node; + + node = static_cast (ib_vector_get( + word->nodes, j)); + ut_free(node->ilist); + } + + fts_word_free(word); + } + + ib_vector_reset(words); +} + +/*******************************************************************//** +Go through words, fill INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE. +@return 0 on success, 1 on failure */ +static +int +i_s_fts_index_table_fill_one_fetch( +/*===============================*/ + CHARSET_INFO* index_charset, /*!< in: FTS index charset */ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + ib_vector_t* words, /*!< in: words fetched */ + fts_string_t* conv_str, /*!< in: string for conversion*/ + bool has_more) /*!< in: has more to fetch */ +{ + TABLE* table = (TABLE*) tables->table; + Field** fields; + uint dummy_errors; + char* word_str; + ulint words_size; + int ret = 0; + + DBUG_ENTER("i_s_fts_index_table_fill_one_fetch"); + + fields = table->field; + + words_size = ib_vector_size(words); + if (has_more) { + /* the last word is not fetched completely. */ + ut_ad(words_size > 1); + words_size -= 1; + } + + /* Go through each word in the index cache */ + for (ulint i = 0; i < words_size; i++) { + fts_word_t* word; + + word = static_cast(ib_vector_get(words, i)); + + word->text.f_str[word->text.f_len] = 0; + + /* Convert word from index charset to system_charset_info */ + if (index_charset->cset != system_charset_info->cset) { + conv_str->f_n_char = my_convert( + reinterpret_cast(conv_str->f_str), + static_cast(conv_str->f_len), + system_charset_info, + reinterpret_cast(word->text.f_str), + static_cast(word->text.f_len), + index_charset, &dummy_errors); + ut_ad(conv_str->f_n_char <= conv_str->f_len); + conv_str->f_str[conv_str->f_n_char] = 0; + word_str = reinterpret_cast(conv_str->f_str); + } else { + word_str = reinterpret_cast(word->text.f_str); + } + + /* Decrypt the ilist, and display Dod ID and word position */ + for (ulint i = 0; i < ib_vector_size(word->nodes); i++) { + fts_node_t* node; + const byte* ptr; + ulint decoded = 0; + doc_id_t doc_id = 0; + + node = static_cast (ib_vector_get( + word->nodes, i)); + + ptr = node->ilist; + + while (decoded < node->ilist_size) { + doc_id += fts_decode_vlc(&ptr); + + /* Get position info */ + while (*ptr) { + + OK(field_store_string( + fields[I_S_FTS_WORD], + word_str)); + + OK(fields[I_S_FTS_FIRST_DOC_ID]->store( + longlong(node->first_doc_id), true)); + + OK(fields[I_S_FTS_LAST_DOC_ID]->store( + longlong(node->last_doc_id), true)); + + OK(fields[I_S_FTS_DOC_COUNT]->store( + node->doc_count, true)); + + OK(fields[I_S_FTS_ILIST_DOC_ID]->store( + longlong(doc_id), true)); + + OK(fields[I_S_FTS_ILIST_DOC_POS]->store( + fts_decode_vlc(&ptr), true)); + + OK(schema_table_store_record( + thd, table)); + } + + ++ptr; + + decoded = ptr - (byte*) node->ilist; + } + } + } + + DBUG_RETURN(ret); +} + +/*******************************************************************//** +Go through a FTS index and its auxiliary tables, fetch rows in each table +and fill INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE. +@return 0 on success, 1 on failure */ +static +int +i_s_fts_index_table_fill_one_index( +/*===============================*/ + dict_index_t* index, /*!< in: FTS index */ + THD* thd, /*!< in: thread */ + fts_string_t* conv_str, /*!< in/out: buffer */ + TABLE_LIST* tables) /*!< in/out: tables to fill */ +{ + ib_vector_t* words; + mem_heap_t* heap; + CHARSET_INFO* index_charset; + dberr_t error; + int ret = 0; + + DBUG_ENTER("i_s_fts_index_table_fill_one_index"); + DBUG_ASSERT(!dict_index_is_online_ddl(index)); + + heap = mem_heap_create(1024); + + words = ib_vector_create(ib_heap_allocator_create(heap), + sizeof(fts_word_t), 256); + + index_charset = fts_index_get_charset(index); + + /* Iterate through each auxiliary table as described in + fts_index_selector */ + for (ulint selected = 0; selected < FTS_NUM_AUX_INDEX; selected++) { + fts_string_t word; + bool has_more = false; + + word.f_str = NULL; + word.f_len = 0; + word.f_n_char = 0; + + do { + /* Fetch from index */ + error = i_s_fts_index_table_fill_selected( + index, words, selected, &word); + + if (error == DB_SUCCESS) { + has_more = false; + } else if (error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT) { + has_more = true; + } else { + i_s_fts_index_table_free_one_fetch(words); + ret = 1; + goto func_exit; + } + + if (has_more) { + fts_word_t* last_word; + + /* Prepare start point for next fetch */ + last_word = static_cast(ib_vector_last(words)); + ut_ad(last_word != NULL); + fts_string_dup(&word, &last_word->text, heap); + } + + /* Fill into tables */ + ret = i_s_fts_index_table_fill_one_fetch( + index_charset, thd, tables, words, conv_str, + has_more); + i_s_fts_index_table_free_one_fetch(words); + + if (ret != 0) { + goto func_exit; + } + } while (has_more); + } + +func_exit: + mem_heap_free(heap); + + DBUG_RETURN(ret); +} +/*******************************************************************//** +Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE +@return 0 on success, 1 on failure */ +static +int +i_s_fts_index_table_fill( +/*=====================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (ignored) */ +{ + dict_table_t* user_table; + dict_index_t* index; + + DBUG_ENTER("i_s_fts_index_table_fill"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); + + MDL_ticket* mdl_ticket = nullptr; + user_table = dict_table_open_on_id( + innodb_ft_aux_table_id, false, DICT_TABLE_OP_NORMAL, + thd, &mdl_ticket); + + if (!user_table) { + DBUG_RETURN(0); + } + + int ret = 0; + fts_string_t conv_str; + conv_str.f_len = system_charset_info->mbmaxlen + * FTS_MAX_WORD_LEN_IN_CHAR; + conv_str.f_str = static_cast(ut_malloc_nokey(conv_str.f_len)); + + for (index = dict_table_get_first_index(user_table); + index; index = dict_table_get_next_index(index)) { + if (index->type & DICT_FTS) { + BREAK_IF(ret = i_s_fts_index_table_fill_one_index( + index, thd, &conv_str, tables)); + } + } + + dict_table_close(user_table, false, thd, mdl_ticket); + + ut_free(conv_str.f_str); + + DBUG_RETURN(ret); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE +@return 0 on success */ +static +int +i_s_fts_index_table_init( +/*=====================*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_fts_index_table_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::i_s_fts_index_fields_info; + schema->fill_table = i_s_fts_index_table_fill; + + DBUG_RETURN(0); +} + +struct st_maria_plugin i_s_innodb_ft_index_table = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_FT_INDEX_TABLE", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "INNODB AUXILIARY FTS INDEX TABLE", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + i_s_fts_index_table_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + + +namespace Show { +/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_CONFIG */ +static ST_FIELD_INFO i_s_fts_config_fields_info[]= +{ +#define FTS_CONFIG_KEY 0 + Column("KEY", Varchar(NAME_LEN + 1), NOT_NULL), + +#define FTS_CONFIG_VALUE 1 + Column("VALUE", Varchar(NAME_LEN + 1), NOT_NULL), + + CEnd() +}; +} // namespace Show + +static const char* fts_config_key[] = { + FTS_OPTIMIZE_LIMIT_IN_SECS, + FTS_SYNCED_DOC_ID, + FTS_STOPWORD_TABLE_NAME, + FTS_USE_STOPWORD, + NULL +}; + +/*******************************************************************//** +Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_CONFIG +@return 0 on success, 1 on failure */ +static +int +i_s_fts_config_fill( +/*================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (ignored) */ +{ + Field** fields; + TABLE* table = (TABLE*) tables->table; + trx_t* trx; + fts_table_t fts_table; + dict_table_t* user_table; + ulint i = 0; + dict_index_t* index = NULL; + unsigned char str[FTS_MAX_CONFIG_VALUE_LEN + 1]; + + DBUG_ENTER("i_s_fts_config_fill"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); + + MDL_ticket* mdl_ticket = nullptr; + user_table = dict_table_open_on_id( + innodb_ft_aux_table_id, false, DICT_TABLE_OP_NORMAL, + thd, &mdl_ticket); + + if (!user_table) { + DBUG_RETURN(0); + } + + if (!dict_table_has_fts_index(user_table)) { + dict_table_close(user_table, false, thd, mdl_ticket); + DBUG_RETURN(0); + } + + fields = table->field; + + trx = trx_create(); + trx->op_info = "Select for FTS CONFIG TABLE"; + + FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, user_table); + + if (!ib_vector_is_empty(user_table->fts->indexes)) { + index = (dict_index_t*) ib_vector_getp_const( + user_table->fts->indexes, 0); + DBUG_ASSERT(!dict_index_is_online_ddl(index)); + } + + int ret = 0; + + while (fts_config_key[i]) { + fts_string_t value; + char* key_name; + ulint allocated = FALSE; + + value.f_len = FTS_MAX_CONFIG_VALUE_LEN; + + value.f_str = str; + + if (index + && strcmp(fts_config_key[i], FTS_TOTAL_WORD_COUNT) == 0) { + key_name = fts_config_create_index_param_name( + fts_config_key[i], index); + allocated = TRUE; + } else { + key_name = (char*) fts_config_key[i]; + } + + fts_config_get_value(trx, &fts_table, key_name, &value); + + if (allocated) { + ut_free(key_name); + } + + BREAK_IF(ret = field_store_string( + fields[FTS_CONFIG_KEY], fts_config_key[i])); + + BREAK_IF(ret = field_store_string( + fields[FTS_CONFIG_VALUE], + reinterpret_cast(value.f_str))); + + BREAK_IF(ret = schema_table_store_record(thd, table)); + + i++; + } + + fts_sql_commit(trx); + + dict_table_close(user_table, false, thd, mdl_ticket); + + trx->free(); + + DBUG_RETURN(ret); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_CONFIG +@return 0 on success */ +static +int +i_s_fts_config_init( +/*=================*/ + void* p) /*!< in/out: table schema object */ +{ + DBUG_ENTER("i_s_fts_config_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::i_s_fts_config_fields_info; + schema->fill_table = i_s_fts_config_fill; + + DBUG_RETURN(0); +} + +struct st_maria_plugin i_s_innodb_ft_config = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_FT_CONFIG", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "INNODB AUXILIARY FTS CONFIG TABLE", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + i_s_fts_config_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + +namespace Show { +/* Fields of the dynamic table INNODB_BUFFER_POOL_STATS. */ +static ST_FIELD_INFO i_s_innodb_buffer_stats_fields_info[]= +{ +#define IDX_BUF_STATS_POOL_ID 0 + Column("POOL_ID", ULong(), NOT_NULL), + +#define IDX_BUF_STATS_POOL_SIZE 1 + Column("POOL_SIZE", ULonglong(), NOT_NULL), + +#define IDX_BUF_STATS_FREE_BUFFERS 2 + Column("FREE_BUFFERS", ULonglong(), NOT_NULL), + +#define IDX_BUF_STATS_LRU_LEN 3 + Column("DATABASE_PAGES", ULonglong(), NOT_NULL), + +#define IDX_BUF_STATS_OLD_LRU_LEN 4 + Column("OLD_DATABASE_PAGES", ULonglong(), NOT_NULL), + +#define IDX_BUF_STATS_FLUSH_LIST_LEN 5 + Column("MODIFIED_DATABASE_PAGES", ULonglong(), NOT_NULL), + +#define IDX_BUF_STATS_PENDING_ZIP 6 + Column("PENDING_DECOMPRESS", ULonglong(), NOT_NULL), + +#define IDX_BUF_STATS_PENDING_READ 7 + Column("PENDING_READS",ULonglong(), NOT_NULL), + +#define IDX_BUF_STATS_FLUSH_LRU 8 + Column("PENDING_FLUSH_LRU",ULonglong(), NOT_NULL), + +#define IDX_BUF_STATS_FLUSH_LIST 9 + Column("PENDING_FLUSH_LIST", ULonglong(), NOT_NULL), + +#define IDX_BUF_STATS_PAGE_YOUNG 10 + Column("PAGES_MADE_YOUNG",ULonglong(), NOT_NULL), + +#define IDX_BUF_STATS_PAGE_NOT_YOUNG 11 + Column("PAGES_NOT_MADE_YOUNG",ULonglong(), NOT_NULL), + +#define IDX_BUF_STATS_PAGE_YOUNG_RATE 12 + Column("PAGES_MADE_YOUNG_RATE", Float(MAX_FLOAT_STR_LENGTH), NOT_NULL), + +#define IDX_BUF_STATS_PAGE_NOT_YOUNG_RATE 13 + Column("PAGES_MADE_NOT_YOUNG_RATE", Float(MAX_FLOAT_STR_LENGTH), NOT_NULL), + +#define IDX_BUF_STATS_PAGE_READ 14 + Column("NUMBER_PAGES_READ",ULonglong(), NOT_NULL), + +#define IDX_BUF_STATS_PAGE_CREATED 15 + Column("NUMBER_PAGES_CREATED",ULonglong(), NOT_NULL), + +#define IDX_BUF_STATS_PAGE_WRITTEN 16 + Column("NUMBER_PAGES_WRITTEN",ULonglong(), NOT_NULL), + +#define IDX_BUF_STATS_PAGE_READ_RATE 17 + Column("PAGES_READ_RATE", Float(MAX_FLOAT_STR_LENGTH), NOT_NULL), + +#define IDX_BUF_STATS_PAGE_CREATE_RATE 18 + Column("PAGES_CREATE_RATE", Float(MAX_FLOAT_STR_LENGTH), NOT_NULL), + +#define IDX_BUF_STATS_PAGE_WRITTEN_RATE 19 + Column("PAGES_WRITTEN_RATE",Float(MAX_FLOAT_STR_LENGTH), NOT_NULL), + +#define IDX_BUF_STATS_GET 20 + Column("NUMBER_PAGES_GET", ULonglong(), NOT_NULL), + +#define IDX_BUF_STATS_HIT_RATE 21 + Column("HIT_RATE", ULonglong(), NOT_NULL), + +#define IDX_BUF_STATS_MADE_YOUNG_PCT 22 + Column("YOUNG_MAKE_PER_THOUSAND_GETS", ULonglong(), NOT_NULL), + +#define IDX_BUF_STATS_NOT_MADE_YOUNG_PCT 23 + Column("NOT_YOUNG_MAKE_PER_THOUSAND_GETS", ULonglong(), NOT_NULL), + +#define IDX_BUF_STATS_READ_AHEAD 24 + Column("NUMBER_PAGES_READ_AHEAD", ULonglong(), NOT_NULL), + +#define IDX_BUF_STATS_READ_AHEAD_EVICTED 25 + Column("NUMBER_READ_AHEAD_EVICTED", ULonglong(), NOT_NULL), + +#define IDX_BUF_STATS_READ_AHEAD_RATE 26 + Column("READ_AHEAD_RATE", Float(MAX_FLOAT_STR_LENGTH), NOT_NULL), + +#define IDX_BUF_STATS_READ_AHEAD_EVICT_RATE 27 + Column("READ_AHEAD_EVICTED_RATE",Float(MAX_FLOAT_STR_LENGTH), NOT_NULL), + +#define IDX_BUF_STATS_LRU_IO_SUM 28 + Column("LRU_IO_TOTAL", ULonglong(), NOT_NULL), + +#define IDX_BUF_STATS_LRU_IO_CUR 29 + Column("LRU_IO_CURRENT", ULonglong(), NOT_NULL), + +#define IDX_BUF_STATS_UNZIP_SUM 30 + Column("UNCOMPRESS_TOTAL",ULonglong(), NOT_NULL), + +#define IDX_BUF_STATS_UNZIP_CUR 31 + Column("UNCOMPRESS_CURRENT", ULonglong(), NOT_NULL), + + CEnd() +}; +} // namespace Show + +/** Fill INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS +@param[in,out] thd connection +@param[in,out] tables tables to fill +@return 0 on success, 1 on failure */ +static int i_s_innodb_stats_fill(THD *thd, TABLE_LIST * tables, Item *) +{ + TABLE* table; + Field** fields; + buf_pool_info_t info; + + DBUG_ENTER("i_s_innodb_stats_fill"); + + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); + + /* Only allow the PROCESS privilege holder to access the stats */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + buf_stats_get_pool_info(&info); + + table = tables->table; + + fields = table->field; + + OK(fields[IDX_BUF_STATS_POOL_ID]->store(0, true)); + + OK(fields[IDX_BUF_STATS_POOL_SIZE]->store(info.pool_size, true)); + + OK(fields[IDX_BUF_STATS_LRU_LEN]->store(info.lru_len, true)); + + OK(fields[IDX_BUF_STATS_OLD_LRU_LEN]->store(info.old_lru_len, true)); + + OK(fields[IDX_BUF_STATS_FREE_BUFFERS]->store( + info.free_list_len, true)); + + OK(fields[IDX_BUF_STATS_FLUSH_LIST_LEN]->store( + info.flush_list_len, true)); + + OK(fields[IDX_BUF_STATS_PENDING_ZIP]->store(info.n_pend_unzip, true)); + + OK(fields[IDX_BUF_STATS_PENDING_READ]->store(info.n_pend_reads, true)); + + OK(fields[IDX_BUF_STATS_FLUSH_LRU]->store( + info.n_pending_flush_lru, true)); + + OK(fields[IDX_BUF_STATS_FLUSH_LIST]->store( + info.n_pending_flush_list, true)); + + OK(fields[IDX_BUF_STATS_PAGE_YOUNG]->store( + info.n_pages_made_young, true)); + + OK(fields[IDX_BUF_STATS_PAGE_NOT_YOUNG]->store( + info.n_pages_not_made_young, true)); + + OK(fields[IDX_BUF_STATS_PAGE_YOUNG_RATE]->store( + info.page_made_young_rate)); + + OK(fields[IDX_BUF_STATS_PAGE_NOT_YOUNG_RATE]->store( + info.page_not_made_young_rate)); + + OK(fields[IDX_BUF_STATS_PAGE_READ]->store(info.n_pages_read, true)); + + OK(fields[IDX_BUF_STATS_PAGE_CREATED]->store( + info.n_pages_created, true)); + + OK(fields[IDX_BUF_STATS_PAGE_WRITTEN]->store( + info.n_pages_written, true)); + + OK(fields[IDX_BUF_STATS_GET]->store(info.n_page_gets, true)); + + OK(fields[IDX_BUF_STATS_PAGE_READ_RATE]->store( + info.pages_read_rate)); + + OK(fields[IDX_BUF_STATS_PAGE_CREATE_RATE]->store( + info.pages_created_rate)); + + OK(fields[IDX_BUF_STATS_PAGE_WRITTEN_RATE]->store( + info.pages_written_rate)); + + if (info.n_page_get_delta) { + if (info.page_read_delta <= info.n_page_get_delta) { + OK(fields[IDX_BUF_STATS_HIT_RATE]->store( + static_cast( + 1000 - (1000 * info.page_read_delta + / info.n_page_get_delta)))); + } else { + OK(fields[IDX_BUF_STATS_HIT_RATE]->store(0)); + } + + OK(fields[IDX_BUF_STATS_MADE_YOUNG_PCT]->store( + 1000 * info.young_making_delta + / info.n_page_get_delta, true)); + + OK(fields[IDX_BUF_STATS_NOT_MADE_YOUNG_PCT]->store( + 1000 * info.not_young_making_delta + / info.n_page_get_delta, true)); + } else { + OK(fields[IDX_BUF_STATS_HIT_RATE]->store(0, true)); + OK(fields[IDX_BUF_STATS_MADE_YOUNG_PCT]->store(0, true)); + OK(fields[IDX_BUF_STATS_NOT_MADE_YOUNG_PCT]->store(0, true)); + } + + OK(fields[IDX_BUF_STATS_READ_AHEAD]->store( + info.n_ra_pages_read, true)); + + OK(fields[IDX_BUF_STATS_READ_AHEAD_EVICTED]->store( + info.n_ra_pages_evicted, true)); + + OK(fields[IDX_BUF_STATS_READ_AHEAD_RATE]->store( + info.pages_readahead_rate)); + + OK(fields[IDX_BUF_STATS_READ_AHEAD_EVICT_RATE]->store( + info.pages_evicted_rate)); + + OK(fields[IDX_BUF_STATS_LRU_IO_SUM]->store(info.io_sum, true)); + + OK(fields[IDX_BUF_STATS_LRU_IO_CUR]->store(info.io_cur, true)); + + OK(fields[IDX_BUF_STATS_UNZIP_SUM]->store(info.unzip_sum, true)); + + OK(fields[IDX_BUF_STATS_UNZIP_CUR]->store(info.unzip_cur, true)); + + DBUG_RETURN(schema_table_store_record(thd, table)); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS. +@return 0 on success, 1 on failure */ +static +int +i_s_innodb_buffer_pool_stats_init( +/*==============================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("i_s_innodb_buffer_pool_stats_init"); + + schema = reinterpret_cast(p); + + schema->fields_info = Show::i_s_innodb_buffer_stats_fields_info; + schema->fill_table = i_s_innodb_stats_fill; + + DBUG_RETURN(0); +} + +struct st_maria_plugin i_s_innodb_buffer_stats = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_BUFFER_POOL_STATS", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "InnoDB Buffer Pool Statistics Information ", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + i_s_innodb_buffer_pool_stats_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + +/** These must correspond to the first values of buf_page_state */ +static const LEX_CSTRING page_state_values[] = +{ + { STRING_WITH_LEN("NOT_USED") }, + { STRING_WITH_LEN("MEMORY") }, + { STRING_WITH_LEN("REMOVE_HASH") }, + { STRING_WITH_LEN("FILE_PAGE") }, +}; + +static const TypelibBuffer<4> page_state_values_typelib(page_state_values); + +static const LEX_CSTRING io_values[] = +{ + { STRING_WITH_LEN("IO_NONE") }, + { STRING_WITH_LEN("IO_READ") }, + { STRING_WITH_LEN("IO_WRITE") } +}; + + +static TypelibBuffer<3> io_values_typelib(io_values); + +namespace Show { +/* Fields of the dynamic table INNODB_BUFFER_POOL_PAGE. */ +static ST_FIELD_INFO i_s_innodb_buffer_page_fields_info[]= +{ +#define IDX_BUFFER_POOL_ID 0 + Column("POOL_ID", ULong(), NOT_NULL), + +#define IDX_BUFFER_BLOCK_ID 1 + Column("BLOCK_ID", ULonglong(), NOT_NULL), + +#define IDX_BUFFER_PAGE_SPACE 2 + Column("SPACE", ULong(), NOT_NULL), + +#define IDX_BUFFER_PAGE_NUM 3 + Column("PAGE_NUMBER", ULong(), NOT_NULL), + +#define IDX_BUFFER_PAGE_TYPE 4 + Column("PAGE_TYPE", Varchar(64), NULLABLE), + +#define IDX_BUFFER_PAGE_FLUSH_TYPE 5 + Column("FLUSH_TYPE", ULong(), NOT_NULL), + +#define IDX_BUFFER_PAGE_FIX_COUNT 6 + Column("FIX_COUNT", ULong(), NOT_NULL), + +#ifdef BTR_CUR_HASH_ADAPT +#define IDX_BUFFER_PAGE_HASHED 7 + Column("IS_HASHED", SLong(1), NOT_NULL), +#endif /* BTR_CUR_HASH_ADAPT */ +#define IDX_BUFFER_PAGE_NEWEST_MOD 7 + I_S_AHI + Column("NEWEST_MODIFICATION", ULonglong(), NOT_NULL), + +#define IDX_BUFFER_PAGE_OLDEST_MOD 8 + I_S_AHI + Column("OLDEST_MODIFICATION", ULonglong(), NOT_NULL), + +#define IDX_BUFFER_PAGE_ACCESS_TIME 9 + I_S_AHI + Column("ACCESS_TIME", ULonglong(), NOT_NULL), + +#define IDX_BUFFER_PAGE_TABLE_NAME 10 + I_S_AHI + Column("TABLE_NAME", Varchar(1024), NULLABLE), + +#define IDX_BUFFER_PAGE_INDEX_NAME 11 + I_S_AHI + Column("INDEX_NAME", Varchar(NAME_CHAR_LEN), NULLABLE), + +#define IDX_BUFFER_PAGE_NUM_RECS 12 + I_S_AHI + Column("NUMBER_RECORDS", ULonglong(), NOT_NULL), + +#define IDX_BUFFER_PAGE_DATA_SIZE 13 + I_S_AHI + Column("DATA_SIZE", ULonglong(), NOT_NULL), + +#define IDX_BUFFER_PAGE_ZIP_SIZE 14 + I_S_AHI + Column("COMPRESSED_SIZE", ULonglong(), NOT_NULL), + +#define IDX_BUFFER_PAGE_STATE 15 + I_S_AHI + Column("PAGE_STATE", Enum(&page_state_values_typelib), NOT_NULL), + +#define IDX_BUFFER_PAGE_IO_FIX 16 + I_S_AHI + Column("IO_FIX", Enum(&io_values_typelib), NOT_NULL), + +#define IDX_BUFFER_PAGE_IS_OLD 17 + I_S_AHI + Column("IS_OLD", SLong(1), NOT_NULL), + +#define IDX_BUFFER_PAGE_FREE_CLOCK 18 + I_S_AHI + Column("FREE_PAGE_CLOCK", ULonglong(), NOT_NULL), + + CEnd() +}; +} // namespace Show + +/*******************************************************************//** +Fill Information Schema table INNODB_BUFFER_PAGE with information +cached in the buf_page_info_t array +@return 0 on success, 1 on failure */ +static +int +i_s_innodb_buffer_page_fill( +/*========================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + const buf_page_info_t* info_array, /*!< in: array cached page + info */ + ulint num_page) /*!< in: number of page info + cached */ +{ + TABLE* table; + Field** fields; + + compile_time_assert(I_S_PAGE_TYPE_LAST < 1 << I_S_PAGE_TYPE_BITS); + + DBUG_ENTER("i_s_innodb_buffer_page_fill"); + + table = tables->table; + + fields = table->field; + + /* Iterate through the cached array and fill the I_S table rows */ + for (ulint i = 0; i < num_page; i++) { + const buf_page_info_t* page_info; + char table_name[MAX_FULL_NAME_LEN + 1]; + const char* table_name_end = NULL; + + page_info = info_array + i; + + OK(fields[IDX_BUFFER_POOL_ID]->store(0, true)); + + OK(fields[IDX_BUFFER_BLOCK_ID]->store( + page_info->block_id, true)); + + OK(fields[IDX_BUFFER_PAGE_SPACE]->store( + page_info->id.space(), true)); + + OK(fields[IDX_BUFFER_PAGE_NUM]->store( + page_info->id.page_no(), true)); + + OK(field_store_string( + fields[IDX_BUFFER_PAGE_TYPE], + i_s_page_type[page_info->page_type].type_str)); + + OK(fields[IDX_BUFFER_PAGE_FLUSH_TYPE]->store(0, true)); + + OK(fields[IDX_BUFFER_PAGE_FIX_COUNT]->store( + ~buf_page_t::LRU_MASK & page_info->state, true)); + +#ifdef BTR_CUR_HASH_ADAPT + OK(fields[IDX_BUFFER_PAGE_HASHED]->store( + page_info->hashed, true)); +#endif /* BTR_CUR_HASH_ADAPT */ + + OK(fields[IDX_BUFFER_PAGE_NEWEST_MOD]->store( + page_info->newest_mod, true)); + + OK(fields[IDX_BUFFER_PAGE_OLDEST_MOD]->store( + page_info->oldest_mod, true)); + + OK(fields[IDX_BUFFER_PAGE_ACCESS_TIME]->store( + page_info->access_time, true)); + + fields[IDX_BUFFER_PAGE_TABLE_NAME]->set_null(); + + fields[IDX_BUFFER_PAGE_INDEX_NAME]->set_null(); + + /* If this is an index page, fetch the index name + and table name */ + if (page_info->page_type == I_S_PAGE_TYPE_INDEX) { + bool ret = false; + + dict_sys.freeze(SRW_LOCK_CALL); + + const dict_index_t* index = + dict_index_get_if_in_cache_low( + page_info->index_id); + + if (index) { + table_name_end = innobase_convert_name( + table_name, sizeof(table_name), + index->table->name.m_name, + strlen(index->table->name.m_name), + thd); + + ret = fields[IDX_BUFFER_PAGE_TABLE_NAME] + ->store(table_name, + static_cast( + table_name_end + - table_name), + system_charset_info) + || fields[IDX_BUFFER_PAGE_INDEX_NAME] + ->store(index->name, + uint(strlen(index->name)), + system_charset_info); + } + + dict_sys.unfreeze(); + + OK(ret); + + if (index) { + fields[IDX_BUFFER_PAGE_TABLE_NAME] + ->set_notnull(); + fields[IDX_BUFFER_PAGE_INDEX_NAME] + ->set_notnull(); + } + } + + OK(fields[IDX_BUFFER_PAGE_NUM_RECS]->store( + page_info->num_recs, true)); + + OK(fields[IDX_BUFFER_PAGE_DATA_SIZE]->store( + page_info->data_size, true)); + + OK(fields[IDX_BUFFER_PAGE_ZIP_SIZE]->store( + page_info->zip_ssize + ? (UNIV_ZIP_SIZE_MIN >> 1) << page_info->zip_ssize + : 0, true)); + + static_assert(buf_page_t::NOT_USED == 0, "compatibility"); + static_assert(buf_page_t::MEMORY == 1, "compatibility"); + static_assert(buf_page_t::REMOVE_HASH == 2, "compatibility"); + + OK(fields[IDX_BUFFER_PAGE_STATE]->store( + std::min(3, page_info->state) + 1, true)); + + static_assert(buf_page_t::UNFIXED == 1U << 29, "comp."); + static_assert(buf_page_t::READ_FIX == 4U << 29, "comp."); + static_assert(buf_page_t::WRITE_FIX == 5U << 29, "comp."); + + unsigned io_fix = page_info->state >> 29; + if (io_fix < 4) { + io_fix = 1; + } else if (io_fix > 5) { + io_fix = 3; + } else { + io_fix -= 2; + } + + OK(fields[IDX_BUFFER_PAGE_IO_FIX]->store(io_fix, true)); + + OK(fields[IDX_BUFFER_PAGE_IS_OLD]->store( + page_info->is_old, true)); + + OK(fields[IDX_BUFFER_PAGE_FREE_CLOCK]->store( + page_info->freed_page_clock, true)); + + OK(schema_table_store_record(thd, table)); + } + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Set appropriate page type to a buf_page_info_t structure */ +static +void +i_s_innodb_set_page_type( +/*=====================*/ + buf_page_info_t*page_info, /*!< in/out: structure to fill with + scanned info */ + const byte* frame) /*!< in: buffer frame */ +{ + uint16_t page_type = fil_page_get_type(frame); + + if (fil_page_type_is_index(page_type)) { + const page_t* page = (const page_t*) frame; + + page_info->index_id = btr_page_get_index_id(page); + + /* FIL_PAGE_INDEX and FIL_PAGE_RTREE are a bit special, + their values are defined as 17855 and 17854, so we cannot + use them to index into i_s_page_type[] array, its array index + in the i_s_page_type[] array is I_S_PAGE_TYPE_INDEX + (1) for index pages or I_S_PAGE_TYPE_IBUF for + change buffer index pages */ + if (page_type == FIL_PAGE_RTREE) { + page_info->page_type = I_S_PAGE_TYPE_RTREE; + } else if (page_info->index_id + == static_cast(DICT_IBUF_ID_MIN + + IBUF_SPACE_ID)) { + page_info->page_type = I_S_PAGE_TYPE_IBUF; + } else { + ut_ad(page_type == FIL_PAGE_INDEX + || page_type == FIL_PAGE_TYPE_INSTANT); + page_info->page_type = I_S_PAGE_TYPE_INDEX; + } + + page_info->data_size = uint16_t(page_header_get_field( + page, PAGE_HEAP_TOP) - (page_is_comp(page) + ? PAGE_NEW_SUPREMUM_END + : PAGE_OLD_SUPREMUM_END) + - page_header_get_field(page, PAGE_GARBAGE)); + + page_info->num_recs = page_get_n_recs(page) & ((1U << 14) - 1); + } else if (page_type > FIL_PAGE_TYPE_LAST) { + /* Encountered an unknown page type */ + page_info->page_type = I_S_PAGE_TYPE_UNKNOWN; + } else { + /* Make sure we get the right index into the + i_s_page_type[] array */ + ut_a(page_type == i_s_page_type[page_type].type_value); + + page_info->page_type = page_type & 0xf; + } +} +/*******************************************************************//** +Scans pages in the buffer cache, and collect their general information +into the buf_page_info_t array which is zero-filled. So any fields +that are not initialized in the function will default to 0 */ +static +void +i_s_innodb_buffer_page_get_info( +/*============================*/ + const buf_page_t*bpage, /*!< in: buffer pool page to scan */ + ulint pos, /*!< in: buffer block position in + buffer pool or in the LRU list */ + buf_page_info_t*page_info) /*!< in: zero filled info structure; + out: structure filled with scanned + info */ +{ + page_info->block_id = pos; + + static_assert(buf_page_t::NOT_USED == 0, "compatibility"); + static_assert(buf_page_t::MEMORY == 1, "compatibility"); + static_assert(buf_page_t::REMOVE_HASH == 2, "compatibility"); + static_assert(buf_page_t::UNFIXED == 1U << 29, "compatibility"); + static_assert(buf_page_t::READ_FIX == 4U << 29, "compatibility"); + static_assert(buf_page_t::WRITE_FIX == 5U << 29, "compatibility"); + + page_info->state = bpage->state(); + + if (page_info->state < buf_page_t::UNFIXED) { + page_info->page_type = I_S_PAGE_TYPE_UNKNOWN; + page_info->compressed_only = false; + } else { + const byte* frame; + + page_info->id = bpage->id(); + + page_info->oldest_mod = bpage->oldest_modification(); + + page_info->access_time = bpage->access_time; + + page_info->zip_ssize = bpage->zip.ssize; + + page_info->is_old = bpage->old; + + page_info->freed_page_clock = bpage->freed_page_clock; + + if (page_info->state >= buf_page_t::READ_FIX + && page_info->state < buf_page_t::WRITE_FIX) { + page_info->page_type = I_S_PAGE_TYPE_UNKNOWN; + page_info->newest_mod = 0; + return; + } + + page_info->compressed_only = !bpage->frame, + frame = bpage->frame; + if (UNIV_LIKELY(frame != nullptr)) { +#ifdef BTR_CUR_HASH_ADAPT + /* Note: this may be a false positive, that + is, block->index will not always be set to + NULL when the last adaptive hash index + reference is dropped. */ + page_info->hashed = + reinterpret_cast(bpage) + ->index != nullptr; +#endif /* BTR_CUR_HASH_ADAPT */ + } else { + ut_ad(page_info->zip_ssize); + frame = bpage->zip.data; + } + + page_info->newest_mod = mach_read_from_8(FIL_PAGE_LSN + frame); + i_s_innodb_set_page_type(page_info, frame); + } +} + +/*******************************************************************//** +This is the function that goes through each block of the buffer pool +and fetch information to information schema tables: INNODB_BUFFER_PAGE. +@param[in,out] thd connection +@param[in,out] tables tables to fill +@return 0 on success, 1 on failure */ +static int i_s_innodb_buffer_page_fill(THD *thd, TABLE_LIST *tables, Item *) +{ + int status = 0; + mem_heap_t* heap; + + DBUG_ENTER("i_s_innodb_buffer_page_fill"); + + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); + + /* deny access to user without PROCESS privilege */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + heap = mem_heap_create(10000); + + for (ulint n = 0; + n < ut_min(buf_pool.n_chunks, buf_pool.n_chunks_new); n++) { + const buf_block_t* block; + ulint n_blocks; + buf_page_info_t* info_buffer; + ulint num_page; + ulint mem_size; + ulint chunk_size; + ulint num_to_process = 0; + ulint block_id = 0; + + /* Get buffer block of the nth chunk */ + block = buf_pool.chunks[n].blocks; + chunk_size = buf_pool.chunks[n].size; + num_page = 0; + + while (chunk_size > 0) { + /* we cache maximum MAX_BUF_INFO_CACHED number of + buffer page info */ + num_to_process = ut_min(chunk_size, + (ulint)MAX_BUF_INFO_CACHED); + + mem_size = num_to_process * sizeof(buf_page_info_t); + + /* For each chunk, we'll pre-allocate information + structures to cache the page information read from + the buffer pool. Doing so before obtain any mutex */ + info_buffer = (buf_page_info_t*) mem_heap_zalloc( + heap, mem_size); + + /* Obtain appropriate mutexes. Since this is diagnostic + buffer pool info printout, we are not required to + preserve the overall consistency, so we can + release mutex periodically */ + mysql_mutex_lock(&buf_pool.mutex); + + /* GO through each block in the chunk */ + for (n_blocks = num_to_process; n_blocks--; block++) { + i_s_innodb_buffer_page_get_info( + &block->page, block_id, + info_buffer + num_page); + block_id++; + num_page++; + } + + mysql_mutex_unlock(&buf_pool.mutex); + + /* Fill in information schema table with information + just collected from the buffer chunk scan */ + status = i_s_innodb_buffer_page_fill( + thd, tables, info_buffer, + num_page); + + /* If something goes wrong, break and return */ + if (status) { + break; + } + + mem_heap_empty(heap); + chunk_size -= num_to_process; + num_page = 0; + } + } + + mem_heap_free(heap); + + DBUG_RETURN(status); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE. +@return 0 on success, 1 on failure */ +static +int +i_s_innodb_buffer_page_init( +/*========================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("i_s_innodb_buffer_page_init"); + + schema = reinterpret_cast(p); + + schema->fields_info = Show::i_s_innodb_buffer_page_fields_info; + schema->fill_table = i_s_innodb_buffer_page_fill; + + DBUG_RETURN(0); +} + +struct st_maria_plugin i_s_innodb_buffer_page = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_BUFFER_PAGE", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "InnoDB Buffer Page Information", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + i_s_innodb_buffer_page_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + +namespace Show { +static ST_FIELD_INFO i_s_innodb_buf_page_lru_fields_info[] = +{ +#define IDX_BUF_LRU_POOL_ID 0 + Column("POOL_ID", ULong(), NOT_NULL), + +#define IDX_BUF_LRU_POS 1 + Column("LRU_POSITION", ULonglong(), NOT_NULL), + +#define IDX_BUF_LRU_PAGE_SPACE 2 + Column("SPACE", ULong(), NOT_NULL), + +#define IDX_BUF_LRU_PAGE_NUM 3 + Column("PAGE_NUMBER", ULong(), NOT_NULL), + +#define IDX_BUF_LRU_PAGE_TYPE 4 + Column("PAGE_TYPE", Varchar(64), NULLABLE), + +#define IDX_BUF_LRU_PAGE_FLUSH_TYPE 5 + Column("FLUSH_TYPE", ULong(), NOT_NULL), + +#define IDX_BUF_LRU_PAGE_FIX_COUNT 6 + Column("FIX_COUNT", ULong(), NOT_NULL), + +#ifdef BTR_CUR_HASH_ADAPT +#define IDX_BUF_LRU_PAGE_HASHED 7 + Column("IS_HASHED", SLong(1), NOT_NULL), +#endif /* BTR_CUR_HASH_ADAPT */ +#define IDX_BUF_LRU_PAGE_NEWEST_MOD 7 + I_S_AHI + Column("NEWEST_MODIFICATION",ULonglong(), NOT_NULL), + +#define IDX_BUF_LRU_PAGE_OLDEST_MOD 8 + I_S_AHI + Column("OLDEST_MODIFICATION",ULonglong(), NOT_NULL), + +#define IDX_BUF_LRU_PAGE_ACCESS_TIME 9 + I_S_AHI + Column("ACCESS_TIME",ULonglong(), NOT_NULL), + +#define IDX_BUF_LRU_PAGE_TABLE_NAME 10 + I_S_AHI + Column("TABLE_NAME", Varchar(1024), NULLABLE), + +#define IDX_BUF_LRU_PAGE_INDEX_NAME 11 + I_S_AHI + Column("INDEX_NAME", Varchar(NAME_CHAR_LEN), NULLABLE), + +#define IDX_BUF_LRU_PAGE_NUM_RECS 12 + I_S_AHI + Column("NUMBER_RECORDS", ULonglong(), NOT_NULL), + +#define IDX_BUF_LRU_PAGE_DATA_SIZE 13 + I_S_AHI + Column("DATA_SIZE", ULonglong(), NOT_NULL), + +#define IDX_BUF_LRU_PAGE_ZIP_SIZE 14 + I_S_AHI + Column("COMPRESSED_SIZE",ULonglong(), NOT_NULL), + +#define IDX_BUF_LRU_PAGE_STATE 15 + I_S_AHI + Column("COMPRESSED", SLong(1), NOT_NULL), + +#define IDX_BUF_LRU_PAGE_IO_FIX 16 + I_S_AHI + Column("IO_FIX", Enum(&io_values_typelib), NOT_NULL), + +#define IDX_BUF_LRU_PAGE_IS_OLD 17 + I_S_AHI + Column("IS_OLD", SLong(1), NULLABLE), + +#define IDX_BUF_LRU_PAGE_FREE_CLOCK 18 + I_S_AHI + Column("FREE_PAGE_CLOCK", ULonglong(), NOT_NULL), + + CEnd() +}; +} // namespace Show + +/*******************************************************************//** +Fill Information Schema table INNODB_BUFFER_PAGE_LRU with information +cached in the buf_page_info_t array +@return 0 on success, 1 on failure */ +static +int +i_s_innodb_buf_page_lru_fill( +/*=========================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + const buf_page_info_t* info_array, /*!< in: array cached page + info */ + ulint num_page) /*!< in: number of page info + cached */ +{ + DBUG_ENTER("i_s_innodb_buf_page_lru_fill"); + + TABLE* table = tables->table; + Field** fields = table->field; + + /* Iterate through the cached array and fill the I_S table rows */ + for (ulint i = 0; i < num_page; i++) { + const buf_page_info_t* page_info; + char table_name[MAX_FULL_NAME_LEN + 1]; + const char* table_name_end = NULL; + + page_info = info_array + i; + + OK(fields[IDX_BUF_LRU_POOL_ID]->store(0, true)); + + OK(fields[IDX_BUF_LRU_POS]->store( + page_info->block_id, true)); + + OK(fields[IDX_BUF_LRU_PAGE_SPACE]->store( + page_info->id.space(), true)); + + OK(fields[IDX_BUF_LRU_PAGE_NUM]->store( + page_info->id.page_no(), true)); + + OK(field_store_string( + fields[IDX_BUF_LRU_PAGE_TYPE], + i_s_page_type[page_info->page_type].type_str)); + + OK(fields[IDX_BUF_LRU_PAGE_FLUSH_TYPE]->store(0, true)); + + OK(fields[IDX_BUF_LRU_PAGE_FIX_COUNT]->store( + ~buf_page_t::LRU_MASK & page_info->state, true)); + +#ifdef BTR_CUR_HASH_ADAPT + OK(fields[IDX_BUF_LRU_PAGE_HASHED]->store( + page_info->hashed, true)); +#endif /* BTR_CUR_HASH_ADAPT */ + + OK(fields[IDX_BUF_LRU_PAGE_NEWEST_MOD]->store( + page_info->newest_mod, true)); + + OK(fields[IDX_BUF_LRU_PAGE_OLDEST_MOD]->store( + page_info->oldest_mod, true)); + + OK(fields[IDX_BUF_LRU_PAGE_ACCESS_TIME]->store( + page_info->access_time, true)); + + fields[IDX_BUF_LRU_PAGE_TABLE_NAME]->set_null(); + + fields[IDX_BUF_LRU_PAGE_INDEX_NAME]->set_null(); + + /* If this is an index page, fetch the index name + and table name */ + if (page_info->page_type == I_S_PAGE_TYPE_INDEX) { + bool ret = false; + + dict_sys.freeze(SRW_LOCK_CALL); + + const dict_index_t* index = + dict_index_get_if_in_cache_low( + page_info->index_id); + + if (index) { + table_name_end = innobase_convert_name( + table_name, sizeof(table_name), + index->table->name.m_name, + strlen(index->table->name.m_name), + thd); + + ret = fields[IDX_BUF_LRU_PAGE_TABLE_NAME] + ->store(table_name, + static_cast( + table_name_end + - table_name), + system_charset_info) + || fields[IDX_BUF_LRU_PAGE_INDEX_NAME] + ->store(index->name, + uint(strlen(index->name)), + system_charset_info); + } + + dict_sys.unfreeze(); + + OK(ret); + + if (index) { + fields[IDX_BUF_LRU_PAGE_TABLE_NAME] + ->set_notnull(); + fields[IDX_BUF_LRU_PAGE_INDEX_NAME] + ->set_notnull(); + } + } + + OK(fields[IDX_BUF_LRU_PAGE_NUM_RECS]->store( + page_info->num_recs, true)); + + OK(fields[IDX_BUF_LRU_PAGE_DATA_SIZE]->store( + page_info->data_size, true)); + + OK(fields[IDX_BUF_LRU_PAGE_ZIP_SIZE]->store( + page_info->zip_ssize + ? 512 << page_info->zip_ssize : 0, true)); + + OK(fields[IDX_BUF_LRU_PAGE_STATE]->store( + page_info->compressed_only, true)); + + static_assert(buf_page_t::UNFIXED == 1U << 29, "comp."); + static_assert(buf_page_t::READ_FIX == 4U << 29, "comp."); + static_assert(buf_page_t::WRITE_FIX == 5U << 29, "comp."); + + unsigned io_fix = page_info->state >> 29; + if (io_fix < 4) { + io_fix = 1; + } else if (io_fix > 5) { + io_fix = 3; + } else { + io_fix -= 2; + } + + OK(fields[IDX_BUF_LRU_PAGE_IO_FIX]->store(io_fix, true)); + + OK(fields[IDX_BUF_LRU_PAGE_IS_OLD]->store( + page_info->is_old, true)); + + OK(fields[IDX_BUF_LRU_PAGE_FREE_CLOCK]->store( + page_info->freed_page_clock, true)); + + OK(schema_table_store_record(thd, table)); + } + + DBUG_RETURN(0); +} + +/** Fill the table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE_LRU. +@param[in] thd thread +@param[in,out] tables tables to fill +@return 0 on success, 1 on failure */ +static int i_s_innodb_fill_buffer_lru(THD *thd, TABLE_LIST *tables, Item *) +{ + int status = 0; + buf_page_info_t* info_buffer; + ulint lru_pos = 0; + const buf_page_t* bpage; + ulint lru_len; + + DBUG_ENTER("i_s_innodb_fill_buffer_lru"); + + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); + + /* deny access to any users that do not hold PROCESS_ACL */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + /* Aquire the mutex before allocating info_buffer, since + UT_LIST_GET_LEN(buf_pool.LRU) could change */ + mysql_mutex_lock(&buf_pool.mutex); + + lru_len = UT_LIST_GET_LEN(buf_pool.LRU); + + /* Print error message if malloc fail */ + info_buffer = (buf_page_info_t*) my_malloc(PSI_INSTRUMENT_ME, + lru_len * sizeof *info_buffer, MYF(MY_WME | MY_ZEROFILL)); + + if (!info_buffer) { + status = 1; + goto exit; + } + + /* Walk through Pool's LRU list and print the buffer page + information */ + bpage = UT_LIST_GET_LAST(buf_pool.LRU); + + while (bpage != NULL) { + /* Use the same function that collect buffer info for + INNODB_BUFFER_PAGE to get buffer page info */ + i_s_innodb_buffer_page_get_info(bpage, lru_pos, + (info_buffer + lru_pos)); + + bpage = UT_LIST_GET_PREV(LRU, bpage); + + lru_pos++; + } + + ut_ad(lru_pos == lru_len); + ut_ad(lru_pos == UT_LIST_GET_LEN(buf_pool.LRU)); + +exit: + mysql_mutex_unlock(&buf_pool.mutex); + + if (info_buffer) { + status = i_s_innodb_buf_page_lru_fill( + thd, tables, info_buffer, lru_len); + + my_free(info_buffer); + } + + DBUG_RETURN(status); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE_LRU. +@return 0 on success, 1 on failure */ +static +int +i_s_innodb_buffer_page_lru_init( +/*============================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("i_s_innodb_buffer_page_lru_init"); + + schema = reinterpret_cast(p); + + schema->fields_info = Show::i_s_innodb_buf_page_lru_fields_info; + schema->fill_table = i_s_innodb_fill_buffer_lru; + + DBUG_RETURN(0); +} + +struct st_maria_plugin i_s_innodb_buffer_page_lru = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_BUFFER_PAGE_LRU", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "InnoDB Buffer Page in LRU", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + i_s_innodb_buffer_page_lru_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + +/*******************************************************************//** +Unbind a dynamic INFORMATION_SCHEMA table. +@return 0 */ +static int i_s_common_deinit(void*) +{ + DBUG_ENTER("i_s_common_deinit"); + + /* Do nothing */ + + DBUG_RETURN(0); +} + +static const LEX_CSTRING row_format_values[] = +{ + { STRING_WITH_LEN("Redundant") }, + { STRING_WITH_LEN("Compact") }, + { STRING_WITH_LEN("Compressed") }, + { STRING_WITH_LEN("Dynamic") } +}; + +static TypelibBuffer<4> row_format_values_typelib(row_format_values); + +static const LEX_CSTRING space_type_values[] = +{ + { STRING_WITH_LEN("Single") }, + { STRING_WITH_LEN("System") } +}; + +static TypelibBuffer<2> space_type_values_typelib(space_type_values); + +namespace Show { +/** SYS_TABLES ***************************************************/ +/* Fields of the dynamic table INFORMATION_SCHEMA.SYS_TABLES */ +static ST_FIELD_INFO innodb_sys_tables_fields_info[]= +{ +#define SYS_TABLES_ID 0 + Column("TABLE_ID", ULonglong(), NOT_NULL), + +#define SYS_TABLES_NAME 1 + Column("NAME", Varchar(MAX_FULL_NAME_LEN + 1), NOT_NULL), + +#define SYS_TABLES_FLAG 2 + Column("FLAG", SLong(), NOT_NULL), + +#define SYS_TABLES_NUM_COLUMN 3 + Column("N_COLS", ULong(), NOT_NULL), + +#define SYS_TABLES_SPACE 4 + Column("SPACE", ULong(), NOT_NULL), + +#define SYS_TABLES_ROW_FORMAT 5 + Column("ROW_FORMAT", Enum(&row_format_values_typelib), NULLABLE), + +#define SYS_TABLES_ZIP_PAGE_SIZE 6 + Column("ZIP_PAGE_SIZE", ULong(), NOT_NULL), + +#define SYS_TABLES_SPACE_TYPE 7 + Column("SPACE_TYPE", Enum(&space_type_values_typelib), NULLABLE), + + CEnd() +}; +} // namespace Show + +/**********************************************************************//** +Populate information_schema.innodb_sys_tables table with information +from SYS_TABLES. +@return 0 on success */ +static +int +i_s_dict_fill_sys_tables( +/*=====================*/ + THD* thd, /*!< in: thread */ + dict_table_t* table, /*!< in: table */ + TABLE* table_to_fill) /*!< in/out: fill this table */ +{ + Field** fields; + ulint compact = DICT_TF_GET_COMPACT(table->flags); + ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS( + table->flags); + const ulint zip_size = dict_tf_get_zip_size(table->flags); + const char* row_format; + + if (!compact) { + row_format = "Redundant"; + } else if (!atomic_blobs) { + row_format = "Compact"; + } else if (DICT_TF_GET_ZIP_SSIZE(table->flags)) { + row_format = "Compressed"; + } else { + row_format = "Dynamic"; + } + + DBUG_ENTER("i_s_dict_fill_sys_tables"); + + fields = table_to_fill->field; + + OK(fields[SYS_TABLES_ID]->store(longlong(table->id), TRUE)); + + OK(field_store_string(fields[SYS_TABLES_NAME], table->name.m_name)); + + OK(fields[SYS_TABLES_FLAG]->store(table->flags)); + + OK(fields[SYS_TABLES_NUM_COLUMN]->store(table->n_cols)); + + OK(fields[SYS_TABLES_SPACE]->store(table->space_id, true)); + + OK(field_store_string(fields[SYS_TABLES_ROW_FORMAT], row_format)); + + OK(fields[SYS_TABLES_ZIP_PAGE_SIZE]->store(zip_size, true)); + + OK(field_store_string(fields[SYS_TABLES_SPACE_TYPE], + table->space_id ? "Single" : "System")); + + OK(schema_table_store_record(thd, table_to_fill)); + + DBUG_RETURN(0); +} + +/** Convert one SYS_TABLES record to dict_table_t. +@param pcur persistent cursor position on SYS_TABLES record +@param mtr mini-transaction (nullptr=use the dict_sys cache) +@param rec record to read from (nullptr=use the dict_sys cache) +@param table the converted dict_table_t +@return error message +@retval nullptr on success */ +static const char *i_s_sys_tables_rec(const btr_pcur_t &pcur, mtr_t *mtr, + const rec_t *rec, dict_table_t **table) +{ + static_assert(DICT_FLD__SYS_TABLES__NAME == 0, "compatibility"); + size_t len; + if (rec_get_1byte_offs_flag(pcur.old_rec)) + { + len= rec_1_get_field_end_info(pcur.old_rec, 0); + if (len & REC_1BYTE_SQL_NULL_MASK) + return "corrupted SYS_TABLES.NAME"; + } + else + { + len= rec_2_get_field_end_info(pcur.old_rec, 0); + static_assert(REC_2BYTE_EXTERN_MASK == 16384, "compatibility"); + if (len >= REC_2BYTE_EXTERN_MASK) + return "corrupted SYS_TABLES.NAME"; + } + + if (rec) + return dict_load_table_low(mtr, false, rec, table); + + *table= dict_sys.load_table + (span{reinterpret_cast(pcur.old_rec), len}); + return *table ? nullptr : "Table not found in cache"; +} + +/*******************************************************************//** +Function to go through each record in SYS_TABLES table, and fill the +information_schema.innodb_sys_tables table with related table information +@return 0 on success */ +static +int +i_s_sys_tables_fill_table( +/*======================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (not used) */ +{ + btr_pcur_t pcur; + mtr_t mtr; + + DBUG_ENTER("i_s_sys_tables_fill_table"); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); + + /* deny access to user without PROCESS_ACL privilege */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + mtr.start(); + dict_sys.lock(SRW_LOCK_CALL); + + for (const rec_t *rec = dict_startscan_system(&pcur, &mtr, + dict_sys.sys_tables); + rec; rec = dict_getnext_system(&pcur, &mtr)) { + if (rec_get_deleted_flag(rec, 0)) { + continue; + } + + const char* err_msg; + dict_table_t* table_rec; + + /* Create and populate a dict_table_t structure with + information from SYS_TABLES row */ + err_msg = i_s_sys_tables_rec(pcur, &mtr, rec, &table_rec); + mtr.commit(); + dict_sys.unlock(); + + if (!err_msg) { + i_s_dict_fill_sys_tables(thd, table_rec, + tables->table); + } else { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_CANT_FIND_SYSTEM_REC, "%s", + err_msg); + } + + if (table_rec) { + dict_mem_table_free(table_rec); + } + + /* Get the next record */ + mtr.start(); + dict_sys.lock(SRW_LOCK_CALL); + } + + mtr.commit(); + dict_sys.unlock(); + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_tables +@return 0 on success */ +static +int +innodb_sys_tables_init( +/*===================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_sys_tables_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::innodb_sys_tables_fields_info; + schema->fill_table = i_s_sys_tables_fill_table; + + DBUG_RETURN(0); +} + +struct st_maria_plugin i_s_innodb_sys_tables = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_SYS_TABLES", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "InnoDB SYS_TABLES", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + innodb_sys_tables_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + +namespace Show { +/** SYS_TABLESTATS ***********************************************/ +/* Fields of the dynamic table INFORMATION_SCHEMA.SYS_TABLESTATS */ +static ST_FIELD_INFO innodb_sys_tablestats_fields_info[]= +{ +#define SYS_TABLESTATS_ID 0 + Column("TABLE_ID", ULonglong(), NOT_NULL), + +#define SYS_TABLESTATS_NAME 1 + Column("NAME", Varchar(NAME_CHAR_LEN), NOT_NULL), + +#define SYS_TABLESTATS_INIT 2 + Column("STATS_INITIALIZED", SLong(1), NOT_NULL), + +#define SYS_TABLESTATS_NROW 3 + Column("NUM_ROWS", ULonglong(), NOT_NULL), + +#define SYS_TABLESTATS_CLUST_SIZE 4 + Column("CLUST_INDEX_SIZE", ULonglong(), NOT_NULL), + +#define SYS_TABLESTATS_INDEX_SIZE 5 + Column("OTHER_INDEX_SIZE", ULonglong(), NOT_NULL), + +#define SYS_TABLESTATS_MODIFIED 6 + Column("MODIFIED_COUNTER", ULonglong(), NOT_NULL), + +#define SYS_TABLESTATS_AUTONINC 7 + Column("AUTOINC", ULonglong(), NOT_NULL), + +#define SYS_TABLESTATS_TABLE_REF_COUNT 8 + Column("REF_COUNT", SLong(), NOT_NULL), + + CEnd() +}; +} // namespace Show + +/** Populate information_schema.innodb_sys_tablestats table with a table, +and release exclusive dict_sys.latch. +@param[in] thd connection +@param[in,out] table InnoDB table metadata +@param[in,out] table_to_fill INFORMATION_SCHEMA.INNODB_SYS_TABLESTATS +@return 0 on success */ +static +int +i_s_dict_fill_sys_tablestats(THD* thd, dict_table_t *table, + TABLE* table_to_fill) +{ + DBUG_ENTER("i_s_dict_fill_sys_tablestats"); + + Field **fields= table_to_fill->field; + + { + table->stats_mutex_lock(); + auto _ = make_scope_exit([table]() { + table->stats_mutex_unlock(); dict_sys.unlock(); }); + + OK(fields[SYS_TABLESTATS_ID]->store(longlong(table->id), TRUE)); + + OK(field_store_string(fields[SYS_TABLESTATS_NAME], + table->name.m_name)); + OK(fields[SYS_TABLESTATS_INIT]->store(table->stat_initialized, true)); + + if (table->stat_initialized) + { + OK(fields[SYS_TABLESTATS_NROW]->store(table->stat_n_rows, true)); + + OK(fields[SYS_TABLESTATS_CLUST_SIZE]-> + store(table->stat_clustered_index_size, true)); + + OK(fields[SYS_TABLESTATS_INDEX_SIZE]-> + store(table->stat_sum_of_other_index_sizes, true)); + + OK(fields[SYS_TABLESTATS_MODIFIED]-> + store(table->stat_modified_counter, true)); + } + else + { + OK(fields[SYS_TABLESTATS_NROW]->store(0, true)); + OK(fields[SYS_TABLESTATS_CLUST_SIZE]->store(0, true)); + OK(fields[SYS_TABLESTATS_INDEX_SIZE]->store(0, true)); + OK(fields[SYS_TABLESTATS_MODIFIED]->store(0, true)); + } + + OK(fields[SYS_TABLESTATS_AUTONINC]->store(table->autoinc, true)); + + OK(fields[SYS_TABLESTATS_TABLE_REF_COUNT]-> + store(table->get_ref_count(), true)); + } + + OK(schema_table_store_record(thd, table_to_fill)); + DBUG_RETURN(0); +} + +/*******************************************************************//** +Function to go through each record in SYS_TABLES table, and fill the +information_schema.innodb_sys_tablestats table with table statistics +related information +@return 0 on success */ +static +int +i_s_sys_tables_fill_table_stats( +/*============================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (not used) */ +{ + btr_pcur_t pcur; + const rec_t* rec; + mtr_t mtr; + + DBUG_ENTER("i_s_sys_tables_fill_table_stats"); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); + + /* deny access to user without PROCESS_ACL privilege */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + mtr.start(); + dict_sys.lock(SRW_LOCK_CALL); + + rec = dict_startscan_system(&pcur, &mtr, dict_sys.sys_tables); + + while (rec) { + const char* err_msg; + dict_table_t* table_rec = nullptr; + + mtr.commit(); + /* Fetch the dict_table_t structure corresponding to + this SYS_TABLES record */ + err_msg = i_s_sys_tables_rec(pcur, nullptr, nullptr, + &table_rec); + + if (UNIV_LIKELY(!err_msg)) { + i_s_dict_fill_sys_tablestats(thd, table_rec, + tables->table); + } else { + ut_ad(!table_rec); + dict_sys.unlock(); + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_CANT_FIND_SYSTEM_REC, "%s", + err_msg); + } + + /* Get the next record */ + mtr.start(); + dict_sys.lock(SRW_LOCK_CALL); + + rec = dict_getnext_system(&pcur, &mtr); + } + + mtr.commit(); + dict_sys.unlock(); + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_tablestats +@return 0 on success */ +static +int +innodb_sys_tablestats_init( +/*=======================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_sys_tablestats_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::innodb_sys_tablestats_fields_info; + schema->fill_table = i_s_sys_tables_fill_table_stats; + + DBUG_RETURN(0); +} + +struct st_maria_plugin i_s_innodb_sys_tablestats = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_SYS_TABLESTATS", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "InnoDB SYS_TABLESTATS", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + innodb_sys_tablestats_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + +namespace Show { +/** SYS_INDEXES **************************************************/ +/* Fields of the dynamic table INFORMATION_SCHEMA.SYS_INDEXES */ +static ST_FIELD_INFO innodb_sysindex_fields_info[]= +{ +#define SYS_INDEX_ID 0 + Column("INDEX_ID", ULonglong(), NOT_NULL), + +#define SYS_INDEX_NAME 1 + Column("NAME", Varchar(NAME_CHAR_LEN), NOT_NULL), + +#define SYS_INDEX_TABLE_ID 2 + Column("TABLE_ID", ULonglong(), NOT_NULL), + +#define SYS_INDEX_TYPE 3 + Column("TYPE", SLong(), NOT_NULL), + +#define SYS_INDEX_NUM_FIELDS 4 + Column("N_FIELDS", SLong(), NOT_NULL), + +#define SYS_INDEX_PAGE_NO 5 + Column("PAGE_NO", SLong(), NULLABLE), + +#define SYS_INDEX_SPACE 6 + Column("SPACE", SLong(), NULLABLE), + +#define SYS_INDEX_MERGE_THRESHOLD 7 + Column("MERGE_THRESHOLD", SLong(), NOT_NULL), + + CEnd() +}; +} // namespace Show + +/**********************************************************************//** +Function to populate the information_schema.innodb_sys_indexes table with +collected index information +@return 0 on success */ +static +int +i_s_dict_fill_sys_indexes( +/*======================*/ + THD* thd, /*!< in: thread */ + table_id_t table_id, /*!< in: table id */ + ulint space_id, /*!< in: tablespace id */ + dict_index_t* index, /*!< in: populated dict_index_t + struct with index info */ + TABLE* table_to_fill) /*!< in/out: fill this table */ +{ + Field** fields; + + DBUG_ENTER("i_s_dict_fill_sys_indexes"); + + fields = table_to_fill->field; + + if (*index->name == *TEMP_INDEX_PREFIX_STR) { + /* Since TEMP_INDEX_PREFIX_STR is not valid UTF-8, we + need to convert it to something else. */ + *const_cast(index->name()) = '?'; + } + + OK(fields[SYS_INDEX_NAME]->store(index->name, + uint(strlen(index->name)), + system_charset_info)); + + OK(fields[SYS_INDEX_ID]->store(longlong(index->id), true)); + + OK(fields[SYS_INDEX_TABLE_ID]->store(longlong(table_id), true)); + + OK(fields[SYS_INDEX_TYPE]->store(index->type, true)); + + OK(fields[SYS_INDEX_NUM_FIELDS]->store(index->n_fields)); + + /* FIL_NULL is ULINT32_UNDEFINED */ + if (index->page == FIL_NULL) { + fields[SYS_INDEX_PAGE_NO]->set_null(); + } else { + fields[SYS_INDEX_PAGE_NO]->set_notnull(); + OK(fields[SYS_INDEX_PAGE_NO]->store(index->page, true)); + } + + if (space_id == FIL_NULL) { + fields[SYS_INDEX_SPACE]->set_null(); + } else { + fields[SYS_INDEX_SPACE]->set_notnull(); + OK(fields[SYS_INDEX_SPACE]->store(space_id, true)); + } + + OK(fields[SYS_INDEX_MERGE_THRESHOLD]->store(index->merge_threshold, + true)); + + OK(schema_table_store_record(thd, table_to_fill)); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Function to go through each record in SYS_INDEXES table, and fill the +information_schema.innodb_sys_indexes table with related index information +@return 0 on success */ +static +int +i_s_sys_indexes_fill_table( +/*=======================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (not used) */ +{ + btr_pcur_t pcur; + const rec_t* rec; + mem_heap_t* heap; + mtr_t mtr; + + DBUG_ENTER("i_s_sys_indexes_fill_table"); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); + + /* deny access to user without PROCESS_ACL privilege */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + heap = mem_heap_create(1000); + dict_sys.lock(SRW_LOCK_CALL); + mtr_start(&mtr); + + /* Start scan the SYS_INDEXES table */ + rec = dict_startscan_system(&pcur, &mtr, dict_sys.sys_indexes); + + /* Process each record in the table */ + while (rec) { + const char* err_msg; + table_id_t table_id; + ulint space_id; + dict_index_t index_rec; + + /* Populate a dict_index_t structure with information from + a SYS_INDEXES row */ + err_msg = dict_process_sys_indexes_rec(heap, rec, &index_rec, + &table_id); + const byte* field = rec_get_nth_field_old( + rec, DICT_FLD__SYS_INDEXES__SPACE, &space_id); + space_id = space_id == 4 ? mach_read_from_4(field) + : ULINT_UNDEFINED; + mtr.commit(); + dict_sys.unlock(); + + if (!err_msg) { + if (int err = i_s_dict_fill_sys_indexes( + thd, table_id, space_id, &index_rec, + tables->table)) { + mem_heap_free(heap); + DBUG_RETURN(err); + } + } else { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_CANT_FIND_SYSTEM_REC, "%s", + err_msg); + } + + mem_heap_empty(heap); + + /* Get the next record */ + mtr.start(); + dict_sys.lock(SRW_LOCK_CALL); + rec = dict_getnext_system(&pcur, &mtr); + } + + mtr.commit(); + dict_sys.unlock(); + mem_heap_free(heap); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_indexes +@return 0 on success */ +static +int +innodb_sys_indexes_init( +/*====================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_sys_indexes_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::innodb_sysindex_fields_info; + schema->fill_table = i_s_sys_indexes_fill_table; + + DBUG_RETURN(0); +} + +struct st_maria_plugin i_s_innodb_sys_indexes = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_SYS_INDEXES", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "InnoDB SYS_INDEXES", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + innodb_sys_indexes_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + +namespace Show { +/** SYS_COLUMNS **************************************************/ +/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_COLUMNS */ +static ST_FIELD_INFO innodb_sys_columns_fields_info[]= +{ +#define SYS_COLUMN_TABLE_ID 0 + Column("TABLE_ID", ULonglong(), NOT_NULL), + +#define SYS_COLUMN_NAME 1 + Column("NAME", Varchar(NAME_CHAR_LEN), NOT_NULL), + +#define SYS_COLUMN_POSITION 2 + Column("POS", ULonglong(), NOT_NULL), + +#define SYS_COLUMN_MTYPE 3 + Column("MTYPE", SLong(), NOT_NULL), + +#define SYS_COLUMN__PRTYPE 4 + Column("PRTYPE", SLong(), NOT_NULL), + +#define SYS_COLUMN_COLUMN_LEN 5 + Column("LEN", SLong(), NOT_NULL), + + CEnd() +}; +} // namespace Show + +/**********************************************************************//** +Function to populate the information_schema.innodb_sys_columns with +related column information +@return 0 on success */ +static +int +i_s_dict_fill_sys_columns( +/*======================*/ + THD* thd, /*!< in: thread */ + table_id_t table_id, /*!< in: table ID */ + const char* col_name, /*!< in: column name */ + dict_col_t* column, /*!< in: dict_col_t struct holding + more column information */ + ulint nth_v_col, /*!< in: virtual column, its + sequence number (nth virtual col) */ + TABLE* table_to_fill) /*!< in/out: fill this table */ +{ + Field** fields; + + DBUG_ENTER("i_s_dict_fill_sys_columns"); + + fields = table_to_fill->field; + + OK(fields[SYS_COLUMN_TABLE_ID]->store((longlong) table_id, TRUE)); + + OK(field_store_string(fields[SYS_COLUMN_NAME], col_name)); + + if (column->is_virtual()) { + ulint pos = dict_create_v_col_pos(nth_v_col, column->ind); + OK(fields[SYS_COLUMN_POSITION]->store(pos, true)); + } else { + OK(fields[SYS_COLUMN_POSITION]->store(column->ind, true)); + } + + OK(fields[SYS_COLUMN_MTYPE]->store(column->mtype)); + + OK(fields[SYS_COLUMN__PRTYPE]->store(column->prtype)); + + OK(fields[SYS_COLUMN_COLUMN_LEN]->store(column->len)); + + OK(schema_table_store_record(thd, table_to_fill)); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Function to fill information_schema.innodb_sys_columns with information +collected by scanning SYS_COLUMNS table. +@return 0 on success */ +static +int +i_s_sys_columns_fill_table( +/*=======================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (not used) */ +{ + btr_pcur_t pcur; + const rec_t* rec; + const char* col_name; + mem_heap_t* heap; + mtr_t mtr; + + DBUG_ENTER("i_s_sys_columns_fill_table"); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); + + /* deny access to user without PROCESS_ACL privilege */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + heap = mem_heap_create(1000); + mtr.start(); + dict_sys.lock(SRW_LOCK_CALL); + + rec = dict_startscan_system(&pcur, &mtr, dict_sys.sys_columns); + + while (rec) { + const char* err_msg; + dict_col_t column_rec; + table_id_t table_id; + ulint nth_v_col; + + /* populate a dict_col_t structure with information from + a SYS_COLUMNS row */ + err_msg = dict_process_sys_columns_rec(heap, rec, &column_rec, + &table_id, &col_name, + &nth_v_col); + + mtr.commit(); + dict_sys.unlock(); + + if (!err_msg) { + i_s_dict_fill_sys_columns(thd, table_id, col_name, + &column_rec, nth_v_col, + tables->table); + } else { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_CANT_FIND_SYSTEM_REC, "%s", + err_msg); + } + + mem_heap_empty(heap); + + /* Get the next record */ + mtr.start(); + dict_sys.lock(SRW_LOCK_CALL); + rec = dict_getnext_system(&pcur, &mtr); + } + + mtr.commit(); + dict_sys.unlock(); + mem_heap_free(heap); + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_columns +@return 0 on success */ +static +int +innodb_sys_columns_init( +/*====================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_sys_columns_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::innodb_sys_columns_fields_info; + schema->fill_table = i_s_sys_columns_fill_table; + + DBUG_RETURN(0); +} + +struct st_maria_plugin i_s_innodb_sys_columns = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_SYS_COLUMNS", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "InnoDB SYS_COLUMNS", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + innodb_sys_columns_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + +namespace Show { +/** SYS_VIRTUAL **************************************************/ +/** Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_VIRTUAL */ +static ST_FIELD_INFO innodb_sys_virtual_fields_info[]= +{ +#define SYS_VIRTUAL_TABLE_ID 0 + Column("TABLE_ID", ULonglong(), NOT_NULL), + +#define SYS_VIRTUAL_POS 1 + Column("POS", ULong(), NOT_NULL), + +#define SYS_VIRTUAL_BASE_POS 2 + Column("BASE_POS", ULong(), NOT_NULL), + + CEnd() +}; +} // namespace Show + +/** Function to populate the information_schema.innodb_sys_virtual with +related information +param[in] thd thread +param[in] table_id table ID +param[in] pos virtual column position +param[in] base_pos base column position +param[in,out] table_to_fill fill this table +@return 0 on success */ +static +int +i_s_dict_fill_sys_virtual( + THD* thd, + table_id_t table_id, + ulint pos, + ulint base_pos, + TABLE* table_to_fill) +{ + Field** fields; + + DBUG_ENTER("i_s_dict_fill_sys_virtual"); + + fields = table_to_fill->field; + + OK(fields[SYS_VIRTUAL_TABLE_ID]->store(table_id, true)); + + OK(fields[SYS_VIRTUAL_POS]->store(pos, true)); + + OK(fields[SYS_VIRTUAL_BASE_POS]->store(base_pos, true)); + + OK(schema_table_store_record(thd, table_to_fill)); + + DBUG_RETURN(0); +} + +/** Function to fill information_schema.innodb_sys_virtual with information +collected by scanning SYS_VIRTUAL table. +param[in] thd thread +param[in,out] tables tables to fill +param[in] item condition (not used) +@return 0 on success */ +static +int +i_s_sys_virtual_fill_table( + THD* thd, + TABLE_LIST* tables, + Item* ) +{ + btr_pcur_t pcur; + const rec_t* rec; + ulint pos; + ulint base_pos; + mtr_t mtr; + + DBUG_ENTER("i_s_sys_virtual_fill_table"); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); + + /* deny access to user without PROCESS_ACL privilege */ + if (check_global_access(thd, PROCESS_ACL) || !dict_sys.sys_virtual) { + DBUG_RETURN(0); + } + + mtr.start(); + dict_sys.lock(SRW_LOCK_CALL); + + rec = dict_startscan_system(&pcur, &mtr, dict_sys.sys_virtual); + + while (rec) { + const char* err_msg; + table_id_t table_id; + + /* populate a dict_col_t structure with information from + a SYS_VIRTUAL row */ + err_msg = dict_process_sys_virtual_rec(rec, + &table_id, &pos, + &base_pos); + + mtr.commit(); + dict_sys.unlock(); + + if (!err_msg) { + i_s_dict_fill_sys_virtual(thd, table_id, pos, base_pos, + tables->table); + } else { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_CANT_FIND_SYSTEM_REC, "%s", + err_msg); + } + + /* Get the next record */ + mtr.start(); + dict_sys.lock(SRW_LOCK_CALL); + rec = dict_getnext_system(&pcur, &mtr); + } + + mtr.commit(); + dict_sys.unlock(); + + DBUG_RETURN(0); +} + +/** Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_virtual +param[in,out] p table schema object +@return 0 on success */ +static +int +innodb_sys_virtual_init( + void* p) +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_sys_virtual_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::innodb_sys_virtual_fields_info; + schema->fill_table = i_s_sys_virtual_fill_table; + + DBUG_RETURN(0); +} + +struct st_maria_plugin i_s_innodb_sys_virtual = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_SYS_VIRTUAL", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "InnoDB SYS_VIRTUAL", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + innodb_sys_virtual_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + + +namespace Show { +/** SYS_FIELDS ***************************************************/ +/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_FIELDS */ +static ST_FIELD_INFO innodb_sys_fields_fields_info[]= +{ +#define SYS_FIELD_INDEX_ID 0 + Column("INDEX_ID", ULonglong(), NOT_NULL), + +#define SYS_FIELD_NAME 1 + Column("NAME", Varchar(NAME_CHAR_LEN), NOT_NULL), + +#define SYS_FIELD_POS 2 + Column("POS", ULong(), NOT_NULL), + + CEnd() +}; +} // namespace Show + +/**********************************************************************//** +Function to fill information_schema.innodb_sys_fields with information +collected by scanning SYS_FIELDS table. +@return 0 on success */ +static +int +i_s_dict_fill_sys_fields( +/*=====================*/ + THD* thd, /*!< in: thread */ + index_id_t index_id, /*!< in: index id for the field */ + dict_field_t* field, /*!< in: table */ + ulint pos, /*!< in: Field position */ + TABLE* table_to_fill) /*!< in/out: fill this table */ +{ + Field** fields; + + DBUG_ENTER("i_s_dict_fill_sys_fields"); + + fields = table_to_fill->field; + + OK(fields[SYS_FIELD_INDEX_ID]->store(index_id, true)); + + OK(field_store_string(fields[SYS_FIELD_NAME], field->name)); + + OK(fields[SYS_FIELD_POS]->store(pos, true)); + + OK(schema_table_store_record(thd, table_to_fill)); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Function to go through each record in SYS_FIELDS table, and fill the +information_schema.innodb_sys_fields table with related index field +information +@return 0 on success */ +static +int +i_s_sys_fields_fill_table( +/*======================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (not used) */ +{ + btr_pcur_t pcur; + const rec_t* rec; + mem_heap_t* heap; + index_id_t last_id; + mtr_t mtr; + + DBUG_ENTER("i_s_sys_fields_fill_table"); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); + + /* deny access to user without PROCESS_ACL privilege */ + if (check_global_access(thd, PROCESS_ACL)) { + + DBUG_RETURN(0); + } + + heap = mem_heap_create(1000); + mtr.start(); + + /* will save last index id so that we know whether we move to + the next index. This is used to calculate prefix length */ + last_id = 0; + + dict_sys.lock(SRW_LOCK_CALL); + rec = dict_startscan_system(&pcur, &mtr, dict_sys.sys_fields); + + while (rec) { + ulint pos; + const char* err_msg; + index_id_t index_id; + dict_field_t field_rec; + + /* Populate a dict_field_t structure with information from + a SYS_FIELDS row */ + err_msg = dict_process_sys_fields_rec(heap, rec, &field_rec, + &pos, &index_id, last_id); + + mtr.commit(); + dict_sys.unlock(); + + if (!err_msg) { + i_s_dict_fill_sys_fields(thd, index_id, &field_rec, + pos, tables->table); + last_id = index_id; + } else { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_CANT_FIND_SYSTEM_REC, "%s", + err_msg); + } + + mem_heap_empty(heap); + + /* Get the next record */ + mtr.start(); + dict_sys.lock(SRW_LOCK_CALL); + rec = dict_getnext_system(&pcur, &mtr); + } + + mtr.commit(); + dict_sys.unlock(); + mem_heap_free(heap); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_fields +@return 0 on success */ +static +int +innodb_sys_fields_init( +/*===================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_sys_field_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::innodb_sys_fields_fields_info; + schema->fill_table = i_s_sys_fields_fill_table; + + DBUG_RETURN(0); +} + +struct st_maria_plugin i_s_innodb_sys_fields = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_SYS_FIELDS", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "InnoDB SYS_FIELDS", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + innodb_sys_fields_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + +namespace Show { +/** SYS_FOREIGN ********************************************/ +/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_FOREIGN */ +static ST_FIELD_INFO innodb_sys_foreign_fields_info[]= +{ +#define SYS_FOREIGN_ID 0 + Column("ID", Varchar(NAME_LEN + 1), NOT_NULL), + +#define SYS_FOREIGN_FOR_NAME 1 + Column("FOR_NAME", Varchar(NAME_LEN + 1), NOT_NULL), + +#define SYS_FOREIGN_REF_NAME 2 + Column("REF_NAME", Varchar(NAME_LEN + 1), NOT_NULL), + +#define SYS_FOREIGN_NUM_COL 3 + Column("N_COLS", ULong(), NOT_NULL), + +#define SYS_FOREIGN_TYPE 4 + Column("TYPE", ULong(), NOT_NULL), + + CEnd() +}; +} // namespace Show + +/**********************************************************************//** +Function to fill information_schema.innodb_sys_foreign with information +collected by scanning SYS_FOREIGN table. +@return 0 on success */ +static +int +i_s_dict_fill_sys_foreign( +/*======================*/ + THD* thd, /*!< in: thread */ + dict_foreign_t* foreign, /*!< in: table */ + TABLE* table_to_fill) /*!< in/out: fill this table */ +{ + Field** fields; + + DBUG_ENTER("i_s_dict_fill_sys_foreign"); + + fields = table_to_fill->field; + + OK(field_store_string(fields[SYS_FOREIGN_ID], foreign->id)); + + OK(field_store_string(fields[SYS_FOREIGN_FOR_NAME], + foreign->foreign_table_name)); + + OK(field_store_string(fields[SYS_FOREIGN_REF_NAME], + foreign->referenced_table_name)); + + OK(fields[SYS_FOREIGN_NUM_COL]->store(foreign->n_fields)); + + OK(fields[SYS_FOREIGN_TYPE]->store(foreign->type)); + + OK(schema_table_store_record(thd, table_to_fill)); + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Function to populate INFORMATION_SCHEMA.innodb_sys_foreign table. Loop +through each record in SYS_FOREIGN, and extract the foreign key +information. +@return 0 on success */ +static +int +i_s_sys_foreign_fill_table( +/*=======================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (not used) */ +{ + btr_pcur_t pcur; + const rec_t* rec; + mem_heap_t* heap; + mtr_t mtr; + + DBUG_ENTER("i_s_sys_foreign_fill_table"); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); + + /* deny access to user without PROCESS_ACL privilege */ + if (check_global_access(thd, PROCESS_ACL) || !dict_sys.sys_foreign) { + DBUG_RETURN(0); + } + + heap = mem_heap_create(1000); + mtr.start(); + dict_sys.lock(SRW_LOCK_CALL); + + rec = dict_startscan_system(&pcur, &mtr, dict_sys.sys_foreign); + + while (rec) { + const char* err_msg; + dict_foreign_t foreign_rec; + + /* Populate a dict_foreign_t structure with information from + a SYS_FOREIGN row */ + err_msg = dict_process_sys_foreign_rec(heap, rec, &foreign_rec); + + mtr.commit(); + dict_sys.unlock(); + + if (!err_msg) { + i_s_dict_fill_sys_foreign(thd, &foreign_rec, + tables->table); + } else { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_CANT_FIND_SYSTEM_REC, "%s", + err_msg); + } + + mem_heap_empty(heap); + + /* Get the next record */ + mtr.start(); + dict_sys.lock(SRW_LOCK_CALL); + rec = dict_getnext_system(&pcur, &mtr); + } + + mtr.commit(); + dict_sys.unlock(); + mem_heap_free(heap); + + DBUG_RETURN(0); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_foreign +@return 0 on success */ +static +int +innodb_sys_foreign_init( +/*====================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_sys_foreign_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::innodb_sys_foreign_fields_info; + schema->fill_table = i_s_sys_foreign_fill_table; + + DBUG_RETURN(0); +} + +struct st_maria_plugin i_s_innodb_sys_foreign = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_SYS_FOREIGN", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "InnoDB SYS_FOREIGN", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + innodb_sys_foreign_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + +namespace Show { +/** SYS_FOREIGN_COLS ********************************************/ +/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_FOREIGN_COLS */ +static ST_FIELD_INFO innodb_sys_foreign_cols_fields_info[]= +{ +#define SYS_FOREIGN_COL_ID 0 + Column("ID", Varchar(NAME_LEN + 1), NOT_NULL), + +#define SYS_FOREIGN_COL_FOR_NAME 1 + Column("FOR_COL_NAME", Varchar(NAME_CHAR_LEN), NOT_NULL), + +#define SYS_FOREIGN_COL_REF_NAME 2 + Column("REF_COL_NAME", Varchar(NAME_CHAR_LEN), NOT_NULL), + +#define SYS_FOREIGN_COL_POS 3 + Column("POS", ULong(), NOT_NULL), + + CEnd() +}; +} // namespace Show + +/**********************************************************************//** +Function to fill information_schema.innodb_sys_foreign_cols with information +collected by scanning SYS_FOREIGN_COLS table. +@return 0 on success */ +static +int +i_s_dict_fill_sys_foreign_cols( +/*==========================*/ + THD* thd, /*!< in: thread */ + const char* name, /*!< in: foreign key constraint name */ + const char* for_col_name, /*!< in: referencing column name*/ + const char* ref_col_name, /*!< in: referenced column + name */ + ulint pos, /*!< in: column position */ + TABLE* table_to_fill) /*!< in/out: fill this table */ +{ + Field** fields; + + DBUG_ENTER("i_s_dict_fill_sys_foreign_cols"); + + fields = table_to_fill->field; + + OK(field_store_string(fields[SYS_FOREIGN_COL_ID], name)); + + OK(field_store_string(fields[SYS_FOREIGN_COL_FOR_NAME], for_col_name)); + + OK(field_store_string(fields[SYS_FOREIGN_COL_REF_NAME], ref_col_name)); + + OK(fields[SYS_FOREIGN_COL_POS]->store(pos, true)); + + OK(schema_table_store_record(thd, table_to_fill)); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Function to populate INFORMATION_SCHEMA.innodb_sys_foreign_cols table. Loop +through each record in SYS_FOREIGN_COLS, and extract the foreign key column +information and fill the INFORMATION_SCHEMA.innodb_sys_foreign_cols table. +@return 0 on success */ +static +int +i_s_sys_foreign_cols_fill_table( +/*============================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (not used) */ +{ + btr_pcur_t pcur; + const rec_t* rec; + mem_heap_t* heap; + mtr_t mtr; + + DBUG_ENTER("i_s_sys_foreign_cols_fill_table"); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); + + /* deny access to user without PROCESS_ACL privilege */ + if (check_global_access(thd, PROCESS_ACL) + || !dict_sys.sys_foreign_cols) { + DBUG_RETURN(0); + } + + heap = mem_heap_create(1000); + mtr.start(); + dict_sys.lock(SRW_LOCK_CALL); + + rec = dict_startscan_system(&pcur, &mtr, dict_sys.sys_foreign_cols); + + while (rec) { + const char* err_msg; + const char* name; + const char* for_col_name; + const char* ref_col_name; + ulint pos; + + /* Extract necessary information from a SYS_FOREIGN_COLS row */ + err_msg = dict_process_sys_foreign_col_rec( + heap, rec, &name, &for_col_name, &ref_col_name, &pos); + + mtr.commit(); + dict_sys.unlock(); + + if (!err_msg) { + i_s_dict_fill_sys_foreign_cols( + thd, name, for_col_name, ref_col_name, pos, + tables->table); + } else { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + ER_CANT_FIND_SYSTEM_REC, "%s", + err_msg); + } + + mem_heap_empty(heap); + + /* Get the next record */ + mtr.start(); + dict_sys.lock(SRW_LOCK_CALL); + rec = dict_getnext_system(&pcur, &mtr); + } + + mtr.commit(); + dict_sys.unlock(); + mem_heap_free(heap); + + DBUG_RETURN(0); +} +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_foreign_cols +@return 0 on success */ +static +int +innodb_sys_foreign_cols_init( +/*========================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_sys_foreign_cols_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::innodb_sys_foreign_cols_fields_info; + schema->fill_table = i_s_sys_foreign_cols_fill_table; + + DBUG_RETURN(0); +} + +struct st_maria_plugin i_s_innodb_sys_foreign_cols = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_SYS_FOREIGN_COLS", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "InnoDB SYS_FOREIGN_COLS", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + innodb_sys_foreign_cols_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + +namespace Show { +/** SYS_TABLESPACES ********************************************/ +/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES */ +static ST_FIELD_INFO innodb_sys_tablespaces_fields_info[]= +{ +#define SYS_TABLESPACES_SPACE 0 + Column("SPACE", ULong(), NOT_NULL), + +#define SYS_TABLESPACES_NAME 1 + Column("NAME", Varchar(MAX_FULL_NAME_LEN + 1), NOT_NULL), + +#define SYS_TABLESPACES_FLAGS 2 + Column("FLAG", ULong(), NOT_NULL), + +#define SYS_TABLESPACES_ROW_FORMAT 3 + Column("ROW_FORMAT", Varchar(22), NULLABLE), + +#define SYS_TABLESPACES_PAGE_SIZE 4 + Column("PAGE_SIZE", ULong(), NOT_NULL), + +#define SYS_TABLESPACES_FILENAME 5 + Column("FILENAME", Varchar(FN_REFLEN), NOT_NULL), + +#define SYS_TABLESPACES_FS_BLOCK_SIZE 6 + Column("FS_BLOCK_SIZE", ULong(),NOT_NULL), + +#define SYS_TABLESPACES_FILE_SIZE 7 + Column("FILE_SIZE", ULonglong(), NOT_NULL), + +#define SYS_TABLESPACES_ALLOC_SIZE 8 + Column("ALLOCATED_SIZE", ULonglong(), NOT_NULL), + + CEnd() +}; +} // namespace Show + +extern size_t os_file_get_fs_block_size(const char *path); + +/** Produce one row of INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES. +@param thd connection +@param s tablespace +@param t output table +@return 0 on success */ +static int i_s_sys_tablespaces_fill(THD *thd, const fil_space_t &s, TABLE *t) +{ + DBUG_ENTER("i_s_sys_tablespaces_fill"); + const char *row_format; + + if (s.full_crc32() || is_system_tablespace(s.id)) + row_format= nullptr; + else if (FSP_FLAGS_GET_ZIP_SSIZE(s.flags)) + row_format= "Compressed"; + else if (FSP_FLAGS_HAS_ATOMIC_BLOBS(s.flags)) + row_format= "Dynamic"; + else + row_format= "Compact or Redundant"; + + Field **fields= t->field; + + OK(fields[SYS_TABLESPACES_SPACE]->store(s.id, true)); + { + Field *f= fields[SYS_TABLESPACES_NAME]; + const auto name= s.name(); + if (name.data()) + { + OK(f->store(name.data(), name.size(), system_charset_info)); + f->set_notnull(); + } + else if (srv_is_undo_tablespace(s.id)) + { + char name[15]; + snprintf(name, sizeof name, "innodb_undo%03u", + (s.id - srv_undo_space_id_start + 1)); + OK(f->store(name, strlen(name), system_charset_info)); + } else f->set_notnull(); + } + + fields[SYS_TABLESPACES_NAME]->set_null(); + OK(fields[SYS_TABLESPACES_FLAGS]->store(s.flags, true)); + OK(field_store_string(fields[SYS_TABLESPACES_ROW_FORMAT], row_format)); + const char *filepath= s.chain.start->name; + OK(field_store_string(fields[SYS_TABLESPACES_FILENAME], filepath)); + + OK(fields[SYS_TABLESPACES_PAGE_SIZE]->store(s.physical_size(), true)); + size_t fs_block_size; + os_file_size_t file= os_file_get_size(filepath); + if (file.m_total_size == os_offset_t(~0)) + { + file.m_total_size= 0; + file.m_alloc_size= 0; + fs_block_size= 0; + } + else + fs_block_size= os_file_get_fs_block_size(filepath); + + OK(fields[SYS_TABLESPACES_FS_BLOCK_SIZE]->store(fs_block_size, true)); + OK(fields[SYS_TABLESPACES_FILE_SIZE]->store(file.m_total_size, true)); + OK(fields[SYS_TABLESPACES_ALLOC_SIZE]->store(file.m_alloc_size, true)); + + OK(schema_table_store_record(thd, t)); + + DBUG_RETURN(0); +} + +/** Populate INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES. +@param thd connection +@param tables table to fill +@return 0 on success */ +static int i_s_sys_tablespaces_fill_table(THD *thd, TABLE_LIST *tables, Item*) +{ + DBUG_ENTER("i_s_sys_tablespaces_fill_table"); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); + + if (check_global_access(thd, PROCESS_ACL)) + DBUG_RETURN(0); + + int err= 0; + + mysql_mutex_lock(&fil_system.mutex); + fil_system.freeze_space_list++; + + for (fil_space_t &space : fil_system.space_list) + { + if (space.purpose == FIL_TYPE_TABLESPACE && !space.is_stopping() && + space.chain.start) + { + space.reacquire(); + mysql_mutex_unlock(&fil_system.mutex); + space.s_lock(); + err= i_s_sys_tablespaces_fill(thd, space, tables->table); + space.s_unlock(); + mysql_mutex_lock(&fil_system.mutex); + space.release(); + if (err) + break; + } + } + + fil_system.freeze_space_list--; + mysql_mutex_unlock(&fil_system.mutex); + if (err == DB_SUCCESS) + err= i_s_sys_tablespaces_fill(thd, *fil_system.temp_space, tables->table); + DBUG_RETURN(err); +} + +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES +@return 0 on success */ +static +int +innodb_sys_tablespaces_init( +/*========================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_sys_tablespaces_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::innodb_sys_tablespaces_fields_info; + schema->fill_table = i_s_sys_tablespaces_fill_table; + + DBUG_RETURN(0); +} + +struct st_maria_plugin i_s_innodb_sys_tablespaces = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_SYS_TABLESPACES", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + plugin_author, + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "InnoDB tablespaces", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_GPL, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + innodb_sys_tablespaces_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; + +namespace Show { +/** TABLESPACES_ENCRYPTION ********************************************/ +/* Fields of the table INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION */ +static ST_FIELD_INFO innodb_tablespaces_encryption_fields_info[]= +{ +#define TABLESPACES_ENCRYPTION_SPACE 0 + Column("SPACE", ULong(), NOT_NULL), + +#define TABLESPACES_ENCRYPTION_NAME 1 + Column("NAME", Varchar(MAX_FULL_NAME_LEN + 1), NULLABLE), + +#define TABLESPACES_ENCRYPTION_ENCRYPTION_SCHEME 2 + Column("ENCRYPTION_SCHEME", ULong(), NOT_NULL), + +#define TABLESPACES_ENCRYPTION_KEYSERVER_REQUESTS 3 + Column("KEYSERVER_REQUESTS", ULong(), NOT_NULL), + +#define TABLESPACES_ENCRYPTION_MIN_KEY_VERSION 4 + Column("MIN_KEY_VERSION", ULong(), NOT_NULL), + +#define TABLESPACES_ENCRYPTION_CURRENT_KEY_VERSION 5 + Column("CURRENT_KEY_VERSION", ULong(), NOT_NULL), + +#define TABLESPACES_ENCRYPTION_KEY_ROTATION_PAGE_NUMBER 6 + Column("KEY_ROTATION_PAGE_NUMBER", ULonglong(), NULLABLE), + +#define TABLESPACES_ENCRYPTION_KEY_ROTATION_MAX_PAGE_NUMBER 7 + Column("KEY_ROTATION_MAX_PAGE_NUMBER", ULonglong(), NULLABLE), + +#define TABLESPACES_ENCRYPTION_CURRENT_KEY_ID 8 + Column("CURRENT_KEY_ID", ULong(), NOT_NULL), + +#define TABLESPACES_ENCRYPTION_ROTATING_OR_FLUSHING 9 + Column("ROTATING_OR_FLUSHING", SLong(1), NOT_NULL), + + CEnd() +}; +} // namespace Show + +/**********************************************************************//** +Function to fill INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION. +@param[in] thd thread handle +@param[in] space Tablespace +@param[in] table_to_fill I_S table to fill +@return 0 on success */ +static +int +i_s_dict_fill_tablespaces_encryption( + THD* thd, + fil_space_t* space, + TABLE* table_to_fill) +{ + Field** fields; + struct fil_space_crypt_status_t status; + DBUG_ENTER("i_s_dict_fill_tablespaces_encryption"); + + fields = table_to_fill->field; + + fil_space_crypt_get_status(space, &status); + + /* If tablespace id does not match, we did not find + encryption information for this tablespace. */ + if (!space->crypt_data || space->id != status.space) { + goto skip; + } + + OK(fields[TABLESPACES_ENCRYPTION_SPACE]->store(space->id, true)); + + { + const auto name = space->name(); + if (name.data()) { + OK(fields[TABLESPACES_ENCRYPTION_NAME]->store( + name.data(), name.size(), + system_charset_info)); + fields[TABLESPACES_ENCRYPTION_NAME]->set_notnull(); + } else if (srv_is_undo_tablespace(space->id)) { + char undo_name[sizeof "innodb_undo000"]; + snprintf(undo_name, sizeof undo_name, + "innodb_undo%03" PRIu32, space->id); + OK(fields[TABLESPACES_ENCRYPTION_NAME]->store( + undo_name, strlen(undo_name), + system_charset_info)); + fields[TABLESPACES_ENCRYPTION_NAME]->set_notnull(); + } else { + fields[TABLESPACES_ENCRYPTION_NAME]->set_null(); + } + } + + OK(fields[TABLESPACES_ENCRYPTION_ENCRYPTION_SCHEME]->store( + status.scheme, true)); + OK(fields[TABLESPACES_ENCRYPTION_KEYSERVER_REQUESTS]->store( + status.keyserver_requests, true)); + OK(fields[TABLESPACES_ENCRYPTION_MIN_KEY_VERSION]->store( + status.min_key_version, true)); + OK(fields[TABLESPACES_ENCRYPTION_CURRENT_KEY_VERSION]->store( + status.current_key_version, true)); + OK(fields[TABLESPACES_ENCRYPTION_CURRENT_KEY_ID]->store( + status.key_id, true)); + OK(fields[TABLESPACES_ENCRYPTION_ROTATING_OR_FLUSHING]->store( + status.rotating || status.flushing, true)); + + if (status.rotating) { + fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_PAGE_NUMBER]->set_notnull(); + OK(fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_PAGE_NUMBER]->store( + status.rotate_next_page_number, true)); + fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_MAX_PAGE_NUMBER]->set_notnull(); + OK(fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_MAX_PAGE_NUMBER]->store( + status.rotate_max_page_number, true)); + } else { + fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_PAGE_NUMBER] + ->set_null(); + fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_MAX_PAGE_NUMBER] + ->set_null(); + } + + OK(schema_table_store_record(thd, table_to_fill)); + +skip: + DBUG_RETURN(0); +} +/*******************************************************************//** +Function to populate INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION table. +Loop through each record in TABLESPACES_ENCRYPTION, and extract the column +information and fill the INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION table. +@return 0 on success */ +static +int +i_s_tablespaces_encryption_fill_table( +/*===========================*/ + THD* thd, /*!< in: thread */ + TABLE_LIST* tables, /*!< in/out: tables to fill */ + Item* ) /*!< in: condition (not used) */ +{ + DBUG_ENTER("i_s_tablespaces_encryption_fill_table"); + RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str); + + /* deny access to user without PROCESS_ACL privilege */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + int err = 0; + mysql_mutex_lock(&fil_system.mutex); + fil_system.freeze_space_list++; + + for (fil_space_t& space : fil_system.space_list) { + if (space.purpose == FIL_TYPE_TABLESPACE + && !space.is_stopping()) { + space.reacquire(); + mysql_mutex_unlock(&fil_system.mutex); + space.s_lock(); + err = i_s_dict_fill_tablespaces_encryption( + thd, &space, tables->table); + space.s_unlock(); + mysql_mutex_lock(&fil_system.mutex); + space.release(); + if (err) { + break; + } + } + } + + fil_system.freeze_space_list--; + mysql_mutex_unlock(&fil_system.mutex); + DBUG_RETURN(err); +} +/*******************************************************************//** +Bind the dynamic table INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION +@return 0 on success */ +static +int +innodb_tablespaces_encryption_init( +/*========================*/ + void* p) /*!< in/out: table schema object */ +{ + ST_SCHEMA_TABLE* schema; + + DBUG_ENTER("innodb_tablespaces_encryption_init"); + + schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = Show::innodb_tablespaces_encryption_fields_info; + schema->fill_table = i_s_tablespaces_encryption_fill_table; + + DBUG_RETURN(0); +} + +struct st_maria_plugin i_s_innodb_tablespaces_encryption = +{ + /* the plugin type (a MYSQL_XXX_PLUGIN value) */ + /* int */ + MYSQL_INFORMATION_SCHEMA_PLUGIN, + + /* pointer to type-specific plugin descriptor */ + /* void* */ + &i_s_info, + + /* plugin name */ + /* const char* */ + "INNODB_TABLESPACES_ENCRYPTION", + + /* plugin author (for SHOW PLUGINS) */ + /* const char* */ + "Google Inc", + + /* general descriptive text (for SHOW PLUGINS) */ + /* const char* */ + "InnoDB TABLESPACES_ENCRYPTION", + + /* the plugin license (PLUGIN_LICENSE_XXX) */ + /* int */ + PLUGIN_LICENSE_BSD, + + /* the function to invoke when plugin is loaded */ + /* int (*)(void*); */ + innodb_tablespaces_encryption_init, + + /* the function to invoke when plugin is unloaded */ + /* int (*)(void*); */ + i_s_common_deinit, + + i_s_version, nullptr, nullptr, PACKAGE_VERSION, + MariaDB_PLUGIN_MATURITY_STABLE +}; diff --git a/storage/innobase/handler/i_s.h b/storage/innobase/handler/i_s.h new file mode 100644 index 00000000..c8190a41 --- /dev/null +++ b/storage/innobase/handler/i_s.h @@ -0,0 +1,91 @@ +/***************************************************************************** + +Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2014, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file handler/i_s.h +InnoDB INFORMATION SCHEMA tables interface to MySQL. + +Created July 18, 2007 Vasil Dimov +Modified Dec 29, 2014 Jan Lindström +*******************************************************/ + +#ifndef i_s_h +#define i_s_h +#include "dict0types.h" + +const char plugin_author[] = "Oracle Corporation"; +const char maria_plugin_author[] = "MariaDB Corporation"; + +extern struct st_maria_plugin i_s_innodb_trx; +extern struct st_maria_plugin i_s_innodb_locks; +extern struct st_maria_plugin i_s_innodb_lock_waits; +extern struct st_maria_plugin i_s_innodb_cmp; +extern struct st_maria_plugin i_s_innodb_cmp_reset; +extern struct st_maria_plugin i_s_innodb_cmp_per_index; +extern struct st_maria_plugin i_s_innodb_cmp_per_index_reset; +extern struct st_maria_plugin i_s_innodb_cmpmem; +extern struct st_maria_plugin i_s_innodb_cmpmem_reset; +extern struct st_maria_plugin i_s_innodb_metrics; +extern struct st_maria_plugin i_s_innodb_ft_default_stopword; +extern struct st_maria_plugin i_s_innodb_ft_deleted; +extern struct st_maria_plugin i_s_innodb_ft_being_deleted; +extern struct st_maria_plugin i_s_innodb_ft_index_cache; +extern struct st_maria_plugin i_s_innodb_ft_index_table; +extern struct st_maria_plugin i_s_innodb_ft_config; +extern struct st_maria_plugin i_s_innodb_buffer_page; +extern struct st_maria_plugin i_s_innodb_buffer_page_lru; +extern struct st_maria_plugin i_s_innodb_buffer_stats; +extern struct st_maria_plugin i_s_innodb_sys_tables; +extern struct st_maria_plugin i_s_innodb_sys_tablestats; +extern struct st_maria_plugin i_s_innodb_sys_indexes; +extern struct st_maria_plugin i_s_innodb_sys_columns; +extern struct st_maria_plugin i_s_innodb_sys_fields; +extern struct st_maria_plugin i_s_innodb_sys_foreign; +extern struct st_maria_plugin i_s_innodb_sys_foreign_cols; +extern struct st_maria_plugin i_s_innodb_sys_tablespaces; +extern struct st_maria_plugin i_s_innodb_sys_virtual; +extern struct st_maria_plugin i_s_innodb_tablespaces_encryption; + +/** The latest successfully looked up innodb_fts_aux_table */ +extern table_id_t innodb_ft_aux_table_id; + +/** maximum number of buffer page info we would cache. */ +#define MAX_BUF_INFO_CACHED 10000 + +#define OK(expr) \ + if ((expr) != 0) { \ + DBUG_RETURN(1); \ + } + +#define BREAK_IF(expr) if ((expr)) break + +#define RETURN_IF_INNODB_NOT_STARTED(plugin_name) \ +do { \ + if (!srv_was_started) { \ + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, \ + ER_CANT_FIND_SYSTEM_REC, \ + "InnoDB: SELECTing from " \ + "INFORMATION_SCHEMA.%s but " \ + "the InnoDB storage engine " \ + "is not installed", plugin_name); \ + DBUG_RETURN(0); \ + } \ +} while (0) + +#endif /* i_s_h */ diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc new file mode 100644 index 00000000..b9e94a67 --- /dev/null +++ b/storage/innobase/ibuf/ibuf0ibuf.cc @@ -0,0 +1,4617 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file ibuf/ibuf0ibuf.cc +Insert buffer + +Created 7/19/1997 Heikki Tuuri +*******************************************************/ + +#include "ibuf0ibuf.h" +#include "btr0sea.h" + +/** Number of bits describing a single page */ +#define IBUF_BITS_PER_PAGE 4 +/** The start address for an insert buffer bitmap page bitmap */ +#define IBUF_BITMAP PAGE_DATA + +#include "buf0buf.h" +#include "buf0rea.h" +#include "fsp0fsp.h" +#include "trx0sys.h" +#include "fil0fil.h" +#include "rem0rec.h" +#include "btr0cur.h" +#include "btr0pcur.h" +#include "btr0btr.h" +#include "row0upd.h" +#include "dict0boot.h" +#include "fut0lst.h" +#include "lock0lock.h" +#include "log0recv.h" +#include "que0que.h" +#include "srv0start.h" /* srv_shutdown_state */ +#include "rem0cmp.h" +#include "log.h" + +/* STRUCTURE OF AN INSERT BUFFER RECORD + +In versions < 4.1.x: + +1. The first field is the page number. +2. The second field is an array which stores type info for each subsequent + field. We store the information which affects the ordering of records, and + also the physical storage size of an SQL NULL value. E.g., for CHAR(10) it + is 10 bytes. +3. Next we have the fields of the actual index record. + +In versions >= 4.1.x: + +Note that contary to what we planned in the 1990's, there will only be one +insert buffer tree, and that is in the system tablespace of InnoDB. + +1. The first field is the space id. +2. The second field is a one-byte marker (0) which differentiates records from + the < 4.1.x storage format. +3. The third field is the page number. +4. The fourth field contains the type info, where we have also added 2 bytes to + store the charset. In the compressed table format of 5.0.x we must add more + information here so that we can build a dummy 'index' struct which 5.0.x + can use in the binary search on the index page in the ibuf merge phase. +5. The rest of the fields contain the fields of the actual index record. + +In versions >= 5.0.3: + +The first byte of the fourth field is an additional marker (0) if the record +is in the compact format. The presence of this marker can be detected by +looking at the length of the field modulo DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE. + +The high-order bit of the character set field in the type info is the +"nullable" flag for the field. + +In versions >= 5.5: + +The optional marker byte at the start of the fourth field is replaced by +mandatory 3 fields, totaling 4 bytes: + + 1. 2 bytes: Counter field, used to sort records within a (space id, page + no) in the order they were added. This is needed so that for example the + sequence of operations "INSERT x, DEL MARK x, INSERT x" is handled + correctly. + + 2. 1 byte: Operation type (see ibuf_op_t). + + 3. 1 byte: Flags. Currently only one flag exists, IBUF_REC_COMPACT. + +To ensure older records, which do not have counters to enforce correct +sorting, are merged before any new records, ibuf_insert checks if we're +trying to insert to a position that contains old-style records, and if so, +refuses the insert. Thus, ibuf pages are gradually converted to the new +format as their corresponding buffer pool pages are read into memory. +*/ + + +/* PREVENTING DEADLOCKS IN THE INSERT BUFFER SYSTEM + +If an OS thread performs any operation that brings in disk pages from +non-system tablespaces into the buffer pool, or creates such a page there, +then the operation may have as a side effect an insert buffer index tree +compression. Thus, the tree latch of the insert buffer tree may be acquired +in the x-mode, and also the file space latch of the system tablespace may +be acquired in the x-mode. + +Also, an insert to an index in a non-system tablespace can have the same +effect. How do we know this cannot lead to a deadlock of OS threads? There +is a problem with the i\o-handler threads: they break the latching order +because they own x-latches to pages which are on a lower level than the +insert buffer tree latch, its page latches, and the tablespace latch an +insert buffer operation can reserve. + +The solution is the following: Let all the tree and page latches connected +with the insert buffer be later in the latching order than the fsp latch and +fsp page latches. + +Insert buffer pages must be such that the insert buffer is never invoked +when these pages are accessed as this would result in a recursion violating +the latching order. We let a special i/o-handler thread take care of i/o to +the insert buffer pages and the ibuf bitmap pages, as well as the fsp bitmap +pages and the first inode page, which contains the inode of the ibuf tree: let +us call all these ibuf pages. To prevent deadlocks, we do not let a read-ahead +access both non-ibuf and ibuf pages. + +Then an i/o-handler for the insert buffer never needs to access recursively the +insert buffer tree and thus obeys the latching order. On the other hand, other +i/o-handlers for other tablespaces may require access to the insert buffer, +but because all kinds of latches they need to access there are later in the +latching order, no violation of the latching order occurs in this case, +either. + +A problem is how to grow and contract an insert buffer tree. As it is later +in the latching order than the fsp management, we have to reserve the fsp +latch first, before adding or removing pages from the insert buffer tree. +We let the insert buffer tree have its own file space management: a free +list of pages linked to the tree root. To prevent recursive using of the +insert buffer when adding pages to the tree, we must first load these pages +to memory, obtaining a latch on them, and only after that add them to the +free list of the insert buffer tree. More difficult is removing of pages +from the free list. If there is an excess of pages in the free list of the +ibuf tree, they might be needed if some thread reserves the fsp latch, +intending to allocate more file space. So we do the following: if a thread +reserves the fsp latch, we check the writer count field of the latch. If +this field has value 1, it means that the thread did not own the latch +before entering the fsp system, and the mtr of the thread contains no +modifications to the fsp pages. Now we are free to reserve the ibuf latch, +and check if there is an excess of pages in the free list. We can then, in a +separate mini-transaction, take them out of the free list and free them to +the fsp system. + +To avoid deadlocks in the ibuf system, we divide file pages into three levels: + +(1) non-ibuf pages, +(2) ibuf tree pages and the pages in the ibuf tree free list, and +(3) ibuf bitmap pages. + +No OS thread is allowed to access higher level pages if it has latches to +lower level pages; even if the thread owns a B-tree latch it must not access +the B-tree non-leaf pages if it has latches on lower level pages. Read-ahead +is only allowed for level 1 and 2 pages. Dedicated i/o-handler threads handle +exclusively level 1 i/o. A dedicated i/o handler thread handles exclusively +level 2 i/o. However, if an OS thread does the i/o handling for itself, i.e., +it uses synchronous aio, it can access any pages, as long as it obeys the +access order rules. */ + +/** Operations that can currently be buffered. */ +ulong innodb_change_buffering; + +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG +/** Dump the change buffer at startup */ +my_bool ibuf_dump; +/** Flag to control insert buffer debugging. */ +uint ibuf_debug; +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + +/** The insert buffer control structure */ +ibuf_t ibuf; + +/** @name Offsets to the per-page bits in the insert buffer bitmap */ +/* @{ */ +#define IBUF_BITMAP_FREE 0 /*!< Bits indicating the + amount of free space */ +#define IBUF_BITMAP_BUFFERED 2 /*!< TRUE if there are buffered + changes for the page */ +#define IBUF_BITMAP_IBUF 3 /*!< TRUE if page is a part of + the ibuf tree, excluding the + root page, or is in the free + list of the ibuf */ +/* @} */ + +#define IBUF_REC_FIELD_SPACE 0 /*!< in the pre-4.1 format, + the page number. later, the space_id */ +#define IBUF_REC_FIELD_MARKER 1 /*!< starting with 4.1, a marker + consisting of 1 byte that is 0 */ +#define IBUF_REC_FIELD_PAGE 2 /*!< starting with 4.1, the + page number */ +#define IBUF_REC_FIELD_METADATA 3 /* the metadata field */ +#define IBUF_REC_FIELD_USER 4 /* first user field */ + +/* Various constants for checking the type of an ibuf record and extracting +data from it. For details, see the description of the record format at the +top of this file. */ + +/** @name Format of the IBUF_REC_FIELD_METADATA of an insert buffer record +The fourth column in the MySQL 5.5 format contains an operation +type, counter, and some flags. */ +/* @{ */ +#define IBUF_REC_INFO_SIZE 4 /*!< Combined size of info fields at + the beginning of the fourth field */ + +/* Offsets for the fields at the beginning of the fourth field */ +#define IBUF_REC_OFFSET_COUNTER 0 /*!< Operation counter */ +#define IBUF_REC_OFFSET_TYPE 2 /*!< Type of operation */ +#define IBUF_REC_OFFSET_FLAGS 3 /*!< Additional flags */ + +/* Record flag masks */ +#define IBUF_REC_COMPACT 0x1 /*!< Set in + IBUF_REC_OFFSET_FLAGS if the + user index is in COMPACT + format or later */ + + +#ifndef SAFE_MUTEX +static +#endif /* SAFE_MUTEX */ +/** The mutex protecting the insert buffer */ +mysql_mutex_t ibuf_mutex, + /** The mutex covering pessimistic inserts into the change buffer */ + ibuf_pessimistic_insert_mutex; + +/** The area in pages from which contract looks for page numbers for merge */ +constexpr ulint IBUF_MERGE_AREA = 8; + +/** In ibuf_contract() at most this number of pages is read to memory in one +batch, in order to merge the entries for them in the change buffer */ +constexpr ulint IBUF_MAX_N_PAGES_MERGED = IBUF_MERGE_AREA; + +/* TODO: how to cope with drop table if there are records in the insert +buffer for the indexes of the table? Is there actually any problem, +because ibuf merge is done to a page when it is read in, and it is +still physically like the index page even if the index would have been +dropped! So, there seems to be no problem. */ + +/******************************************************************//** +Sets the flag in the current mini-transaction record indicating we're +inside an insert buffer routine. */ +UNIV_INLINE +void +ibuf_enter( +/*=======*/ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(!mtr->is_inside_ibuf()); + mtr->enter_ibuf(); +} + +/******************************************************************//** +Sets the flag in the current mini-transaction record indicating we're +exiting an insert buffer routine. */ +UNIV_INLINE +void +ibuf_exit( +/*======*/ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(mtr->is_inside_ibuf()); + mtr->exit_ibuf(); +} + +/**************************************************************//** +Commits an insert buffer mini-transaction and sets the persistent +cursor latch mode to BTR_NO_LATCHES, that is, detaches the cursor. */ +UNIV_INLINE +void +ibuf_btr_pcur_commit_specify_mtr( +/*=============================*/ + btr_pcur_t* pcur, /*!< in/out: persistent cursor */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_d(ibuf_exit(mtr)); + btr_pcur_commit_specify_mtr(pcur, mtr); +} + +/******************************************************************//** +Gets the ibuf header page and x-latches it. +@return insert buffer header page */ +static +page_t* +ibuf_header_page_get( +/*=================*/ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(!ibuf_inside(mtr)); + + buf_block_t* block = buf_page_get( + page_id_t(IBUF_SPACE_ID, FSP_IBUF_HEADER_PAGE_NO), + 0, RW_X_LATCH, mtr); + + return block ? block->page.frame : nullptr; +} + +/** Acquire the change buffer root page. +@param[in,out] mtr mini-transaction +@return change buffer root page, SX-latched */ +static buf_block_t *ibuf_tree_root_get(mtr_t *mtr, dberr_t *err= nullptr) +{ + ut_ad(ibuf_inside(mtr)); + mysql_mutex_assert_owner(&ibuf_mutex); + + mtr_sx_lock_index(ibuf.index, mtr); + + buf_block_t *block= + buf_page_get_gen(page_id_t{IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO}, + 0, RW_SX_LATCH, nullptr, BUF_GET, mtr, err); + ut_ad(!block || ibuf.empty == page_is_empty(block->page.frame)); + return block; +} + +/******************************************************************//** +Closes insert buffer and frees the data structures. */ +void +ibuf_close(void) +/*============*/ +{ + if (!ibuf.index) { + return; + } + + mysql_mutex_destroy(&ibuf_pessimistic_insert_mutex); + mysql_mutex_destroy(&ibuf_mutex); + + dict_table_t* ibuf_table = ibuf.index->table; + ibuf.index->lock.free(); + dict_mem_index_free(ibuf.index); + dict_mem_table_free(ibuf_table); + ibuf.index = NULL; +} + +/******************************************************************//** +Updates the size information of the ibuf, assuming the segment size has not +changed. */ +static +void +ibuf_size_update( +/*=============*/ + const page_t* root) /*!< in: ibuf tree root */ +{ + mysql_mutex_assert_owner(&ibuf_mutex); + + ibuf.free_list_len = flst_get_len(root + PAGE_HEADER + + PAGE_BTR_IBUF_FREE_LIST); + + ibuf.height = 1 + btr_page_get_level(root); + + /* the '1 +' is the ibuf header page */ + ibuf.size = ibuf.seg_size - (1 + ibuf.free_list_len); +} + +/******************************************************************//** +Creates the insert buffer data structure at a database startup and initializes +the data structures for the insert buffer. +@return DB_SUCCESS or failure */ +dberr_t +ibuf_init_at_db_start(void) +/*=======================*/ +{ + page_t* root; + + ut_ad(!ibuf.index); + mtr_t mtr; + mtr.start(); + compile_time_assert(IBUF_SPACE_ID == TRX_SYS_SPACE); + compile_time_assert(IBUF_SPACE_ID == 0); + mtr.x_lock_space(fil_system.sys_space); + dberr_t err; + buf_block_t* header_page = buf_page_get_gen( + page_id_t(IBUF_SPACE_ID, FSP_IBUF_HEADER_PAGE_NO), + 0, RW_X_LATCH, nullptr, BUF_GET, &mtr, &err); + + if (!header_page) { +err_exit: + sql_print_error("InnoDB: The change buffer is corrupted" + " or has been removed on upgrade" + " to MariaDB 11.0 or later"); + mtr.commit(); + if (innodb_change_buffering == IBUF_USE_NONE) { + err = DB_SUCCESS; + } + return err; + } + + fseg_n_reserved_pages(*header_page, + IBUF_HEADER + IBUF_TREE_SEG_HEADER + + header_page->page.frame, &ibuf.seg_size, &mtr); + + do { + DBUG_EXECUTE_IF("intermittent_read_failure", continue;); + ut_ad(ibuf.seg_size >= 2); + } while (0); + + if (buf_block_t* block = + buf_page_get_gen(page_id_t(IBUF_SPACE_ID, + FSP_IBUF_TREE_ROOT_PAGE_NO), + 0, RW_X_LATCH, nullptr, BUF_GET, &mtr, &err)) { + root = buf_block_get_frame(block); + } else { + goto err_exit; + } + + DBUG_EXECUTE_IF("ibuf_init_corrupt", + err = DB_CORRUPTION; + goto err_exit;); + + if (page_is_comp(root) || fil_page_get_type(root) != FIL_PAGE_INDEX + || btr_page_get_index_id(root) != DICT_IBUF_ID_MIN) { + err = DB_CORRUPTION; + goto err_exit; + } + + /* At startup we intialize ibuf to have a maximum of + CHANGE_BUFFER_DEFAULT_SIZE in terms of percentage of the + buffer pool size. Once ibuf struct is initialized this + value is updated with the user supplied size by calling + ibuf_max_size_update(). */ + ibuf.max_size = ((buf_pool_get_curr_size() >> srv_page_size_shift) + * CHANGE_BUFFER_DEFAULT_SIZE) / 100; + + mysql_mutex_init(ibuf_mutex_key, &ibuf_mutex, nullptr); + mysql_mutex_init(ibuf_pessimistic_insert_mutex_key, + &ibuf_pessimistic_insert_mutex, nullptr); + + mysql_mutex_lock(&ibuf_mutex); + ibuf_size_update(root); + mysql_mutex_unlock(&ibuf_mutex); + + ibuf.empty = page_is_empty(root); + mtr.commit(); + + ibuf.index = dict_mem_index_create( + dict_table_t::create( + {C_STRING_WITH_LEN("innodb_change_buffer")}, + fil_system.sys_space, 1, 0, 0, 0), + "CLUST_IND", + DICT_CLUSTERED | DICT_IBUF, 1); + ibuf.index->id = DICT_IBUF_ID_MIN + IBUF_SPACE_ID; + ibuf.index->n_uniq = REC_MAX_N_FIELDS; + ibuf.index->lock.SRW_LOCK_INIT(index_tree_rw_lock_key); +#ifdef BTR_CUR_ADAPT + ibuf.index->search_info = btr_search_info_create(ibuf.index->heap); +#endif /* BTR_CUR_ADAPT */ + ibuf.index->page = FSP_IBUF_TREE_ROOT_PAGE_NO; + ut_d(ibuf.index->cached = TRUE); + +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG + if (!ibuf_dump) { + return DB_SUCCESS; + } + ib::info() << "Dumping the change buffer"; + ibuf_mtr_start(&mtr); + btr_pcur_t pcur; + if (DB_SUCCESS + == pcur.open_leaf(true, ibuf.index, BTR_SEARCH_LEAF, &mtr)) { + while (btr_pcur_move_to_next_user_rec(&pcur, &mtr)) { + rec_print_old(stderr, btr_pcur_get_rec(&pcur)); + } + } + ibuf_mtr_commit(&mtr); + ib::info() << "Dumped the change buffer"; +#endif + + return DB_SUCCESS; +} + +/*********************************************************************//** +Updates the max_size value for ibuf. */ +void +ibuf_max_size_update( +/*=================*/ + ulint new_val) /*!< in: new value in terms of + percentage of the buffer pool size */ +{ + if (UNIV_UNLIKELY(!ibuf.index)) return; + ulint new_size = ((buf_pool_get_curr_size() >> srv_page_size_shift) + * new_val) / 100; + mysql_mutex_lock(&ibuf_mutex); + ibuf.max_size = new_size; + mysql_mutex_unlock(&ibuf_mutex); +} + +# ifdef UNIV_DEBUG +/** Gets the desired bits for a given page from a bitmap page. +@param[in] page bitmap page +@param[in] page_id page id whose bits to get +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] bit IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ... +@param[in,out] mtr mini-transaction holding an x-latch on the +bitmap page +@return value of bits */ +# define ibuf_bitmap_page_get_bits(page, page_id, zip_size, bit, mtr) \ + ibuf_bitmap_page_get_bits_low(page, page_id, zip_size, \ + MTR_MEMO_PAGE_X_FIX, mtr, bit) +# else /* UNIV_DEBUG */ +/** Gets the desired bits for a given page from a bitmap page. +@param[in] page bitmap page +@param[in] page_id page id whose bits to get +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] bit IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ... +@param[in,out] mtr mini-transaction holding an x-latch on the +bitmap page +@return value of bits */ +# define ibuf_bitmap_page_get_bits(page, page_id, zip_size, bit, mtr) \ + ibuf_bitmap_page_get_bits_low(page, page_id, zip_size, bit) +# endif /* UNIV_DEBUG */ + +/** Gets the desired bits for a given page from a bitmap page. +@param[in] page bitmap page +@param[in] page_id page id whose bits to get +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] latch_type MTR_MEMO_PAGE_X_FIX, MTR_MEMO_BUF_FIX, ... +@param[in,out] mtr mini-transaction holding latch_type on the +bitmap page +@param[in] bit IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ... +@return value of bits */ +UNIV_INLINE +ulint +ibuf_bitmap_page_get_bits_low( + const page_t* page, + const page_id_t page_id, + ulint zip_size, +#ifdef UNIV_DEBUG + ulint latch_type, + mtr_t* mtr, +#endif /* UNIV_DEBUG */ + ulint bit) +{ + ulint byte_offset; + ulint bit_offset; + ulint map_byte; + ulint value; + const ulint size = zip_size ? zip_size : srv_page_size; + + ut_ad(ut_is_2pow(zip_size)); + ut_ad(bit < IBUF_BITS_PER_PAGE); + compile_time_assert(!(IBUF_BITS_PER_PAGE % 2)); + ut_ad(mtr->memo_contains_page_flagged(page, latch_type)); + + bit_offset = (page_id.page_no() & (size - 1)) + * IBUF_BITS_PER_PAGE + bit; + + byte_offset = bit_offset / 8; + bit_offset = bit_offset % 8; + + ut_ad(byte_offset + IBUF_BITMAP < srv_page_size); + + map_byte = mach_read_from_1(page + IBUF_BITMAP + byte_offset); + + value = ut_bit_get_nth(map_byte, bit_offset); + + if (bit == IBUF_BITMAP_FREE) { + ut_ad(bit_offset + 1 < 8); + + value = value * 2 + ut_bit_get_nth(map_byte, bit_offset + 1); + } + + return(value); +} + +/** Sets the desired bit for a given page in a bitmap page. +@tparam bit IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ... +@param[in,out] block bitmap page +@param[in] page_id page id whose bits to set +@param[in] physical_size page size +@param[in] val value to set +@param[in,out] mtr mtr containing an x-latch to the bitmap page */ +template +static void +ibuf_bitmap_page_set_bits( + buf_block_t* block, + const page_id_t page_id, + ulint physical_size, + ulint val, + mtr_t* mtr) +{ + ulint byte_offset; + ulint bit_offset; + + static_assert(bit < IBUF_BITS_PER_PAGE, "wrong bit"); + compile_time_assert(!(IBUF_BITS_PER_PAGE % 2)); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(mtr->is_named_space(page_id.space())); + + bit_offset = (page_id.page_no() % physical_size) + * IBUF_BITS_PER_PAGE + bit; + + byte_offset = bit_offset / 8; + bit_offset = bit_offset % 8; + + ut_ad(byte_offset + IBUF_BITMAP < srv_page_size); + + byte* map_byte = &block->page.frame[IBUF_BITMAP + byte_offset]; + byte b = *map_byte; + + if (bit == IBUF_BITMAP_FREE) { + ut_ad(bit_offset + 1 < 8); + ut_ad(val <= 3); + b &= static_cast(~(3U << bit_offset)); + b |= static_cast(((val & 2) >> 1) << bit_offset + | (val & 1) << (bit_offset + 1)); + } else { + ut_ad(val <= 1); + b &= static_cast(~(1U << bit_offset)); +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */ +#endif + b |= static_cast(val << bit_offset); +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 +# pragma GCC diagnostic pop +#endif + } + + mtr->write<1,mtr_t::MAYBE_NOP>(*block, map_byte, b); +} + +/** Calculates the bitmap page number for a given page number. +@param[in] page_id page id +@param[in] size page size +@return the bitmap page id where the file page is mapped */ +inline page_id_t ibuf_bitmap_page_no_calc(const page_id_t page_id, ulint size) +{ + if (!size) + size= srv_page_size; + + return page_id_t(page_id.space(), FSP_IBUF_BITMAP_OFFSET + + uint32_t(page_id.page_no() & ~(size - 1))); +} + +/** Gets the ibuf bitmap page where the bits describing a given file page are +stored. +@param[in] page_id page id of the file page +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] mtr mini-transaction +@return bitmap page where the file page is mapped, that is, the bitmap +page containing the descriptor bits for the file page; the bitmap page +is x-latched */ +static +buf_block_t* +ibuf_bitmap_get_map_page( + const page_id_t page_id, + ulint zip_size, + mtr_t* mtr) +{ + return buf_page_get_gen(ibuf_bitmap_page_no_calc(page_id, zip_size), + zip_size, RW_X_LATCH, nullptr, + BUF_GET_POSSIBLY_FREED, mtr); +} + +/************************************************************************//** +Sets the free bits of the page in the ibuf bitmap. This is done in a separate +mini-transaction, hence this operation does not restrict further work to only +ibuf bitmap operations, which would result if the latch to the bitmap page +were kept. */ +UNIV_INLINE +void +ibuf_set_free_bits_low( +/*===================*/ + const buf_block_t* block, /*!< in: index page; free bits are set if + the index is non-clustered and page + level is 0 */ + ulint val, /*!< in: value to set: < 4 */ + mtr_t* mtr) /*!< in/out: mtr */ +{ + ut_ad(mtr->is_named_space(block->page.id().space())); + if (!page_is_leaf(block->page.frame)) { + return; + } + +#ifdef UNIV_IBUF_DEBUG + ut_a(val <= ibuf_index_page_calc_free(block)); +#endif /* UNIV_IBUF_DEBUG */ + const page_id_t id(block->page.id()); + + if (buf_block_t* bitmap_page = ibuf_bitmap_get_map_page( + id, block->zip_size(), mtr)) { + ibuf_bitmap_page_set_bits( + bitmap_page, id, block->physical_size(), + val, mtr); + } +} + +/************************************************************************//** +Sets the free bit of the page in the ibuf bitmap. This is done in a separate +mini-transaction, hence this operation does not restrict further work to only +ibuf bitmap operations, which would result if the latch to the bitmap page +were kept. */ +void +ibuf_set_free_bits_func( +/*====================*/ + buf_block_t* block, /*!< in: index page of a non-clustered index; + free bit is reset if page level is 0 */ +#ifdef UNIV_IBUF_DEBUG + ulint max_val,/*!< in: ULINT_UNDEFINED or a maximum + value which the bits must have before + setting; this is for debugging */ +#endif /* UNIV_IBUF_DEBUG */ + ulint val) /*!< in: value to set: < 4 */ +{ + if (!page_is_leaf(block->page.frame)) + return; + + mtr_t mtr; + mtr.start(); + const page_id_t id(block->page.id()); + const fil_space_t *space= mtr.set_named_space_id(id.space()); + + if (buf_block_t *bitmap_page= + ibuf_bitmap_get_map_page(id, block->zip_size(), &mtr)) + { + if (space->purpose != FIL_TYPE_TABLESPACE) + mtr.set_log_mode(MTR_LOG_NO_REDO); + +#ifdef UNIV_IBUF_DEBUG + if (max_val != ULINT_UNDEFINED) + { + ulint old_val= ibuf_bitmap_page_get_bits(bitmap_page, id, + IBUF_BITMAP_FREE, &mtr); + ut_a(old_val <= max_val); + } + + ut_a(val <= ibuf_index_page_calc_free(block)); +#endif /* UNIV_IBUF_DEBUG */ + + ibuf_bitmap_page_set_bits + (bitmap_page, id, block->physical_size(), val, &mtr); + } + + mtr.commit(); +} + +/************************************************************************//** +Resets the free bits of the page in the ibuf bitmap. This is done in a +separate mini-transaction, hence this operation does not restrict +further work to only ibuf bitmap operations, which would result if the +latch to the bitmap page were kept. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to decrement or reset the bits in the bitmap in a mini-transaction +that is committed before the mini-transaction that affects the free +space. */ +void +ibuf_reset_free_bits( +/*=================*/ + buf_block_t* block) /*!< in: index page; free bits are set to 0 + if the index is a non-clustered + non-unique, and page level is 0 */ +{ + ibuf_set_free_bits(block, 0, ULINT_UNDEFINED); +} + +/**********************************************************************//** +Updates the free bits for an uncompressed page to reflect the present +state. Does this in the mtr given, which means that the latching +order rules virtually prevent any further operations for this OS +thread until mtr is committed. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to set the free bits in the same mini-transaction that updated the +page. */ +void +ibuf_update_free_bits_low( +/*======================*/ + const buf_block_t* block, /*!< in: index page */ + ulint max_ins_size, /*!< in: value of + maximum insert size + with reorganize before + the latest operation + performed to the page */ + mtr_t* mtr) /*!< in/out: mtr */ +{ + ulint before; + ulint after; + + ut_a(!is_buf_block_get_page_zip(block)); + ut_ad(mtr->is_named_space(block->page.id().space())); + + before = ibuf_index_page_calc_free_bits(srv_page_size, + max_ins_size); + + after = ibuf_index_page_calc_free(block); + + /* This approach cannot be used on compressed pages, since the + computed value of "before" often does not match the current + state of the bitmap. This is because the free space may + increase or decrease when a compressed page is reorganized. */ + if (before != after) { + ibuf_set_free_bits_low(block, after, mtr); + } +} + +/**********************************************************************//** +Updates the free bits for a compressed page to reflect the present +state. Does this in the mtr given, which means that the latching +order rules virtually prevent any further operations for this OS +thread until mtr is committed. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to set the free bits in the same mini-transaction that updated the +page. */ +void +ibuf_update_free_bits_zip( +/*======================*/ + buf_block_t* block, /*!< in/out: index page */ + mtr_t* mtr) /*!< in/out: mtr */ +{ + ut_ad(page_is_leaf(block->page.frame)); + ut_ad(block->zip_size()); + + ulint after = ibuf_index_page_calc_free_zip(block); + + if (after == 0) { + /* We move the page to the front of the buffer pool LRU list: + the purpose of this is to prevent those pages to which we + cannot make inserts using the insert buffer from slipping + out of the buffer pool */ + + buf_page_make_young(&block->page); + } + + if (buf_block_t* bitmap_page = ibuf_bitmap_get_map_page( + block->page.id(), block->zip_size(), mtr)) { + + ibuf_bitmap_page_set_bits( + bitmap_page, block->page.id(), + block->physical_size(), after, mtr); + } +} + +/**********************************************************************//** +Updates the free bits for the two pages to reflect the present state. +Does this in the mtr given, which means that the latching order rules +virtually prevent any further operations until mtr is committed. +NOTE: The free bits in the insert buffer bitmap must never exceed the +free space on a page. It is safe to set the free bits in the same +mini-transaction that updated the pages. */ +void +ibuf_update_free_bits_for_two_pages_low( +/*====================================*/ + buf_block_t* block1, /*!< in: index page */ + buf_block_t* block2, /*!< in: index page */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(mtr->is_named_space(block1->page.id().space())); + ut_ad(block1->page.id().space() == block2->page.id().space()); + + /* Avoid deadlocks by acquiring multiple bitmap page latches in + a consistent order (smaller pointer first). */ + if (block1 > block2) + std::swap(block1, block2); + + ibuf_set_free_bits_low(block1, ibuf_index_page_calc_free(block1), mtr); + ibuf_set_free_bits_low(block2, ibuf_index_page_calc_free(block2), mtr); +} + +/** Returns TRUE if the page is one of the fixed address ibuf pages. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@return TRUE if a fixed address ibuf i/o page */ +inline bool ibuf_fixed_addr_page(const page_id_t page_id, ulint zip_size) +{ + return(page_id == page_id_t(IBUF_SPACE_ID, IBUF_TREE_ROOT_PAGE_NO) + || ibuf_bitmap_page(page_id, zip_size)); +} + +/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. +Must not be called when recv_no_ibuf_operations==true. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] x_latch FALSE if relaxed check (avoid latching the +bitmap page) +@param[in,out] mtr mtr which will contain an x-latch to the +bitmap page if the page is not one of the fixed address ibuf pages, or NULL, +in which case a new transaction is created. +@return TRUE if level 2 or level 3 page */ +bool +ibuf_page_low( + const page_id_t page_id, + ulint zip_size, +#ifdef UNIV_DEBUG + bool x_latch, +#endif /* UNIV_DEBUG */ + mtr_t* mtr) +{ + ibool ret; + mtr_t local_mtr; + + ut_ad(!recv_no_ibuf_operations); + ut_ad(x_latch || mtr == NULL); + + if (ibuf_fixed_addr_page(page_id, zip_size)) { + return(true); + } else if (page_id.space() != IBUF_SPACE_ID) { + return(false); + } + + compile_time_assert(IBUF_SPACE_ID == 0); + ut_ad(fil_system.sys_space->purpose == FIL_TYPE_TABLESPACE); + +#ifdef UNIV_DEBUG + if (!x_latch) { + mtr_start(&local_mtr); + + /* Get the bitmap page without a page latch, so that + we will not be violating the latching order when + another bitmap page has already been latched by this + thread. The page will be buffer-fixed, and thus it + cannot be removed or relocated while we are looking at + it. The contents of the page could change, but the + IBUF_BITMAP_IBUF bit that we are interested in should + not be modified by any other thread. Nobody should be + calling ibuf_add_free_page() or ibuf_remove_free_page() + while the page is linked to the insert buffer b-tree. */ + buf_block_t* block = buf_page_get_gen( + ibuf_bitmap_page_no_calc(page_id, zip_size), + zip_size, RW_NO_LATCH, nullptr, BUF_GET, &local_mtr); + + ret = block + && ibuf_bitmap_page_get_bits_low( + block->page.frame, page_id, zip_size, + MTR_MEMO_BUF_FIX, &local_mtr, IBUF_BITMAP_IBUF); + + mtr_commit(&local_mtr); + return(ret); + } +#endif /* UNIV_DEBUG */ + + if (mtr == NULL) { + mtr = &local_mtr; + mtr_start(mtr); + } + + buf_block_t *block = ibuf_bitmap_get_map_page(page_id, zip_size, + mtr); + ret = block + && ibuf_bitmap_page_get_bits(block->page.frame, + page_id, zip_size, + IBUF_BITMAP_IBUF, mtr); + + if (mtr == &local_mtr) { + mtr_commit(mtr); + } + + return(ret); +} + +#ifdef UNIV_DEBUG +# define ibuf_rec_get_page_no(mtr,rec) ibuf_rec_get_page_no_func(mtr,rec) +#else /* UNIV_DEBUG */ +# define ibuf_rec_get_page_no(mtr,rec) ibuf_rec_get_page_no_func(rec) +#endif /* UNIV_DEBUG */ + +/********************************************************************//** +Returns the page number field of an ibuf record. +@return page number */ +static +uint32_t +ibuf_rec_get_page_no_func( +/*======================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ + const rec_t* rec) /*!< in: ibuf record */ +{ + const byte* field; + ulint len; + + ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX + | MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + ut_ad(rec_get_n_fields_old(rec) > 2); + + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len); + + ut_a(len == 1); + + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_PAGE, &len); + + ut_a(len == 4); + + return(mach_read_from_4(field)); +} + +#ifdef UNIV_DEBUG +# define ibuf_rec_get_space(mtr,rec) ibuf_rec_get_space_func(mtr,rec) +#else /* UNIV_DEBUG */ +# define ibuf_rec_get_space(mtr,rec) ibuf_rec_get_space_func(rec) +#endif /* UNIV_DEBUG */ + +/********************************************************************//** +Returns the space id field of an ibuf record. For < 4.1.x format records +returns 0. +@return space id */ +static +uint32_t +ibuf_rec_get_space_func( +/*====================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ + const rec_t* rec) /*!< in: ibuf record */ +{ + const byte* field; + ulint len; + + ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX + | MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + ut_ad(rec_get_n_fields_old(rec) > 2); + + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len); + + ut_a(len == 1); + + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len); + + ut_a(len == 4); + + return(mach_read_from_4(field)); +} + +#ifdef UNIV_DEBUG +# define ibuf_rec_get_info(mtr,rec,op,comp,info_len,counter) \ + ibuf_rec_get_info_func(mtr,rec,op,comp,info_len,counter) +#else /* UNIV_DEBUG */ +# define ibuf_rec_get_info(mtr,rec,op,comp,info_len,counter) \ + ibuf_rec_get_info_func(rec,op,comp,info_len,counter) +#endif +/****************************************************************//** +Get various information about an ibuf record in >= 4.1.x format. */ +static +void +ibuf_rec_get_info_func( +/*===================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ + const rec_t* rec, /*!< in: ibuf record */ + ibuf_op_t* op, /*!< out: operation type, or NULL */ + ibool* comp, /*!< out: compact flag, or NULL */ + ulint* info_len, /*!< out: length of info fields at the + start of the fourth field, or + NULL */ + ulint* counter) /*!< in: counter value, or NULL */ +{ + const byte* types; + ulint fields; + ulint len; + + /* Local variables to shadow arguments. */ + ibuf_op_t op_local; + ibool comp_local; + ulint info_len_local; + ulint counter_local; + + ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX + | MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + fields = rec_get_n_fields_old(rec); + ut_a(fields > IBUF_REC_FIELD_USER); + + types = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len); + + info_len_local = len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE; + compile_time_assert(IBUF_REC_INFO_SIZE + < DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + switch (info_len_local) { + case 0: + case 1: + op_local = IBUF_OP_INSERT; + comp_local = info_len_local; + ut_ad(!counter); + counter_local = ULINT_UNDEFINED; + break; + + case IBUF_REC_INFO_SIZE: + op_local = (ibuf_op_t) types[IBUF_REC_OFFSET_TYPE]; + comp_local = types[IBUF_REC_OFFSET_FLAGS] & IBUF_REC_COMPACT; + counter_local = mach_read_from_2( + types + IBUF_REC_OFFSET_COUNTER); + break; + + default: + ut_error; + } + + ut_a(op_local < IBUF_OP_COUNT); + ut_a((len - info_len_local) == + (fields - IBUF_REC_FIELD_USER) + * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + if (op) { + *op = op_local; + } + + if (comp) { + *comp = comp_local; + } + + if (info_len) { + *info_len = info_len_local; + } + + if (counter) { + *counter = counter_local; + } +} + +#ifdef UNIV_DEBUG +# define ibuf_rec_get_op_type(mtr,rec) ibuf_rec_get_op_type_func(mtr,rec) +#else /* UNIV_DEBUG */ +# define ibuf_rec_get_op_type(mtr,rec) ibuf_rec_get_op_type_func(rec) +#endif + +/****************************************************************//** +Returns the operation type field of an ibuf record. +@return operation type */ +static +ibuf_op_t +ibuf_rec_get_op_type_func( +/*======================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ + const rec_t* rec) /*!< in: ibuf record */ +{ + ulint len; + + ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX + | MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + ut_ad(rec_get_n_fields_old(rec) > 2); + + (void) rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len); + + if (len > 1) { + /* This is a < 4.1.x format record */ + + return(IBUF_OP_INSERT); + } else { + ibuf_op_t op; + + ibuf_rec_get_info(mtr, rec, &op, NULL, NULL, NULL); + + return(op); + } +} + +/****************************************************************//** +Read the first two bytes from a record's fourth field (counter field in new +records; something else in older records). +@return "counter" field, or ULINT_UNDEFINED if for some reason it +can't be read */ +ulint +ibuf_rec_get_counter( +/*=================*/ + const rec_t* rec) /*!< in: ibuf record */ +{ + const byte* ptr; + ulint len; + + if (rec_get_n_fields_old(rec) <= IBUF_REC_FIELD_METADATA) { + + return(ULINT_UNDEFINED); + } + + ptr = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len); + + if (len >= 2) { + + return(mach_read_from_2(ptr)); + } else { + + return(ULINT_UNDEFINED); + } +} + + +/** + Add accumulated operation counts to a permanent array. + Both arrays must be of size IBUF_OP_COUNT. +*/ +static void ibuf_add_ops(Atomic_counter *out, const ulint *in) +{ + for (auto i = 0; i < IBUF_OP_COUNT; i++) + out[i]+= in[i]; +} + + +/****************************************************************//** +Print operation counts. The array must be of size IBUF_OP_COUNT. */ +static +void +ibuf_print_ops( +/*===========*/ + const char* op_name,/*!< in: operation name */ + const Atomic_counter* ops, /*!< in: operation counts */ + FILE* file) /*!< in: file where to print */ +{ + static const char* op_names[] = { + "insert", + "delete mark", + "delete" + }; + + static_assert(array_elements(op_names) == IBUF_OP_COUNT, ""); + fputs(op_name, file); + + for (ulint i = 0; i < IBUF_OP_COUNT; i++) { + fprintf(file, "%s " ULINTPF "%s", op_names[i], + ulint{ops[i]}, (i < (IBUF_OP_COUNT - 1)) ? ", " : ""); + } + + putc('\n', file); +} + +/********************************************************************//** +Creates a dummy index for inserting a record to a non-clustered index. +@return dummy index */ +static +dict_index_t* +ibuf_dummy_index_create( +/*====================*/ + ulint n, /*!< in: number of fields */ + ibool comp) /*!< in: TRUE=use compact record format */ +{ + dict_table_t* table; + dict_index_t* index; + + table = dict_table_t::create({C_STRING_WITH_LEN("IBUF_DUMMY")}, + nullptr, n, 0, + comp ? DICT_TF_COMPACT : 0, 0); + + index = dict_mem_index_create(table, "IBUF_DUMMY", 0, n); + + /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */ + index->cached = TRUE; + ut_d(index->is_dummy = true); + + return(index); +} +/********************************************************************//** +Add a column to the dummy index */ +static +void +ibuf_dummy_index_add_col( +/*=====================*/ + dict_index_t* index, /*!< in: dummy index */ + const dtype_t* type, /*!< in: the data type of the column */ + ulint len) /*!< in: length of the column */ +{ + ulint i = index->table->n_def; + dict_mem_table_add_col(index->table, NULL, NULL, + dtype_get_mtype(type), + dtype_get_prtype(type), + dtype_get_len(type)); + dict_index_add_col(index, index->table, + dict_table_get_nth_col(index->table, i), len); +} +/********************************************************************//** +Deallocates a dummy index for inserting a record to a non-clustered index. */ +static +void +ibuf_dummy_index_free( +/*==================*/ + dict_index_t* index) /*!< in, own: dummy index */ +{ + dict_table_t* table = index->table; + + dict_mem_index_free(index); + dict_mem_table_free(table); +} + +#ifdef UNIV_DEBUG +# define ibuf_build_entry_from_ibuf_rec(mtr,ibuf_rec,heap,pindex) \ + ibuf_build_entry_from_ibuf_rec_func(mtr,ibuf_rec,heap,pindex) +#else /* UNIV_DEBUG */ +# define ibuf_build_entry_from_ibuf_rec(mtr,ibuf_rec,heap,pindex) \ + ibuf_build_entry_from_ibuf_rec_func(ibuf_rec,heap,pindex) +#endif + +/*********************************************************************//** +Builds the entry used to + +1) IBUF_OP_INSERT: insert into a non-clustered index + +2) IBUF_OP_DELETE_MARK: find the record whose delete-mark flag we need to + activate + +3) IBUF_OP_DELETE: find the record we need to delete + +when we have the corresponding record in an ibuf index. + +NOTE that as we copy pointers to fields in ibuf_rec, the caller must +hold a latch to the ibuf_rec page as long as the entry is used! + +@return own: entry to insert to a non-clustered index */ +static +dtuple_t* +ibuf_build_entry_from_ibuf_rec_func( +/*================================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ + const rec_t* ibuf_rec, /*!< in: record in an insert buffer */ + mem_heap_t* heap, /*!< in: heap where built */ + dict_index_t** pindex) /*!< out, own: dummy index that + describes the entry */ +{ + dtuple_t* tuple; + dfield_t* field; + ulint n_fields; + const byte* types; + const byte* data; + ulint len; + ulint info_len; + ulint i; + ulint comp; + dict_index_t* index; + + ut_ad(mtr->memo_contains_page_flagged(ibuf_rec, MTR_MEMO_PAGE_X_FIX + | MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + + data = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_MARKER, &len); + + ut_a(len == 1); + ut_a(*data == 0); + ut_a(rec_get_n_fields_old(ibuf_rec) > IBUF_REC_FIELD_USER); + + n_fields = rec_get_n_fields_old(ibuf_rec) - IBUF_REC_FIELD_USER; + + tuple = dtuple_create(heap, n_fields); + + types = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_METADATA, &len); + + ibuf_rec_get_info(mtr, ibuf_rec, NULL, &comp, &info_len, NULL); + + index = ibuf_dummy_index_create(n_fields, comp); + + len -= info_len; + types += info_len; + + ut_a(len == n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(tuple, i); + + data = rec_get_nth_field_old( + ibuf_rec, i + IBUF_REC_FIELD_USER, &len); + + dfield_set_data(field, data, len); + + dtype_new_read_for_order_and_null_size( + dfield_get_type(field), + types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + ibuf_dummy_index_add_col(index, dfield_get_type(field), len); + } + + index->n_core_null_bytes = static_cast( + UT_BITS_IN_BYTES(unsigned(index->n_nullable))); + + /* Prevent an ut_ad() failure in page_zip_write_rec() by + adding system columns to the dummy table pointed to by the + dummy secondary index. The insert buffer is only used for + secondary indexes, whose records never contain any system + columns, such as DB_TRX_ID. */ + ut_d(dict_table_add_system_columns(index->table, index->table->heap)); + + *pindex = index; + + return(tuple); +} + +/******************************************************************//** +Get the data size. +@return size of fields */ +UNIV_INLINE +ulint +ibuf_rec_get_size( +/*==============*/ + const rec_t* rec, /*!< in: ibuf record */ + const byte* types, /*!< in: fields */ + ulint n_fields, /*!< in: number of fields */ + ulint comp) /*!< in: 0=ROW_FORMAT=REDUNDANT, + nonzero=ROW_FORMAT=COMPACT */ +{ + ulint i; + ulint field_offset; + ulint types_offset; + ulint size = 0; + + field_offset = IBUF_REC_FIELD_USER; + types_offset = DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE; + + for (i = 0; i < n_fields; i++) { + ulint len; + dtype_t dtype; + + rec_get_nth_field_offs_old(rec, i + field_offset, &len); + + if (len != UNIV_SQL_NULL) { + size += len; + } else { + dtype_new_read_for_order_and_null_size(&dtype, types); + + size += dtype_get_sql_null_size(&dtype, comp); + } + + types += types_offset; + } + + return(size); +} + +#ifdef UNIV_DEBUG +# define ibuf_rec_get_volume(mtr,rec) ibuf_rec_get_volume_func(mtr,rec) +#else /* UNIV_DEBUG */ +# define ibuf_rec_get_volume(mtr,rec) ibuf_rec_get_volume_func(rec) +#endif + +/********************************************************************//** +Returns the space taken by a stored non-clustered index entry if converted to +an index record. +@return size of index record in bytes + an upper limit of the space +taken in the page directory */ +static +ulint +ibuf_rec_get_volume_func( +/*=====================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ + const rec_t* ibuf_rec)/*!< in: ibuf record */ +{ + ulint len; + const byte* data; + const byte* types; + ulint n_fields; + ulint data_size; + ulint comp; + ibuf_op_t op; + ulint info_len; + + ut_ad(mtr->memo_contains_page_flagged(ibuf_rec, MTR_MEMO_PAGE_X_FIX + | MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + ut_ad(rec_get_n_fields_old(ibuf_rec) > 2); + + data = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_MARKER, &len); + ut_a(len == 1); + ut_a(*data == 0); + + types = rec_get_nth_field_old( + ibuf_rec, IBUF_REC_FIELD_METADATA, &len); + + ibuf_rec_get_info(mtr, ibuf_rec, &op, &comp, &info_len, NULL); + + if (op == IBUF_OP_DELETE_MARK || op == IBUF_OP_DELETE) { + /* Delete-marking a record doesn't take any + additional space, and while deleting a record + actually frees up space, we have to play it safe and + pretend it takes no additional space (the record + might not exist, etc.). */ + + return(0); + } else if (comp) { + dtuple_t* entry; + ulint volume; + dict_index_t* dummy_index; + mem_heap_t* heap = mem_heap_create(500); + + entry = ibuf_build_entry_from_ibuf_rec(mtr, ibuf_rec, + heap, &dummy_index); + + volume = rec_get_converted_size(dummy_index, entry, 0); + + ibuf_dummy_index_free(dummy_index); + mem_heap_free(heap); + + return(volume + page_dir_calc_reserved_space(1)); + } + + types += info_len; + n_fields = rec_get_n_fields_old(ibuf_rec) + - IBUF_REC_FIELD_USER; + + data_size = ibuf_rec_get_size(ibuf_rec, types, n_fields, comp); + + return(data_size + rec_get_converted_extra_size(data_size, n_fields, 0) + + page_dir_calc_reserved_space(1)); +} + +/*********************************************************************//** +Builds the tuple to insert to an ibuf tree when we have an entry for a +non-clustered index. + +NOTE that the original entry must be kept because we copy pointers to +its fields. + +@return own: entry to insert into an ibuf index tree */ +static +dtuple_t* +ibuf_entry_build( +/*=============*/ + ibuf_op_t op, /*!< in: operation type */ + dict_index_t* index, /*!< in: non-clustered index */ + const dtuple_t* entry, /*!< in: entry for a non-clustered index */ + ulint space, /*!< in: space id */ + ulint page_no,/*!< in: index page number where entry should + be inserted */ + ulint counter,/*!< in: counter value; + ULINT_UNDEFINED=not used */ + mem_heap_t* heap) /*!< in: heap into which to build */ +{ + dtuple_t* tuple; + dfield_t* field; + const dfield_t* entry_field; + ulint n_fields; + byte* buf; + byte* ti; + byte* type_info; + ulint i; + + ut_ad(counter != ULINT_UNDEFINED || op == IBUF_OP_INSERT); + ut_ad(counter == ULINT_UNDEFINED || counter <= 0xFFFF); + ut_ad(op < IBUF_OP_COUNT); + + /* We have to build a tuple with the following fields: + + 1-4) These are described at the top of this file. + + 5) The rest of the fields are copied from the entry. + + All fields in the tuple are ordered like the type binary in our + insert buffer tree. */ + + n_fields = dtuple_get_n_fields(entry); + + tuple = dtuple_create(heap, n_fields + IBUF_REC_FIELD_USER); + + /* 1) Space Id */ + + field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_SPACE); + + buf = static_cast(mem_heap_alloc(heap, 4)); + + mach_write_to_4(buf, space); + + dfield_set_data(field, buf, 4); + + /* 2) Marker byte */ + + field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_MARKER); + + buf = static_cast(mem_heap_alloc(heap, 1)); + + /* We set the marker byte zero */ + + mach_write_to_1(buf, 0); + + dfield_set_data(field, buf, 1); + + /* 3) Page number */ + + field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_PAGE); + + buf = static_cast(mem_heap_alloc(heap, 4)); + + mach_write_to_4(buf, page_no); + + dfield_set_data(field, buf, 4); + + /* 4) Type info, part #1 */ + + if (counter == ULINT_UNDEFINED) { + i = dict_table_is_comp(index->table) ? 1 : 0; + } else { + ut_ad(counter <= 0xFFFF); + i = IBUF_REC_INFO_SIZE; + } + + ti = type_info = static_cast( + mem_heap_alloc( + heap, + i + n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE)); + + switch (i) { + default: + ut_error; + break; + case 1: + /* set the flag for ROW_FORMAT=COMPACT */ + *ti++ = 0; + /* fall through */ + case 0: + /* the old format does not allow delete buffering */ + ut_ad(op == IBUF_OP_INSERT); + break; + case IBUF_REC_INFO_SIZE: + mach_write_to_2(ti + IBUF_REC_OFFSET_COUNTER, counter); + + ti[IBUF_REC_OFFSET_TYPE] = (byte) op; + ti[IBUF_REC_OFFSET_FLAGS] = dict_table_is_comp(index->table) + ? IBUF_REC_COMPACT : 0; + ti += IBUF_REC_INFO_SIZE; + break; + } + + /* 5+) Fields from the entry */ + + for (i = 0; i < n_fields; i++) { + ulint fixed_len; + const dict_field_t* ifield; + + field = dtuple_get_nth_field(tuple, i + IBUF_REC_FIELD_USER); + entry_field = dtuple_get_nth_field(entry, i); + dfield_copy(field, entry_field); + + ifield = dict_index_get_nth_field(index, i); + ut_ad(!ifield->descending); + /* Prefix index columns of fixed-length columns are of + fixed length. However, in the function call below, + dfield_get_type(entry_field) contains the fixed length + of the column in the clustered index. Replace it with + the fixed length of the secondary index column. */ + fixed_len = ifield->fixed_len; + +#ifdef UNIV_DEBUG + if (fixed_len) { + /* dict_index_add_col() should guarantee these */ + ut_ad(fixed_len <= (ulint) + dfield_get_type(entry_field)->len); + if (ifield->prefix_len) { + ut_ad(ifield->prefix_len == fixed_len); + } else { + ut_ad(fixed_len == (ulint) + dfield_get_type(entry_field)->len); + } + } +#endif /* UNIV_DEBUG */ + + dtype_new_store_for_order_and_null_size( + ti, dfield_get_type(entry_field), fixed_len); + ti += DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE; + } + + /* 4) Type info, part #2 */ + + field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_METADATA); + + dfield_set_data(field, type_info, ulint(ti - type_info)); + + /* Set all the types in the new tuple binary */ + + dtuple_set_types_binary(tuple, n_fields + IBUF_REC_FIELD_USER); + + return(tuple); +} + +/*********************************************************************//** +Builds a search tuple used to search buffered inserts for an index page. +This is for >= 4.1.x format records. +@return own: search tuple */ +static +dtuple_t* +ibuf_search_tuple_build( +/*====================*/ + ulint space, /*!< in: space id */ + ulint page_no,/*!< in: index page number */ + mem_heap_t* heap) /*!< in: heap into which to build */ +{ + dtuple_t* tuple; + dfield_t* field; + byte* buf; + + tuple = dtuple_create(heap, IBUF_REC_FIELD_METADATA); + + /* Store the space id in tuple */ + + field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_SPACE); + + buf = static_cast(mem_heap_alloc(heap, 4)); + + mach_write_to_4(buf, space); + + dfield_set_data(field, buf, 4); + + /* Store the new format record marker byte */ + + field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_MARKER); + + buf = static_cast(mem_heap_alloc(heap, 1)); + + mach_write_to_1(buf, 0); + + dfield_set_data(field, buf, 1); + + /* Store the page number in tuple */ + + field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_PAGE); + + buf = static_cast(mem_heap_alloc(heap, 4)); + + mach_write_to_4(buf, page_no); + + dfield_set_data(field, buf, 4); + + dtuple_set_types_binary(tuple, IBUF_REC_FIELD_METADATA); + + return(tuple); +} + +/*********************************************************************//** +Checks if there are enough pages in the free list of the ibuf tree that we +dare to start a pessimistic insert to the insert buffer. +@return whether enough free pages in list */ +static inline bool ibuf_data_enough_free_for_insert() +{ + mysql_mutex_assert_owner(&ibuf_mutex); + + /* We want a big margin of free pages, because a B-tree can sometimes + grow in size also if records are deleted from it, as the node pointers + can change, and we must make sure that we are able to delete the + inserts buffered for pages that we read to the buffer pool, without + any risk of running out of free space in the insert buffer. */ + + return(ibuf.free_list_len >= (ibuf.size / 2) + 3 * ibuf.height); +} + +/*********************************************************************//** +Checks if there are enough pages in the free list of the ibuf tree that we +should remove them and free to the file space management. +@return TRUE if enough free pages in list */ +UNIV_INLINE +ibool +ibuf_data_too_much_free(void) +/*=========================*/ +{ + mysql_mutex_assert_owner(&ibuf_mutex); + + return(ibuf.free_list_len >= 3 + (ibuf.size / 2) + 3 * ibuf.height); +} + +/** Allocate a change buffer page. +@retval true on success +@retval false if no space left */ +static bool ibuf_add_free_page() +{ + mtr_t mtr; + page_t* header_page; + buf_block_t* block; + + mtr.start(); + /* Acquire the fsp latch before the ibuf header, obeying the latching + order */ + mtr.x_lock_space(fil_system.sys_space); + header_page = ibuf_header_page_get(&mtr); + if (!header_page) { + mtr.commit(); + return false; + } + + /* Allocate a new page: NOTE that if the page has been a part of a + non-clustered index which has subsequently been dropped, then the + page may have buffered inserts in the insert buffer, and these + should be deleted from there. These get deleted when the page + allocation creates the page in buffer. Thus the call below may end + up calling the insert buffer routines and, as we yet have no latches + to insert buffer tree pages, these routines can run without a risk + of a deadlock. This is the reason why we created a special ibuf + header page apart from the ibuf tree. */ + + dberr_t err; + block = fseg_alloc_free_page_general( + header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, 0, FSP_UP, + false, &mtr, &mtr, &err); + + if (!block) { + mtr.commit(); + return false; + } + + ut_ad(block->page.lock.not_recursive()); + ibuf_enter(&mtr); + mysql_mutex_lock(&ibuf_mutex); + + mtr.write<2>(*block, block->page.frame + FIL_PAGE_TYPE, + FIL_PAGE_IBUF_FREE_LIST); + buf_block_t* ibuf_root = ibuf_tree_root_get(&mtr); + if (UNIV_UNLIKELY(!ibuf_root)) { +corrupted: + /* Do not bother to try to free the allocated block, because + the change buffer is seriously corrupted already. */ + mysql_mutex_unlock(&ibuf_mutex); + ibuf_mtr_commit(&mtr); + return false; + } + + /* Add the page to the free list and update the ibuf size data */ + + err = flst_add_last(ibuf_root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, + &mtr); + if (UNIV_UNLIKELY(err != DB_SUCCESS)) { + goto corrupted; + } + + /* Set the bit indicating that this page is now an ibuf tree page + (level 2 page) */ + + const page_id_t page_id(block->page.id()); + buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(page_id, 0, &mtr); + + if (UNIV_UNLIKELY(!bitmap_page)) { + goto corrupted; + } + + ibuf.seg_size++; + ibuf.free_list_len++; + + mysql_mutex_unlock(&ibuf_mutex); + + ibuf_bitmap_page_set_bits(bitmap_page, page_id, + srv_page_size, true, &mtr); + ibuf_mtr_commit(&mtr); + return true; +} + +/*********************************************************************//** +Removes a page from the free list and frees it to the fsp system. */ +static void ibuf_remove_free_page() +{ + mtr_t mtr; + mtr_t mtr2; + page_t* header_page; + + log_free_check(); + + mtr_start(&mtr); + /* Acquire the fsp latch before the ibuf header, obeying the latching + order */ + + mtr.x_lock_space(fil_system.sys_space); + header_page = ibuf_header_page_get(&mtr); + + /* Prevent pessimistic inserts to insert buffer trees for a while */ + ibuf_enter(&mtr); + mysql_mutex_lock(&ibuf_pessimistic_insert_mutex); + mysql_mutex_lock(&ibuf_mutex); + + if (!header_page || !ibuf_data_too_much_free()) { +early_exit: + mysql_mutex_unlock(&ibuf_mutex); + mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex); + + ibuf_mtr_commit(&mtr); + + return; + } + + ibuf_mtr_start(&mtr2); + + buf_block_t* root = ibuf_tree_root_get(&mtr2); + + if (UNIV_UNLIKELY(!root)) { + ibuf_mtr_commit(&mtr2); + goto early_exit; + } + + mysql_mutex_unlock(&ibuf_mutex); + + const uint32_t page_no = flst_get_last(PAGE_HEADER + + PAGE_BTR_IBUF_FREE_LIST + + root->page.frame).page; + + /* NOTE that we must release the latch on the ibuf tree root + because in fseg_free_page we access level 1 pages, and the root + is a level 2 page. */ + + ibuf_mtr_commit(&mtr2); + ibuf_exit(&mtr); + + /* Since pessimistic inserts were prevented, we know that the + page is still in the free list. NOTE that also deletes may take + pages from the free list, but they take them from the start, and + the free list was so long that they cannot have taken the last + page from it. */ + + compile_time_assert(IBUF_SPACE_ID == 0); + const page_id_t page_id{IBUF_SPACE_ID, page_no}; + buf_block_t* bitmap_page = nullptr; + dberr_t err = fseg_free_page( + header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, + fil_system.sys_space, page_no, &mtr); + + if (err != DB_SUCCESS) { + goto func_exit; + } + + ibuf_enter(&mtr); + + mysql_mutex_lock(&ibuf_mutex); + + root = ibuf_tree_root_get(&mtr, &err); + if (UNIV_UNLIKELY(!root)) { + mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex); + goto func_exit; + } + + ut_ad(page_no == flst_get_last(PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST + + root->page.frame).page); + + /* Remove the page from the free list and update the ibuf size data */ + if (buf_block_t* block = + buf_page_get_gen(page_id, 0, RW_X_LATCH, nullptr, BUF_GET, + &mtr, &err)) { + err = flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, + block, + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, + &mtr); + } + + mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex); + + if (err == DB_SUCCESS) { + ibuf.seg_size--; + ibuf.free_list_len--; + bitmap_page = ibuf_bitmap_get_map_page(page_id, 0, &mtr); + } + +func_exit: + mysql_mutex_unlock(&ibuf_mutex); + + if (bitmap_page) { + /* Set the bit indicating that this page is no more an + ibuf tree page (level 2 page) */ + ibuf_bitmap_page_set_bits( + bitmap_page, page_id, srv_page_size, false, &mtr); + } + + if (err == DB_SUCCESS) { + buf_page_free(fil_system.sys_space, page_no, &mtr); + } + + ibuf_mtr_commit(&mtr); +} + +/***********************************************************************//** +Frees excess pages from the ibuf free list. This function is called when an OS +thread calls fsp services to allocate a new file segment, or a new page to a +file segment, and the thread did not own the fsp latch before this call. */ +void +ibuf_free_excess_pages(void) +/*========================*/ +{ + if (UNIV_UNLIKELY(!ibuf.index)) return; + /* Free at most a few pages at a time, so that we do not delay the + requested service too much */ + + for (ulint i = 0; i < 4; i++) { + + ibool too_much_free; + + mysql_mutex_lock(&ibuf_mutex); + too_much_free = ibuf_data_too_much_free(); + mysql_mutex_unlock(&ibuf_mutex); + + if (!too_much_free) { + return; + } + + ibuf_remove_free_page(); + } +} + +#ifdef UNIV_DEBUG +# define ibuf_get_merge_page_nos(rec,mtr,ids,pages,n_stored) \ + ibuf_get_merge_page_nos_func(rec,mtr,ids,pages,n_stored) +#else /* UNIV_DEBUG */ +# define ibuf_get_merge_page_nos(rec,mtr,ids,pages,n_stored) \ + ibuf_get_merge_page_nos_func(rec,ids,pages,n_stored) +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Reads page numbers from a leaf in an ibuf tree. +@return a lower limit for the combined volume of records which will be +merged */ +static +ulint +ibuf_get_merge_page_nos_func( +/*=========================*/ + const rec_t* rec, /*!< in: insert buffer record */ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction holding rec */ +#endif /* UNIV_DEBUG */ + uint32_t* space_ids,/*!< in/out: space id's of the pages */ + uint32_t* page_nos,/*!< in/out: buffer for at least + IBUF_MAX_N_PAGES_MERGED many page numbers; + the page numbers are in an ascending order */ + ulint* n_stored)/*!< out: number of page numbers stored to + page_nos in this function */ +{ + uint32_t prev_page_no; + uint32_t prev_space_id; + uint32_t first_page_no; + uint32_t first_space_id; + uint32_t rec_page_no; + uint32_t rec_space_id; + ulint sum_volumes; + ulint volume_for_page; + ulint rec_volume; + ulint limit; + ulint n_pages; + + ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX + | MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + + *n_stored = 0; + + if (page_rec_is_supremum(rec)) { + + rec = page_rec_get_prev_const(rec); + if (UNIV_UNLIKELY(!rec)) { +corruption: + ut_ad("corrupted page" == 0); + return 0; + } + } + + if (page_rec_is_infimum(rec)) { + rec = page_rec_get_next_const(rec); + if (!rec || page_rec_is_supremum(rec)) { + return 0; + } + } + + limit = ut_min(IBUF_MAX_N_PAGES_MERGED, + buf_pool_get_curr_size() / 4); + + first_page_no = ibuf_rec_get_page_no(mtr, rec); + first_space_id = ibuf_rec_get_space(mtr, rec); + n_pages = 0; + prev_page_no = 0; + prev_space_id = 0; + + /* Go backwards from the first rec until we reach the border of the + 'merge area', or the page start or the limit of storeable pages is + reached */ + + while (!page_rec_is_infimum(rec) && UNIV_LIKELY(n_pages < limit)) { + + rec_page_no = ibuf_rec_get_page_no(mtr, rec); + rec_space_id = ibuf_rec_get_space(mtr, rec); + + if (rec_space_id != first_space_id + || (rec_page_no / IBUF_MERGE_AREA) + != (first_page_no / IBUF_MERGE_AREA)) { + + break; + } + + if (rec_page_no != prev_page_no + || rec_space_id != prev_space_id) { + n_pages++; + } + + prev_page_no = rec_page_no; + prev_space_id = rec_space_id; + + if (UNIV_UNLIKELY(!(rec = page_rec_get_prev_const(rec)))) { + goto corruption; + } + } + + rec = page_rec_get_next_const(rec); + + /* At the loop start there is no prev page; we mark this with a pair + of space id, page no (0, 0) for which there can never be entries in + the insert buffer */ + + prev_page_no = 0; + prev_space_id = 0; + sum_volumes = 0; + volume_for_page = 0; + + while (*n_stored < limit && rec) { + if (page_rec_is_supremum(rec)) { + /* When no more records available, mark this with + another 'impossible' pair of space id, page no */ + rec_page_no = 1; + rec_space_id = 0; + } else { + rec_page_no = ibuf_rec_get_page_no(mtr, rec); + rec_space_id = ibuf_rec_get_space(mtr, rec); + /* In the system tablespace the smallest + possible secondary index leaf page number is + bigger than FSP_DICT_HDR_PAGE_NO (7). + In all tablespaces, pages 0 and 1 are reserved + for the allocation bitmap and the change + buffer bitmap. In file-per-table tablespaces, + a file segment inode page will be created at + page 2 and the clustered index tree is created + at page 3. So for file-per-table tablespaces, + page 4 is the smallest possible secondary + index leaf page. CREATE TABLESPACE also initially + uses pages 2 and 3 for the first created table, + but that table may be dropped, allowing page 2 + to be reused for a secondary index leaf page. + To keep this assertion simple, just + make sure the page is >= 2. */ + ut_ad(rec_page_no >= FSP_FIRST_INODE_PAGE_NO); + } + +#ifdef UNIV_IBUF_DEBUG + ut_a(*n_stored < IBUF_MAX_N_PAGES_MERGED); +#endif + if ((rec_space_id != prev_space_id + || rec_page_no != prev_page_no) + && (prev_space_id != 0 || prev_page_no != 0)) { + + space_ids[*n_stored] = prev_space_id; + page_nos[*n_stored] = prev_page_no; + (*n_stored)++; + sum_volumes += volume_for_page; + + if (rec_space_id != first_space_id + || rec_page_no / IBUF_MERGE_AREA + != first_page_no / IBUF_MERGE_AREA) { + + break; + } + + volume_for_page = 0; + } + + if (rec_page_no == 1 && rec_space_id == 0) { + /* Supremum record */ + + break; + } + + rec_volume = ibuf_rec_get_volume(mtr, rec); + + volume_for_page += rec_volume; + + prev_page_no = rec_page_no; + prev_space_id = rec_space_id; + + rec = page_rec_get_next_const(rec); + } + +#ifdef UNIV_IBUF_DEBUG + ut_a(*n_stored <= IBUF_MAX_N_PAGES_MERGED); +#endif +#if 0 + fprintf(stderr, "Ibuf merge batch %lu pages %lu volume\n", + *n_stored, sum_volumes); +#endif + return(sum_volumes); +} + +/*******************************************************************//** +Get the matching records for space id. +@return current rec or NULL */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +const rec_t* +ibuf_get_user_rec( +/*===============*/ + btr_pcur_t* pcur, /*!< in: the current cursor */ + mtr_t* mtr) /*!< in: mini transaction */ +{ + do { + const rec_t* rec = btr_pcur_get_rec(pcur); + + if (page_rec_is_user_rec(rec)) { + return(rec); + } + } while (btr_pcur_move_to_next(pcur, mtr)); + + return(NULL); +} + +/*********************************************************************//** +Reads page numbers for a space id from an ibuf tree. +@return a lower limit for the combined volume of records which will be +merged */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +ulint +ibuf_get_merge_pages( +/*=================*/ + btr_pcur_t* pcur, /*!< in/out: cursor */ + uint32_t space, /*!< in: space for which to merge */ + ulint limit, /*!< in: max page numbers to read */ + uint32_t* pages, /*!< out: pages read */ + uint32_t* spaces, /*!< out: spaces read */ + ulint* n_pages,/*!< out: number of pages read */ + mtr_t* mtr) /*!< in: mini transaction */ +{ + const rec_t* rec; + ulint volume = 0; + + *n_pages = 0; + + while ((rec = ibuf_get_user_rec(pcur, mtr)) != 0 + && ibuf_rec_get_space(mtr, rec) == space + && *n_pages < limit) { + + uint32_t page_no = ibuf_rec_get_page_no(mtr, rec); + + if (*n_pages == 0 || pages[*n_pages - 1] != page_no) { + spaces[*n_pages] = space; + pages[*n_pages] = page_no; + ++*n_pages; + } + + volume += ibuf_rec_get_volume(mtr, rec); + + btr_pcur_move_to_next(pcur, mtr); + } + + return(volume); +} + +/** +Delete a change buffer record. +@param[in] page_id page identifier +@param[in,out] pcur persistent cursor positioned on the record +@param[in] search_tuple search key for (space,page_no) +@param[in,out] mtr mini-transaction +@return whether mtr was committed (due to pessimistic operation) */ +static MY_ATTRIBUTE((warn_unused_result, nonnull)) +bool ibuf_delete_rec(const page_id_t page_id, btr_pcur_t* pcur, + const dtuple_t* search_tuple, mtr_t* mtr); + +/** Delete the change buffer records for the given page id +@param page_id page identifier */ +static void ibuf_delete_recs(const page_id_t page_id) +{ + if (!ibuf.index || srv_read_only_mode) + return; + dfield_t dfield[IBUF_REC_FIELD_METADATA]; + dtuple_t tuple {0,IBUF_REC_FIELD_METADATA,IBUF_REC_FIELD_METADATA, + dfield,0,nullptr +#ifdef UNIV_DEBUG + ,DATA_TUPLE_MAGIC_N +#endif + }; + byte space_id[4], page_no[4]; + + mach_write_to_4(space_id, page_id.space()); + mach_write_to_4(page_no, page_id.page_no()); + + dfield_set_data(&dfield[0], space_id, 4); + dfield_set_data(&dfield[1], field_ref_zero, 1); + dfield_set_data(&dfield[2], page_no, 4); + dtuple_set_types_binary(&tuple, IBUF_REC_FIELD_METADATA); + + mtr_t mtr; +loop: + btr_pcur_t pcur; + pcur.btr_cur.page_cur.index= ibuf.index; + ibuf_mtr_start(&mtr); + if (btr_pcur_open(&tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF, &pcur, &mtr)) + goto func_exit; + if (!btr_pcur_is_on_user_rec(&pcur)) + { + ut_ad(btr_pcur_is_after_last_on_page(&pcur)); + goto func_exit; + } + + for (;;) + { + ut_ad(btr_pcur_is_on_user_rec(&pcur)); + const rec_t* ibuf_rec = btr_pcur_get_rec(&pcur); + if (ibuf_rec_get_space(&mtr, ibuf_rec) != page_id.space() + || ibuf_rec_get_page_no(&mtr, ibuf_rec) != page_id.page_no()) + break; + /* Delete the record from ibuf */ + if (ibuf_delete_rec(page_id, &pcur, &tuple, &mtr)) + { + /* Deletion was pessimistic and mtr was committed: + we start from the beginning again */ + ut_ad(mtr.has_committed()); + goto loop; + } + + if (btr_pcur_is_after_last_on_page(&pcur)) + { + ibuf_mtr_commit(&mtr); + btr_pcur_close(&pcur); + goto loop; + } + } +func_exit: + ibuf_mtr_commit(&mtr); + btr_pcur_close(&pcur); +} + +/** Merge the change buffer to some pages. */ +static void ibuf_read_merge_pages(const uint32_t* space_ids, + const uint32_t* page_nos, ulint n_stored) +{ + for (ulint i = 0; i < n_stored; i++) { + const uint32_t space_id = space_ids[i]; + fil_space_t* s = fil_space_t::get(space_id); + if (!s) { +tablespace_deleted: + /* The tablespace was not found: remove all + entries for it */ + ibuf_delete_for_discarded_space(space_id); + while (i + 1 < n_stored + && space_ids[i + 1] == space_id) { + i++; + } + continue; + } + + const ulint zip_size = s->zip_size(), size = s->size; + s->x_lock(); + s->release(); + mtr_t mtr; + + if (UNIV_LIKELY(page_nos[i] < size)) { + mtr.start(); + dberr_t err; + buf_block_t *block = + buf_page_get_gen(page_id_t(space_id, page_nos[i]), + zip_size, RW_X_LATCH, nullptr, + BUF_GET_POSSIBLY_FREED, + &mtr, &err, true); + bool remove = !block + || fil_page_get_type(block->page.frame) + != FIL_PAGE_INDEX + || !page_is_leaf(block->page.frame); + mtr.commit(); + if (err == DB_TABLESPACE_DELETED) { + s->x_unlock(); + goto tablespace_deleted; + } + if (!remove) { + s->x_unlock(); + continue; + } + } + + s->x_unlock(); + + if (srv_shutdown_state == SRV_SHUTDOWN_NONE + || srv_fast_shutdown) { + continue; + } + + /* The following code works around a hang when the + change buffer is corrupted, likely due to the + failure of ibuf_merge_or_delete_for_page() to + invoke ibuf_delete_recs() if (!bitmap_bits). + + It also introduced corruption by itself in the + following scenario: + + (1) We merged buffered changes in buf_page_get_gen() + (2) We committed the mini-transaction + (3) Redo log and the page with the merged changes is written + (4) A write completion callback thread evicts the page. + (5) Other threads buffer changes for that page. + (6) We will wrongly discard those newly buffered changes below. + + To prevent this scenario, we will only invoke this code + on shutdown. A call to ibuf_max_size_update(0) will cause + ibuf_insert_low() to refuse to insert anything into the + change buffer. */ + + /* Prevent an infinite loop, by removing entries from + the change buffer in the case the bitmap bits were + wrongly clear even though buffered changes exist. */ + ibuf_delete_recs(page_id_t(space_id, page_nos[i])); + } +} + +/** Contract the change buffer by reading pages to the buffer pool. +@return a lower limit for the combined size in bytes of entries which +will be merged from ibuf trees to the pages read +@retval 0 if ibuf.empty */ +ATTRIBUTE_COLD ulint ibuf_contract() +{ + if (UNIV_UNLIKELY(!ibuf.index)) return 0; + mtr_t mtr; + btr_cur_t cur; + ulint sum_sizes; + uint32_t page_nos[IBUF_MAX_N_PAGES_MERGED]; + uint32_t space_ids[IBUF_MAX_N_PAGES_MERGED]; + + ibuf_mtr_start(&mtr); + + if (cur.open_leaf(true, ibuf.index, BTR_SEARCH_LEAF, &mtr) != + DB_SUCCESS) { + return 0; + } + + ut_ad(page_validate(btr_cur_get_page(&cur), ibuf.index)); + + if (page_is_empty(btr_cur_get_page(&cur))) { + /* If a B-tree page is empty, it must be the root page + and the whole B-tree must be empty. InnoDB does not + allow empty B-tree pages other than the root. */ + ut_ad(ibuf.empty); + ut_ad(btr_cur_get_block(&cur)->page.id() + == page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO)); + + ibuf_mtr_commit(&mtr); + + return(0); + } + + ulint n_pages = 0; + sum_sizes = ibuf_get_merge_page_nos(btr_cur_get_rec(&cur), &mtr, + space_ids, page_nos, &n_pages); + ibuf_mtr_commit(&mtr); + + ibuf_read_merge_pages(space_ids, page_nos, n_pages); + + return(sum_sizes + 1); +} + +/*********************************************************************//** +Contracts insert buffer trees by reading pages referring to space_id +to the buffer pool. +@returns number of pages merged.*/ +ulint +ibuf_merge_space( +/*=============*/ + ulint space) /*!< in: tablespace id to merge */ +{ + if (UNIV_UNLIKELY(!ibuf.index)) return 0; + mtr_t mtr; + btr_pcur_t pcur; + + dfield_t dfield[IBUF_REC_FIELD_METADATA]; + dtuple_t tuple {0, IBUF_REC_FIELD_METADATA, + IBUF_REC_FIELD_METADATA,dfield,0,nullptr +#ifdef UNIV_DEBUG + , DATA_TUPLE_MAGIC_N +#endif + }; + byte space_id[4]; + + mach_write_to_4(space_id, space); + + dfield_set_data(&dfield[0], space_id, 4); + dfield_set_data(&dfield[1], field_ref_zero, 1); + dfield_set_data(&dfield[2], field_ref_zero, 4); + + dtuple_set_types_binary(&tuple, IBUF_REC_FIELD_METADATA); + ulint n_pages = 0; + + ut_ad(space < SRV_SPACE_ID_UPPER_BOUND); + + log_free_check(); + ibuf_mtr_start(&mtr); + + /* Position the cursor on the first matching record. */ + + pcur.btr_cur.page_cur.index = ibuf.index; + dberr_t err = btr_pcur_open(&tuple, PAGE_CUR_GE, BTR_SEARCH_LEAF, + &pcur, &mtr); + ut_ad(err != DB_SUCCESS || page_validate(btr_pcur_get_page(&pcur), + ibuf.index)); + + ulint sum_sizes = 0; + uint32_t pages[IBUF_MAX_N_PAGES_MERGED]; + uint32_t spaces[IBUF_MAX_N_PAGES_MERGED]; + + if (err != DB_SUCCESS) { + } else if (page_is_empty(btr_pcur_get_page(&pcur))) { + /* If a B-tree page is empty, it must be the root page + and the whole B-tree must be empty. InnoDB does not + allow empty B-tree pages other than the root. */ + ut_ad(ibuf.empty); + ut_ad(btr_pcur_get_block(&pcur)->page.id() + == page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO)); + } else { + + sum_sizes = ibuf_get_merge_pages( + &pcur, uint32_t(space), IBUF_MAX_N_PAGES_MERGED, + &pages[0], &spaces[0], &n_pages, + &mtr); + ib::info() << "Size of pages merged " << sum_sizes; + } + + ibuf_mtr_commit(&mtr); + + if (n_pages > 0) { + ut_ad(n_pages <= UT_ARR_SIZE(pages)); + +#ifdef UNIV_DEBUG + for (ulint i = 0; i < n_pages; ++i) { + ut_ad(spaces[i] == space); + } +#endif /* UNIV_DEBUG */ + + ibuf_read_merge_pages(spaces, pages, n_pages); + } + + return(n_pages); +} + +/** Determine if a change buffer record has been encountered already. +@param rec change buffer record in the MySQL 5.5 format +@param hash hash table of encountered records +@param size number of elements in hash +@retval true if a distinct record +@retval false if this may be duplicating an earlier record */ +static bool ibuf_get_volume_buffered_hash(const rec_t *rec, ulint *hash, + ulint size) +{ + ut_ad(rec_get_n_fields_old(rec) > IBUF_REC_FIELD_USER); + const ulint start= rec_get_field_start_offs(rec, IBUF_REC_FIELD_USER); + const ulint len= rec_get_data_size_old(rec) - start; + const uint32_t fold= my_crc32c(0, rec + start, len); + hash+= (fold / (CHAR_BIT * sizeof *hash)) % size; + ulint bitmask= static_cast(1) << (fold % (CHAR_BIT * sizeof(*hash))); + + if (*hash & bitmask) + return false; + + /* We have not seen this record yet. Remember it. */ + *hash|= bitmask; + return true; +} + +#ifdef UNIV_DEBUG +# define ibuf_get_volume_buffered_count(mtr,rec,hash,size,n_recs) \ + ibuf_get_volume_buffered_count_func(mtr,rec,hash,size,n_recs) +#else /* UNIV_DEBUG */ +# define ibuf_get_volume_buffered_count(mtr,rec,hash,size,n_recs) \ + ibuf_get_volume_buffered_count_func(rec,hash,size,n_recs) +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Update the estimate of the number of records on a page, and +get the space taken by merging the buffered record to the index page. +@return size of index record in bytes + an upper limit of the space +taken in the page directory */ +static +ulint +ibuf_get_volume_buffered_count_func( +/*================================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction owning rec */ +#endif /* UNIV_DEBUG */ + const rec_t* rec, /*!< in: insert buffer record */ + ulint* hash, /*!< in/out: hash array */ + ulint size, /*!< in: number of elements in hash array */ + lint* n_recs) /*!< in/out: estimated number of records + on the page that rec points to */ +{ + ulint len; + ibuf_op_t ibuf_op; + const byte* types; + ulint n_fields; + + ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX + | MTR_MEMO_PAGE_S_FIX)); + ut_ad(ibuf_inside(mtr)); + + n_fields = rec_get_n_fields_old(rec); + ut_ad(n_fields > IBUF_REC_FIELD_USER); + n_fields -= IBUF_REC_FIELD_USER; + + rec_get_nth_field_offs_old(rec, 1, &len); + /* This function is only invoked when buffering new + operations. All pre-4.1 records should have been merged + when the database was started up. */ + ut_a(len == 1); + + if (rec_get_deleted_flag(rec, 0)) { + /* This record has been merged already, + but apparently the system crashed before + the change was discarded from the buffer. + Pretend that the record does not exist. */ + return(0); + } + + types = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len); + + switch (UNIV_EXPECT(int(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE), + IBUF_REC_INFO_SIZE)) { + default: + ut_error; + case 0: + /* This ROW_TYPE=REDUNDANT record does not include an + operation counter. Exclude it from the *n_recs, + because deletes cannot be buffered if there are + old-style inserts buffered for the page. */ + + len = ibuf_rec_get_size(rec, types, n_fields, 0); + + return(len + + rec_get_converted_extra_size(len, n_fields, 0) + + page_dir_calc_reserved_space(1)); + case 1: + /* This ROW_TYPE=COMPACT record does not include an + operation counter. Exclude it from the *n_recs, + because deletes cannot be buffered if there are + old-style inserts buffered for the page. */ + goto get_volume_comp; + + case IBUF_REC_INFO_SIZE: + ibuf_op = (ibuf_op_t) types[IBUF_REC_OFFSET_TYPE]; + break; + } + + switch (ibuf_op) { + case IBUF_OP_INSERT: + /* Inserts can be done by updating a delete-marked record. + Because delete-mark and insert operations can be pointing to + the same records, we must not count duplicates. */ + case IBUF_OP_DELETE_MARK: + /* There must be a record to delete-mark. + See if this record has been already buffered. */ + if (n_recs && ibuf_get_volume_buffered_hash(rec, hash, size)) { + (*n_recs)++; + } + + if (ibuf_op == IBUF_OP_DELETE_MARK) { + /* Setting the delete-mark flag does not + affect the available space on the page. */ + return(0); + } + break; + case IBUF_OP_DELETE: + /* A record will be removed from the page. */ + if (n_recs) { + (*n_recs)--; + } + /* While deleting a record actually frees up space, + we have to play it safe and pretend that it takes no + additional space (the record might not exist, etc.). */ + return(0); + default: + ut_error; + } + + ut_ad(ibuf_op == IBUF_OP_INSERT); + +get_volume_comp: + { + dtuple_t* entry; + ulint volume; + dict_index_t* dummy_index; + mem_heap_t* heap = mem_heap_create(500); + + entry = ibuf_build_entry_from_ibuf_rec( + mtr, rec, heap, &dummy_index); + + volume = rec_get_converted_size(dummy_index, entry, 0); + + ibuf_dummy_index_free(dummy_index); + mem_heap_free(heap); + + return(volume + page_dir_calc_reserved_space(1)); + } +} + +/*********************************************************************//** +Gets an upper limit for the combined size of entries buffered in the insert +buffer for a given page. +@return upper limit for the volume of buffered inserts for the index +page, in bytes; srv_page_size, if the entries for the index page span +several pages in the insert buffer */ +static +ulint +ibuf_get_volume_buffered( +/*=====================*/ + const btr_pcur_t*pcur, /*!< in: pcur positioned at a place in an + insert buffer tree where we would insert an + entry for the index page whose number is + page_no, latch mode has to be BTR_MODIFY_PREV + or BTR_MODIFY_TREE */ + ulint space, /*!< in: space id */ + ulint page_no,/*!< in: page number of an index page */ + lint* n_recs, /*!< in/out: minimum number of records on the + page after the buffered changes have been + applied, or NULL to disable the counting */ + mtr_t* mtr) /*!< in: mini-transaction of pcur */ +{ + ulint volume; + const rec_t* rec; + const page_t* page; + const page_t* prev_page; + const page_t* next_page; + /* bitmap of buffered recs */ + ulint hash_bitmap[128 / sizeof(ulint)]; + + ut_ad((pcur->latch_mode == BTR_MODIFY_PREV) + || (pcur->latch_mode == BTR_MODIFY_TREE)); + + /* Count the volume of inserts earlier in the alphabetical order than + pcur */ + + volume = 0; + + if (n_recs) { + memset(hash_bitmap, 0, sizeof hash_bitmap); + } + + rec = btr_pcur_get_rec(pcur); + page = page_align(rec); + ut_ad(page_validate(page, ibuf.index)); + + if (page_rec_is_supremum(rec) + && UNIV_UNLIKELY(!(rec = page_rec_get_prev_const(rec)))) { +corruption: + ut_ad("corrupted page" == 0); + return srv_page_size; + } + + uint32_t prev_page_no; + + for (; !page_rec_is_infimum(rec); ) { + ut_ad(page_align(rec) == page); + + if (page_no != ibuf_rec_get_page_no(mtr, rec) + || space != ibuf_rec_get_space(mtr, rec)) { + + goto count_later; + } + + volume += ibuf_get_volume_buffered_count( + mtr, rec, + hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs); + + if (UNIV_UNLIKELY(!(rec = page_rec_get_prev_const(rec)))) { + goto corruption; + } + } + + /* Look at the previous page */ + + prev_page_no = btr_page_get_prev(page); + + if (prev_page_no == FIL_NULL) { + + goto count_later; + } + + if (buf_block_t* block = + buf_page_get(page_id_t(IBUF_SPACE_ID, prev_page_no), + 0, RW_X_LATCH, mtr)) { + prev_page = buf_block_get_frame(block); + ut_ad(page_validate(prev_page, ibuf.index)); + } else { + return srv_page_size; + } + + static_assert(FIL_PAGE_NEXT % 4 == 0, "alignment"); + static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment"); + + if (UNIV_UNLIKELY(memcmp_aligned<4>(prev_page + FIL_PAGE_NEXT, + page + FIL_PAGE_OFFSET, 4))) { + return srv_page_size; + } + + rec = page_rec_get_prev_const(page_get_supremum_rec(prev_page)); + + if (UNIV_UNLIKELY(!rec)) { + goto corruption; + } + + for (;;) { + ut_ad(page_align(rec) == prev_page); + + if (page_rec_is_infimum(rec)) { + + /* We cannot go to yet a previous page, because we + do not have the x-latch on it, and cannot acquire one + because of the latching order: we have to give up */ + + return(srv_page_size); + } + + if (page_no != ibuf_rec_get_page_no(mtr, rec) + || space != ibuf_rec_get_space(mtr, rec)) { + + goto count_later; + } + + volume += ibuf_get_volume_buffered_count( + mtr, rec, + hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs); + + if (UNIV_UNLIKELY(!(rec = page_rec_get_prev_const(rec)))) { + goto corruption; + } + } + +count_later: + rec = btr_pcur_get_rec(pcur); + + if (!page_rec_is_supremum(rec)) { + rec = page_rec_get_next_const(rec); + } + + for (; !page_rec_is_supremum(rec); + rec = page_rec_get_next_const(rec)) { + if (UNIV_UNLIKELY(!rec)) { + return srv_page_size; + } + if (page_no != ibuf_rec_get_page_no(mtr, rec) + || space != ibuf_rec_get_space(mtr, rec)) { + + return(volume); + } + + volume += ibuf_get_volume_buffered_count( + mtr, rec, + hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs); + } + + /* Look at the next page */ + + uint32_t next_page_no = btr_page_get_next(page); + + if (next_page_no == FIL_NULL) { + + return(volume); + } + + if (buf_block_t* block = + buf_page_get(page_id_t(IBUF_SPACE_ID, next_page_no), + 0, RW_X_LATCH, mtr)) { + next_page = buf_block_get_frame(block); + ut_ad(page_validate(next_page, ibuf.index)); + } else { + return srv_page_size; + } + + static_assert(FIL_PAGE_PREV % 4 == 0, "alignment"); + static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment"); + + if (UNIV_UNLIKELY(memcmp_aligned<4>(next_page + FIL_PAGE_PREV, + page + FIL_PAGE_OFFSET, 4))) { + return 0; + } + + rec = page_get_infimum_rec(next_page); + rec = page_rec_get_next_const(rec); + + for (; ; rec = page_rec_get_next_const(rec)) { + if (!rec || page_rec_is_supremum(rec)) { + /* We give up */ + return(srv_page_size); + } + + ut_ad(page_align(rec) == next_page); + + if (page_no != ibuf_rec_get_page_no(mtr, rec) + || space != ibuf_rec_get_space(mtr, rec)) { + + return(volume); + } + + volume += ibuf_get_volume_buffered_count( + mtr, rec, + hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs); + } +} + +/*********************************************************************//** +Reads the biggest tablespace id from the high end of the insert buffer +tree and updates the counter in fil_system. */ +void +ibuf_update_max_tablespace_id(void) +/*===============================*/ +{ + if (UNIV_UNLIKELY(!ibuf.index)) return; + const rec_t* rec; + const byte* field; + ulint len; + btr_pcur_t pcur; + mtr_t mtr; + + ut_ad(!ibuf.index->table->not_redundant()); + + ibuf_mtr_start(&mtr); + + if (pcur.open_leaf(false, ibuf.index, BTR_SEARCH_LEAF, &mtr) + != DB_SUCCESS) { +func_exit: + ibuf_mtr_commit(&mtr); + return; + } + + ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf.index)); + + if (!btr_pcur_move_to_prev(&pcur, &mtr) + || btr_pcur_is_before_first_on_page(&pcur)) { + goto func_exit; + } + + rec = btr_pcur_get_rec(&pcur); + + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len); + + ut_a(len == 4); + + const uint32_t max_space_id = mach_read_from_4(field); + + ibuf_mtr_commit(&mtr); + + /* printf("Maximum space id in insert buffer %lu\n", max_space_id); */ + + fil_set_max_space_id_if_bigger(max_space_id); +} + +#ifdef UNIV_DEBUG +# define ibuf_get_entry_counter_low(mtr,rec,space,page_no) \ + ibuf_get_entry_counter_low_func(mtr,rec,space,page_no) +#else /* UNIV_DEBUG */ +# define ibuf_get_entry_counter_low(mtr,rec,space,page_no) \ + ibuf_get_entry_counter_low_func(rec,space,page_no) +#endif +/****************************************************************//** +Helper function for ibuf_get_entry_counter_func. Checks if rec is for +(space, page_no), and if so, reads counter value from it and returns +that + 1. +@retval ULINT_UNDEFINED if the record does not contain any counter +@retval 0 if the record is not for (space, page_no) +@retval 1 + previous counter value, otherwise */ +static +ulint +ibuf_get_entry_counter_low_func( +/*============================*/ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction of rec */ +#endif /* UNIV_DEBUG */ + const rec_t* rec, /*!< in: insert buffer record */ + ulint space, /*!< in: space id */ + ulint page_no) /*!< in: page number */ +{ + ulint counter; + const byte* field; + ulint len; + + ut_ad(ibuf_inside(mtr)); + ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX + | MTR_MEMO_PAGE_S_FIX)); + ut_ad(rec_get_n_fields_old(rec) > 2); + + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len); + + ut_a(len == 1); + + /* Check the tablespace identifier. */ + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len); + + ut_a(len == 4); + + if (mach_read_from_4(field) != space) { + + return(0); + } + + /* Check the page offset. */ + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_PAGE, &len); + ut_a(len == 4); + + if (mach_read_from_4(field) != page_no) { + + return(0); + } + + /* Check if the record contains a counter field. */ + field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len); + + switch (len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE) { + default: + ut_error; + case 0: /* ROW_FORMAT=REDUNDANT */ + case 1: /* ROW_FORMAT=COMPACT */ + return(ULINT_UNDEFINED); + + case IBUF_REC_INFO_SIZE: + counter = mach_read_from_2(field + IBUF_REC_OFFSET_COUNTER); + ut_a(counter < 0xFFFF); + return(counter + 1); + } +} + +#ifdef UNIV_DEBUG +# define ibuf_get_entry_counter(space,page_no,rec,mtr,exact_leaf) \ + ibuf_get_entry_counter_func(space,page_no,rec,mtr,exact_leaf) +#else /* UNIV_DEBUG */ +# define ibuf_get_entry_counter(space,page_no,rec,mtr,exact_leaf) \ + ibuf_get_entry_counter_func(space,page_no,rec,exact_leaf) +#endif /* UNIV_DEBUG */ + +/****************************************************************//** +Calculate the counter field for an entry based on the current +last record in ibuf for (space, page_no). +@return the counter field, or ULINT_UNDEFINED +if we should abort this insertion to ibuf */ +static +ulint +ibuf_get_entry_counter_func( +/*========================*/ + ulint space, /*!< in: space id of entry */ + ulint page_no, /*!< in: page number of entry */ + const rec_t* rec, /*!< in: the record preceding the + insertion point */ +#ifdef UNIV_DEBUG + mtr_t* mtr, /*!< in: mini-transaction */ +#endif /* UNIV_DEBUG */ + ibool only_leaf) /*!< in: TRUE if this is the only + leaf page that can contain entries + for (space,page_no), that is, there + was no exact match for (space,page_no) + in the node pointer */ +{ + ut_ad(ibuf_inside(mtr)); + ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX)); + ut_ad(page_validate(page_align(rec), ibuf.index)); + + if (page_rec_is_supremum(rec)) { + /* This is just for safety. The record should be a + page infimum or a user record. */ + ut_ad(0); + return(ULINT_UNDEFINED); + } else if (!page_rec_is_infimum(rec)) { + return(ibuf_get_entry_counter_low(mtr, rec, space, page_no)); + } else if (only_leaf || !page_has_prev(page_align(rec))) { + /* The parent node pointer did not contain the + searched for (space, page_no), which means that the + search ended on the correct page regardless of the + counter value, and since we're at the infimum record, + there are no existing records. */ + return(0); + } else { + /* We used to read the previous page here. It would + break the latching order, because the caller has + buffer-fixed an insert buffer bitmap page. */ + return(ULINT_UNDEFINED); + } +} + + +/** Translates the ibuf free bits to the free space on a page in bytes. +@param[in] physical_size page_size +@param[in] bits value for ibuf bitmap bits +@return maximum insert size after reorganize for the page */ +inline ulint +ibuf_index_page_calc_free_from_bits(ulint physical_size, ulint bits) +{ + ut_ad(bits < 4); + ut_ad(physical_size > IBUF_PAGE_SIZE_PER_FREE_SPACE); + + if (bits == 3) { + bits = 4; + } + + return bits * physical_size / IBUF_PAGE_SIZE_PER_FREE_SPACE; +} + +/** Buffer an operation in the insert/delete buffer, instead of doing it +directly to the disk page, if this is possible. +@param[in] mode BTR_MODIFY_PREV or BTR_INSERT_TREE +@param[in] op operation type +@param[in] no_counter TRUE=use 5.0.3 format; FALSE=allow delete +buffering +@param[in] entry index entry to insert +@param[in] entry_size rec_get_converted_size(index, entry) +@param[in,out] index index where to insert; must not be unique +or clustered +@param[in] page_id page id where to insert +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] thr query thread +@return DB_SUCCESS, DB_STRONG_FAIL or other error */ +static TRANSACTIONAL_TARGET MY_ATTRIBUTE((warn_unused_result)) +dberr_t +ibuf_insert_low( + btr_latch_mode mode, + ibuf_op_t op, + ibool no_counter, + const dtuple_t* entry, + ulint entry_size, + dict_index_t* index, + const page_id_t page_id, + ulint zip_size, + que_thr_t* thr) +{ + big_rec_t* dummy_big_rec; + btr_pcur_t pcur; + btr_cur_t* cursor; + dtuple_t* ibuf_entry; + mem_heap_t* offsets_heap = NULL; + mem_heap_t* heap; + rec_offs* offsets = NULL; + ulint buffered; + lint min_n_recs; + rec_t* ins_rec; + buf_block_t* bitmap_page; + buf_block_t* block = NULL; + page_t* root; + dberr_t err; + mtr_t mtr; + mtr_t bitmap_mtr; + + ut_a(!dict_index_is_clust(index)); + ut_ad(!dict_index_is_spatial(index)); + ut_ad(dtuple_check_typed(entry)); + ut_ad(!no_counter || op == IBUF_OP_INSERT); + ut_ad(page_id.space() == index->table->space_id); + ut_a(op < IBUF_OP_COUNT); + + /* Perform dirty comparison of ibuf.max_size and ibuf.size to + reduce ibuf_mutex contention. */ + if (ibuf.size >= ibuf.max_size) { + return(DB_STRONG_FAIL); + } + + heap = mem_heap_create(1024); + + /* Build the entry which contains the space id and the page number + as the first fields and the type information for other fields, and + which will be inserted to the insert buffer. Using a counter value + of 0xFFFF we find the last record for (space, page_no), from which + we can then read the counter value N and use N + 1 in the record we + insert. (We patch the ibuf_entry's counter field to the correct + value just before actually inserting the entry.) */ + + ibuf_entry = ibuf_entry_build( + op, index, entry, page_id.space(), page_id.page_no(), + no_counter ? ULINT_UNDEFINED : 0xFFFF, heap); + + /* Open a cursor to the insert buffer tree to calculate if we can add + the new entry to it without exceeding the free space limit for the + page. */ + + if (mode == BTR_INSERT_TREE) { + for (;;) { + mysql_mutex_lock(&ibuf_pessimistic_insert_mutex); + mysql_mutex_lock(&ibuf_mutex); + + if (UNIV_LIKELY(ibuf_data_enough_free_for_insert())) { + + break; + } + + mysql_mutex_unlock(&ibuf_mutex); + mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex); + + if (!ibuf_add_free_page()) { + + mem_heap_free(heap); + return(DB_STRONG_FAIL); + } + } + } + + ibuf_mtr_start(&mtr); + pcur.btr_cur.page_cur.index = ibuf.index; + + err = btr_pcur_open(ibuf_entry, PAGE_CUR_LE, mode, &pcur, &mtr); + if (err != DB_SUCCESS) { +func_exit: + ibuf_mtr_commit(&mtr); + ut_free(pcur.old_rec_buf); + mem_heap_free(heap); + return err; + } + + ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf.index)); + + /* Find out the volume of already buffered inserts for the same index + page */ + min_n_recs = 0; + buffered = ibuf_get_volume_buffered(&pcur, + page_id.space(), + page_id.page_no(), + op == IBUF_OP_DELETE + ? &min_n_recs + : NULL, &mtr); + + const ulint physical_size = zip_size ? zip_size : srv_page_size; + + if (op == IBUF_OP_DELETE + && (min_n_recs < 2 || buf_pool.watch_occurred(page_id))) { + /* The page could become empty after the record is + deleted, or the page has been read in to the buffer + pool. Refuse to buffer the operation. */ + + /* The buffer pool watch is needed for IBUF_OP_DELETE + because of latching order considerations. We can + check buf_pool_watch_occurred() only after latching + the insert buffer B-tree pages that contain buffered + changes for the page. We never buffer IBUF_OP_DELETE, + unless some IBUF_OP_INSERT or IBUF_OP_DELETE_MARK have + been previously buffered for the page. Because there + are buffered operations for the page, the insert + buffer B-tree page latches held by mtr will guarantee + that no changes for the user page will be merged + before mtr_commit(&mtr). We must not mtr_commit(&mtr) + until after the IBUF_OP_DELETE has been buffered. */ + +fail_exit: + if (mode == BTR_INSERT_TREE) { + mysql_mutex_unlock(&ibuf_mutex); + mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex); + } + + err = DB_STRONG_FAIL; + goto func_exit; + } + + /* After this point, the page could still be loaded to the + buffer pool, but we do not have to care about it, since we are + holding a latch on the insert buffer leaf page that contains + buffered changes for (space, page_no). If the page enters the + buffer pool, buf_page_t::read_complete() for (space, page_no) will + have to acquire a latch on the same insert buffer leaf page, + which it cannot do until we have buffered the IBUF_OP_DELETE + and done mtr_commit(&mtr) to release the latch. */ + + ibuf_mtr_start(&bitmap_mtr); + + bitmap_page = ibuf_bitmap_get_map_page(page_id, zip_size, &bitmap_mtr); + + /* We check if the index page is suitable for buffered entries */ + + if (!bitmap_page || buf_pool.page_hash_contains( + page_id, buf_pool.page_hash.cell_get(page_id.fold()))) { +commit_exit: + ibuf_mtr_commit(&bitmap_mtr); + goto fail_exit; + } else if (!lock_sys.rd_lock_try()) { + goto commit_exit; + } else { + hash_cell_t* cell = lock_sys.rec_hash.cell_get(page_id.fold()); + lock_sys.rec_hash.latch(cell)->acquire(); + const lock_t* lock = lock_sys_t::get_first(*cell, page_id); + lock_sys.rec_hash.latch(cell)->release(); + lock_sys.rd_unlock(); + if (lock) { + goto commit_exit; + } + } + + if (op == IBUF_OP_INSERT) { + ulint bits = ibuf_bitmap_page_get_bits( + bitmap_page->page.frame, page_id, physical_size, + IBUF_BITMAP_FREE, &bitmap_mtr); + + if (buffered + entry_size + page_dir_calc_reserved_space(1) + > ibuf_index_page_calc_free_from_bits(physical_size, + bits)) { + /* Release the bitmap page latch early. */ + ibuf_mtr_commit(&bitmap_mtr); + goto fail_exit; + } + } + + if (!no_counter) { + /* Patch correct counter value to the entry to + insert. This can change the insert position, which can + result in the need to abort in some cases. */ + ulint counter = ibuf_get_entry_counter( + page_id.space(), page_id.page_no(), + btr_pcur_get_rec(&pcur), &mtr, + btr_pcur_get_btr_cur(&pcur)->low_match + < IBUF_REC_FIELD_METADATA); + dfield_t* field; + + if (counter == ULINT_UNDEFINED) { + goto commit_exit; + } + + field = dtuple_get_nth_field( + ibuf_entry, IBUF_REC_FIELD_METADATA); + mach_write_to_2( + (byte*) dfield_get_data(field) + + IBUF_REC_OFFSET_COUNTER, counter); + } + + /* Set the bitmap bit denoting that the insert buffer contains + buffered entries for this index page, if the bit is not set yet */ + index->set_modified(bitmap_mtr); + ibuf_bitmap_page_set_bits( + bitmap_page, page_id, physical_size, true, &bitmap_mtr); + ibuf_mtr_commit(&bitmap_mtr); + + cursor = btr_pcur_get_btr_cur(&pcur); + + if (mode == BTR_MODIFY_PREV) { + err = btr_cur_optimistic_insert( + BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG, + cursor, &offsets, &offsets_heap, + ibuf_entry, &ins_rec, + &dummy_big_rec, 0, thr, &mtr); + block = btr_cur_get_block(cursor); + ut_ad(block->page.id().space() == IBUF_SPACE_ID); + + /* If this is the root page, update ibuf.empty. */ + if (block->page.id().page_no() == FSP_IBUF_TREE_ROOT_PAGE_NO) { + const page_t* root = buf_block_get_frame(block); + + ut_ad(page_get_space_id(root) == IBUF_SPACE_ID); + ut_ad(page_get_page_no(root) + == FSP_IBUF_TREE_ROOT_PAGE_NO); + + ibuf.empty = page_is_empty(root); + } + } else { + ut_ad(mode == BTR_INSERT_TREE); + + /* We acquire an sx-latch to the root page before the insert, + because a pessimistic insert releases the tree x-latch, + which would cause the sx-latching of the root after that to + break the latching order. */ + if (buf_block_t* ibuf_root = ibuf_tree_root_get(&mtr)) { + root = ibuf_root->page.frame; + } else { + err = DB_CORRUPTION; + mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex); + mysql_mutex_unlock(&ibuf_mutex); + goto ibuf_insert_done; + } + + err = btr_cur_optimistic_insert( + BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG, + cursor, &offsets, &offsets_heap, + ibuf_entry, &ins_rec, + &dummy_big_rec, 0, thr, &mtr); + + if (err == DB_FAIL) { + err = btr_cur_pessimistic_insert( + BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG, + cursor, &offsets, &offsets_heap, + ibuf_entry, &ins_rec, + &dummy_big_rec, 0, thr, &mtr); + } + + mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex); + ibuf_size_update(root); + mysql_mutex_unlock(&ibuf_mutex); + ibuf.empty = page_is_empty(root); + + block = btr_cur_get_block(cursor); + ut_ad(block->page.id().space() == IBUF_SPACE_ID); + } + +ibuf_insert_done: + if (offsets_heap) { + mem_heap_free(offsets_heap); + } + + if (err == DB_SUCCESS && op != IBUF_OP_DELETE) { + /* Update the page max trx id field */ + page_update_max_trx_id(block, NULL, + thr_get_trx(thr)->id, &mtr); + } + + goto func_exit; +} + +/** Buffer an operation in the change buffer, instead of applying it +directly to the file page, if this is possible. Does not do it if the index +is clustered or unique. +@param[in] op operation type +@param[in] entry index entry to insert +@param[in,out] index index where to insert +@param[in] page_id page id where to insert +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] thr query thread +@return true if success */ +TRANSACTIONAL_TARGET +bool +ibuf_insert( + ibuf_op_t op, + const dtuple_t* entry, + dict_index_t* index, + const page_id_t page_id, + ulint zip_size, + que_thr_t* thr) +{ + if (!index->is_committed()) { + return false; + } + + dberr_t err; + ulint entry_size; + ibool no_counter; + /* Read the settable global variable only once in + this function, so that we will have a consistent view of it. */ + ibuf_use_t use = ibuf_use_t(innodb_change_buffering); + DBUG_ENTER("ibuf_insert"); + + DBUG_PRINT("ibuf", ("op: %d, space: " UINT32PF ", page_no: " UINT32PF, + op, page_id.space(), page_id.page_no())); + + ut_ad(dtuple_check_typed(entry)); + ut_ad(page_id.space() != SRV_TMP_SPACE_ID); + ut_ad(index->is_btree()); + ut_a(!dict_index_is_clust(index)); + ut_ad(!index->table->is_temporary()); + + no_counter = use <= IBUF_USE_INSERT; + + switch (op) { + case IBUF_OP_INSERT: + switch (use) { + case IBUF_USE_NONE: + case IBUF_USE_DELETE: + case IBUF_USE_DELETE_MARK: + DBUG_RETURN(false); + case IBUF_USE_INSERT: + case IBUF_USE_INSERT_DELETE_MARK: + case IBUF_USE_ALL: + goto check_watch; + } + break; + case IBUF_OP_DELETE_MARK: + switch (use) { + case IBUF_USE_NONE: + case IBUF_USE_INSERT: + DBUG_RETURN(false); + case IBUF_USE_DELETE_MARK: + case IBUF_USE_DELETE: + case IBUF_USE_INSERT_DELETE_MARK: + case IBUF_USE_ALL: + ut_ad(!no_counter); + goto check_watch; + } + break; + case IBUF_OP_DELETE: + switch (use) { + case IBUF_USE_NONE: + case IBUF_USE_INSERT: + case IBUF_USE_INSERT_DELETE_MARK: + DBUG_RETURN(false); + case IBUF_USE_DELETE_MARK: + case IBUF_USE_DELETE: + case IBUF_USE_ALL: + ut_ad(!no_counter); + goto skip_watch; + } + break; + case IBUF_OP_COUNT: + break; + } + + /* unknown op or use */ + ut_error; + +check_watch: + /* If a thread attempts to buffer an insert on a page while a + purge is in progress on the same page, the purge must not be + buffered, because it could remove a record that was + re-inserted later. For simplicity, we block the buffering of + all operations on a page that has a purge pending. + + We do not check this in the IBUF_OP_DELETE case, because that + would always trigger the buffer pool watch during purge and + thus prevent the buffering of delete operations. We assume + that the issuer of IBUF_OP_DELETE has called + buf_pool_t::watch_set(). */ + + if (buf_pool.page_hash_contains( + page_id, buf_pool.page_hash.cell_get(page_id.fold()))) { + /* A buffer pool watch has been set or the + page has been read into the buffer pool. + Do not buffer the request. If a purge operation + is being buffered, have this request executed + directly on the page in the buffer pool after the + buffered entries for this page have been merged. */ + DBUG_RETURN(false); + } + +skip_watch: + entry_size = rec_get_converted_size(index, entry, 0); + + if (entry_size + >= page_get_free_space_of_empty(dict_table_is_comp(index->table)) + / 2) { + + DBUG_RETURN(false); + } + + err = ibuf_insert_low(BTR_MODIFY_PREV, op, no_counter, + entry, entry_size, + index, page_id, zip_size, thr); + if (err == DB_FAIL) { + err = ibuf_insert_low(BTR_INSERT_TREE, + op, no_counter, entry, entry_size, + index, page_id, zip_size, thr); + } + + ut_a(err == DB_SUCCESS || err == DB_STRONG_FAIL + || err == DB_TOO_BIG_RECORD); + + DBUG_RETURN(err == DB_SUCCESS); +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/********************************************************************//** +During merge, inserts to an index page a secondary index entry extracted +from the insert buffer. +@return error code */ +static +dberr_t +ibuf_insert_to_index_page_low( +/*==========================*/ + const dtuple_t* entry, /*!< in: buffered entry to insert */ + rec_offs** offsets,/*!< out: offsets on *rec */ + mem_heap_t* heap, /*!< in/out: memory heap */ + mtr_t* mtr, /*!< in/out: mtr */ + page_cur_t* page_cur)/*!< in/out: cursor positioned on the record + after which to insert the buffered entry */ +{ + if (page_cur_tuple_insert(page_cur, entry, offsets, &heap, 0, mtr)) + return DB_SUCCESS; + + /* Page reorganization or recompression should already have been + attempted by page_cur_tuple_insert(). Besides, per + ibuf_index_page_calc_free_zip() the page should not have been + recompressed or reorganized. */ + ut_ad(!is_buf_block_get_page_zip(page_cur->block)); + + /* If the record did not fit, reorganize */ + if (dberr_t err= btr_page_reorganize(page_cur, mtr)) + return err; + + /* This time the record must fit */ + if (page_cur_tuple_insert(page_cur, entry, offsets, &heap, 0, mtr)) + return DB_SUCCESS; + + return DB_CORRUPTION; +} + +/************************************************************************ +During merge, inserts to an index page a secondary index entry extracted +from the insert buffer. */ +static +dberr_t +ibuf_insert_to_index_page( +/*======================*/ + const dtuple_t* entry, /*!< in: buffered entry to insert */ + buf_block_t* block, /*!< in/out: index page where the buffered entry + should be placed */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_cur_t page_cur; + page_t* page = buf_block_get_frame(block); + rec_t* rec; + rec_offs* offsets; + mem_heap_t* heap; + + DBUG_PRINT("ibuf", ("page " UINT32PF ":" UINT32PF, + block->page.id().space(), + block->page.id().page_no())); + + ut_ad(!dict_index_is_online_ddl(index));// this is an ibuf_dummy index + ut_ad(ibuf_inside(mtr)); + ut_ad(dtuple_check_typed(entry)); +#ifdef BTR_CUR_HASH_ADAPT + /* A change buffer merge must occur before users are granted + any access to the page. No adaptive hash index entries may + point to a freshly read page. */ + ut_ad(!block->index); + assert_block_ahi_empty(block); +#endif /* BTR_CUR_HASH_ADAPT */ + ut_ad(mtr->is_named_space(block->page.id().space())); + + if (UNIV_UNLIKELY(dict_table_is_comp(index->table) + != (ibool)!!page_is_comp(page))) { + return DB_CORRUPTION; + } + + rec = page_rec_get_next(page_get_infimum_rec(page)); + + if (!rec || page_rec_is_supremum(rec)) { + return DB_CORRUPTION; + } + + if (!rec_n_fields_is_sane(index, rec, entry)) { + return DB_CORRUPTION; + } + + ulint up_match = 0, low_match = 0; + page_cur.index = index; + page_cur.block = block; + + if (page_cur_search_with_match(entry, PAGE_CUR_LE, + &up_match, &low_match, &page_cur, + nullptr)) { + return DB_CORRUPTION; + } + + dberr_t err = DB_SUCCESS; + + heap = mem_heap_create( + sizeof(upd_t) + + REC_OFFS_HEADER_SIZE * sizeof(*offsets) + + dtuple_get_n_fields(entry) + * (sizeof(upd_field_t) + sizeof *offsets)); + + if (UNIV_UNLIKELY(low_match == dtuple_get_n_fields(entry))) { + upd_t* update; + + rec = page_cur_get_rec(&page_cur); + + /* This is based on + row_ins_sec_index_entry_by_modify(BTR_MODIFY_LEAF). */ + ut_ad(rec_get_deleted_flag(rec, page_is_comp(page))); + + offsets = rec_get_offsets(rec, index, NULL, index->n_fields, + ULINT_UNDEFINED, &heap); + update = row_upd_build_sec_rec_difference_binary( + rec, index, offsets, entry, heap); + + if (update->n_fields == 0) { + /* The records only differ in the delete-mark. + Clear the delete-mark, like we did before + Bug #56680 was fixed. */ + btr_rec_set_deleted(block, rec, mtr); + goto updated_in_place; + } + + /* Copy the info bits. Clear the delete-mark. */ + update->info_bits = rec_get_info_bits(rec, page_is_comp(page)); + update->info_bits &= byte(~REC_INFO_DELETED_FLAG); + page_zip_des_t* page_zip = buf_block_get_page_zip(block); + + /* We cannot invoke btr_cur_optimistic_update() here, + because we do not have a btr_cur_t or que_thr_t, + as the insert buffer merge occurs at a very low level. */ + if (!row_upd_changes_field_size_or_external(index, offsets, + update) + && (!page_zip || btr_cur_update_alloc_zip( + page_zip, &page_cur, offsets, + rec_offs_size(offsets), false, mtr))) { + /* This is the easy case. Do something similar + to btr_cur_update_in_place(). */ + rec = page_cur_get_rec(&page_cur); + btr_cur_upd_rec_in_place(rec, index, offsets, + update, block, mtr); + + DBUG_EXECUTE_IF( + "crash_after_log_ibuf_upd_inplace", + log_buffer_flush_to_disk(); + ib::info() << "Wrote log record for ibuf" + " update in place operation"; + DBUG_SUICIDE(); + ); + + goto updated_in_place; + } + + /* btr_cur_update_alloc_zip() may have changed this */ + rec = page_cur_get_rec(&page_cur); + + /* A collation may identify values that differ in + storage length. + Some examples (1 or 2 bytes): + utf8_turkish_ci: I = U+0131 LATIN SMALL LETTER DOTLESS I + utf8_general_ci: S = U+00DF LATIN SMALL LETTER SHARP S + utf8_general_ci: A = U+00E4 LATIN SMALL LETTER A WITH DIAERESIS + + latin1_german2_ci: SS = U+00DF LATIN SMALL LETTER SHARP S + + Examples of a character (3-byte UTF-8 sequence) + identified with 2 or 4 characters (1-byte UTF-8 sequences): + + utf8_unicode_ci: 'II' = U+2171 SMALL ROMAN NUMERAL TWO + utf8_unicode_ci: '(10)' = U+247D PARENTHESIZED NUMBER TEN + */ + + /* Delete the different-length record, and insert the + buffered one. */ + + page_cur_delete_rec(&page_cur, offsets, mtr); + if (!(page_cur_move_to_prev(&page_cur))) { + err = DB_CORRUPTION; + goto updated_in_place; + } + } else { + offsets = NULL; + } + + err = ibuf_insert_to_index_page_low(entry, &offsets, heap, mtr, + &page_cur); +updated_in_place: + mem_heap_free(heap); + + return err; +} + +/****************************************************************//** +During merge, sets the delete mark on a record for a secondary index +entry. */ +static +void +ibuf_set_del_mark( +/*==============*/ + const dtuple_t* entry, /*!< in: entry */ + buf_block_t* block, /*!< in/out: block */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ +{ + page_cur_t page_cur; + page_cur.block = block; + page_cur.index = index; + ulint up_match = 0, low_match = 0; + + ut_ad(ibuf_inside(mtr)); + ut_ad(dtuple_check_typed(entry)); + + if (!page_cur_search_with_match(entry, PAGE_CUR_LE, + &up_match, &low_match, &page_cur, + nullptr) + && low_match == dtuple_get_n_fields(entry)) { + rec_t* rec = page_cur_get_rec(&page_cur); + + /* Delete mark the old index record. According to a + comment in row_upd_sec_index_entry(), it can already + have been delete marked if a lock wait occurred in + row_ins_sec_index_entry() in a previous invocation of + row_upd_sec_index_entry(). */ + + if (UNIV_LIKELY + (!rec_get_deleted_flag( + rec, dict_table_is_comp(index->table)))) { + btr_rec_set_deleted(block, rec, mtr); + } + } else { + const page_t* page + = page_cur_get_page(&page_cur); + const buf_block_t* block + = page_cur_get_block(&page_cur); + + ib::error() << "Unable to find a record to delete-mark"; + fputs("InnoDB: tuple ", stderr); + dtuple_print(stderr, entry); + fputs("\n" + "InnoDB: record ", stderr); + rec_print(stderr, page_cur_get_rec(&page_cur), index); + + ib::error() << "page " << block->page.id() << " (" + << page_get_n_recs(page) << " records, index id " + << btr_page_get_index_id(page) << ")."; + + ib::error() << BUG_REPORT_MSG; + ut_ad(0); + } +} + +/****************************************************************//** +During merge, delete a record for a secondary index entry. */ +static +void +ibuf_delete( +/*========*/ + const dtuple_t* entry, /*!< in: entry */ + buf_block_t* block, /*!< in/out: block */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in/out: mtr; must be committed + before latching any further pages */ +{ + page_cur_t page_cur; + page_cur.block = block; + page_cur.index = index; + ulint up_match = 0, low_match = 0; + + ut_ad(ibuf_inside(mtr)); + ut_ad(dtuple_check_typed(entry)); + ut_ad(!index->is_spatial()); + ut_ad(!index->is_clust()); + + if (!page_cur_search_with_match(entry, PAGE_CUR_LE, + &up_match, &low_match, &page_cur, + nullptr) + && low_match == dtuple_get_n_fields(entry)) { + page_zip_des_t* page_zip= buf_block_get_page_zip(block); + page_t* page = buf_block_get_frame(block); + rec_t* rec = page_cur_get_rec(&page_cur); + + /* TODO: the below should probably be a separate function, + it's a bastardized version of btr_cur_optimistic_delete. */ + + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + mem_heap_t* heap = NULL; + ulint max_ins_size = 0; + + rec_offs_init(offsets_); + + offsets = rec_get_offsets(rec, index, offsets, index->n_fields, + ULINT_UNDEFINED, &heap); + + if (page_get_n_recs(page) <= 1 + || !(REC_INFO_DELETED_FLAG + & rec_get_info_bits(rec, page_is_comp(page)))) { + /* Refuse to purge the last record or a + record that has not been marked for deletion. */ + ib::error() << "Unable to purge a record"; + fputs("InnoDB: tuple ", stderr); + dtuple_print(stderr, entry); + fputs("\n" + "InnoDB: record ", stderr); + rec_print_new(stderr, rec, offsets); + fprintf(stderr, "\nspace " UINT32PF " offset " UINT32PF + " (%u records, index id %llu)\n" + "InnoDB: Submit a detailed bug report" + " to https://jira.mariadb.org/\n", + block->page.id().space(), + block->page.id().page_no(), + (unsigned) page_get_n_recs(page), + (ulonglong) btr_page_get_index_id(page)); + + ut_ad(0); + return; + } + + if (!page_zip) { + max_ins_size + = page_get_max_insert_size_after_reorganize( + page, 1); + } +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + page_cur_delete_rec(&page_cur, offsets, mtr); +#ifdef UNIV_ZIP_DEBUG + ut_a(!page_zip || page_zip_validate(page_zip, page, index)); +#endif /* UNIV_ZIP_DEBUG */ + + if (page_zip) { + ibuf_update_free_bits_zip(block, mtr); + } else { + ibuf_update_free_bits_low(block, max_ins_size, mtr); + } + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } +} + +/*********************************************************************//** +Restores insert buffer tree cursor position +@return whether the position was restored */ +static MY_ATTRIBUTE((nonnull)) +bool +ibuf_restore_pos( +/*=============*/ + const page_id_t page_id,/*!< in: page identifier */ + const dtuple_t* search_tuple, + /*!< in: search tuple for entries of page_no */ + btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF or BTR_PURGE_TREE */ + btr_pcur_t* pcur, /*!< in/out: persistent cursor whose + position is to be restored */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + if (UNIV_LIKELY(pcur->restore_position(mode, mtr) == + btr_pcur_t::SAME_ALL)) { + return true; + } + + if (fil_space_t* s = fil_space_t::get(page_id.space())) { + ib::error() << "ibuf cursor restoration fails!" + " ibuf record inserted to page " + << page_id + << " in file " << s->chain.start->name; + s->release(); + + ib::error() << BUG_REPORT_MSG; + + rec_print_old(stderr, btr_pcur_get_rec(pcur)); + rec_print_old(stderr, pcur->old_rec); + dtuple_print(stderr, search_tuple); + } + + ibuf_btr_pcur_commit_specify_mtr(pcur, mtr); + return false; +} + +/** +Delete a change buffer record. +@param[in] page_id page identifier +@param[in,out] pcur persistent cursor positioned on the record +@param[in] search_tuple search key for (space,page_no) +@param[in,out] mtr mini-transaction +@return whether mtr was committed (due to pessimistic operation) */ +static MY_ATTRIBUTE((warn_unused_result, nonnull)) +bool ibuf_delete_rec(const page_id_t page_id, btr_pcur_t* pcur, + const dtuple_t* search_tuple, mtr_t* mtr) +{ + dberr_t err; + + ut_ad(ibuf_inside(mtr)); + ut_ad(page_rec_is_user_rec(btr_pcur_get_rec(pcur))); + ut_ad(ibuf_rec_get_page_no(mtr, btr_pcur_get_rec(pcur)) + == page_id.page_no()); + ut_ad(ibuf_rec_get_space(mtr, btr_pcur_get_rec(pcur)) + == page_id.space()); + + switch (btr_cur_optimistic_delete(btr_pcur_get_btr_cur(pcur), + BTR_CREATE_FLAG, mtr)) { + case DB_FAIL: + break; + case DB_SUCCESS: + if (page_is_empty(btr_pcur_get_page(pcur))) { + /* If a B-tree page is empty, it must be the root page + and the whole B-tree must be empty. InnoDB does not + allow empty B-tree pages other than the root. */ + ut_d(const page_t* root = btr_pcur_get_page(pcur)); + + ut_ad(page_get_space_id(root) == IBUF_SPACE_ID); + ut_ad(page_get_page_no(root) + == FSP_IBUF_TREE_ROOT_PAGE_NO); + + /* ibuf.empty is protected by the root page latch. + Before the deletion, it had to be FALSE. */ + ut_ad(!ibuf.empty); + ibuf.empty = true; + } + /* fall through */ + default: + return(FALSE); + } + + /* We have to resort to a pessimistic delete from ibuf. + Delete-mark the record so that it will not be applied again, + in case the server crashes before the pessimistic delete is + made persistent. */ + btr_rec_set_deleted(btr_pcur_get_block(pcur), + btr_pcur_get_rec(pcur), mtr); + + btr_pcur_store_position(pcur, mtr); + ibuf_btr_pcur_commit_specify_mtr(pcur, mtr); + + ibuf_mtr_start(mtr); + mysql_mutex_lock(&ibuf_mutex); + mtr_x_lock_index(ibuf.index, mtr); + + if (!ibuf_restore_pos(page_id, search_tuple, + BTR_PURGE_TREE_ALREADY_LATCHED, pcur, mtr)) { + mysql_mutex_unlock(&ibuf_mutex); + goto func_exit; + } + + if (buf_block_t* ibuf_root = ibuf_tree_root_get(mtr)) { + btr_cur_pessimistic_delete(&err, TRUE, + btr_pcur_get_btr_cur(pcur), + BTR_CREATE_FLAG, false, mtr); + ut_a(err == DB_SUCCESS); + + ibuf_size_update(ibuf_root->page.frame); + ibuf.empty = page_is_empty(ibuf_root->page.frame); + } + + mysql_mutex_unlock(&ibuf_mutex); + ibuf_btr_pcur_commit_specify_mtr(pcur, mtr); + +func_exit: + ut_ad(mtr->has_committed()); + btr_pcur_close(pcur); + + return(TRUE); +} + +/** Check whether buffered changes exist for a page. +@param[in] id page identifier +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@return whether buffered changes exist */ +bool ibuf_page_exists(const page_id_t id, ulint zip_size) +{ + ut_ad(!fsp_is_system_temporary(id.space())); + + const ulint physical_size = zip_size ? zip_size : srv_page_size; + + if (ibuf_fixed_addr_page(id, physical_size) + || fsp_descr_page(id, physical_size)) { + return false; + } + + mtr_t mtr; + bool bitmap_bits = false; + + ibuf_mtr_start(&mtr); + if (const buf_block_t* bitmap_page = ibuf_bitmap_get_map_page( + id, zip_size, &mtr)) { + bitmap_bits = ibuf_bitmap_page_get_bits( + bitmap_page->page.frame, id, zip_size, + IBUF_BITMAP_BUFFERED, &mtr) != 0; + } + ibuf_mtr_commit(&mtr); + return bitmap_bits; +} + +/** Reset the bits in the bitmap page for the given block and page id. +@param b X-latched secondary index page (nullptr to discard changes) +@param page_id page identifier +@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param mtr mini-transaction */ +static void ibuf_reset_bitmap(buf_block_t *b, page_id_t page_id, + ulint zip_size, mtr_t *mtr) +{ + buf_block_t *bitmap= ibuf_bitmap_get_map_page(page_id, zip_size, mtr); + if (!bitmap) + return; + + const ulint physical_size = zip_size ? zip_size : srv_page_size; + /* FIXME: update the bitmap byte only once! */ + ibuf_bitmap_page_set_bits(bitmap, page_id, + physical_size, false, mtr); + + if (b) + ibuf_bitmap_page_set_bits(bitmap, page_id, physical_size, + ibuf_index_page_calc_free(b), + mtr); +} + +/** When an index page is read from a disk to the buffer pool, this function +applies any buffered operations to the page and deletes the entries from the +insert buffer. If the page is not read, but created in the buffer pool, this +function deletes its buffered entries from the insert buffer; there can +exist entries for such a page if the page belonged to an index which +subsequently was dropped. +@param block X-latched page to try to apply changes to, or NULL to discard +@param page_id page identifier +@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@return error code */ +dberr_t ibuf_merge_or_delete_for_page(buf_block_t *block, + const page_id_t page_id, + ulint zip_size) +{ + if (trx_sys_hdr_page(page_id)) { + return DB_SUCCESS; + } + + ut_ad(!block || page_id == block->page.id()); + ut_ad(!block || block->page.frame); + ut_ad(!block || !block->page.is_ibuf_exist()); + ut_ad(!block || !block->page.is_reinit()); + ut_ad(!trx_sys_hdr_page(page_id)); + ut_ad(page_id < page_id_t(SRV_SPACE_ID_UPPER_BOUND, 0)); + + const ulint physical_size = zip_size ? zip_size : srv_page_size; + + if (ibuf_fixed_addr_page(page_id, physical_size) + || fsp_descr_page(page_id, physical_size)) { + return DB_SUCCESS; + } + + btr_pcur_t pcur; +#ifdef UNIV_IBUF_DEBUG + ulint volume = 0; +#endif /* UNIV_IBUF_DEBUG */ + dberr_t err = DB_SUCCESS; + mtr_t mtr; + + fil_space_t* space = fil_space_t::get(page_id.space()); + + if (UNIV_UNLIKELY(!space)) { + block = nullptr; + } else { + ulint bitmap_bits = 0; + + ibuf_mtr_start(&mtr); + + buf_block_t* bitmap_page = ibuf_bitmap_get_map_page( + page_id, zip_size, &mtr); + + if (bitmap_page + && fil_page_get_type(bitmap_page->page.frame) + != FIL_PAGE_TYPE_ALLOCATED) { + bitmap_bits = ibuf_bitmap_page_get_bits( + bitmap_page->page.frame, page_id, zip_size, + IBUF_BITMAP_BUFFERED, &mtr); + } + + ibuf_mtr_commit(&mtr); + + if (!bitmap_bits) { + done: + /* No changes are buffered for this page. */ + space->release(); + return DB_SUCCESS; + } + + if (!block + || DB_SUCCESS + == fseg_page_is_allocated(space, page_id.page_no())) { + ibuf_mtr_start(&mtr); + mtr.set_named_space(space); + ibuf_reset_bitmap(block, page_id, zip_size, &mtr); + ibuf_mtr_commit(&mtr); + if (!block + || btr_page_get_index_id(block->page.frame) + != DICT_IBUF_ID_MIN + IBUF_SPACE_ID) { + ibuf_delete_recs(page_id); + } + goto done; + } + } + + if (!block) { + } else if (!fil_page_index_page_check(block->page.frame) + || !page_is_leaf(block->page.frame)) { + space->set_corrupted(); + err = DB_CORRUPTION; + block = nullptr; + } else { + /* Move the ownership of the x-latch on the page to this OS + thread, so that we can acquire a second x-latch on it. This + is needed for the insert operations to the index page to pass + the debug checks. */ + + block->page.lock.claim_ownership(); + } + + mem_heap_t* heap = mem_heap_create(512); + + const dtuple_t* search_tuple = ibuf_search_tuple_build( + page_id.space(), page_id.page_no(), heap); + + /* Counts for merged & discarded operations. */ + ulint mops[IBUF_OP_COUNT]; + ulint dops[IBUF_OP_COUNT]; + + memset(mops, 0, sizeof(mops)); + memset(dops, 0, sizeof(dops)); + pcur.btr_cur.page_cur.index = ibuf.index; + +loop: + ibuf_mtr_start(&mtr); + + /* Position pcur in the insert buffer at the first entry for this + index page */ + if (btr_pcur_open_on_user_rec(search_tuple, + BTR_MODIFY_LEAF, &pcur, &mtr) + != DB_SUCCESS) { + err = DB_CORRUPTION; + goto reset_bit; + } + + if (block) { + block->page.fix(); + block->page.lock.x_lock_recursive(); + mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX); + } + + if (space) { + mtr.set_named_space(space); + } + + if (!btr_pcur_is_on_user_rec(&pcur)) { + ut_ad(btr_pcur_is_after_last_on_page(&pcur)); + goto reset_bit; + } + + for (;;) { + rec_t* rec; + + ut_ad(btr_pcur_is_on_user_rec(&pcur)); + + rec = btr_pcur_get_rec(&pcur); + + /* Check if the entry is for this index page */ + if (ibuf_rec_get_page_no(&mtr, rec) != page_id.page_no() + || ibuf_rec_get_space(&mtr, rec) != page_id.space()) { + + if (block != NULL) { + page_header_reset_last_insert(block, &mtr); + } + + goto reset_bit; + } + + if (err) { + fputs("InnoDB: Discarding record\n ", stderr); + rec_print_old(stderr, rec); + fputs("\nInnoDB: from the insert buffer!\n\n", stderr); + } else if (block != NULL && !rec_get_deleted_flag(rec, 0)) { + /* Now we have at pcur a record which should be + applied on the index page; NOTE that the call below + copies pointers to fields in rec, and we must + keep the latch to the rec page until the + insertion is finished! */ + dtuple_t* entry; + trx_id_t max_trx_id; + dict_index_t* dummy_index; + ibuf_op_t op = ibuf_rec_get_op_type(&mtr, rec); + + max_trx_id = page_get_max_trx_id(page_align(rec)); + page_update_max_trx_id(block, + buf_block_get_page_zip(block), + max_trx_id, &mtr); + + ut_ad(page_validate(page_align(rec), ibuf.index)); + + entry = ibuf_build_entry_from_ibuf_rec( + &mtr, rec, heap, &dummy_index); + ut_ad(!dummy_index->table->space); + dummy_index->table->space = space; + dummy_index->table->space_id = space->id; + + ut_ad(page_validate(block->page.frame, dummy_index)); + + switch (op) { + case IBUF_OP_INSERT: +#ifdef UNIV_IBUF_DEBUG + volume += rec_get_converted_size( + dummy_index, entry, 0); + + volume += page_dir_calc_reserved_space(1); + + ut_a(volume <= (4U << srv_page_size_shift) + / IBUF_PAGE_SIZE_PER_FREE_SPACE); +#endif + ibuf_insert_to_index_page( + entry, block, dummy_index, &mtr); + break; + + case IBUF_OP_DELETE_MARK: + ibuf_set_del_mark( + entry, block, dummy_index, &mtr); + break; + + case IBUF_OP_DELETE: + ibuf_delete(entry, block, dummy_index, &mtr); + /* Because ibuf_delete() will latch an + insert buffer bitmap page, commit mtr + before latching any further pages. + Store and restore the cursor position. */ + ut_ad(rec == btr_pcur_get_rec(&pcur)); + ut_ad(page_rec_is_user_rec(rec)); + ut_ad(ibuf_rec_get_page_no(&mtr, rec) + == page_id.page_no()); + ut_ad(ibuf_rec_get_space(&mtr, rec) + == page_id.space()); + + /* Mark the change buffer record processed, + so that it will not be merged again in case + the server crashes between the following + mtr_commit() and the subsequent mtr_commit() + of deleting the change buffer record. */ + btr_rec_set_deleted( + btr_pcur_get_block(&pcur), + btr_pcur_get_rec(&pcur), &mtr); + + btr_pcur_store_position(&pcur, &mtr); + ibuf_btr_pcur_commit_specify_mtr(&pcur, &mtr); + + ibuf_mtr_start(&mtr); + mtr.set_named_space(space); + + block->page.lock.x_lock_recursive(); + block->fix(); + mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX); + + if (!ibuf_restore_pos(page_id, search_tuple, + BTR_MODIFY_LEAF, + &pcur, &mtr)) { + + ut_ad(mtr.has_committed()); + mops[op]++; + ibuf_dummy_index_free(dummy_index); + goto loop; + } + + break; + default: + ut_error; + } + + mops[op]++; + + ibuf_dummy_index_free(dummy_index); + } else { + dops[ibuf_rec_get_op_type(&mtr, rec)]++; + } + + /* Delete the record from ibuf */ + if (ibuf_delete_rec(page_id, &pcur, search_tuple, &mtr)) { + /* Deletion was pessimistic and mtr was committed: + we start from the beginning again */ + + ut_ad(mtr.has_committed()); + goto loop; + } else if (btr_pcur_is_after_last_on_page(&pcur)) { + ibuf_mtr_commit(&mtr); + goto loop; + } + } + +reset_bit: + if (space) { + ibuf_reset_bitmap(block, page_id, zip_size, &mtr); + } + + ibuf_mtr_commit(&mtr); + ut_free(pcur.old_rec_buf); + + if (space) { + space->release(); + } + + mem_heap_free(heap); + + ibuf.n_merges++; + ibuf_add_ops(ibuf.n_merged_ops, mops); + ibuf_add_ops(ibuf.n_discarded_ops, dops); + + return err; +} + +/** Delete all change buffer entries for a tablespace, +in DISCARD TABLESPACE, IMPORT TABLESPACE, or read-ahead. +@param[in] space missing or to-be-discarded tablespace */ +void ibuf_delete_for_discarded_space(uint32_t space) +{ + if (UNIV_UNLIKELY(!ibuf.index)) return; + + btr_pcur_t pcur; + const rec_t* ibuf_rec; + mtr_t mtr; + + /* Counts for discarded operations. */ + ulint dops[IBUF_OP_COUNT]; + + dfield_t dfield[IBUF_REC_FIELD_METADATA]; + dtuple_t search_tuple {0,IBUF_REC_FIELD_METADATA, + IBUF_REC_FIELD_METADATA,dfield,0 + ,nullptr +#ifdef UNIV_DEBUG + ,DATA_TUPLE_MAGIC_N +#endif /* UNIV_DEBUG */ + }; + byte space_id[4]; + mach_write_to_4(space_id, space); + + dfield_set_data(&dfield[0], space_id, 4); + dfield_set_data(&dfield[1], field_ref_zero, 1); + dfield_set_data(&dfield[2], field_ref_zero, 4); + dtuple_set_types_binary(&search_tuple, IBUF_REC_FIELD_METADATA); + /* Use page number 0 to build the search tuple so that we get the + cursor positioned at the first entry for this space id */ + + memset(dops, 0, sizeof(dops)); + pcur.btr_cur.page_cur.index = ibuf.index; + +loop: + log_free_check(); + ibuf_mtr_start(&mtr); + + /* Position pcur in the insert buffer at the first entry for the + space */ + if (btr_pcur_open_on_user_rec(&search_tuple, + BTR_MODIFY_LEAF, &pcur, &mtr) + != DB_SUCCESS) { + goto leave_loop; + } + + if (!btr_pcur_is_on_user_rec(&pcur)) { + ut_ad(btr_pcur_is_after_last_on_page(&pcur)); + goto leave_loop; + } + + for (;;) { + ut_ad(btr_pcur_is_on_user_rec(&pcur)); + + ibuf_rec = btr_pcur_get_rec(&pcur); + + /* Check if the entry is for this space */ + if (ibuf_rec_get_space(&mtr, ibuf_rec) != space) { + + goto leave_loop; + } + + uint32_t page_no = ibuf_rec_get_page_no(&mtr, ibuf_rec); + + dops[ibuf_rec_get_op_type(&mtr, ibuf_rec)]++; + + /* Delete the record from ibuf */ + if (ibuf_delete_rec(page_id_t(space, page_no), + &pcur, &search_tuple, &mtr)) { + /* Deletion was pessimistic and mtr was committed: + we start from the beginning again */ + + ut_ad(mtr.has_committed()); +clear: + ut_free(pcur.old_rec_buf); + goto loop; + } + + if (btr_pcur_is_after_last_on_page(&pcur)) { + ibuf_mtr_commit(&mtr); + goto clear; + } + } + +leave_loop: + ibuf_mtr_commit(&mtr); + ut_free(pcur.old_rec_buf); + + ibuf_add_ops(ibuf.n_discarded_ops, dops); +} + +/******************************************************************//** +Looks if the insert buffer is empty. +@return true if empty */ +bool +ibuf_is_empty(void) +/*===============*/ +{ + mtr_t mtr; + + ibuf_mtr_start(&mtr); + + ut_d(mysql_mutex_lock(&ibuf_mutex)); + const buf_block_t* root = ibuf_tree_root_get(&mtr); + bool is_empty = root && page_is_empty(root->page.frame); + ut_ad(!root || is_empty == ibuf.empty); + ut_d(mysql_mutex_unlock(&ibuf_mutex)); + ibuf_mtr_commit(&mtr); + + return(is_empty); +} + +/******************************************************************//** +Prints info of ibuf. */ +void +ibuf_print( +/*=======*/ + FILE* file) /*!< in: file where to print */ +{ + if (UNIV_UNLIKELY(!ibuf.index)) return; + + mysql_mutex_lock(&ibuf_mutex); + if (ibuf.empty) + { + mysql_mutex_unlock(&ibuf_mutex); + return; + } + + const ulint size= ibuf.size; + const ulint free_list_len= ibuf.free_list_len; + const ulint seg_size= ibuf.seg_size; + mysql_mutex_unlock(&ibuf_mutex); + + fprintf(file, + "-------------\n" + "INSERT BUFFER\n" + "-------------\n" + "size " ULINTPF ", free list len " ULINTPF "," + " seg size " ULINTPF ", " ULINTPF " merges\n", + size, free_list_len, seg_size, ulint{ibuf.n_merges}); + ibuf_print_ops("merged operations:\n", ibuf.n_merged_ops, file); + ibuf_print_ops("discarded operations:\n", ibuf.n_discarded_ops, file); +} + +/** Check the insert buffer bitmaps on IMPORT TABLESPACE. +@param[in] trx transaction +@param[in,out] space tablespace being imported +@return DB_SUCCESS or error code */ +dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space) +{ + ut_ad(trx->mysql_thd); + ut_ad(space->purpose == FIL_TYPE_IMPORT); + + const unsigned zip_size = space->zip_size(); + const unsigned physical_size = space->physical_size(); + + uint32_t size= std::min(space->free_limit, space->size); + + if (size == 0) { + return(DB_TABLE_NOT_FOUND); + } + + mtr_t mtr; + + /* The two bitmap pages (allocation bitmap and ibuf bitmap) repeat + every page_size pages. For example if page_size is 16 KiB, then the + two bitmap pages repeat every 16 KiB * 16384 = 256 MiB. In the loop + below page_no is measured in number of pages since the beginning of + the space, as usual. */ + + for (uint32_t page_no = 0; page_no < size; page_no += physical_size) { + if (trx_is_interrupted(trx)) { + return(DB_INTERRUPTED); + } + + mtr_start(&mtr); + + buf_block_t* bitmap_page = ibuf_bitmap_get_map_page( + page_id_t(space->id, page_no), zip_size, &mtr); + if (!bitmap_page) { + mtr.commit(); + return DB_CORRUPTION; + } + + if (buf_is_zeroes(span(bitmap_page->page.frame, + physical_size))) { + /* This means we got all-zero page instead of + ibuf bitmap page. The subsequent page should be + all-zero pages. */ +#ifdef UNIV_DEBUG + for (uint32_t curr_page = page_no + 1; + curr_page < physical_size; curr_page++) { + + buf_block_t* block = buf_page_get( + page_id_t(space->id, curr_page), + zip_size, RW_S_LATCH, &mtr); + page_t* page = buf_block_get_frame(block); + ut_ad(buf_is_zeroes(span( + page, + physical_size))); + } +#endif /* UNIV_DEBUG */ + mtr_commit(&mtr); + continue; + } + + for (uint32_t i = FSP_IBUF_BITMAP_OFFSET + 1; i < physical_size; + i++) { + const uint32_t offset = page_no + i; + const page_id_t cur_page_id(space->id, offset); + + if (ibuf_bitmap_page_get_bits( + bitmap_page->page.frame, + cur_page_id, zip_size, + IBUF_BITMAP_IBUF, &mtr)) { + + mtr_commit(&mtr); + + ib_errf(trx->mysql_thd, + IB_LOG_LEVEL_ERROR, + ER_INNODB_INDEX_CORRUPT, + "File %s page %u" + " is wrongly flagged to belong to the" + " insert buffer", + space->chain.start->name, offset); + return(DB_CORRUPTION); + } + + if (ibuf_bitmap_page_get_bits( + bitmap_page->page.frame, + cur_page_id, zip_size, + IBUF_BITMAP_BUFFERED, &mtr)) { + + ib_errf(trx->mysql_thd, + IB_LOG_LEVEL_WARN, + ER_INNODB_INDEX_CORRUPT, + "Buffered changes" + " for file %s page %u are lost", + space->chain.start->name, offset); + + /* Tolerate this error, so that + slightly corrupted tables can be + imported and dumped. Clear the bit. */ + ibuf_bitmap_page_set_bits( + bitmap_page, cur_page_id, + physical_size, false, &mtr); + } + } + + mtr_commit(&mtr); + } + + return(DB_SUCCESS); +} + +void ibuf_set_bitmap_for_bulk_load(buf_block_t *block, mtr_t *mtr, bool reset) +{ + ut_a(page_is_leaf(block->page.frame)); + const page_id_t id{block->page.id()}; + const auto zip_size= block->zip_size(); + + if (buf_block_t *bitmap_page= ibuf_bitmap_get_map_page(id, zip_size, mtr)) + { + if (ibuf_bitmap_page_get_bits(bitmap_page->page.frame, id, zip_size, + IBUF_BITMAP_BUFFERED, mtr)) + ibuf_delete_recs(id); + + ulint free_val= reset ? 0 : ibuf_index_page_calc_free(block); + /* FIXME: update the bitmap byte only once! */ + ibuf_bitmap_page_set_bits + (bitmap_page, id, block->physical_size(), free_val, mtr); + ibuf_bitmap_page_set_bits + (bitmap_page, id, block->physical_size(), false, mtr); + } +} diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h new file mode 100644 index 00000000..5a0401fa --- /dev/null +++ b/storage/innobase/include/btr0btr.h @@ -0,0 +1,543 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2014, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/btr0btr.h +The B-tree + +Created 6/2/1994 Heikki Tuuri +*******************************************************/ + +#pragma once + +#include "dict0dict.h" +#include "data0data.h" +#include "rem0types.h" +#include "page0cur.h" +#include "btr0types.h" +#include "gis0type.h" + +#define BTR_MAX_NODE_LEVEL 50 /*!< Maximum B-tree page level + (not really a hard limit). + Used in debug assertions + in btr_page_set_level and + btr_page_get_level */ + +/** Maximum record size which can be stored on a page, without using the +special big record storage structure */ +#define BTR_PAGE_MAX_REC_SIZE (srv_page_size / 2 - 200) + +/** @brief Maximum depth of a B-tree in InnoDB. + +Note that this isn't a maximum as such; none of the tree operations +avoid producing trees bigger than this. It is instead a "max depth +that other code must work with", useful for e.g. fixed-size arrays +that must store some information about each level in a tree. In other +words: if a B-tree with bigger depth than this is encountered, it is +not acceptable for it to lead to mysterious memory corruption, but it +is acceptable for the program to die with a clear assert failure. */ +#define BTR_MAX_LEVELS 100 + +#define BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode) \ + btr_latch_mode((latch_mode) & ~(BTR_INSERT \ + | BTR_DELETE_MARK \ + | BTR_RTREE_UNDO_INS \ + | BTR_RTREE_DELETE_MARK \ + | BTR_DELETE \ + | BTR_IGNORE_SEC_UNIQUE \ + | BTR_ALREADY_S_LATCHED \ + | BTR_LATCH_FOR_INSERT \ + | BTR_LATCH_FOR_DELETE)) + +#define BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode) \ + btr_latch_mode((latch_mode) \ + & ~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE)) + +/**************************************************************//** +Checks and adjusts the root node of a tree during IMPORT TABLESPACE. +@return error code, or DB_SUCCESS */ +dberr_t +btr_root_adjust_on_import( +/*======================*/ + const dict_index_t* index) /*!< in: index tree */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Report a decryption failure. */ +ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index); + +/** Get an index page and declare its latching order level. +@param[in] index index tree +@param[in] page page number +@param[in] mode latch mode +@param[in] merge whether change buffer merge should be attempted +@param[in,out] mtr mini-transaction +@param[out] err error code +@return block */ +buf_block_t *btr_block_get(const dict_index_t &index, + uint32_t page, rw_lock_type_t mode, bool merge, + mtr_t *mtr, dberr_t *err= nullptr); + +/**************************************************************//** +Gets the index id field of a page. +@return index id */ +UNIV_INLINE +index_id_t +btr_page_get_index_id( +/*==================*/ + const page_t* page) /*!< in: index page */ + MY_ATTRIBUTE((warn_unused_result)); +/** Read the B-tree or R-tree PAGE_LEVEL. +@param page B-tree or R-tree page +@return number of child page links to reach the leaf level +@retval 0 for leaf pages */ +inline uint16_t btr_page_get_level(const page_t *page) +{ + uint16_t level= mach_read_from_2(my_assume_aligned<2> + (PAGE_HEADER + PAGE_LEVEL + page)); + ut_ad(level <= BTR_MAX_NODE_LEVEL); + return level; +} MY_ATTRIBUTE((warn_unused_result)) + +/** Read FIL_PAGE_NEXT. +@param page buffer pool page +@return previous page number */ +inline uint32_t btr_page_get_next(const page_t* page) +{ + return mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT)); +} + +/** Read FIL_PAGE_PREV. +@param page buffer pool page +@return previous page number */ +inline uint32_t btr_page_get_prev(const page_t* page) +{ + return mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV)); +} + +/**************************************************************//** +Gets the child node file address in a node pointer. +NOTE: the offsets array must contain all offsets for the record since +we read the last field according to offsets and assume that it contains +the child page number. In other words offsets must have been retrieved +with rec_get_offsets(n_fields=ULINT_UNDEFINED). +@return child node address */ +UNIV_INLINE +uint32_t +btr_node_ptr_get_child_page_no( +/*===========================*/ + const rec_t* rec, /*!< in: node pointer record */ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Create the root node for a new index tree. +@param[in] type type of the index +@param[in,out] space tablespace where created +@param[in] index_id index id +@param[in] index index, or NULL to create a system table +@param[in,out] mtr mini-transaction +@param[out] err error code +@return page number of the created root +@retval FIL_NULL if did not succeed */ +uint32_t +btr_create( + ulint type, + fil_space_t* space, + index_id_t index_id, + dict_index_t* index, + mtr_t* mtr, + dberr_t* err) + MY_ATTRIBUTE((nonnull(2,5,6), warn_unused_result)); + +/** Free a persistent index tree if it exists. +@param[in,out] space tablespce +@param[in] page root page number +@param[in] index_id PAGE_INDEX_ID contents +@param[in,out] mtr mini-transaction */ +void btr_free_if_exists(fil_space_t *space, uint32_t page, + index_id_t index_id, mtr_t *mtr); + +/** Drop a temporary table +@param table temporary table */ +void btr_drop_temporary_table(const dict_table_t &table); + +/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC. +@param[in,out] index clustered index +@return the last used AUTO_INCREMENT value +@retval 0 on error or if no AUTO_INCREMENT value was used yet */ +ib_uint64_t +btr_read_autoinc(dict_index_t* index) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC, +or fall back to MAX(auto_increment_column). +@param[in] table table containing an AUTO_INCREMENT column +@param[in] col_no index of the AUTO_INCREMENT column +@return the AUTO_INCREMENT value +@retval 0 on error or if no AUTO_INCREMENT value was used yet */ +ib_uint64_t +btr_read_autoinc_with_fallback(const dict_table_t* table, unsigned col_no) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Write the next available AUTO_INCREMENT value to PAGE_ROOT_AUTO_INC. +@param[in,out] index clustered index +@param[in] autoinc the AUTO_INCREMENT value +@param[in] reset whether to reset the AUTO_INCREMENT + to a possibly smaller value than currently + exists in the page */ +void +btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset = false) + MY_ATTRIBUTE((nonnull)); + +/** Write instant ALTER TABLE metadata to a root page. +@param[in,out] root clustered index root page +@param[in] index clustered index with instant ALTER TABLE +@param[in,out] mtr mini-transaction */ +void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr); + +ATTRIBUTE_COLD __attribute__((nonnull)) +/** Reset the table to the canonical format on ROLLBACK of instant ALTER TABLE. +@param[in] index clustered index with instant ALTER TABLE +@param[in] all whether to reset FIL_PAGE_TYPE as well +@param[in,out] mtr mini-transaction */ +void btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr); + +/*************************************************************//** +Makes tree one level higher by splitting the root, and inserts +the tuple. It is assumed that mtr contains an x-latch on the tree. +NOTE that the operation of this function must always succeed, +we cannot reverse it: therefore enough free disk space must be +guaranteed to be available before this function is called. +@return inserted record */ +rec_t* +btr_root_raise_and_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor at which to insert: must be + on the root page; when the function returns, + the cursor is positioned on the predecessor + of the inserted record */ + rec_offs** offsets,/*!< out: offsets on inserted record */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap + that can be emptied, or NULL */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr, /*!< in: mtr */ + dberr_t* err) /*!< out: error code */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*************************************************************//** +Reorganizes an index page. + +IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. This has to +be done either within the same mini-transaction, or by invoking +ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages, +IBUF_BITMAP_FREE is unaffected by reorganization. + +@param cursor page cursor +@param mtr mini-transaction +@return error code +@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */ +dberr_t btr_page_reorganize(page_cur_t *cursor, mtr_t *mtr) + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/** Decide if the page should be split at the convergence point of inserts +converging to the left. +@param[in] cursor insert position +@return the first record to be moved to the right half page +@retval NULL if no split is recommended */ +rec_t* btr_page_get_split_rec_to_left(const btr_cur_t* cursor); +/** Decide if the page should be split at the convergence point of inserts +converging to the right. +@param[in] cursor insert position +@param[out] split_rec if split recommended, the first record + on the right half page, or + NULL if the to-be-inserted record + should be first +@return whether split is recommended */ +bool +btr_page_get_split_rec_to_right(const btr_cur_t* cursor, rec_t** split_rec); + +/*************************************************************//** +Splits an index page to halves and inserts the tuple. It is assumed +that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is +released within this function! NOTE that the operation of this +function must always succeed, we cannot reverse it: therefore enough +free disk space (2 pages) must be guaranteed to be available before +this function is called. + +@return inserted record */ +rec_t* +btr_page_split_and_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor at which to insert; when the + function returns, the cursor is positioned + on the predecessor of the inserted record */ + rec_offs** offsets,/*!< out: offsets on inserted record */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap + that can be emptied, or NULL */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr, /*!< in: mtr */ + dberr_t* err) /*!< out: error code */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*******************************************************//** +Inserts a data tuple to a tree on a non-leaf level. It is assumed +that mtr holds an x-latch on the tree. */ +dberr_t +btr_insert_on_non_leaf_level( + ulint flags, /*!< in: undo logging and locking flags */ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: level, must be > 0 */ + dtuple_t* tuple, /*!< in: the record to be inserted */ + mtr_t* mtr) /*!< in: mtr */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Set a child page pointer record as the predefined minimum record. +@tparam has_prev whether the page is supposed to have a left sibling +@param[in,out] rec leftmost record on a leftmost non-leaf page +@param[in,out] block buffer pool block +@param[in,out] mtr mini-transaction */ +template +inline void btr_set_min_rec_mark(rec_t *rec, const buf_block_t &block, + mtr_t *mtr) +{ + ut_ad(block.page.frame == page_align(rec)); + ut_ad(!page_is_leaf(block.page.frame)); + ut_ad(has_prev == page_has_prev(block.page.frame)); + + rec-= page_rec_is_comp(rec) ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS; + + if (block.page.zip.data) + /* This flag is computed from other contents on a ROW_FORMAT=COMPRESSED + page. We are not modifying the compressed page frame at all. */ + *rec|= REC_INFO_MIN_REC_FLAG; + else + mtr->write<1>(block, rec, *rec | REC_INFO_MIN_REC_FLAG); +} + +/** Seek to the parent page of a B-tree page. +@param[in,out] mtr mini-transaction +@param[in,out] cursor cursor pointing to the x-latched parent page +@return whether the cursor was successfully positioned */ +bool btr_page_get_father(mtr_t* mtr, btr_cur_t* cursor) + MY_ATTRIBUTE((nonnull,warn_unused_result)); +#ifdef UNIV_DEBUG +/************************************************************//** +Checks that the node pointer to a page is appropriate. +@return TRUE */ +ibool +btr_check_node_ptr( +/*===============*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: index page */ + mtr_t* mtr) /*!< in: mtr */ + MY_ATTRIBUTE((warn_unused_result)); +#endif /* UNIV_DEBUG */ +/*************************************************************//** +Tries to merge the page first to the left immediate brother if such a +brother exists, and the node pointers to the current page and to the +brother reside on the same page. If the left brother does not satisfy these +conditions, looks at the right brother. If the page is the only one on that +level lifts the records of the page to the father page, thus reducing the +tree height. It is assumed that mtr holds an x-latch on the tree and on the +page. If cursor is on the leaf level, mtr must also hold x-latches to +the brothers, if they exist. +@return error code +@retval DB_FAIL if the tree could not be merged */ +dberr_t +btr_compress( +/*=========*/ + btr_cur_t* cursor, /*!< in/out: cursor on the page to merge + or lift; the page must not be empty: + when deleting records, use btr_discard_page() + if the page would become empty */ + bool adjust, /*!< in: whether the cursor position should be + adjusted even when compression occurs */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*************************************************************//** +Discards a page from a B-tree. This is used to remove the last record from +a B-tree page: the whole page must be removed at the same time. This cannot +be used for the root page, which is allowed to be empty. */ +dberr_t +btr_discard_page( +/*=============*/ + btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on + the root page */ + mtr_t* mtr); /*!< in: mtr */ + +/**************************************************************//** +Allocates a new file page to be used in an index tree. NOTE: we assume +that the caller has made the reservation for free extents! +@retval NULL if no page could be allocated */ +buf_block_t* +btr_page_alloc( +/*===========*/ + dict_index_t* index, /*!< in: index tree */ + uint32_t hint_page_no, /*!< in: hint of a good page */ + byte file_direction, /*!< in: direction where a possible + page split is made */ + ulint level, /*!< in: level where the page is placed + in the tree */ + mtr_t* mtr, /*!< in/out: mini-transaction + for the allocation */ + mtr_t* init_mtr, /*!< in/out: mini-transaction + for x-latching and initializing + the page */ + dberr_t* err) /*!< out: error code */ + MY_ATTRIBUTE((warn_unused_result)); +/** Empty an index page (possibly the root page). @see btr_page_create(). +@param[in,out] block page to be emptied +@param[in,out] page_zip compressed page frame, or NULL +@param[in] index index of the page +@param[in] level B-tree level of the page (0=leaf) +@param[in,out] mtr mini-transaction */ +void +btr_page_empty( + buf_block_t* block, + page_zip_des_t* page_zip, + dict_index_t* index, + ulint level, + mtr_t* mtr) + MY_ATTRIBUTE((nonnull(1, 3, 5))); +/**************************************************************//** +Creates a new index page (not the root, and also not +used in page reorganization). @see btr_page_empty(). */ +void +btr_page_create( +/*============*/ + buf_block_t* block, /*!< in/out: page to be created */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + dict_index_t* index, /*!< in: index */ + ulint level, /*!< in: the B-tree level of the page */ + mtr_t* mtr); /*!< in: mtr */ + +/** Free an index page. +@param[in,out] index index tree +@param[in,out] block block to be freed +@param[in,out] mtr mini-transaction +@param[in] blob whether this is freeing a BLOB page +@param[in] latched whether index->table->space->x_lock() was called */ +MY_ATTRIBUTE((nonnull)) +dberr_t btr_page_free(dict_index_t *index, buf_block_t *block, mtr_t *mtr, + bool blob= false, bool space_latched= false); + +/**************************************************************//** +Gets the root node of a tree and x- or s-latches it. +@return root page, x- or s-latched */ +buf_block_t* +btr_root_block_get( +/*===============*/ + dict_index_t* index, /*!< in: index tree */ + rw_lock_type_t mode, /*!< in: either RW_S_LATCH + or RW_X_LATCH */ + mtr_t* mtr, /*!< in: mtr */ + dberr_t* err); /*!< out: error code */ +/*************************************************************//** +Reorganizes an index page. + +IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. This has to +be done either within the same mini-transaction, or by invoking +ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages, +IBUF_BITMAP_FREE is unaffected by reorganization. + +@return error code +@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */ +dberr_t btr_page_reorganize_block( + ulint z_level,/*!< in: compression level to be used + if dealing with compressed page */ + buf_block_t* block, /*!< in/out: B-tree page */ + dict_index_t* index, /*!< in: the index tree of the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull, warn_unused_result)); + +#ifdef UNIV_BTR_PRINT +/*************************************************************//** +Prints size info of a B-tree. */ +void +btr_print_size( +/*===========*/ + dict_index_t* index) /*!< in: index tree */ + MY_ATTRIBUTE((nonnull)); +/**************************************************************//** +Prints directories and other info of all nodes in the index. */ +void +btr_print_index( +/*============*/ + dict_index_t* index, /*!< in: index */ + ulint width) /*!< in: print this many entries from start + and end */ + MY_ATTRIBUTE((nonnull)); +#endif /* UNIV_BTR_PRINT */ +/************************************************************//** +Checks the size and number of fields in a record based on the definition of +the index. +@return TRUE if ok */ +ibool +btr_index_rec_validate( +/*===================*/ + const rec_t* rec, /*!< in: index record */ + const dict_index_t* index, /*!< in: index */ + ibool dump_on_error) /*!< in: TRUE if the function + should print hex dump of record + and page on error */ + MY_ATTRIBUTE((warn_unused_result)); +/**************************************************************//** +Checks the consistency of an index tree. +@return DB_SUCCESS if ok, error code if not */ +dberr_t +btr_validate_index( +/*===============*/ + dict_index_t* index, /*!< in: index */ + const trx_t* trx) /*!< in: transaction or 0 */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Remove a page from the level list of pages. +@param[in] block page to remove +@param[in] index index tree +@param[in,out] mtr mini-transaction */ +dberr_t btr_level_list_remove(const buf_block_t& block, + const dict_index_t& index, mtr_t* mtr) + MY_ATTRIBUTE((warn_unused_result)); + +/*************************************************************//** +If page is the only on its level, this function moves its records to the +father page, thus reducing the tree height. +@return father block */ +buf_block_t* +btr_lift_page_up( + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: page which is the only on its level; + must not be empty: use + btr_discard_only_page_on_level if the last + record from the page should be removed */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + dberr_t* err) /*!< out: error code */ + __attribute__((nonnull)); + +#define BTR_N_LEAF_PAGES 1 +#define BTR_TOTAL_SIZE 2 + +#include "btr0btr.inl" + +/**************************************************************** +Global variable controlling if scrubbing should be performed */ +extern my_bool srv_immediate_scrub_data_uncompressed; diff --git a/storage/innobase/include/btr0btr.inl b/storage/innobase/include/btr0btr.inl new file mode 100644 index 00000000..9a9e39b6 --- /dev/null +++ b/storage/innobase/include/btr0btr.inl @@ -0,0 +1,111 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/btr0btr.ic +The B-tree + +Created 6/2/1994 Heikki Tuuri +*******************************************************/ + +#include "mtr0log.h" + +/**************************************************************//** +Gets the index id field of a page. +@return index id */ +UNIV_INLINE +index_id_t +btr_page_get_index_id( +/*==================*/ + const page_t* page) /*!< in: index page */ +{ + return(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID)); +} + +/** Set PAGE_LEVEL. +@param[in,out] block buffer block +@param[in] level page level +@param[in,out] mtr mini-transaction */ +inline +void btr_page_set_level(buf_block_t *block, ulint level, mtr_t *mtr) +{ + ut_ad(level <= BTR_MAX_NODE_LEVEL); + constexpr uint16_t field= PAGE_HEADER + PAGE_LEVEL; + byte *b= my_assume_aligned<2>(&block->page.frame[field]); + if (mtr->write<2,mtr_t::MAYBE_NOP>(*block, b, level) && + UNIV_LIKELY_NULL(block->page.zip.data)) + memcpy_aligned<2>(&block->page.zip.data[field], b, 2); +} + +/** Set FIL_PAGE_NEXT. +@param[in,out] block buffer block +@param[in] next number of successor page +@param[in,out] mtr mini-transaction */ +inline void btr_page_set_next(buf_block_t *block, ulint next, mtr_t *mtr) +{ + constexpr uint16_t field= FIL_PAGE_NEXT; + byte *b= my_assume_aligned<4>(&block->page.frame[field]); + if (mtr->write<4,mtr_t::MAYBE_NOP>(*block, b, next) && + UNIV_LIKELY_NULL(block->page.zip.data)) + memcpy_aligned<4>(&block->page.zip.data[field], b, 4); +} + +/** Set FIL_PAGE_PREV. +@param[in,out] block buffer block +@param[in] prev number of predecessor page +@param[in,out] mtr mini-transaction */ +inline void btr_page_set_prev(buf_block_t *block, ulint prev, mtr_t *mtr) +{ + constexpr uint16_t field= FIL_PAGE_PREV; + byte *b= my_assume_aligned<4>(&block->page.frame[field]); + if (mtr->write<4,mtr_t::MAYBE_NOP>(*block, b, prev) && + UNIV_LIKELY_NULL(block->page.zip.data)) + memcpy_aligned<4>(&block->page.zip.data[field], b, 4); +} + +/**************************************************************//** +Gets the child node file address in a node pointer. +NOTE: the offsets array must contain all offsets for the record since +we read the last field according to offsets and assume that it contains +the child page number. In other words offsets must have been retrieved +with rec_get_offsets(n_fields=ULINT_UNDEFINED). +@return child node address */ +UNIV_INLINE +uint32_t +btr_node_ptr_get_child_page_no( +/*===========================*/ + const rec_t* rec, /*!< in: node pointer record */ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + const byte* field; + ulint len; + + ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec)); + + /* The child address is in the last field */ + field = rec_get_nth_field(rec, offsets, + rec_offs_n_fields(offsets) - 1, &len); + + ut_ad(len == 4); + + uint32_t page_no = mach_read_from_4(field); + ut_ad(page_no > 1); + + return(page_no); +} diff --git a/storage/innobase/include/btr0bulk.h b/storage/innobase/include/btr0bulk.h new file mode 100644 index 00000000..9fcea86d --- /dev/null +++ b/storage/innobase/include/btr0bulk.h @@ -0,0 +1,371 @@ +/***************************************************************************** + +Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/btr0bulk.h +The B-tree bulk load + +Created 03/11/2014 Shaohua Wang +*************************************************************************/ + +#ifndef btr0bulk_h +#define btr0bulk_h + +#include "dict0dict.h" +#include "rem0types.h" +#include "page0cur.h" + +#include + +/** Innodb B-tree index fill factor for bulk load. */ +extern uint innobase_fill_factor; + +/* +The proper function call sequence of PageBulk is as below: +-- PageBulk::init +-- PageBulk::insert +-- PageBulk::finish +-- PageBulk::compress(COMPRESSED table only) +-- PageBulk::pageSplit(COMPRESSED table only) +-- PageBulk::commit +*/ + +class PageBulk +{ +public: + /** Constructor + @param[in] index B-tree index + @param[in] page_no page number + @param[in] level page level + @param[in] trx_id transaction id */ + PageBulk( + dict_index_t* index, + trx_id_t trx_id, + uint32_t page_no, + ulint level) + : + m_heap(NULL), + m_index(index), + m_mtr(), + m_trx_id(trx_id), + m_block(NULL), + m_page(NULL), + m_page_zip(NULL), + m_cur_rec(NULL), + m_page_no(page_no), + m_level(level), + m_is_comp(dict_table_is_comp(index->table)), + m_heap_top(NULL), + m_rec_no(0), + m_free_space(0), + m_reserved_space(0), +#ifdef UNIV_DEBUG + m_total_data(0), +#endif /* UNIV_DEBUG */ + m_modify_clock(0), + m_err(DB_SUCCESS) + { + ut_ad(!dict_index_is_spatial(m_index)); + ut_ad(!m_index->table->is_temporary()); + } + + /** Deconstructor */ + ~PageBulk() + { + mem_heap_free(m_heap); + } + + /** Initialize members and allocate page if needed and start mtr. + Note: must be called and only once right after constructor. + @return error code */ + dberr_t init(); + + /** Insert a record in the page. + @param[in] rec record + @param[in] offsets record offsets */ + inline void insert(const rec_t* rec, rec_offs* offsets); +private: + /** Page format */ + enum format { REDUNDANT, DYNAMIC, COMPRESSED }; + /** Mark end of insertion to the page. Scan all records to set page + dirs, and set page header members. + @tparam format the page format */ + template inline void finishPage(); + /** Insert a record in the page. + @tparam format the page format + @param[in,out] rec record + @param[in] offsets record offsets */ + template inline void insertPage(rec_t* rec, rec_offs* offsets); + +public: + /** Mark end of insertion to the page. Scan all records to set page + dirs, and set page header members. */ + inline void finish(); + + /** @return whether finish() actually needs to do something */ + inline bool needs_finish() const; + + /** Commit mtr for a page + @param[in] success Flag whether all inserts succeed. */ + void commit(bool success); + + /** Compress if it is compressed table + @return true compress successfully or no need to compress + @return false compress failed. */ + bool compress(); + + /** Check whether the record needs to be stored externally. + @return true + @return false */ + bool needExt(const dtuple_t* tuple, ulint rec_size); + + /** Store external record + @param[in] big_rec external recrod + @param[in] offsets record offsets + @return error code */ + dberr_t storeExt(const big_rec_t* big_rec, rec_offs* offsets); + + /** Get node pointer + @return node pointer */ + dtuple_t* getNodePtr(); + + /** Get split rec in the page. We split a page in half when compresssion + fails, and the split rec should be copied to the new page. + @return split rec */ + rec_t* getSplitRec(); + + /** Copy all records after split rec including itself. + @param[in] rec split rec */ + void copyIn(rec_t* split_rec); + + /** Remove all records after split rec including itself. + @param[in] rec split rec */ + void copyOut(rec_t* split_rec); + + /** Set next page + @param[in] next_page_no next page no */ + inline void setNext(ulint next_page_no); + + /** Set previous page + @param[in] prev_page_no previous page no */ + inline void setPrev(ulint prev_page_no); + + /** Release block by commiting mtr */ + inline void release(); + + /** Start mtr and latch block */ + inline void latch(); + + /** Check if required space is available in the page for the rec + to be inserted. We check fill factor & padding here. + @param[in] length required length + @return true if space is available */ + inline bool isSpaceAvailable(ulint rec_size); + + /** Get page no */ + uint32_t getPageNo() const { return m_page_no; } + + /** Get page level */ + ulint getLevel() + { + return(m_level); + } + + /** Get record no */ + ulint getRecNo() + { + return(m_rec_no); + } + + /** Get page */ + page_t* getPage() + { + return(m_page); + } + + /** Get page zip */ + page_zip_des_t* getPageZip() + { + return(m_page_zip); + } + + dberr_t getError() + { + return(m_err); + } + + void set_modified() { m_mtr.set_modified(*m_block); } + + /* Memory heap for internal allocation */ + mem_heap_t* m_heap; + +private: + /** The index B-tree */ + dict_index_t* m_index; + + /** The mini-transaction */ + mtr_t m_mtr; + + /** The transaction id */ + trx_id_t m_trx_id; + + /** The buffer block */ + buf_block_t* m_block; + + /** The page */ + page_t* m_page; + + /** The page zip descriptor */ + page_zip_des_t* m_page_zip; + + /** The current rec, just before the next insert rec */ + rec_t* m_cur_rec; + + /** The page no */ + uint32_t m_page_no; + + /** The page level in B-tree */ + ulint m_level; + + /** Flag: is page in compact format */ + const bool m_is_comp; + + /** The heap top in page for next insert */ + byte* m_heap_top; + + /** User record no */ + ulint m_rec_no; + + /** The free space left in the page */ + ulint m_free_space; + + /** The reserved space for fill factor */ + ulint m_reserved_space; + + /** The padding space for compressed page */ + ulint m_padding_space; + +#ifdef UNIV_DEBUG + /** Total data in the page */ + ulint m_total_data; +#endif /* UNIV_DEBUG */ + + /** The modify clock value of the buffer block + when the block is re-pinned */ + ib_uint64_t m_modify_clock; + + /** Operation result DB_SUCCESS or error code */ + dberr_t m_err; +}; + +typedef std::vector > + page_bulk_vector; + +class BtrBulk +{ +public: + /** Constructor + @param[in] index B-tree index + @param[in] trx transaction */ + BtrBulk( + dict_index_t* index, + const trx_t* trx) + : + m_index(index), + m_trx(trx) + { + ut_ad(!dict_index_is_spatial(index)); + } + + /** Insert a tuple + @param[in] tuple tuple to insert. + @return error code */ + dberr_t insert(dtuple_t* tuple) + { + return(insert(tuple, 0)); + } + + /** Btree bulk load finish. We commit the last page in each level + and copy the last page in top level to the root page of the index + if no error occurs. + @param[in] err whether bulk load was successful until now + @return error code */ + dberr_t finish(dberr_t err); + + /** Release all latches */ + void release(); + + /** Re-latch all latches */ + void latch(); + + table_name_t table_name() { return m_index->table->name; } + +private: + /** Insert a tuple to a page in a level + @param[in] tuple tuple to insert + @param[in] level B-tree level + @return error code */ + dberr_t insert(dtuple_t* tuple, ulint level); + + /** Split a page + @param[in] page_bulk page to split + @param[in] next_page_bulk next page + @return error code */ + dberr_t pageSplit(PageBulk* page_bulk, + PageBulk* next_page_bulk); + + /** Commit(finish) a page. We set next/prev page no, compress a page of + compressed table and split the page if compression fails, insert a node + pointer to father page if needed, and commit mini-transaction. + @param[in] page_bulk page to commit + @param[in] next_page_bulk next page + @param[in] insert_father flag whether need to insert node ptr + @return error code */ + dberr_t pageCommit(PageBulk* page_bulk, + PageBulk* next_page_bulk, + bool insert_father); + + /** Abort a page when an error occurs + @param[in] page_bulk page bulk object + Note: we should call pageAbort for a PageBulk object, which is not in + m_page_bulks after pageCommit, and we will commit or abort PageBulk + objects in function "finish". */ + void pageAbort(PageBulk* page_bulk) + { + page_bulk->commit(false); + } + + /** Log free check */ + inline void logFreeCheck(); + +private: + /** B-tree index */ + dict_index_t*const m_index; + + /** Transaction */ + const trx_t*const m_trx; + + /** Root page level */ + ulint m_root_level; + + /** Page cursor vector for all level */ + page_bulk_vector m_page_bulks; +}; + +#endif diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h new file mode 100644 index 00000000..f6abc9f5 --- /dev/null +++ b/storage/innobase/include/btr0cur.h @@ -0,0 +1,855 @@ +/***************************************************************************** + +Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/btr0cur.h +The index tree cursor + +Created 10/16/1994 Heikki Tuuri +*******************************************************/ + +#ifndef btr0cur_h +#define btr0cur_h + +#include "dict0dict.h" +#include "page0cur.h" +#include "btr0types.h" +#include "rem0types.h" +#include "gis0type.h" +#include "my_base.h" +#ifdef BTR_CUR_HASH_ADAPT +# include "srw_lock.h" +#endif + +/** Mode flags for btr_cur operations; these can be ORed */ +enum { + /** do no undo logging */ + BTR_NO_UNDO_LOG_FLAG = 1, + /** do no record lock checking */ + BTR_NO_LOCKING_FLAG = 2, + /** sys fields will be found in the update vector or inserted + entry */ + BTR_KEEP_SYS_FLAG = 4, + + /** no rollback */ + BTR_NO_ROLLBACK = BTR_NO_UNDO_LOG_FLAG + | BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG, + + /** btr_cur_pessimistic_update() must keep cursor position + when moving columns to big_rec */ + BTR_KEEP_POS_FLAG = 8, + /** the caller is creating the index or wants to bypass the + index->info.online creation log */ + BTR_CREATE_FLAG = 16, + /** the caller of btr_cur_optimistic_update() or + btr_cur_update_in_place() will take care of + updating IBUF_BITMAP_FREE */ + BTR_KEEP_IBUF_BITMAP = 32 +}; + +#include "que0types.h" +#include "row0types.h" + +#define btr_cur_get_page_cur(cursor) (&(cursor)->page_cur) +#define btr_cur_get_block(cursor) ((cursor)->page_cur.block) +#define btr_cur_get_rec(cursor) ((cursor)->page_cur.rec) + +/*********************************************************//** +Returns the compressed page on which the tree cursor is positioned. +@return pointer to compressed page, or NULL if the page is not compressed */ +UNIV_INLINE +page_zip_des_t* +btr_cur_get_page_zip( +/*=================*/ + btr_cur_t* cursor);/*!< in: tree cursor */ +/*********************************************************//** +Returns the page of a tree cursor. +@return pointer to page */ +UNIV_INLINE +page_t* +btr_cur_get_page( +/*=============*/ + btr_cur_t* cursor);/*!< in: tree cursor */ +/*********************************************************//** +Returns the index of a cursor. +@param cursor b-tree cursor +@return index */ +#define btr_cur_get_index(cursor) ((cursor)->index()) +/*********************************************************//** +Positions a tree cursor at a given record. */ +UNIV_INLINE +void +btr_cur_position( +/*=============*/ + dict_index_t* index, /*!< in: index */ + rec_t* rec, /*!< in: record in tree */ + buf_block_t* block, /*!< in: buffer block of rec */ + btr_cur_t* cursor);/*!< in: cursor */ + +/** Load the instant ALTER TABLE metadata from the clustered index +when loading a table definition. +@param[in,out] table table definition from the data dictionary +@return error code +@retval DB_SUCCESS if no error occurred */ +dberr_t +btr_cur_instant_init(dict_table_t* table) + ATTRIBUTE_COLD __attribute__((nonnull, warn_unused_result)); + +/** Initialize the n_core_null_bytes on first access to a clustered +index root page. +@param[in] index clustered index that is on its first access +@param[in] page clustered index root page +@return whether the page is corrupted */ +bool +btr_cur_instant_root_init(dict_index_t* index, const page_t* page) + ATTRIBUTE_COLD __attribute__((nonnull, warn_unused_result)); + +MY_ATTRIBUTE((warn_unused_result)) +/********************************************************************//** +Searches an index tree and positions a tree cursor on a given non-leaf level. +NOTE: n_fields_cmp in tuple must be set so that it cannot be compared +to node pointer page number fields on the upper levels of the tree! +cursor->up_match and cursor->low_match both will have sensible values. +Cursor is left at the place where an insert of the +search tuple should be performed in the B-tree. InnoDB does an insert +immediately after the cursor. Thus, the cursor may end up on a user record, +or on a page infimum record. +@param level the tree level of search +@param tuple data tuple; NOTE: n_fields_cmp in tuple must be set so that + it cannot get compared to the node ptr page number field! +@param latch RW_S_LATCH or RW_X_LATCH +@param cursor tree cursor; the cursor page is s- or x-latched, but see also + above! +@param mtr mini-transaction +@return DB_SUCCESS on success or error code otherwise */ +dberr_t btr_cur_search_to_nth_level(ulint level, + const dtuple_t *tuple, + rw_lock_type_t rw_latch, + btr_cur_t *cursor, mtr_t *mtr); + +/*************************************************************//** +Tries to perform an insert to a page in an index tree, next to cursor. +It is assumed that mtr holds an x-latch on the page. The operation does +not succeed if there is too little space on the page. If there is just +one record on the page, the insert will always succeed; this is to +prevent trying to split a page with just one record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL, or error number */ +dberr_t +btr_cur_optimistic_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags: if not + zero, the parameters index and thr should be + specified */ + btr_cur_t* cursor, /*!< in: cursor on page after which to insert; + cursor stays valid */ + rec_offs** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap */ + dtuple_t* entry, /*!< in/out: entry to insert */ + rec_t** rec, /*!< out: pointer to inserted record if + succeed */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller */ + ulint n_ext, /*!< in: number of externally stored columns */ + que_thr_t* thr, /*!< in/out: query thread; can be NULL if + !(~flags + & (BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG)) */ + mtr_t* mtr) /*!< in/out: mini-transaction; + if this function returns DB_SUCCESS on + a leaf page of a secondary index in a + compressed tablespace, the caller must + mtr_commit(mtr) before latching + any further pages */ + MY_ATTRIBUTE((nonnull(2,3,4,5,6,7,10), warn_unused_result)); +/*************************************************************//** +Performs an insert on a page of an index tree. It is assumed that mtr +holds an x-latch on the tree and on the cursor page. If the insert is +made on the leaf level, to avoid deadlocks, mtr must also own x-latches +to brothers of page, if those brothers exist. +@return DB_SUCCESS or error number */ +dberr_t +btr_cur_pessimistic_insert( +/*=======================*/ + ulint flags, /*!< in: undo logging and locking flags: if not + zero, the parameter thr should be + specified; if no undo logging is specified, + then the caller must have reserved enough + free extents in the file space so that the + insertion will certainly succeed */ + btr_cur_t* cursor, /*!< in: cursor after which to insert; + cursor stays valid */ + rec_offs** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap + that can be emptied */ + dtuple_t* entry, /*!< in/out: entry to insert */ + rec_t** rec, /*!< out: pointer to inserted record if + succeed */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller */ + ulint n_ext, /*!< in: number of externally stored columns */ + que_thr_t* thr, /*!< in/out: query thread; can be NULL if + !(~flags + & (BTR_NO_LOCKING_FLAG + | BTR_NO_UNDO_LOG_FLAG)) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull(2,3,4,5,6,7,10), warn_unused_result)); +/*************************************************************//** +See if there is enough place in the page modification log to log +an update-in-place. + +@retval false if out of space; IBUF_BITMAP_FREE will be reset +outside mtr if the page was recompressed +@retval true if enough place; + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is +a secondary index leaf page. This has to be done either within the +same mini-transaction, or by invoking ibuf_reset_free_bits() before +mtr_commit(mtr). */ +bool +btr_cur_update_alloc_zip_func( +/*==========================*/ + page_zip_des_t* page_zip,/*!< in/out: compressed page */ + page_cur_t* cursor, /*!< in/out: B-tree page cursor */ +#ifdef UNIV_DEBUG + rec_offs* offsets,/*!< in/out: offsets of the cursor record */ +#endif /* UNIV_DEBUG */ + ulint length, /*!< in: size needed */ + bool create, /*!< in: true=delete-and-insert, + false=update-in-place */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +#ifdef UNIV_DEBUG +# define btr_cur_update_alloc_zip(page_zip,cursor,offsets,len,cr,mtr) \ + btr_cur_update_alloc_zip_func(page_zip,cursor,offsets,len,cr,mtr) +#else /* UNIV_DEBUG */ +# define btr_cur_update_alloc_zip(page_zip,cursor,offsets,len,cr,mtr) \ + btr_cur_update_alloc_zip_func(page_zip,cursor,len,cr,mtr) +#endif /* UNIV_DEBUG */ + +/** Apply an update vector to a record. No field size changes are allowed. + +This is usually invoked on a clustered index. The only use case for a +secondary index is row_ins_sec_index_entry_by_modify() or its +counterpart in ibuf_insert_to_index_page(). +@param[in,out] rec index record +@param[in] index the index of the record +@param[in] offsets rec_get_offsets(rec, index) +@param[in] update update vector +@param[in,out] block index page +@param[in,out] mtr mini-transaction */ +void btr_cur_upd_rec_in_place(rec_t *rec, const dict_index_t *index, + const rec_offs *offsets, const upd_t *update, + buf_block_t *block, mtr_t *mtr) + MY_ATTRIBUTE((nonnull)); +/*************************************************************//** +Updates a record when the update causes no size changes in its fields. +@return locking or undo log related error code, or +@retval DB_SUCCESS on success +@retval DB_ZIP_OVERFLOW if there is not enough space left +on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */ +dberr_t +btr_cur_update_in_place( +/*====================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor on the record to update; + cursor stays valid and positioned on the + same record */ + rec_offs* offsets,/*!< in/out: offsets on cursor->page_cur.rec */ + const upd_t* update, /*!< in: update vector */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction; if this + is a secondary index, the caller must + mtr_commit(mtr) before latching any + further pages */ + MY_ATTRIBUTE((warn_unused_result, nonnull)); +/*************************************************************//** +Tries to update a record on a page in an index tree. It is assumed that mtr +holds an x-latch on the page. The operation does not succeed if there is too +little space on the page or if the update would result in too empty a page, +so that tree compression is recommended. +@return error code, including +@retval DB_SUCCESS on success +@retval DB_OVERFLOW if the updated record does not fit +@retval DB_UNDERFLOW if the page would become too empty +@retval DB_ZIP_OVERFLOW if there is not enough space left +on the compressed page */ +dberr_t +btr_cur_optimistic_update( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in: cursor on the record to update; + cursor stays valid and positioned on the + same record */ + rec_offs** offsets,/*!< out: offsets on cursor->page_cur.rec */ + mem_heap_t** heap, /*!< in/out: pointer to NULL or memory heap */ + const upd_t* update, /*!< in: update vector; this must also + contain trx id and roll ptr fields */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction; if this + is a secondary index, the caller must + mtr_commit(mtr) before latching any + further pages */ + MY_ATTRIBUTE((warn_unused_result, nonnull)); +/*************************************************************//** +Performs an update of a record on a page of a tree. It is assumed +that mtr holds an x-latch on the tree and on the cursor page. If the +update is made on the leaf level, to avoid deadlocks, mtr must also +own x-latches to brothers of page, if those brothers exist. +@return DB_SUCCESS or error code */ +dberr_t +btr_cur_pessimistic_update( +/*=======================*/ + ulint flags, /*!< in: undo logging, locking, and rollback + flags */ + btr_cur_t* cursor, /*!< in/out: cursor on the record to update; + cursor may become invalid if *big_rec == NULL + || !(flags & BTR_KEEP_POS_FLAG) */ + rec_offs** offsets,/*!< out: offsets on cursor->page_cur.rec */ + mem_heap_t** offsets_heap, + /*!< in/out: pointer to memory heap + that can be emptied */ + mem_heap_t* entry_heap, + /*!< in/out: memory heap for allocating + big_rec and the index tuple */ + big_rec_t** big_rec,/*!< out: big rec vector whose fields have to + be stored externally by the caller */ + upd_t* update, /*!< in/out: update vector; this is allowed to + also contain trx id and roll ptr fields. + Non-updated columns that are moved offpage will + be appended to this. */ + ulint cmpl_info,/*!< in: compiler info on secondary index + updates */ + que_thr_t* thr, /*!< in: query thread */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction; must be committed + before latching any further pages */ + MY_ATTRIBUTE((warn_unused_result, nonnull)); +/***********************************************************//** +Marks a clustered index record deleted. Writes an undo log record to +undo log on this delete marking. Writes in the trx id field the id +of the deleting transaction, and in the roll ptr field pointer to the +undo log record created. +@return DB_SUCCESS, DB_LOCK_WAIT, or error number */ +dberr_t +btr_cur_del_mark_set_clust_rec( +/*===========================*/ + buf_block_t* block, /*!< in/out: buffer block of the record */ + rec_t* rec, /*!< in/out: record */ + dict_index_t* index, /*!< in: clustered index of the record */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec) */ + que_thr_t* thr, /*!< in: query thread */ + const dtuple_t* entry, /*!< in: dtuple for the deleting record */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*************************************************************//** +Tries to compress a page of the tree if it seems useful. It is assumed +that mtr holds an x-latch on the tree and on the cursor page. To avoid +deadlocks, mtr must also own x-latches to brothers of page, if those +brothers exist. NOTE: it is assumed that the caller has reserved enough +free extents so that the compression will always succeed if done! +@return whether compression occurred */ +bool +btr_cur_compress_if_useful( +/*=======================*/ + btr_cur_t* cursor, /*!< in/out: cursor on the page to compress; + cursor does not stay valid if !adjust and + compression occurs */ + bool adjust, /*!< in: whether the cursor position should be + adjusted even when compression occurs */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull)); +/*******************************************************//** +Removes the record on which the tree cursor is positioned. It is assumed +that the mtr has an x-latch on the page where the cursor is positioned, +but no latch on the whole tree. +@return error code +@retval DB_FAIL if the page would become too empty */ +dberr_t +btr_cur_optimistic_delete( +/*======================*/ + btr_cur_t* cursor, /*!< in: cursor on the record to delete; + cursor stays valid: if deletion succeeds, + on function exit it points to the successor + of the deleted record */ + ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */ + mtr_t* mtr) /*!< in: mtr; if this function returns + TRUE on a leaf page of a secondary + index, the mtr must be committed + before latching any further pages */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*************************************************************//** +Removes the record on which the tree cursor is positioned. Tries +to compress the page if its fillfactor drops below a threshold +or if it is the only page on the level. It is assumed that mtr holds +an x-latch on the tree and on the cursor page. To avoid deadlocks, +mtr must also own x-latches to brothers of page, if those brothers +exist. +@return TRUE if compression occurred */ +ibool +btr_cur_pessimistic_delete( +/*=======================*/ + dberr_t* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE; + the latter may occur because we may have + to update node pointers on upper levels, + and in the case of variable length keys + these may actually grow in size */ + ibool has_reserved_extents, /*!< in: TRUE if the + caller has already reserved enough free + extents so that he knows that the operation + will succeed */ + btr_cur_t* cursor, /*!< in: cursor on the record to delete; + if compression does not occur, the cursor + stays valid: it points to successor of + deleted record on function exit */ + ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */ + bool rollback,/*!< in: performing rollback? */ + mtr_t* mtr) /*!< in: mtr */ + MY_ATTRIBUTE((nonnull)); +/** Delete the node pointer in a parent page. +@param[in,out] parent cursor pointing to parent record +@param[in,out] mtr mini-transaction */ +dberr_t btr_cur_node_ptr_delete(btr_cur_t* parent, mtr_t* mtr) + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/***********************************************************//** +Parses a redo log record of updating a record in-place. +@return end of log record or NULL */ +byte* +btr_cur_parse_update_in_place( +/*==========================*/ + byte* ptr, /*!< in: buffer */ + byte* end_ptr,/*!< in: buffer end */ + page_t* page, /*!< in/out: page or NULL */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + dict_index_t* index); /*!< in: index corresponding to page */ +/** Arguments to btr_estimate_n_rows_in_range */ +struct btr_pos_t +{ + btr_pos_t(dtuple_t *arg_tuple, + page_cur_mode_t arg_mode, + page_id_t arg_page_id) + :tuple(arg_tuple), mode(arg_mode), page_id(arg_page_id) + {} + + dtuple_t* tuple; /* Range start or end. May be NULL */ + page_cur_mode_t mode; /* search mode for range */ + page_id_t page_id; /* Out: Page where we found the tuple */ +}; + +/** Estimates the number of rows in a given index range. Do search in the +left page, then if there are pages between left and right ones, read a few +pages to the right, if the right page is reached, fetch it and count the exact +number of rows, otherwise count the estimated(see +btr_estimate_n_rows_in_range_on_level() for details) number if rows, and +fetch the right page. If leaves are reached, unlatch non-leaf pages except +the right leaf parent. After the right leaf page is fetched, commit mtr. +@param[in] index index +@param[in] range_start range start +@param[in] range_end range end +@return estimated number of rows; */ +ha_rows btr_estimate_n_rows_in_range(dict_index_t *index, + btr_pos_t *range_start, + btr_pos_t *range_end); + +/** Gets the externally stored size of a record, in units of a database page. +@param[in] rec record +@param[in] offsets array returned by rec_get_offsets() +@return externally stored part, in units of a database page */ +ulint +btr_rec_get_externally_stored_len( + const rec_t* rec, + const rec_offs* offsets); + +/*******************************************************************//** +Marks non-updated off-page fields as disowned by this record. The ownership +must be transferred to the updated record which is inserted elsewhere in the +index tree. In purge only the owner of externally stored field is allowed +to free the field. */ +void +btr_cur_disown_inherited_fields( +/*============================*/ + buf_block_t* block, /*!< in/out: index page */ + rec_t* rec, /*!< in/out: record in a clustered index */ + dict_index_t* index, /*!< in: index of the page */ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + const upd_t* update, /*!< in: update vector */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull(2,3,4,5,6))); + +/** Operation code for btr_store_big_rec_extern_fields(). */ +enum blob_op { + /** Store off-page columns for a freshly inserted record */ + BTR_STORE_INSERT = 0, + /** Store off-page columns for an insert by update */ + BTR_STORE_INSERT_UPDATE, + /** Store off-page columns for an update */ + BTR_STORE_UPDATE, + /** Store off-page columns for a freshly inserted record by bulk */ + BTR_STORE_INSERT_BULK +}; + +/*******************************************************************//** +Determine if an operation on off-page columns is an update. +@return TRUE if op != BTR_STORE_INSERT */ +UNIV_INLINE +ibool +btr_blob_op_is_update( +/*==================*/ + enum blob_op op) /*!< in: operation */ + MY_ATTRIBUTE((warn_unused_result)); + +/*******************************************************************//** +Stores the fields in big_rec_vec to the tablespace and puts pointers to +them in rec. The extern flags in rec will have to be set beforehand. +The fields are stored on pages allocated from leaf node +file segment of the index tree. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +dberr_t +btr_store_big_rec_extern_fields( +/*============================*/ + btr_pcur_t* pcur, /*!< in: a persistent cursor */ + rec_offs* offsets, /*!< in/out: rec_get_offsets() on + pcur. the "external storage" flags + in offsets will correctly correspond + to rec when this function returns */ + const big_rec_t*big_rec_vec, /*!< in: vector containing fields + to be stored externally */ + mtr_t* btr_mtr, /*!< in/out: mtr containing the + latches to the clustered index. can be + committed and restarted. */ + enum blob_op op) /*! in: operation code */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/*******************************************************************//** +Frees the space in an externally stored field to the file space +management if the field in data is owned the externally stored field, +in a rollback we may have the additional condition that the field must +not be inherited. */ +void +btr_free_externally_stored_field( +/*=============================*/ + dict_index_t* index, /*!< in: index of the data, the index + tree MUST be X-latched; if the tree + height is 1, then also the root page + must be X-latched! (this is relevant + in the case this function is called + from purge where 'data' is located on + an undo log page, not an index + page) */ + byte* field_ref, /*!< in/out: field reference */ + const rec_t* rec, /*!< in: record containing field_ref, for + page_zip_write_blob_ptr(), or NULL */ + const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index), + or NULL */ + buf_block_t* block, /*!< in/out: page of field_ref */ + ulint i, /*!< in: field number of field_ref; + ignored if rec == NULL */ + bool rollback, /*!< in: performing rollback? */ + mtr_t* local_mtr) /*!< in: mtr containing the latch */ + MY_ATTRIBUTE((nonnull(1,2,5,8))); + +/** Copies the prefix of an externally stored field of a record. +The clustered index record must be protected by a lock or a page latch. +@param[out] buf the field, or a prefix of it +@param[in] len length of buf, in bytes +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] data 'internally' stored part of the field +containing also the reference to the external part; must be protected by +a lock or a page latch +@param[in] local_len length of data, in bytes +@return the length of the copied field, or 0 if the column was being +or has been deleted */ +ulint +btr_copy_externally_stored_field_prefix( + byte* buf, + ulint len, + ulint zip_size, + const byte* data, + ulint local_len); + +/** Copies an externally stored field of a record to mem heap. +The clustered index record must be protected by a lock or a page latch. +@param[out] len length of the whole field +@param[in] data 'internally' stored part of the field +containing also the reference to the external part; must be protected by +a lock or a page latch +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] local_len length of data +@param[in,out] heap mem heap +@return the whole field copied to heap */ +byte* +btr_copy_externally_stored_field( + ulint* len, + const byte* data, + ulint zip_size, + ulint local_len, + mem_heap_t* heap); + +/** Copies an externally stored field of a record to mem heap. +@param[in] rec record in a clustered index; must be +protected by a lock or a page latch +@param[in] offset array returned by rec_get_offsets() +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] no field number +@param[out] len length of the field +@param[in,out] heap mem heap +@return the field copied to heap, or NULL if the field is incomplete */ +byte* +btr_rec_copy_externally_stored_field( + const rec_t* rec, + const rec_offs* offsets, + ulint zip_size, + ulint no, + ulint* len, + mem_heap_t* heap); + +/*######################################################################*/ + +/** In the pessimistic delete, if the page data size drops below this +limit, merging it to a neighbor is tried */ +#define BTR_CUR_PAGE_COMPRESS_LIMIT(index) \ + ((srv_page_size * (ulint)((index)->merge_threshold)) / 100) + +/** A slot in the path array. We store here info on a search path down the +tree. Each slot contains data on a single level of the tree. */ +struct btr_path_t { + /* Assume a page like: + records: (inf, a, b, c, d, sup) + index of the record: 0, 1, 2, 3, 4, 5 + */ + + /** Index of the record where the page cursor stopped on this level + (index in alphabetical order). Value ULINT_UNDEFINED denotes array + end. In the above example, if the search stopped on record 'c', then + nth_rec will be 3. */ + ulint nth_rec; + + /** Number of the records on the page, not counting inf and sup. + In the above example n_recs will be 4. */ + ulint n_recs; + + /** Number of the page containing the record. */ + uint32_t page_no; + + /** Level of the page. If later we fetch the page under page_no + and it is no different level then we know that the tree has been + reorganized. */ + ulint page_level; +}; + +#define BTR_PATH_ARRAY_N_SLOTS 250 /*!< size of path array (in slots) */ + +/** Values for the flag documenting the used search method */ +enum btr_cur_method { + BTR_CUR_HASH = 1, /*!< successful shortcut using + the hash index */ + BTR_CUR_HASH_FAIL, /*!< failure using hash, success using + binary search: the misleading hash + reference is stored in the field + hash_node, and might be necessary to + update */ + BTR_CUR_BINARY, /*!< success using the binary search */ + BTR_CUR_INSERT_TO_IBUF, /*!< performed the intended insert to + the insert buffer */ + BTR_CUR_DEL_MARK_IBUF, /*!< performed the intended delete + mark in the insert/delete buffer */ + BTR_CUR_DELETE_IBUF, /*!< performed the intended delete in + the insert/delete buffer */ + BTR_CUR_DELETE_REF /*!< row_purge_poss_sec() failed */ +}; + +/** The tree cursor: the definition appears here only for the compiler +to know struct size! */ +struct btr_cur_t { + page_cur_t page_cur; /*!< page cursor */ + purge_node_t* purge_node; /*!< purge node, for BTR_DELETE */ + /*------------------------------*/ + que_thr_t* thr; /*!< this field is only used + when search_leaf() + is called for an index entry + insertion: the calling query + thread is passed here to be + used in the insert buffer */ + /*------------------------------*/ + /** The following fields are used in + search_leaf() to pass information: */ + /* @{ */ + enum btr_cur_method flag; /*!< Search method used */ + ulint tree_height; /*!< Tree height if the search is done + for a pessimistic insert or update + operation */ + ulint up_match; /*!< If the search mode was PAGE_CUR_LE, + the number of matched fields to the + the first user record to the right of + the cursor record after search_leaf(); + for the mode PAGE_CUR_GE, the matched + fields to the first user record AT THE + CURSOR or to the right of it; + NOTE that the up_match and low_match + values may exceed the correct values + for comparison to the adjacent user + record if that record is on a + different leaf page! (See the note in + row_ins_duplicate_error_in_clust.) */ + ulint up_bytes; /*!< number of matched bytes to the + right at the time cursor positioned; + only used internally in searches: not + defined after the search */ + ulint low_match; /*!< if search mode was PAGE_CUR_LE, + the number of matched fields to the + first user record AT THE CURSOR or + to the left of it after search_leaf(); + NOT defined for PAGE_CUR_GE or any + other search modes; see also the NOTE + in up_match! */ + ulint low_bytes; /*!< number of matched bytes to the + left at the time cursor positioned; + only used internally in searches: not + defined after the search */ + ulint n_fields; /*!< prefix length used in a hash + search if hash_node != NULL */ + ulint n_bytes; /*!< hash prefix bytes if hash_node != + NULL */ + ulint fold; /*!< fold value used in the search if + flag is BTR_CUR_HASH */ + /* @} */ + btr_path_t* path_arr; /*!< in estimating the number of + rows in range, we store in this array + information of the path through + the tree */ + rtr_info_t* rtr_info; /*!< rtree search info */ + btr_cur_t() { memset((void*) this, 0, sizeof *this); } + + dict_index_t *index() const { return page_cur.index; } + buf_block_t *block() const { return page_cur.block; } + + /** Open the cursor on the first or last record. + @param first true=first record, false=last record + @param index B-tree + @param latch_mode which latches to acquire + @param mtr mini-transaction + @return error code */ + dberr_t open_leaf(bool first, dict_index_t *index, btr_latch_mode latch_mode, + mtr_t *mtr); + + /** Search the leaf page record corresponding to a key. + @param tuple key to search for, with correct n_fields_cmp + @param mode search mode; PAGE_CUR_LE for unique prefix or for inserting + @param latch_mode latch mode + @param mtr mini-transaction + @return error code */ + dberr_t search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, + btr_latch_mode latch_mode, mtr_t *mtr); + + /** Search the leaf page record corresponding to a key, exclusively latching + all sibling pages on the way. + @param tuple key to search for, with correct n_fields_cmp + @param mode search mode; PAGE_CUR_LE for unique prefix or for inserting + @param mtr mini-transaction + @return error code */ + dberr_t pessimistic_search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, + mtr_t *mtr); + + /** Open the cursor at a random leaf page record. + @param offsets temporary memory for rec_get_offsets() + @param heap memory heap for rec_get_offsets() + @param mtr mini-transaction + @return error code */ + inline dberr_t open_random_leaf(rec_offs *&offsets, mem_heap_t *& heap, + mtr_t &mtr); +}; + +/** Modify the delete-mark flag of a record. +@tparam flag the value of the delete-mark flag +@param[in,out] block buffer block +@param[in,out] rec record on a physical index page +@param[in,out] mtr mini-transaction */ +template +void btr_rec_set_deleted(buf_block_t *block, rec_t *rec, mtr_t *mtr) + MY_ATTRIBUTE((nonnull)); + +/** If pessimistic delete fails because of lack of file space, there +is still a good change of success a little later. Try this many +times. */ +#define BTR_CUR_RETRY_DELETE_N_TIMES 100 +/** If pessimistic delete fails because of lack of file space, there +is still a good change of success a little later. Sleep this time +between retries. */ +static const std::chrono::milliseconds BTR_CUR_RETRY_SLEEP_TIME(50); + +/** The reference in a field for which data is stored on a different page. +The reference is at the end of the 'locally' stored part of the field. +'Locally' means storage in the index record. +We store locally a long enough prefix of each column so that we can determine +the ordering parts of each index record without looking into the externally +stored part. */ +/*-------------------------------------- @{ */ +#define BTR_EXTERN_SPACE_ID 0U /*!< space id where stored */ +#define BTR_EXTERN_PAGE_NO 4U /*!< page no where stored */ +#define BTR_EXTERN_OFFSET 8U /*!< offset of BLOB header + on that page */ +#define BTR_EXTERN_LEN 12U /*!< 8 bytes containing the + length of the externally + stored part of the BLOB. + The 2 highest bits are + reserved to the flags below. */ +/*-------------------------------------- @} */ +/* #define BTR_EXTERN_FIELD_REF_SIZE 20 // moved to btr0types.h */ + +/** The most significant bit of BTR_EXTERN_LEN (i.e., the most +significant bit of the byte at smallest address) is set to 1 if this +field does not 'own' the externally stored field; only the owner field +is allowed to free the field in purge! */ +#define BTR_EXTERN_OWNER_FLAG 128U +/** If the second most significant bit of BTR_EXTERN_LEN (i.e., the +second most significant bit of the byte at smallest address) is 1 then +it means that the externally stored field was inherited from an +earlier version of the row. In rollback we are not allowed to free an +inherited external field. */ +#define BTR_EXTERN_INHERITED_FLAG 64U + +#ifdef BTR_CUR_HASH_ADAPT +/** Number of searches down the B-tree in btr_cur_t::search_leaf(). */ +extern ib_counter_t btr_cur_n_non_sea; +/** Old value of btr_cur_n_non_sea. Copied by +srv_refresh_innodb_monitor_stats(). Referenced by +srv_printf_innodb_monitor(). */ +extern ulint btr_cur_n_non_sea_old; +/** Number of successful adaptive hash index lookups in +btr_cur_t::search_leaf(). */ +extern ib_counter_t btr_cur_n_sea; +/** Old value of btr_cur_n_sea. Copied by +srv_refresh_innodb_monitor_stats(). Referenced by +srv_printf_innodb_monitor(). */ +extern ulint btr_cur_n_sea_old; +#endif /* BTR_CUR_HASH_ADAPT */ + +#ifdef UNIV_DEBUG +/* Flag to limit optimistic insert records */ +extern uint btr_cur_limit_optimistic_insert_debug; +#endif /* UNIV_DEBUG */ + +#include "btr0cur.inl" + +#endif diff --git a/storage/innobase/include/btr0cur.inl b/storage/innobase/include/btr0cur.inl new file mode 100644 index 00000000..955cf342 --- /dev/null +++ b/storage/innobase/include/btr0cur.inl @@ -0,0 +1,170 @@ +/***************************************************************************** + +Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/btr0cur.ic +The index tree cursor + +Created 10/16/1994 Heikki Tuuri +*******************************************************/ + +#include "btr0btr.h" + +#ifdef UNIV_DEBUG +# define LIMIT_OPTIMISTIC_INSERT_DEBUG(NREC, CODE)\ +if (btr_cur_limit_optimistic_insert_debug > 1\ + && (NREC) >= btr_cur_limit_optimistic_insert_debug) {\ + CODE;\ +} +#else +# define LIMIT_OPTIMISTIC_INSERT_DEBUG(NREC, CODE) +#endif /* UNIV_DEBUG */ + +/*********************************************************//** +Returns the compressed page on which the tree cursor is positioned. +@return pointer to compressed page, or NULL if the page is not compressed */ +UNIV_INLINE +page_zip_des_t* +btr_cur_get_page_zip( +/*=================*/ + btr_cur_t* cursor) /*!< in: tree cursor */ +{ + return(buf_block_get_page_zip(btr_cur_get_block(cursor))); +} + +/*********************************************************//** +Returns the page of a tree cursor. +@return pointer to page */ +UNIV_INLINE +page_t* +btr_cur_get_page( +/*=============*/ + btr_cur_t* cursor) /*!< in: tree cursor */ +{ + return(page_align(page_cur_get_rec(&(cursor->page_cur)))); +} + +/*********************************************************//** +Positions a tree cursor at a given record. */ +UNIV_INLINE +void +btr_cur_position( +/*=============*/ + dict_index_t* index, /*!< in: index */ + rec_t* rec, /*!< in: record in tree */ + buf_block_t* block, /*!< in: buffer block of rec */ + btr_cur_t* cursor) /*!< out: cursor */ +{ + page_cur_position(rec, block, btr_cur_get_page_cur(cursor)); + cursor->page_cur.index = index; +} + +/*********************************************************************//** +Checks if compressing an index page where a btr cursor is placed makes +sense. +@return TRUE if compression is recommended */ +UNIV_INLINE +ibool +btr_cur_compress_recommendation( +/*============================*/ + btr_cur_t* cursor, /*!< in: btr cursor */ + mtr_t* mtr) /*!< in: mtr */ +{ + const page_t* page; + + ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + + page = btr_cur_get_page(cursor); + + LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page) * 2U, + return(FALSE)); + + if (!page_has_siblings(page) + || page_get_data_size(page) + < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index())) { + + /* The page fillfactor has dropped below a predefined + minimum value OR the level in the B-tree contains just + one page: we recommend compression if this is not the + root page. */ + + return cursor->index()->page + != btr_cur_get_block(cursor)->page.id().page_no(); + } + + return(FALSE); +} + +/*********************************************************************//** +Checks if the record on which the cursor is placed can be deleted without +making tree compression necessary (or, recommended). +@return TRUE if can be deleted without recommended compression */ +UNIV_INLINE +ibool +btr_cur_can_delete_without_compress( +/*================================*/ + btr_cur_t* cursor, /*!< in: btr cursor */ + ulint rec_size,/*!< in: rec_get_size(btr_cur_get_rec(cursor))*/ + mtr_t* mtr) /*!< in: mtr */ +{ + page_t* page; + + ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor), + MTR_MEMO_PAGE_X_FIX)); + + page = btr_cur_get_page(cursor); + + if (!page_has_siblings(page) || page_get_n_recs(page) < 2 + || page_get_data_size(page) - rec_size + < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index())) { + + /* The page fillfactor will drop below a predefined + minimum value, OR the level in the B-tree contains just + one page, OR the page will become empty: we recommend + compression if this is not the root page. */ + + return cursor->index()->page + == btr_cur_get_block(cursor)->page.id().page_no(); + } + + return(TRUE); +} + +/*******************************************************************//** +Determine if an operation on off-page columns is an update. +@return TRUE if op != BTR_STORE_INSERT */ +UNIV_INLINE +ibool +btr_blob_op_is_update( +/*==================*/ + enum blob_op op) /*!< in: operation */ +{ + switch (op) { + case BTR_STORE_INSERT: + case BTR_STORE_INSERT_BULK: + return(FALSE); + case BTR_STORE_INSERT_UPDATE: + case BTR_STORE_UPDATE: + return(TRUE); + } + + ut_ad(0); + return(FALSE); +} diff --git a/storage/innobase/include/btr0defragment.h b/storage/innobase/include/btr0defragment.h new file mode 100644 index 00000000..0523829b --- /dev/null +++ b/storage/innobase/include/btr0defragment.h @@ -0,0 +1,65 @@ +/***************************************************************************** + +Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved. +Copyright (C) 2014, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +#ifndef btr0defragment_h +#define btr0defragment_h + +#include "btr0pcur.h" + +/* Max number of pages to consider at once during defragmentation. */ +#define BTR_DEFRAGMENT_MAX_N_PAGES 32 + +/** stats in btr_defragment */ +extern Atomic_counter btr_defragment_compression_failures; +extern Atomic_counter btr_defragment_failures; +extern Atomic_counter btr_defragment_count; + +/******************************************************************//** +Initialize defragmentation. */ +void +btr_defragment_init(void); +/******************************************************************//** +Shutdown defragmentation. */ +void +btr_defragment_shutdown(); +/******************************************************************//** +Check whether the given index is in btr_defragment_wq. */ +bool +btr_defragment_find_index( + dict_index_t* index); /*!< Index to find. */ +/** Defragment an index. +@param pcur persistent cursor +@param thd current session, for checking thd_killed() +@return whether the operation was interrupted */ +bool btr_defragment_add_index(btr_pcur_t *pcur, THD *thd); +/******************************************************************//** +When table is dropped, this function is called to mark a table as removed in +btr_efragment_wq. The difference between this function and the remove_index +function is this will not NULL the event. */ +void +btr_defragment_remove_table( + dict_table_t* table); /*!< Index to be removed. */ +/*********************************************************************//** +Check whether we should save defragmentation statistics to persistent storage.*/ +void btr_defragment_save_defrag_stats_if_needed(dict_index_t *index); + +/* Stop defragmentation.*/ +void btr_defragment_end(); +extern bool btr_defragment_active; +#endif diff --git a/storage/innobase/include/btr0pcur.h b/storage/innobase/include/btr0pcur.h new file mode 100644 index 00000000..c66a3bfa --- /dev/null +++ b/storage/innobase/include/btr0pcur.h @@ -0,0 +1,459 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/btr0pcur.h +The index tree persistent cursor + +Created 2/23/1996 Heikki Tuuri +*******************************************************/ + +#pragma once + +#include "dict0dict.h" +#include "btr0cur.h" +#include "buf0block_hint.h" +#include "btr0btr.h" +#include "gis0rtree.h" + +/* Relative positions for a stored cursor position */ +enum btr_pcur_pos_t { + BTR_PCUR_ON = 1, + BTR_PCUR_BEFORE = 2, + BTR_PCUR_AFTER = 3, +/* Note that if the tree is not empty, btr_pcur_store_position does not +use the following, but only uses the above three alternatives, where the +position is stored relative to a specific record: this makes implementation +of a scroll cursor easier */ + BTR_PCUR_BEFORE_FIRST_IN_TREE = 4, /* in an empty tree */ + BTR_PCUR_AFTER_LAST_IN_TREE = 5 /* in an empty tree */ +}; + +/**************************************************************//** +Resets a persistent cursor object, freeing ::old_rec_buf if it is +allocated and resetting the other members to their initial values. */ +void +btr_pcur_reset( +/*===========*/ + btr_pcur_t* cursor);/*!< in, out: persistent cursor */ + +/**************************************************************//** +Copies the stored position of a pcur to another pcur. */ +void +btr_pcur_copy_stored_position( +/*==========================*/ + btr_pcur_t* pcur_receive, /*!< in: pcur which will receive the + position info */ + btr_pcur_t* pcur_donate); /*!< in: pcur from which the info is + copied */ +/**************************************************************//** +Sets the old_rec_buf field to NULL. */ +UNIV_INLINE +void +btr_pcur_init( +/*==========*/ + btr_pcur_t* pcur); /*!< in: persistent cursor */ + +/** Opens an persistent cursor to an index tree without initializing the +cursor. +@param tuple tuple on which search done +@param mode PAGE_CUR_L, ...; NOTE that if the search is made using a + unique prefix of a record, mode should be PAGE_CUR_LE, not + PAGE_CUR_GE, as the latter may end up on the previous page of + the record! +@param latch_mode BTR_SEARCH_LEAF, ... +@param cursor memory buffer for persistent cursor +@param mtr mini-transaction +@return DB_SUCCESS on success or error code otherwise. */ +inline +dberr_t btr_pcur_open_with_no_init(const dtuple_t *tuple, page_cur_mode_t mode, + btr_latch_mode latch_mode, + btr_pcur_t *cursor, mtr_t *mtr); + +/**************************************************************//** +Gets the up_match value for a pcur after a search. +@return number of matched fields at the cursor or to the right if +search mode was PAGE_CUR_GE, otherwise undefined */ +UNIV_INLINE +ulint +btr_pcur_get_up_match( +/*==================*/ + const btr_pcur_t* cursor); /*!< in: persistent cursor */ +/**************************************************************//** +Gets the low_match value for a pcur after a search. +@return number of matched fields at the cursor or to the right if +search mode was PAGE_CUR_LE, otherwise undefined */ +UNIV_INLINE +ulint +btr_pcur_get_low_match( +/*===================*/ + const btr_pcur_t* cursor); /*!< in: persistent cursor */ + +/**************************************************************//** +Frees the possible memory heap of a persistent cursor and sets the latch +mode of the persistent cursor to BTR_NO_LATCHES. +WARNING: this function does not release the latch on the page where the +cursor is currently positioned. The latch is acquired by the +"move to next/previous" family of functions. Since recursive shared locks +are not allowed, you must take care (if using the cursor in S-mode) to +manually release the latch by either calling +btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr) +or by mtr_t::commit(). */ +UNIV_INLINE +void +btr_pcur_close( +/*===========*/ + btr_pcur_t* cursor); /*!< in: persistent cursor */ +/**************************************************************//** +The position of the cursor is stored by taking an initial segment of the +record the cursor is positioned on, before, or after, and copying it to the +cursor data structure, or just setting a flag if the cursor id before the +first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the +page where the cursor is positioned must not be empty if the index tree is +not totally empty! */ +void +btr_pcur_store_position( +/*====================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************//** +Gets the rel_pos field for a cursor whose position has been stored. +@return BTR_PCUR_ON, ... */ +UNIV_INLINE +ulint +btr_pcur_get_rel_pos( +/*=================*/ + const btr_pcur_t* cursor);/*!< in: persistent cursor */ +/**************************************************************//** +Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES, +that is, the cursor becomes detached. +Function btr_pcur_store_position should be used before calling this, +if restoration of cursor is wanted later. */ +UNIV_INLINE +void +btr_pcur_commit_specify_mtr( +/*========================*/ + btr_pcur_t* pcur, /*!< in: persistent cursor */ + mtr_t* mtr); /*!< in: mtr to commit */ + +/** Commits the mtr and sets the clustered index pcur and secondary index +pcur latch mode to BTR_NO_LATCHES, that is, the cursor becomes detached. +Function btr_pcur_store_position should be used for both cursor before +calling this, if restoration of cursor is wanted later. +@param[in] pcur persistent cursor +@param[in] sec_pcur secondary index persistent cursor +@param[in] mtr mtr to commit */ +UNIV_INLINE +void +btr_pcurs_commit_specify_mtr( + btr_pcur_t* pcur, + btr_pcur_t* sec_pcur, + mtr_t* mtr); + +/*********************************************************//** +Moves the persistent cursor to the next record in the tree. If no records are +left, the cursor stays 'after last in tree'. +@return TRUE if the cursor was not after last in tree */ +UNIV_INLINE +ibool +btr_pcur_move_to_next( +/*==================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************//** +Moves the persistent cursor to the previous record in the tree. If no records +are left, the cursor stays 'before first in tree'. +@return true if the cursor was not before first in tree */ +bool +btr_pcur_move_to_prev( +/*==================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr) /*!< in: mtr */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*********************************************************//** +Moves the persistent cursor to the next user record in the tree. If no user +records are left, the cursor ends up 'after last in tree'. +@return TRUE if the cursor moved forward, ending on a user record */ +UNIV_INLINE +ibool +btr_pcur_move_to_next_user_rec( +/*===========================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr); /*!< in: mtr */ +/*********************************************************//** +Moves the persistent cursor to the first record on the next page. +Releases the latch on the current page, and bufferunfixes it. +Note that there must not be modifications on the current page, +as then the x-latch can be released only in mtr_commit. */ +dberr_t +btr_pcur_move_to_next_page( +/*=======================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; must be on the + last record of the current page */ + mtr_t* mtr) /*!< in: mtr */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +#define btr_pcur_get_btr_cur(cursor) (&(cursor)->btr_cur) +#define btr_pcur_get_page_cur(cursor) (&(cursor)->btr_cur.page_cur) +#define btr_pcur_get_page(cursor) btr_pcur_get_block(cursor)->page.frame + +/*********************************************************//** +Checks if the persistent cursor is on a user record. */ +UNIV_INLINE +ibool +btr_pcur_is_on_user_rec( +/*====================*/ + const btr_pcur_t* cursor);/*!< in: persistent cursor */ +/*********************************************************//** +Checks if the persistent cursor is after the last user record on +a page. */ +UNIV_INLINE +ibool +btr_pcur_is_after_last_on_page( +/*===========================*/ + const btr_pcur_t* cursor);/*!< in: persistent cursor */ +/*********************************************************//** +Checks if the persistent cursor is before the first user record on +a page. */ +UNIV_INLINE +ibool +btr_pcur_is_before_first_on_page( +/*=============================*/ + const btr_pcur_t* cursor);/*!< in: persistent cursor */ +/*********************************************************//** +Checks if the persistent cursor is before the first user record in +the index tree. */ +static inline bool btr_pcur_is_before_first_in_tree(btr_pcur_t* cursor); +/*********************************************************//** +Checks if the persistent cursor is after the last user record in +the index tree. */ +static inline bool btr_pcur_is_after_last_in_tree(btr_pcur_t* cursor); +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/*********************************************************//** +Moves the persistent cursor to the next record on the same page. */ +UNIV_INLINE +rec_t* +btr_pcur_move_to_next_on_page( +/*==========================*/ + btr_pcur_t* cursor);/*!< in/out: persistent cursor */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/*********************************************************//** +Moves the persistent cursor to the previous record on the same page. */ +UNIV_INLINE +rec_t* +btr_pcur_move_to_prev_on_page( +/*==========================*/ + btr_pcur_t* cursor);/*!< in/out: persistent cursor */ +/*********************************************************//** +Moves the persistent cursor to the infimum record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_before_first_on_page( +/*===============================*/ + btr_pcur_t* cursor); /*!< in/out: persistent cursor */ + +/** Position state of persistent B-tree cursor. */ +enum pcur_pos_t { + /** The persistent cursor is not positioned. */ + BTR_PCUR_NOT_POSITIONED = 0, + /** The persistent cursor was previously positioned. + TODO: currently, the state can be BTR_PCUR_IS_POSITIONED, + though it really should be BTR_PCUR_WAS_POSITIONED, + because we have no obligation to commit the cursor with + mtr; similarly latch_mode may be out of date. This can + lead to problems if btr_pcur is not used the right way; + all current code should be ok. */ + BTR_PCUR_WAS_POSITIONED, + /** The persistent cursor is positioned by optimistic get to the same + record as it was positioned at. Not used for rel_pos == BTR_PCUR_ON. + It may need adjustment depending on previous/current search direction + and rel_pos. */ + BTR_PCUR_IS_POSITIONED_OPTIMISTIC, + /** The persistent cursor is positioned by index search. + Or optimistic get for rel_pos == BTR_PCUR_ON. */ + BTR_PCUR_IS_POSITIONED +}; + +/* The persistent B-tree cursor structure. This is used mainly for SQL +selects, updates, and deletes. */ + +struct btr_pcur_t +{ + /** Return value of restore_position() */ + enum restore_status { + /** cursor position on user rec and points on the record with + the same field values as in the stored record */ + SAME_ALL, + /** cursor position is on user rec and points on the record with + the same unique field values as in the stored record */ + SAME_UNIQ, + /** cursor position is not on user rec or points on the record + with not the same uniq field values as in the stored record */ + NOT_SAME, + /** the index tree is corrupted */ + CORRUPTED + }; + /** a B-tree cursor */ + btr_cur_t btr_cur; + /** @see BTR_PCUR_WAS_POSITIONED + BTR_SEARCH_LEAF, BTR_MODIFY_LEAF, BTR_MODIFY_TREE or BTR_NO_LATCHES, + depending on the latching state of the page and tree where the cursor + is positioned; BTR_NO_LATCHES means that the cursor is not currently + positioned: + we say then that the cursor is detached; it can be restored to + attached if the old position was stored in old_rec */ + btr_latch_mode latch_mode= BTR_NO_LATCHES; + /** if cursor position is stored, contains an initial segment of the + latest record cursor was positioned either on, before or after */ + rec_t *old_rec= nullptr; + /** btr_cur.index()->n_core_fields when old_rec was copied */ + uint16 old_n_core_fields= 0; + /** number of fields in old_rec */ + uint16 old_n_fields= 0; + /** BTR_PCUR_ON, BTR_PCUR_BEFORE, or BTR_PCUR_AFTER, depending on + whether cursor was on, before, or after the old_rec record */ + btr_pcur_pos_t rel_pos= btr_pcur_pos_t(0); + /** buffer block when the position was stored */ + buf::Block_hint block_when_stored; + /** the modify clock value of the buffer block when the cursor position + was stored */ + ib_uint64_t modify_clock= 0; + /** btr_pcur_store_position() and restore_position() state. */ + enum pcur_pos_t pos_state= BTR_PCUR_NOT_POSITIONED; + page_cur_mode_t search_mode= PAGE_CUR_UNSUPP; + /** the transaction, if we know it; otherwise this field is not defined; + can ONLY BE USED in error prints in fatal assertion failures! */ + trx_t *trx_if_known= nullptr; + /** a dynamically allocated buffer for old_rec */ + byte *old_rec_buf= nullptr; + /** old_rec_buf size if old_rec_buf is not NULL */ + ulint buf_size= 0; + + /** Return the index of this persistent cursor */ + dict_index_t *index() const { return(btr_cur.index()); } + MY_ATTRIBUTE((nonnull, warn_unused_result)) + /** Restores the stored position of a persistent cursor bufferfixing + the page and obtaining the specified latches. If the cursor position + was saved when the + (1) cursor was positioned on a user record: this function restores the + position to the last record LESS OR EQUAL to the stored record; + (2) cursor was positioned on a page infimum record: restores the + position to the last record LESS than the user record which was the + successor of the page infimum; + (3) cursor was positioned on the page supremum: restores to the first + record GREATER than the user record which was the predecessor of the + supremum. + (4) cursor was positioned before the first or after the last in an + empty tree: restores to before first or after the last in the tree. + @param latch_mode BTR_SEARCH_LEAF, ... + @param mtr mini-transaction + @retval SAME_ALL cursor position on user rec and points on + the record with the same field values as in the stored record, + @retval SAME_UNIQ cursor position is on user rec and points on the + record with the same unique field values as in the stored record, + @retval NOT_SAME cursor position is not on user rec or points on + the record with not the same uniq field values as in the stored + @retval CORRUPTED if the index is corrupted */ + restore_status restore_position(btr_latch_mode latch_mode, mtr_t *mtr); + + /** Open the cursor on the first or last record. + @param first true=first record, false=last record + @param index B-tree + @param latch_mode which latches to acquire + @param mtr mini-transaction + @return error code */ + dberr_t open_leaf(bool first, dict_index_t *index, btr_latch_mode latch_mode, + mtr_t *mtr) + + { + this->latch_mode= BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode); + search_mode= first ? PAGE_CUR_G : PAGE_CUR_L; + pos_state= BTR_PCUR_IS_POSITIONED; + old_rec= nullptr; + + return btr_cur.open_leaf(first, index, this->latch_mode, mtr); + } +}; + +inline buf_block_t *btr_pcur_get_block(btr_pcur_t *cursor) +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + return cursor->btr_cur.page_cur.block; +} + +inline const buf_block_t *btr_pcur_get_block(const btr_pcur_t *cursor) +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + return cursor->btr_cur.page_cur.block; +} + +inline rec_t *btr_pcur_get_rec(const btr_pcur_t *cursor) +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + return cursor->btr_cur.page_cur.rec; +} + +/**************************************************************//** +Initializes and opens a persistent cursor to an index tree. */ +inline +dberr_t +btr_pcur_open( + const dtuple_t* tuple, /*!< in: tuple on which search done */ + page_cur_mode_t mode, /*!< in: PAGE_CUR_LE, ... */ + btr_latch_mode latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */ + btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */ + mtr_t* mtr) /*!< in: mtr */ +{ + cursor->latch_mode= BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode); + cursor->search_mode= mode; + cursor->pos_state= BTR_PCUR_IS_POSITIONED; + cursor->trx_if_known= nullptr; + return cursor->btr_cur.search_leaf(tuple, mode, latch_mode, mtr); +} + +/** Open a cursor on the first user record satisfying the search condition; +in case of no match, after the last index record. */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +inline +dberr_t +btr_pcur_open_on_user_rec( + const dtuple_t* tuple, /*!< in: tuple on which search done */ + btr_latch_mode latch_mode, /*!< in: BTR_SEARCH_LEAF or + BTR_MODIFY_LEAF */ + btr_pcur_t* cursor, /*!< in: memory buffer for persistent + cursor */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF); + if (dberr_t err= + btr_pcur_open(tuple, PAGE_CUR_GE, latch_mode, cursor, mtr)) + return err; + if (!btr_pcur_is_after_last_on_page(cursor) || + btr_pcur_is_after_last_in_tree(cursor)) + return DB_SUCCESS; + if (dberr_t err= btr_pcur_move_to_next_page(cursor, mtr)) + return err; + return btr_pcur_move_to_next_on_page(cursor) ? DB_SUCCESS : DB_CORRUPTION; +} + +#include "btr0pcur.inl" diff --git a/storage/innobase/include/btr0pcur.inl b/storage/innobase/include/btr0pcur.inl new file mode 100644 index 00000000..b827d70d --- /dev/null +++ b/storage/innobase/include/btr0pcur.inl @@ -0,0 +1,372 @@ +/***************************************************************************** + +Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/btr0pcur.ic +The index tree persistent cursor + +Created 2/23/1996 Heikki Tuuri +*******************************************************/ + + +/*********************************************************//** +Gets the rel_pos field for a cursor whose position has been stored. +@return BTR_PCUR_ON, ... */ +UNIV_INLINE +ulint +btr_pcur_get_rel_pos( +/*=================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_ad(cursor); + ut_ad(cursor->old_rec); + ut_ad(cursor->pos_state == BTR_PCUR_WAS_POSITIONED + || cursor->pos_state == BTR_PCUR_IS_POSITIONED); + + return(cursor->rel_pos); +} + +/**************************************************************//** +Gets the up_match value for a pcur after a search. +@return number of matched fields at the cursor or to the right if +search mode was PAGE_CUR_GE, otherwise undefined */ +UNIV_INLINE +ulint +btr_pcur_get_up_match( +/*==================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + const btr_cur_t* btr_cursor; + + ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED) + || (cursor->pos_state == BTR_PCUR_IS_POSITIONED)); + + btr_cursor = btr_pcur_get_btr_cur(cursor); + + ut_ad(btr_cursor->up_match != ULINT_UNDEFINED); + + return(btr_cursor->up_match); +} + +/**************************************************************//** +Gets the low_match value for a pcur after a search. +@return number of matched fields at the cursor or to the right if +search mode was PAGE_CUR_LE, otherwise undefined */ +UNIV_INLINE +ulint +btr_pcur_get_low_match( +/*===================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + const btr_cur_t* btr_cursor; + + ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED) + || (cursor->pos_state == BTR_PCUR_IS_POSITIONED)); + + btr_cursor = btr_pcur_get_btr_cur(cursor); + ut_ad(btr_cursor->low_match != ULINT_UNDEFINED); + + return(btr_cursor->low_match); +} + +/*********************************************************//** +Checks if the persistent cursor is after the last user record on +a page. */ +UNIV_INLINE +ibool +btr_pcur_is_after_last_on_page( +/*===========================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + return(page_cur_is_after_last(btr_pcur_get_page_cur(cursor))); +} + +/*********************************************************//** +Checks if the persistent cursor is before the first user record on +a page. */ +UNIV_INLINE +ibool +btr_pcur_is_before_first_on_page( +/*=============================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + return(page_cur_is_before_first(btr_pcur_get_page_cur(cursor))); +} + +/*********************************************************//** +Checks if the persistent cursor is on a user record. */ +UNIV_INLINE +ibool +btr_pcur_is_on_user_rec( +/*====================*/ + const btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + return !btr_pcur_is_before_first_on_page(cursor) && + !btr_pcur_is_after_last_on_page(cursor); +} + +/*********************************************************//** +Checks if the persistent cursor is before the first user record in +the index tree. */ +static inline bool btr_pcur_is_before_first_in_tree(btr_pcur_t* cursor) +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + return !page_has_prev(btr_pcur_get_page(cursor)) + && page_cur_is_before_first(btr_pcur_get_page_cur(cursor)); +} + +/*********************************************************//** +Checks if the persistent cursor is after the last user record in +the index tree. */ +static inline bool btr_pcur_is_after_last_in_tree(btr_pcur_t* cursor) +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + return !page_has_next(btr_pcur_get_page(cursor)) + && page_cur_is_after_last(btr_pcur_get_page_cur(cursor)); +} + +/*********************************************************//** +Moves the persistent cursor to the next record on the same page. */ +UNIV_INLINE +rec_t* +btr_pcur_move_to_next_on_page( +/*==========================*/ + btr_pcur_t* cursor) /*!< in/out: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + cursor->old_rec = nullptr; + return page_cur_move_to_next(btr_pcur_get_page_cur(cursor)); +} + +/*********************************************************//** +Moves the persistent cursor to the previous record on the same page. */ +UNIV_INLINE +rec_t* +btr_pcur_move_to_prev_on_page( +/*==========================*/ + btr_pcur_t* cursor) /*!< in/out: persistent cursor */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + cursor->old_rec = nullptr; + + return page_cur_move_to_prev(btr_pcur_get_page_cur(cursor)); +} + +/*********************************************************//** +Moves the persistent cursor to the next user record in the tree. If no user +records are left, the cursor ends up 'after last in tree'. +@return TRUE if the cursor moved forward, ending on a user record */ +UNIV_INLINE +ibool +btr_pcur_move_to_next_user_rec( +/*===========================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + cursor->old_rec = nullptr; +loop: + if (btr_pcur_is_after_last_on_page(cursor)) { + if (btr_pcur_is_after_last_in_tree(cursor) + || btr_pcur_move_to_next_page(cursor, mtr) != DB_SUCCESS) { + return(FALSE); + } + } else if (UNIV_UNLIKELY(!btr_pcur_move_to_next_on_page(cursor))) { + return false; + } + + if (btr_pcur_is_on_user_rec(cursor)) { + + return(TRUE); + } + + goto loop; +} + +/*********************************************************//** +Moves the persistent cursor to the next record in the tree. If no records are +left, the cursor stays 'after last in tree'. +@return TRUE if the cursor was not after last in tree */ +UNIV_INLINE +ibool +btr_pcur_move_to_next( +/*==================*/ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + mtr_t* mtr) /*!< in: mtr */ +{ + ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + cursor->old_rec= nullptr; + + if (btr_pcur_is_after_last_on_page(cursor)) + return !btr_pcur_is_after_last_in_tree(cursor) && + btr_pcur_move_to_next_page(cursor, mtr) == DB_SUCCESS; + else + return !!btr_pcur_move_to_next_on_page(cursor); +} + +/**************************************************************//** +Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES, +that is, the cursor becomes detached. +Function btr_pcur_store_position should be used before calling this, +if restoration of cursor is wanted later. */ +UNIV_INLINE +void +btr_pcur_commit_specify_mtr( +/*========================*/ + btr_pcur_t* pcur, /*!< in: persistent cursor */ + mtr_t* mtr) /*!< in: mtr to commit */ +{ + ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED); + + pcur->latch_mode = BTR_NO_LATCHES; + + mtr_commit(mtr); + + pcur->pos_state = BTR_PCUR_WAS_POSITIONED; +} + +/** Commits the mtr and sets the clustered index pcur and secondary index +pcur latch mode to BTR_NO_LATCHES, that is, the cursor becomes detached. +Function btr_pcur_store_position should be used for both cursor before +calling this, if restoration of cursor is wanted later. +@param[in] pcur persistent cursor +@param[in] sec_pcur secondary index persistent cursor +@param[in] mtr mtr to commit */ +UNIV_INLINE +void +btr_pcurs_commit_specify_mtr( + btr_pcur_t* pcur, + btr_pcur_t* sec_pcur, + mtr_t* mtr) +{ + ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED); + ut_ad(sec_pcur->pos_state == BTR_PCUR_IS_POSITIONED); + + pcur->latch_mode = BTR_NO_LATCHES; + sec_pcur->latch_mode = BTR_NO_LATCHES; + + mtr_commit(mtr); + + pcur->pos_state = BTR_PCUR_WAS_POSITIONED; + sec_pcur->pos_state = BTR_PCUR_WAS_POSITIONED; +} + +/**************************************************************//** +Sets the old_rec_buf field to NULL. */ +UNIV_INLINE +void +btr_pcur_init( +/*==========*/ + btr_pcur_t* pcur) /*!< in: persistent cursor */ +{ + pcur->old_rec_buf = NULL; + pcur->old_rec = NULL; + + pcur->btr_cur.rtr_info = NULL; +} + +/** Opens an persistent cursor to an index tree without initializing the +cursor. +@param tuple tuple on which search done +@param mode search mode; NOTE that if the search is made using a + unique prefix of a record, mode should be PAGE_CUR_LE, not + PAGE_CUR_GE, as the latter may end up on the previous page of + the record! +@param latch_mode BTR_SEARCH_LEAF, ... +@param cursor memory buffer for persistent cursor +@param mtr mini-transaction +@return DB_SUCCESS on success or error code otherwise. */ +inline +dberr_t btr_pcur_open_with_no_init(const dtuple_t *tuple, page_cur_mode_t mode, + btr_latch_mode latch_mode, + btr_pcur_t *cursor, mtr_t *mtr) +{ + cursor->latch_mode= BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode); + cursor->search_mode= mode; + cursor->pos_state= BTR_PCUR_IS_POSITIONED; + cursor->trx_if_known= nullptr; + return cursor->btr_cur.search_leaf(tuple, mode, latch_mode, mtr); +} + +/**************************************************************//** +Frees the possible memory heap of a persistent cursor and sets the latch +mode of the persistent cursor to BTR_NO_LATCHES. +WARNING: this function does not release the latch on the page where the +cursor is currently positioned. The latch is acquired by the +"move to next/previous" family of functions. Since recursive shared locks +are not allowed, you must take care (if using the cursor in S-mode) to +manually release the latch by either calling +btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr) +or by mtr_t::commit(). */ +UNIV_INLINE +void +btr_pcur_close( +/*===========*/ + btr_pcur_t* cursor) /*!< in: persistent cursor */ +{ + ut_free(cursor->old_rec_buf); + + if (cursor->btr_cur.rtr_info) + rtr_clean_rtr_info(cursor->btr_cur.rtr_info, true); + + cursor->btr_cur.rtr_info= nullptr; + cursor->old_rec = nullptr; + cursor->old_rec_buf = nullptr; + cursor->btr_cur.page_cur.rec = nullptr; + cursor->btr_cur.page_cur.block = nullptr; + + cursor->latch_mode = BTR_NO_LATCHES; + cursor->pos_state = BTR_PCUR_NOT_POSITIONED; + + cursor->trx_if_known = nullptr; +} + +/*********************************************************//** +Moves the persistent cursor to the infimum record on the same page. */ +UNIV_INLINE +void +btr_pcur_move_before_first_on_page( +/*===============================*/ + btr_pcur_t* cursor) /*!< in/out: persistent cursor */ +{ + ut_ad(cursor->latch_mode != BTR_NO_LATCHES); + + page_cur_set_before_first(btr_pcur_get_block(cursor), + btr_pcur_get_page_cur(cursor)); + + cursor->old_rec = nullptr; +} diff --git a/storage/innobase/include/btr0sea.h b/storage/innobase/include/btr0sea.h new file mode 100644 index 00000000..b75cad10 --- /dev/null +++ b/storage/innobase/include/btr0sea.h @@ -0,0 +1,403 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/btr0sea.h +The index tree adaptive search + +Created 2/17/1996 Heikki Tuuri +*************************************************************************/ + +#ifndef btr0sea_h +#define btr0sea_h + +#include "dict0dict.h" +#ifdef BTR_CUR_HASH_ADAPT +#include "ha0ha.h" +#include "srw_lock.h" + +#ifdef UNIV_PFS_RWLOCK +extern mysql_pfs_key_t btr_search_latch_key; +#endif /* UNIV_PFS_RWLOCK */ + +#define btr_search_sys_create() btr_search_sys.create() +#define btr_search_sys_free() btr_search_sys.free() + +/** Disable the adaptive hash search system and empty the index. */ +void btr_search_disable(); + +/** Enable the adaptive hash search system. +@param resize whether buf_pool_t::resize() is the caller */ +void btr_search_enable(bool resize= false); + +/*********************************************************************//** +Updates the search info. */ +UNIV_INLINE +void +btr_search_info_update( +/*===================*/ + dict_index_t* index, /*!< in: index of the cursor */ + btr_cur_t* cursor);/*!< in: cursor which was just positioned */ + +/** Tries to guess the right search position based on the hash search info +of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts, +and the function returns TRUE, then cursor->up_match and cursor->low_match +both have sensible values. +@param[in,out] index index +@param[in,out] info index search info +@param[in] tuple logical record +@param[in] mode PAGE_CUR_L, .... +@param[in] latch_mode BTR_SEARCH_LEAF, ... +@param[out] cursor tree cursor +@param[in] mtr mini-transaction +@return whether the search succeeded */ +bool +btr_search_guess_on_hash( + dict_index_t* index, + btr_search_t* info, + const dtuple_t* tuple, + ulint mode, + ulint latch_mode, + btr_cur_t* cursor, + mtr_t* mtr); + +/** Move or delete hash entries for moved records, usually in a page split. +If new_block is already hashed, then any hash index for block is dropped. +If new_block is not hashed, and block is hashed, then a new hash index is +built to new_block with the same parameters as block. +@param[in,out] new_block destination page +@param[in,out] block source page (subject to deletion later) */ +void +btr_search_move_or_delete_hash_entries( + buf_block_t* new_block, + buf_block_t* block); + +/** Drop any adaptive hash index entries that point to an index page. +@param[in,out] block block containing index page, s- or x-latched, or an + index page for which we know that + block->buf_fix_count == 0 or it is an index page which + has already been removed from the buf_pool.page_hash + i.e.: it is in state BUF_BLOCK_REMOVE_HASH +@param[in] garbage_collect drop ahi only if the index is marked + as freed */ +void btr_search_drop_page_hash_index(buf_block_t* block, + bool garbage_collect); + +/** Drop possible adaptive hash index entries when a page is evicted +from the buffer pool or freed in a file, or the index is being dropped. +@param[in] page_id page id */ +void btr_search_drop_page_hash_when_freed(const page_id_t page_id); + +/** Updates the page hash index when a single record is inserted on a page. +@param[in] cursor cursor which was positioned to the place to insert + using btr_cur_search_, and the new record has been + inserted next to the cursor. +@param[in] ahi_latch the adaptive hash index latch */ +void btr_search_update_hash_node_on_insert(btr_cur_t *cursor, + srw_spin_lock *ahi_latch); + +/** Updates the page hash index when a single record is inserted on a page. +@param[in,out] cursor cursor which was positioned to the + place to insert using btr_cur_search_..., + and the new record has been inserted next + to the cursor +@param[in] ahi_latch the adaptive hash index latch */ +void btr_search_update_hash_on_insert(btr_cur_t *cursor, + srw_spin_lock *ahi_latch); + +/** Updates the page hash index when a single record is deleted from a page. +@param[in] cursor cursor which was positioned on the record to delete + using btr_cur_search_, the record is not yet deleted.*/ +void btr_search_update_hash_on_delete(btr_cur_t *cursor); + +/** Validates the search system. +@param thd connection, for checking if CHECK TABLE has been killed +@return true if ok */ +bool btr_search_validate(THD *thd); + +/** Lock all search latches in exclusive mode. */ +static inline void btr_search_x_lock_all(); + +/** Unlock all search latches from exclusive mode. */ +static inline void btr_search_x_unlock_all(); + +/** Lock all search latches in shared mode. */ +static inline void btr_search_s_lock_all(); + +/** Unlock all search latches from shared mode. */ +static inline void btr_search_s_unlock_all(); + +# ifdef UNIV_DEBUG +/** @return if the index is marked as freed */ +bool btr_search_check_marked_free_index(const buf_block_t *block); +# endif /* UNIV_DEBUG */ +#else /* BTR_CUR_HASH_ADAPT */ +# define btr_search_sys_create() +# define btr_search_sys_free() +# define btr_search_drop_page_hash_index(block, garbage_collect) +# define btr_search_s_lock_all(index) +# define btr_search_s_unlock_all(index) +# define btr_search_info_update(index, cursor) +# define btr_search_move_or_delete_hash_entries(new_block, block) +# define btr_search_update_hash_on_insert(cursor, ahi_latch) +# define btr_search_update_hash_on_delete(cursor) +# ifdef UNIV_DEBUG +# define btr_search_check_marked_free_index(block) +# endif /* UNIV_DEBUG */ +#endif /* BTR_CUR_HASH_ADAPT */ + +#ifdef BTR_CUR_ADAPT +/** Create and initialize search info. +@param[in,out] heap heap where created +@return own: search info struct */ +static inline btr_search_t* btr_search_info_create(mem_heap_t* heap) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** @return the search info of an index */ +static inline btr_search_t* btr_search_get_info(dict_index_t* index) +{ + return(index->search_info); +} +#endif /* BTR_CUR_ADAPT */ + +/** The search info struct in an index */ +struct btr_search_t{ + /* @{ The following fields are not protected by any latch. + Unfortunately, this means that they must be aligned to + the machine word, i.e., they cannot be turned into bit-fields. */ + buf_block_t* root_guess;/*!< the root page frame when it was last time + fetched, or NULL */ +#ifdef BTR_CUR_HASH_ADAPT + ulint hash_analysis; /*!< when this exceeds + BTR_SEARCH_HASH_ANALYSIS, the hash + analysis starts; this is reset if no + success noticed */ + ibool last_hash_succ; /*!< TRUE if the last search would have + succeeded, or did succeed, using the hash + index; NOTE that the value here is not exact: + it is not calculated for every search, and the + calculation itself is not always accurate! */ + ulint n_hash_potential; + /*!< number of consecutive searches + which would have succeeded, or did succeed, + using the hash index; + the range is 0 .. BTR_SEARCH_BUILD_LIMIT + 5 */ + /* @} */ + ulint ref_count; /*!< Number of blocks in this index tree + that have search index built + i.e. block->index points to this index. + Protected by search latch except + when during initialization in + btr_search_info_create(). */ + + /*---------------------- @{ */ + uint16_t n_fields; /*!< recommended prefix length for hash search: + number of full fields */ + uint16_t n_bytes; /*!< recommended prefix: number of bytes in + an incomplete field + @see BTR_PAGE_MAX_REC_SIZE */ + bool left_side; /*!< true or false, depending on whether + the leftmost record of several records with + the same prefix should be indexed in the + hash index */ + /*---------------------- @} */ +#ifdef UNIV_SEARCH_PERF_STAT + ulint n_hash_succ; /*!< number of successful hash searches thus + far */ + ulint n_hash_fail; /*!< number of failed hash searches */ + ulint n_patt_succ; /*!< number of successful pattern searches thus + far */ + ulint n_searches; /*!< number of searches */ +#endif /* UNIV_SEARCH_PERF_STAT */ +#endif /* BTR_CUR_HASH_ADAPT */ +#ifdef UNIV_DEBUG + ulint magic_n; /*!< magic number @see BTR_SEARCH_MAGIC_N */ +/** value of btr_search_t::magic_n, used in assertions */ +# define BTR_SEARCH_MAGIC_N 1112765 +#endif /* UNIV_DEBUG */ +}; + +#ifdef BTR_CUR_HASH_ADAPT +/** The hash index system */ +struct btr_search_sys_t +{ + /** Partition of the hash table */ + struct partition + { + /** latches protecting hash_table */ + srw_spin_lock latch; + /** mapping of dtuple_fold() to rec_t* in buf_block_t::frame */ + hash_table_t table; + /** memory heap for table */ + mem_heap_t *heap; + +#ifdef _MSC_VER +#pragma warning(push) +// nonstandard extension - zero sized array, if perfschema is not compiled +#pragma warning(disable : 4200) +#endif + + char pad[(CPU_LEVEL1_DCACHE_LINESIZE - sizeof latch - + sizeof table - sizeof heap) & + (CPU_LEVEL1_DCACHE_LINESIZE - 1)]; + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + void init() + { + memset((void*) this, 0, sizeof *this); + latch.SRW_LOCK_INIT(btr_search_latch_key); + } + + void alloc(ulint hash_size) + { + table.create(hash_size); + heap= mem_heap_create_typed(std::min(4096, + MEM_MAX_ALLOC_IN_BUF / 2 + - MEM_BLOCK_HEADER_SIZE + - MEM_SPACE_NEEDED(0)), + MEM_HEAP_FOR_BTR_SEARCH); + } + + void clear() + { + mem_heap_free(heap); + heap= nullptr; + ut_free(table.array); + } + + void free() + { + latch.destroy(); + if (heap) + clear(); + } + }; + + /** Partitions of the adaptive hash index */ + partition *parts; + + /** Get an adaptive hash index partition */ + partition *get_part(index_id_t id, ulint space_id) const + { + return parts + ut_fold_ulint_pair(ulint(id), space_id) % btr_ahi_parts; + } + + /** Get an adaptive hash index partition */ + partition *get_part(const dict_index_t &index) const + { + ut_ad(!index.table->space || + index.table->space->id == index.table->space_id); + return get_part(ulint(index.id), index.table->space_id); + } + + /** Get the search latch for the adaptive hash index partition */ + srw_spin_lock *get_latch(const dict_index_t &index) const + { return &get_part(index)->latch; } + + /** Create and initialize at startup */ + void create() + { + parts= static_cast(ut_malloc(btr_ahi_parts * sizeof *parts, + mem_key_ahi)); + for (ulong i= 0; i < btr_ahi_parts; ++i) + parts[i].init(); + if (btr_search_enabled) + btr_search_enable(); + } + + void alloc(ulint hash_size) + { + hash_size/= btr_ahi_parts; + for (ulong i= 0; i < btr_ahi_parts; ++i) + parts[i].alloc(hash_size); + } + + /** Clear when disabling the adaptive hash index */ + void clear() { for (ulong i= 0; i < btr_ahi_parts; ++i) parts[i].clear(); } + + /** Free at shutdown */ + void free() + { + if (parts) + { + for (ulong i= 0; i < btr_ahi_parts; ++i) + parts[i].free(); + ut_free(parts); + parts= nullptr; + } + } +}; + +/** The adaptive hash index */ +extern btr_search_sys_t btr_search_sys; + +/** @return number of leaf pages pointed to by the adaptive hash index */ +TRANSACTIONAL_INLINE inline ulint dict_index_t::n_ahi_pages() const +{ + if (!btr_search_enabled) + return 0; + srw_spin_lock *latch= &btr_search_sys.get_part(*this)->latch; +#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC + if (xbegin()) + { + if (latch->is_locked()) + xabort(); + ulint ref_count= search_info->ref_count; + xend(); + return ref_count; + } +#endif + latch->rd_lock(SRW_LOCK_CALL); + ulint ref_count= search_info->ref_count; + latch->rd_unlock(); + return ref_count; +} + +#ifdef UNIV_SEARCH_PERF_STAT +/** Number of successful adaptive hash index lookups */ +extern ulint btr_search_n_succ; +/** Number of failed adaptive hash index lookups */ +extern ulint btr_search_n_hash_fail; +#endif /* UNIV_SEARCH_PERF_STAT */ + +/** After change in n_fields or n_bytes in info, this many rounds are waited +before starting the hash analysis again: this is to save CPU time when there +is no hope in building a hash index. */ +#define BTR_SEARCH_HASH_ANALYSIS 17 + +/** Limit of consecutive searches for trying a search shortcut on the search +pattern */ +#define BTR_SEARCH_ON_PATTERN_LIMIT 3 + +/** Limit of consecutive searches for trying a search shortcut using +the hash index */ +#define BTR_SEARCH_ON_HASH_LIMIT 3 + +/** We do this many searches before trying to keep the search latch +over calls from MySQL. If we notice someone waiting for the latch, we +again set this much timeout. This is to reduce contention. */ +#define BTR_SEA_TIMEOUT 10000 +#endif /* BTR_CUR_HASH_ADAPT */ + +#include "btr0sea.inl" + +#endif diff --git a/storage/innobase/include/btr0sea.inl b/storage/innobase/include/btr0sea.inl new file mode 100644 index 00000000..5a8d6480 --- /dev/null +++ b/storage/innobase/include/btr0sea.inl @@ -0,0 +1,117 @@ +/***************************************************************************** + +Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/btr0sea.ic +The index tree adaptive search + +Created 2/17/1996 Heikki Tuuri +*************************************************************************/ + +#include "dict0mem.h" +#include "btr0cur.h" +#include "buf0buf.h" + +/** Create and initialize search info. +@param[in,out] heap heap where created +@return own: search info struct */ +static inline btr_search_t* btr_search_info_create(mem_heap_t* heap) +{ + btr_search_t* info = static_cast( + mem_heap_zalloc(heap, sizeof(btr_search_t))); + ut_d(info->magic_n = BTR_SEARCH_MAGIC_N); +#ifdef BTR_CUR_HASH_ADAPT + info->n_fields = 1; + info->left_side = TRUE; +#endif /* BTR_CUR_HASH_ADAPT */ + return(info); +} + +#ifdef BTR_CUR_HASH_ADAPT +/** Updates the search info. +@param[in,out] info search info +@param[in,out] cursor cursor which was just positioned */ +void btr_search_info_update_slow(btr_search_t *info, btr_cur_t *cursor); + +/*********************************************************************//** +Updates the search info. */ +static inline +void +btr_search_info_update( +/*===================*/ + dict_index_t* index, /*!< in: index of the cursor */ + btr_cur_t* cursor) /*!< in: cursor which was just positioned */ +{ + ut_ad(!index->is_spatial()); + ut_ad(!index->table->is_temporary()); + + if (!btr_search_enabled) { + return; + } + + btr_search_t* info; + info = btr_search_get_info(index); + + info->hash_analysis++; + + if (info->hash_analysis < BTR_SEARCH_HASH_ANALYSIS) { + + /* Do nothing */ + + return; + + } + + ut_ad(cursor->flag != BTR_CUR_HASH); + + btr_search_info_update_slow(info, cursor); +} + +/** Lock all search latches in exclusive mode. */ +static inline void btr_search_x_lock_all() +{ + for (ulint i = 0; i < btr_ahi_parts; ++i) { + btr_search_sys.parts[i].latch.wr_lock(SRW_LOCK_CALL); + } +} + +/** Unlock all search latches from exclusive mode. */ +static inline void btr_search_x_unlock_all() +{ + for (ulint i = 0; i < btr_ahi_parts; ++i) { + btr_search_sys.parts[i].latch.wr_unlock(); + } +} + +/** Lock all search latches in shared mode. */ +static inline void btr_search_s_lock_all() +{ + for (ulint i = 0; i < btr_ahi_parts; ++i) { + btr_search_sys.parts[i].latch.rd_lock(SRW_LOCK_CALL); + } +} + +/** Unlock all search latches from shared mode. */ +static inline void btr_search_s_unlock_all() +{ + for (ulint i = 0; i < btr_ahi_parts; ++i) { + btr_search_sys.parts[i].latch.rd_unlock(); + } +} +#endif /* BTR_CUR_HASH_ADAPT */ diff --git a/storage/innobase/include/btr0types.h b/storage/innobase/include/btr0types.h new file mode 100644 index 00000000..fc829e78 --- /dev/null +++ b/storage/innobase/include/btr0types.h @@ -0,0 +1,154 @@ +/***************************************************************************** + +Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/btr0types.h +The index tree general types + +Created 2/17/1996 Heikki Tuuri +*************************************************************************/ + +#pragma once + +#include "page0types.h" +#include "rem0types.h" + +/** Persistent cursor */ +struct btr_pcur_t; +/** B-tree cursor */ +struct btr_cur_t; +/** B-tree search information for the adaptive hash index */ +struct btr_search_t; + +#ifdef BTR_CUR_HASH_ADAPT +/** Is search system enabled. +Search system is protected by array of latches. */ +extern char btr_search_enabled; + +/** Number of adaptive hash index partition. */ +extern ulong btr_ahi_parts; +#endif /* BTR_CUR_HASH_ADAPT */ + +/** The size of a reference to data stored on a different page. +The reference is stored at the end of the prefix of the field +in the index record. */ +#define FIELD_REF_SIZE 20U +#define BTR_EXTERN_FIELD_REF_SIZE FIELD_REF_SIZE + +/** If the data don't exceed the size, the data are stored locally. */ +#define BTR_EXTERN_LOCAL_STORED_MAX_SIZE \ + (BTR_EXTERN_FIELD_REF_SIZE * 2) + +/** Latching modes for btr_cur_t::search_leaf(). */ +enum btr_latch_mode { + /** Search a record on a leaf page and S-latch it. */ + BTR_SEARCH_LEAF = RW_S_LATCH, + /** (Prepare to) modify a record on a leaf page and X-latch it. */ + BTR_MODIFY_LEAF = RW_X_LATCH, + /** U-latch root and X-latch a leaf page */ + BTR_MODIFY_ROOT_AND_LEAF = RW_SX_LATCH, + /** Obtain no latches. */ + BTR_NO_LATCHES = RW_NO_LATCH, + /** Search the previous record. + Used in btr_pcur_move_backward_from_page(). */ + BTR_SEARCH_PREV = 4 | BTR_SEARCH_LEAF, + /** Modify the previous record. + Used in btr_pcur_move_backward_from_page() and ibuf_insert(). */ + BTR_MODIFY_PREV = 4 | BTR_MODIFY_LEAF, + /** Start modifying the entire B-tree. */ + BTR_MODIFY_TREE = 8 | BTR_MODIFY_LEAF, + /** Continue modifying the entire R-tree. + Only used by rtr_search_to_nth_level(). */ + BTR_CONT_MODIFY_TREE = 4 | BTR_MODIFY_TREE, + + /* BTR_INSERT, BTR_DELETE and BTR_DELETE_MARK are mutually + exclusive. */ + /** The search tuple will be inserted to the secondary index + at the searched position. When the leaf page is not in the + buffer pool, try to use the change buffer. */ + BTR_INSERT = 64, + + /** Try to delete mark a secondary index leaf page record at + the searched position using the change buffer when the page is + not in the buffer pool. */ + BTR_DELETE_MARK = 128, + + /** Try to purge the record using the change buffer when the + secondary index leaf page is not in the buffer pool. */ + BTR_DELETE = BTR_INSERT | BTR_DELETE_MARK, + + /** The caller is already holding dict_index_t::lock S-latch. */ + BTR_ALREADY_S_LATCHED = 256, + /** Search and S-latch a leaf page, assuming that the + dict_index_t::lock S-latch is being held. */ + BTR_SEARCH_LEAF_ALREADY_S_LATCHED = BTR_SEARCH_LEAF + | BTR_ALREADY_S_LATCHED, + /** Search and X-latch a leaf page, assuming that the + dict_index_t::lock is being held in non-exclusive mode. */ + BTR_MODIFY_LEAF_ALREADY_LATCHED = BTR_MODIFY_LEAF + | BTR_ALREADY_S_LATCHED, + /** Attempt to modify records in an x-latched tree. */ + BTR_MODIFY_TREE_ALREADY_LATCHED = BTR_MODIFY_TREE + | BTR_ALREADY_S_LATCHED, + /** U-latch root and X-latch a leaf page, assuming that + dict_index_t::lock is being held in U mode. */ + BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED = BTR_MODIFY_ROOT_AND_LEAF + | BTR_ALREADY_S_LATCHED, + + /** Attempt to delete-mark a secondary index record. */ + BTR_DELETE_MARK_LEAF = BTR_MODIFY_LEAF | BTR_DELETE_MARK, + /** Attempt to delete-mark a secondary index record + while holding the dict_index_t::lock S-latch. */ + BTR_DELETE_MARK_LEAF_ALREADY_S_LATCHED = BTR_DELETE_MARK_LEAF + | BTR_ALREADY_S_LATCHED, + /** Attempt to purge a secondary index record. */ + BTR_PURGE_LEAF = BTR_MODIFY_LEAF | BTR_DELETE, + /** Attempt to purge a secondary index record + while holding the dict_index_t::lock S-latch. */ + BTR_PURGE_LEAF_ALREADY_S_LATCHED = BTR_PURGE_LEAF + | BTR_ALREADY_S_LATCHED, + + /** In the case of BTR_MODIFY_TREE, the caller specifies + the intention to delete record only. It is used to optimize + block->lock range.*/ + BTR_LATCH_FOR_DELETE = 512, + + /** In the case of BTR_MODIFY_TREE, the caller specifies + the intention to delete record only. It is used to optimize + block->lock range.*/ + BTR_LATCH_FOR_INSERT = 1024, + + /** Attempt to delete a record in the tree. */ + BTR_PURGE_TREE = BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE, + /** Attempt to delete a record in an x-latched tree. */ + BTR_PURGE_TREE_ALREADY_LATCHED = BTR_PURGE_TREE + | BTR_ALREADY_S_LATCHED, + + /** Attempt to insert a record into the tree. */ + BTR_INSERT_TREE = BTR_MODIFY_TREE | BTR_LATCH_FOR_INSERT, + + /** This flag ORed to BTR_INSERT says that we can ignore possible + UNIQUE definition on secondary indexes when we decide if we can use + the insert buffer to speed up inserts */ + BTR_IGNORE_SEC_UNIQUE = 2048, + /** Rollback in spatial index */ + BTR_RTREE_UNDO_INS = 4096, + /** Try to delete mark a spatial index record */ + BTR_RTREE_DELETE_MARK = 8192 +}; diff --git a/storage/innobase/include/buf0block_hint.h b/storage/innobase/include/buf0block_hint.h new file mode 100644 index 00000000..d4fee7c1 --- /dev/null +++ b/storage/innobase/include/buf0block_hint.h @@ -0,0 +1,76 @@ +/***************************************************************************** + +Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2020, MariaDB Corporation. +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License, version 2.0, as published by the +Free Software Foundation. + +This program is also distributed with certain software (including but not +limited to OpenSSL) that is licensed under separate terms, as designated in a +particular file or component or in included license documentation. The authors +of MySQL hereby grant you an additional permission to link the program and +your derivative works with the separately licensed software that they have +included with MySQL. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0, +for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ +#pragma once +#include "buf0buf.h" + +namespace buf { +class Block_hint { +public: + /** Stores the pointer to the block, which is currently buffer-fixed. + @param block a pointer to a buffer-fixed block to be stored */ + inline void store(buf_block_t *block) + { + ut_ad(block->page.buf_fix_count()); + m_block= block; + m_page_id= block->page.id(); + } + + /** Clears currently stored pointer. */ + inline void clear() { m_block= nullptr; } + + /** Invoke f on m_block(which may be null) + @param f The function to be executed. It will be passed the pointer. + If you wish to use the block pointer subsequently, + you need to ensure you buffer-fix it before returning from f. + @return the return value of f + */ + template + bool run_with_hint(const F &f) + { + buffer_fix_block_if_still_valid(); + /* m_block could be changed during f() call, so we use local + variable to remember which block we need to unfix */ + buf_block_t *block= m_block; + bool res= f(block); + if (block) + block->page.unfix(); + return res; + } + + buf_block_t *block() const { return m_block; } + + private: + /** The block pointer stored by store(). */ + buf_block_t *m_block= nullptr; + /** If m_block is non-null, the m_block->page.id at time it was stored. */ + page_id_t m_page_id{0, 0}; + + /** A helper function which checks if m_block is not a dangling pointer and + still points to block with page with m_page_id and if so, buffer-fixes it, + otherwise clear()s it */ + void buffer_fix_block_if_still_valid(); +}; +} // namespace buf diff --git a/storage/innobase/include/buf0buddy.h b/storage/innobase/include/buf0buddy.h new file mode 100644 index 00000000..bb999420 --- /dev/null +++ b/storage/innobase/include/buf0buddy.h @@ -0,0 +1,91 @@ +/***************************************************************************** + +Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0buddy.h +Binary buddy allocator for compressed pages + +Created December 2006 by Marko Makela +*******************************************************/ + +#ifndef buf0buddy_h +#define buf0buddy_h + +#include "buf0types.h" + +/** +@param[in] block size in bytes +@return index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */ +inline +ulint +buf_buddy_get_slot(ulint size) +{ + ulint i; + ulint s; + + ut_ad(ut_is_2pow(size)); + ut_ad(size >= UNIV_ZIP_SIZE_MIN); + ut_ad(size <= srv_page_size); + + for (i = 0, s = BUF_BUDDY_LOW; s < size; i++, s <<= 1) { + } + ut_ad(i <= BUF_BUDDY_SIZES); + return i; +} + +/** Allocate a ROW_FORMAT=COMPRESSED block. +@param i index of buf_pool.zip_free[] or BUF_BUDDY_SIZES +@param lru assigned to true if buf_pool.mutex was temporarily released +@return allocated block, never NULL */ +byte *buf_buddy_alloc_low(ulint i, bool *lru) MY_ATTRIBUTE((malloc)); + +/** Allocate a ROW_FORMAT=COMPRESSED block. +@param size compressed page size in bytes +@param lru assigned to true if buf_pool.mutex was temporarily released +@return allocated block, never NULL */ +inline byte *buf_buddy_alloc(ulint size, bool *lru= nullptr) +{ + return buf_buddy_alloc_low(buf_buddy_get_slot(size), lru); +} + +/** Deallocate a block. +@param[in] buf block to be freed, must not be pointed to + by the buffer pool +@param[in] i index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */ +void buf_buddy_free_low(void* buf, ulint i); + +/** Deallocate a block. +@param[in] buf block to be freed, must not be pointed to + by the buffer pool +@param[in] size block size in bytes */ +inline void buf_buddy_free(void* buf, ulint size) +{ + buf_buddy_free_low(buf, buf_buddy_get_slot(size)); +} + +/** Try to reallocate a block. +@param[in] buf block to be reallocated, must be pointed +to by the buffer pool +@param[in] size block size, up to srv_page_size +@retval false if failed because of no free blocks. */ +bool buf_buddy_realloc(void* buf, ulint size); + +/** Combine all pairs of free buddies. */ +void buf_buddy_condense_free(); +#endif /* buf0buddy_h */ diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h new file mode 100644 index 00000000..332b2039 --- /dev/null +++ b/storage/innobase/include/buf0buf.h @@ -0,0 +1,2190 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0buf.h +The database buffer pool high-level routines + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#pragma once + +/** Magic value to use instead of checksums when they are disabled */ +#define BUF_NO_CHECKSUM_MAGIC 0xDEADBEEFUL + +#include "fil0fil.h" +#include "mtr0types.h" +#include "span.h" +#include "assume_aligned.h" +#include "buf0types.h" +#ifndef UNIV_INNOCHECKSUM +#include "ut0byte.h" +#include "page0types.h" +#include "log0log.h" +#include "srv0srv.h" +#include "transactional_lock_guard.h" +#include + +/** @name Modes for buf_page_get_gen */ +/* @{ */ +#define BUF_GET 10 /*!< get always */ +#define BUF_GET_IF_IN_POOL 11 /*!< get if in pool */ +#define BUF_PEEK_IF_IN_POOL 12 /*!< get if in pool, do not make + the block young in the LRU list */ +#define BUF_GET_IF_IN_POOL_OR_WATCH 15 + /*!< Get the page only if it's in the + buffer pool, if not then set a watch + on the page. */ +#define BUF_GET_POSSIBLY_FREED 16 + /*!< Like BUF_GET, but do not mind + if the file page has been freed. */ +/* @} */ + +/** If LRU list of a buf_pool is less than this size then LRU eviction +should not happen. This is because when we do LRU flushing we also put +the blocks on free list. If LRU list is very small then we can end up +in thrashing. */ +#define BUF_LRU_MIN_LEN 256 + +/** This structure defines information we will fetch from each buffer pool. It +will be used to print table IO stats */ +struct buf_pool_info_t +{ + /* General buffer pool info */ + ulint pool_size; /*!< Buffer Pool size in pages */ + ulint lru_len; /*!< Length of buf_pool.LRU */ + ulint old_lru_len; /*!< buf_pool.LRU_old_len */ + ulint free_list_len; /*!< Length of buf_pool.free list */ + ulint flush_list_len; /*!< Length of buf_pool.flush_list */ + ulint n_pend_unzip; /*!< buf_pool.n_pend_unzip, pages + pending decompress */ + ulint n_pend_reads; /*!< os_aio_pending_reads() */ + ulint n_pending_flush_lru; /*!< Pages pending flush in LRU */ + ulint n_pending_flush_list; /*!< Pages pending flush in FLUSH + LIST */ + ulint n_pages_made_young; /*!< number of pages made young */ + ulint n_pages_not_made_young; /*!< number of pages not made young */ + ulint n_pages_read; /*!< buf_pool.n_pages_read */ + ulint n_pages_created; /*!< buf_pool.n_pages_created */ + ulint n_pages_written; /*!< buf_pool.n_pages_written */ + ulint n_page_gets; /*!< buf_pool.n_page_gets */ + ulint n_ra_pages_read_rnd; /*!< buf_pool.n_ra_pages_read_rnd, + number of pages readahead */ + ulint n_ra_pages_read; /*!< buf_pool.n_ra_pages_read, number + of pages readahead */ + ulint n_ra_pages_evicted; /*!< buf_pool.n_ra_pages_evicted, + number of readahead pages evicted + without access */ + ulint n_page_get_delta; /*!< num of buffer pool page gets since + last printout */ + + /* Buffer pool access stats */ + double page_made_young_rate; /*!< page made young rate in pages + per second */ + double page_not_made_young_rate;/*!< page not made young rate + in pages per second */ + double pages_read_rate; /*!< num of pages read per second */ + double pages_created_rate; /*!< num of pages create per second */ + double pages_written_rate; /*!< num of pages written per second */ + ulint page_read_delta; /*!< num of pages read since last + printout */ + ulint young_making_delta; /*!< num of pages made young since + last printout */ + ulint not_young_making_delta; /*!< num of pages not make young since + last printout */ + + /* Statistics about read ahead algorithm. */ + double pages_readahead_rnd_rate;/*!< random readahead rate in pages per + second */ + double pages_readahead_rate; /*!< readahead rate in pages per + second */ + double pages_evicted_rate; /*!< rate of readahead page evicted + without access, in pages per second */ + + /* Stats about LRU eviction */ + ulint unzip_lru_len; /*!< length of buf_pool.unzip_LRU + list */ + /* Counters for LRU policy */ + ulint io_sum; /*!< buf_LRU_stat_sum.io */ + ulint io_cur; /*!< buf_LRU_stat_cur.io, num of IO + for current interval */ + ulint unzip_sum; /*!< buf_LRU_stat_sum.unzip */ + ulint unzip_cur; /*!< buf_LRU_stat_cur.unzip, num + pages decompressed in current + interval */ +}; +#endif /* !UNIV_INNOCHECKSUM */ + +/** Print the given page_id_t object. +@param[in,out] out the output stream +@param[in] page_id the page_id_t object to be printed +@return the output stream */ +std::ostream& +operator<<( + std::ostream& out, + const page_id_t page_id); + +#ifndef UNIV_INNOCHECKSUM +# define buf_pool_get_curr_size() srv_buf_pool_curr_size + +/** Allocate a buffer block. +@return own: the allocated block, state()==MEMORY */ +inline buf_block_t *buf_block_alloc(); +/********************************************************************//** +Frees a buffer block which does not contain a file page. */ +UNIV_INLINE +void +buf_block_free( +/*===========*/ + buf_block_t* block); /*!< in, own: block to be freed */ + +#define buf_page_get(ID, SIZE, LA, MTR) \ + buf_page_get_gen(ID, SIZE, LA, NULL, BUF_GET, MTR) + +/** Try to acquire a page latch. +@param rw_latch RW_S_LATCH or RW_X_LATCH +@param block guessed block +@param modify_clock expected value of block->modify_clock +@param mtr mini-transaction +@return whether the latch was acquired (the page is an allocated file page) */ +bool buf_page_optimistic_get(ulint rw_latch, buf_block_t *block, + uint64_t modify_clock, mtr_t *mtr); + +/** Try to S-latch a page. +Suitable for using when holding the lock_sys latches (as it avoids deadlock). +@param[in] page_id page identifier +@param[in,out] mtr mini-transaction +@return the block +@retval nullptr if an S-latch cannot be granted immediately */ +buf_block_t *buf_page_try_get(const page_id_t page_id, mtr_t *mtr); + +/** Get read access to a compressed page (usually of type +FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2). +The page must be released with unfix(). +NOTE: the page is not protected by any latch. Mutual exclusion has to +be implemented at a higher level. In other words, all possible +accesses to a given page through this function must be protected by +the same set of mutexes or latches. +@param page_id page identifier +@param zip_size ROW_FORMAT=COMPRESSED page size in bytes +@return pointer to the block, s-latched */ +buf_page_t *buf_page_get_zip(const page_id_t page_id, ulint zip_size); + +/** Get access to a database page. Buffered redo log may be applied. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH +@param[in] guess guessed block or NULL +@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL, +BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH +@param[in,out] mtr mini-transaction +@param[out] err DB_SUCCESS or error code +@param[in] allow_ibuf_merge Allow change buffer merge while +reading the pages from file. +@return pointer to the block or NULL */ +buf_block_t* +buf_page_get_gen( + const page_id_t page_id, + ulint zip_size, + ulint rw_latch, + buf_block_t* guess, + ulint mode, + mtr_t* mtr, + dberr_t* err = NULL, + bool allow_ibuf_merge = false) + MY_ATTRIBUTE((nonnull(6), warn_unused_result)); + +/** This is the low level function used to get access to a database page. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH +@param[in] guess guessed block or NULL +@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL, +BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH +@param[in,out] mtr mini-transaction, or NULL if a + block with page_id is to be evicted +@param[out] err DB_SUCCESS or error code +@param[in] allow_ibuf_merge Allow change buffer merge to happen +while reading the page from file +then it makes sure that it does merging of change buffer changes while +reading the page from file. +@return pointer to the block or NULL */ +buf_block_t* +buf_page_get_low( + const page_id_t page_id, + ulint zip_size, + ulint rw_latch, + buf_block_t* guess, + ulint mode, + mtr_t* mtr, + dberr_t* err, + bool allow_ibuf_merge); + +/** Initialize a page in the buffer pool. The page is usually not read +from a file even if it cannot be found in the buffer buf_pool. This is one +of the functions which perform to a block a state transition NOT_USED => LRU +(the other is buf_page_get_low()). +@param[in,out] space space object +@param[in] offset offset of the tablespace +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] mtr mini-transaction +@param[in,out] free_block pre-allocated buffer block +@return pointer to the block, page bufferfixed */ +buf_block_t* +buf_page_create(fil_space_t *space, uint32_t offset, + ulint zip_size, mtr_t *mtr, buf_block_t *free_block); + +/** Initialize a page in buffer pool while initializing the +deferred tablespace +@param space_id space identfier +@param zip_size ROW_FORMAT=COMPRESSED page size or 0 +@param mtr mini-transaction +@param free_block pre-allocated buffer block +@return pointer to the block, page bufferfixed */ +buf_block_t* +buf_page_create_deferred(uint32_t space_id, ulint zip_size, mtr_t *mtr, + buf_block_t *free_block); + +/** Move a block to the start of the LRU list. */ +void buf_page_make_young(buf_page_t *bpage); +/** Mark the page status as FREED for the given tablespace and page number. +@param[in,out] space tablespace +@param[in] page page number +@param[in,out] mtr mini-transaction */ +void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr); + +/** Determine if a block is still close enough to the MRU end of the LRU list +meaning that it is not in danger of getting evicted and also implying +that it has been accessed recently. +Note that this is for heuristics only and does not reserve buffer pool +mutex. +@param[in] bpage buffer pool page +@return whether bpage is close to MRU end of LRU */ +inline bool buf_page_peek_if_young(const buf_page_t *bpage); + +/** Determine if a block should be moved to the start of the LRU list if +there is danger of dropping from the buffer pool. +@param[in] bpage buffer pool page +@return true if bpage should be made younger */ +inline bool buf_page_peek_if_too_old(const buf_page_t *bpage); + +/** Move a page to the start of the buffer pool LRU list if it is too old. +@param[in,out] bpage buffer pool page */ +inline void buf_page_make_young_if_needed(buf_page_t *bpage) +{ + if (UNIV_UNLIKELY(buf_page_peek_if_too_old(bpage))) { + buf_page_make_young(bpage); + } +} + +/********************************************************************//** +Increments the modify clock of a frame by 1. The caller must (1) own the +buf_pool.mutex and block bufferfix count has to be zero, (2) or own an x-lock +on the block. */ +UNIV_INLINE +void +buf_block_modify_clock_inc( +/*=======================*/ + buf_block_t* block); /*!< in: block */ +/********************************************************************//** +Returns the value of the modify clock. The caller must have an s-lock +or x-lock on the block. +@return value */ +UNIV_INLINE +ib_uint64_t +buf_block_get_modify_clock( +/*=======================*/ + buf_block_t* block); /*!< in: block */ +#endif /* !UNIV_INNOCHECKSUM */ + +/** Check if a buffer is all zeroes. +@param[in] buf data to check +@return whether the buffer is all zeroes */ +bool buf_is_zeroes(st_::span buf); + +/** Check if a page is corrupt. +@param check_lsn whether FIL_PAGE_LSN should be checked +@param read_buf database page +@param fsp_flags contents of FIL_SPACE_FLAGS +@return whether the page is corrupted */ +bool buf_page_is_corrupted(bool check_lsn, const byte *read_buf, + uint32_t fsp_flags) + MY_ATTRIBUTE((warn_unused_result)); + +/** Read the key version from the page. In full crc32 format, +key version is stored at {0-3th} bytes. In other format, it is +stored in 26th position. +@param[in] read_buf database page +@param[in] fsp_flags tablespace flags +@return key version of the page. */ +inline uint32_t buf_page_get_key_version(const byte* read_buf, + uint32_t fsp_flags) +{ + static_assert(FIL_PAGE_FCRC32_KEY_VERSION == 0, "compatibility"); + return fil_space_t::full_crc32(fsp_flags) + ? mach_read_from_4(my_assume_aligned<4>(read_buf)) + : mach_read_from_4(my_assume_aligned<2> + (read_buf + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION)); +} + +/** Read the compression info from the page. In full crc32 format, +compression info is at MSB of page type. In other format, it is +stored in page type. +@param[in] read_buf database page +@param[in] fsp_flags tablespace flags +@return true if page is compressed. */ +inline bool buf_page_is_compressed(const byte* read_buf, uint32_t fsp_flags) +{ + uint16_t page_type= fil_page_get_type(read_buf); + return fil_space_t::full_crc32(fsp_flags) + ? !!(page_type & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER) + : page_type == FIL_PAGE_PAGE_COMPRESSED; +} + +/** Get the compressed or uncompressed size of a full_crc32 page. +@param[in] buf page_compressed or uncompressed page +@param[out] comp whether the page could be compressed +@param[out] cr whether the page could be corrupted +@return the payload size in the file page */ +inline uint buf_page_full_crc32_size(const byte* buf, bool* comp, bool* cr) +{ + uint t = fil_page_get_type(buf); + uint page_size = uint(srv_page_size); + + if (!(t & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER)) { + return page_size; + } + + t &= ~(1U << FIL_PAGE_COMPRESS_FCRC32_MARKER); + t <<= 8; + + if (t < page_size) { + page_size = t; + if (comp) { + *comp = true; + } + } else if (cr) { + *cr = true; + } + + return page_size; +} + +#ifndef UNIV_INNOCHECKSUM +/** Dump a page to stderr. +@param[in] read_buf database page +@param[in] zip_size compressed page size, or 0 */ +void buf_page_print(const byte* read_buf, ulint zip_size = 0) + ATTRIBUTE_COLD __attribute__((nonnull)); +/********************************************************************//** +Decompress a block. +@return TRUE if successful */ +ibool +buf_zip_decompress( +/*===============*/ + buf_block_t* block, /*!< in/out: block */ + ibool check); /*!< in: TRUE=verify the page checksum */ + +#ifdef UNIV_DEBUG +/** @return the number of latched pages in the buffer pool */ +ulint buf_get_latched_pages_number(); +#endif /* UNIV_DEBUG */ +/*********************************************************************//** +Prints info of the buffer i/o. */ +void +buf_print_io( +/*=========*/ + FILE* file); /*!< in: file where to print */ +/** Collect buffer pool metadata. +@param[out] pool_info buffer pool metadata */ +void buf_stats_get_pool_info(buf_pool_info_t *pool_info); + +/** Refresh the statistics used to print per-second averages. */ +void buf_refresh_io_stats(); + +/** Invalidate all pages in the buffer pool. +All pages must be in a replaceable state (not modified or latched). */ +void buf_pool_invalidate(); + +/*======================================================================== +--------------------------- LOWER LEVEL ROUTINES ------------------------- +=========================================================================*/ + +#define buf_block_get_frame(block) (block)->page.frame + +/*********************************************************************//** +Gets the compressed page descriptor corresponding to an uncompressed page +if applicable. */ +#define buf_block_get_page_zip(block) \ + (UNIV_LIKELY_NULL((block)->page.zip.data) ? &(block)->page.zip : NULL) +#define is_buf_block_get_page_zip(block) \ + UNIV_LIKELY_NULL((block)->page.zip.data) + +/** Monitor the buffer page read/write activity, and increment corresponding +counter value in MONITOR_MODULE_BUF_PAGE. +@param bpage buffer page whose read or write was completed +@param read true=read, false=write */ +ATTRIBUTE_COLD void buf_page_monitor(const buf_page_t &bpage, bool read); + +/** Calculate aligned buffer pool size based on srv_buf_pool_chunk_unit, +if needed. +@param[in] size size in bytes +@return aligned size */ +ulint +buf_pool_size_align( + ulint size); + +/** Verify that post encryption checksum match with the calculated checksum. +This function should be called only if tablespace contains crypt data metadata. +@param page page frame +@param fsp_flags contents of FSP_SPACE_FLAGS +@return whether the page is encrypted and valid */ +bool buf_page_verify_crypt_checksum(const byte *page, uint32_t fsp_flags); + +/** Calculate a ROW_FORMAT=COMPRESSED page checksum and update the page. +@param[in,out] page page to update +@param[in] size compressed page size */ +void buf_flush_update_zip_checksum(buf_frame_t* page, ulint size); + +/** @brief The temporary memory structure. + +NOTE! The definition appears here only for other modules of this +directory (buf) to see it. Do not use from outside! */ + +class buf_tmp_buffer_t +{ + /** whether this slot is reserved */ + std::atomic reserved; +public: + /** For encryption, the data needs to be copied to a separate buffer + before it's encrypted&written. The buffer block itself can be replaced + while a write of crypt_buf to file is in progress. */ + byte *crypt_buf; + /** buffer for fil_page_compress(), for flushing page_compressed pages */ + byte *comp_buf; + /** pointer to resulting buffer after encryption or compression; + not separately allocated memory */ + byte *out_buf; + + /** Release the slot */ + void release() { reserved.store(false, std::memory_order_relaxed); } + + /** Acquire the slot + @return whether the slot was acquired */ + bool acquire() { return !reserved.exchange(true, std::memory_order_relaxed);} + + /** Allocate a buffer for encryption, decryption or decompression. */ + void allocate() + { + if (!crypt_buf) + crypt_buf= static_cast + (aligned_malloc(srv_page_size, srv_page_size)); + } +}; + +/** The common buffer control block structure +for compressed and uncompressed frames */ + +class buf_pool_t; + +class buf_page_t +{ + friend buf_pool_t; + friend buf_block_t; + + /** @name General fields */ + /* @{ */ + +public: // FIXME: fix fil_iterate() + /** Page id. Protected by buf_pool.page_hash.lock_get() when + the page is in buf_pool.page_hash. */ + page_id_t id_; + /** buf_pool.page_hash link; protected by buf_pool.page_hash.lock_get() */ + buf_page_t *hash; +private: + /** log sequence number of the START of the log entry written of the + oldest modification to this block which has not yet been written + to the data file; + + 0 if no modifications are pending; + 1 if no modifications are pending, but the block is in buf_pool.flush_list; + 2 if modifications are pending, but the block is not in buf_pool.flush_list + (because id().space() is the temporary tablespace). */ + Atomic_relaxed oldest_modification_; + +public: + /** state() of unused block (in buf_pool.free list) */ + static constexpr uint32_t NOT_USED= 0; + /** state() of block allocated as general-purpose memory */ + static constexpr uint32_t MEMORY= 1; + /** state() of block that is being freed */ + static constexpr uint32_t REMOVE_HASH= 2; + /** smallest state() of a buffer page that is freed in the tablespace */ + static constexpr uint32_t FREED= 3; + /** smallest state() for a block that belongs to buf_pool.LRU */ + static constexpr uint32_t UNFIXED= 1U << 29; + /** smallest state() of a block for which buffered changes may exist */ + static constexpr uint32_t IBUF_EXIST= 2U << 29; + /** smallest state() of a (re)initialized page (no doublewrite needed) */ + static constexpr uint32_t REINIT= 3U << 29; + /** smallest state() for an io-fixed block */ + static constexpr uint32_t READ_FIX= 4U << 29; + /** smallest state() for a write-fixed block */ + static constexpr uint32_t WRITE_FIX= 5U << 29; + /** smallest state() for a write-fixed block with buffered changes */ + static constexpr uint32_t WRITE_FIX_IBUF= 6U << 29; + /** smallest state() for a write-fixed block (no doublewrite was used) */ + static constexpr uint32_t WRITE_FIX_REINIT= 7U << 29; + /** buf_pool.LRU status mask in state() */ + static constexpr uint32_t LRU_MASK= 7U << 29; + + /** lock covering the contents of frame */ + block_lock lock; + /** pointer to aligned, uncompressed page frame of innodb_page_size */ + byte *frame; + /* @} */ + /** ROW_FORMAT=COMPRESSED page; zip.data (but not the data it points to) + is also protected by buf_pool.mutex; + !frame && !zip.data means an active buf_pool.watch */ + page_zip_des_t zip; +#ifdef UNIV_DEBUG + /** whether this->list is in buf_pool.zip_hash; protected by buf_pool.mutex */ + bool in_zip_hash; + /** whether this->LRU is in buf_pool.LRU (in_file()); + protected by buf_pool.mutex */ + bool in_LRU_list; + /** whether this is in buf_pool.page_hash (in_file()); + protected by buf_pool.mutex */ + bool in_page_hash; + /** whether this->list is in buf_pool.free (state() == NOT_USED); + protected by buf_pool.flush_list_mutex */ + bool in_free_list; +#endif /* UNIV_DEBUG */ + /** list member in one of the lists of buf_pool; protected by + buf_pool.mutex or buf_pool.flush_list_mutex + + state() == NOT_USED: buf_pool.free or buf_pool.withdraw + + in_file() && oldest_modification(): + buf_pool.flush_list (protected by buf_pool.flush_list_mutex) + + The contents is undefined if in_file() && !oldest_modification(), + or if state() == MEMORY or state() == REMOVE_HASH. */ + UT_LIST_NODE_T(buf_page_t) list; + + /** @name LRU replacement algorithm fields. + Protected by buf_pool.mutex. */ + /* @{ */ + + UT_LIST_NODE_T(buf_page_t) LRU; + /*!< node of the LRU list */ + unsigned old:1; /*!< TRUE if the block is in the old + blocks in buf_pool.LRU_old */ + unsigned freed_page_clock:31;/*!< the value of + buf_pool.freed_page_clock + when this block was the last + time put to the head of the + LRU list; a thread is allowed + to read this for heuristic + purposes without holding any + mutex or latch */ + /* @} */ + Atomic_counter access_time; /*!< time of first access, or + 0 if the block was never accessed + in the buffer pool. + + For state() == MEMORY + blocks, this field can be repurposed + for something else. + + When this field counts log records + and bytes allocated for recv_sys.pages, + the field is protected by + recv_sys_t::mutex. */ + buf_page_t() : id_{0} + { + static_assert(NOT_USED == 0, "compatibility"); + memset((void*) this, 0, sizeof *this); + } + + buf_page_t(const buf_page_t &b) : + id_(b.id_), hash(b.hash), + oldest_modification_(b.oldest_modification_), + lock() /* not copied */, + frame(b.frame), zip(b.zip), +#ifdef UNIV_DEBUG + in_zip_hash(b.in_zip_hash), in_LRU_list(b.in_LRU_list), + in_page_hash(b.in_page_hash), in_free_list(b.in_free_list), +#endif /* UNIV_DEBUG */ + list(b.list), LRU(b.LRU), old(b.old), freed_page_clock(b.freed_page_clock), + access_time(b.access_time) + { + lock.init(); + } + + /** Initialize some more fields */ + void init(uint32_t state, page_id_t id) + { + ut_ad(state < REMOVE_HASH || state >= UNFIXED); + id_= id; + zip.fix= state; + oldest_modification_= 0; + lock.init(); + ut_d(in_zip_hash= false); + ut_d(in_free_list= false); + ut_d(in_LRU_list= false); + ut_d(in_page_hash= false); + old= 0; + freed_page_clock= 0; + access_time= 0; + } + + void set_os_unused() + { + MEM_NOACCESS(frame, srv_page_size); +#ifdef MADV_FREE + madvise(frame, srv_page_size, MADV_FREE); +#endif + } + + void set_os_used() const + { + MEM_MAKE_ADDRESSABLE(frame, srv_page_size); + } +public: + const page_id_t &id() const { return id_; } + uint32_t state() const { return zip.fix; } + uint32_t buf_fix_count() const + { + uint32_t f= state(); + ut_ad(f >= FREED); + return f < UNFIXED ? (f - FREED) : (~LRU_MASK & f); + } + /** @return whether this block is read or write fixed; + read_complete() or write_complete() will always release + the io-fix before releasing U-lock or X-lock */ + bool is_io_fixed() const + { const auto s= state(); ut_ad(s >= FREED); return s >= READ_FIX; } + /** @return whether this block is write fixed; + write_complete() will always release the write-fix before releasing U-lock */ + bool is_write_fixed() const { return state() >= WRITE_FIX; } + /** @return whether this block is read fixed; this should never hold + when a thread is holding the block lock in any mode */ + bool is_read_fixed() const { return is_io_fixed() && !is_write_fixed(); } + + /** @return if this belongs to buf_pool.unzip_LRU */ + bool belongs_to_unzip_LRU() const + { return UNIV_LIKELY_NULL(zip.data) && frame; } + + bool is_freed() const + { const auto s= state(); ut_ad(s >= FREED); return s < UNFIXED; } + bool is_ibuf_exist() const + { + const auto s= state(); + ut_ad(s >= UNFIXED); + ut_ad(s < READ_FIX); + return (s & LRU_MASK) == IBUF_EXIST; + } + bool is_reinit() const { return !(~state() & REINIT); } + + void set_reinit(uint32_t prev_state) + { + ut_ad(prev_state < READ_FIX); + ut_d(const auto s=) zip.fix.fetch_add(REINIT - prev_state); + ut_ad(s > prev_state); + ut_ad(s < prev_state + UNFIXED); + } + + void set_ibuf_exist() + { + ut_ad(lock.is_write_locked()); + ut_ad(id() < page_id_t(SRV_SPACE_ID_UPPER_BOUND, 0)); + const auto s= state(); + ut_ad(s >= UNFIXED); + ut_ad(s < READ_FIX); + ut_ad(s < IBUF_EXIST || s >= REINIT); + zip.fix.fetch_add(IBUF_EXIST - (LRU_MASK & s)); + } + void clear_ibuf_exist() + { + ut_ad(lock.is_write_locked()); + ut_ad(id() < page_id_t(SRV_SPACE_ID_UPPER_BOUND, 0)); + ut_d(const auto s=) zip.fix.fetch_sub(IBUF_EXIST - UNFIXED); + ut_ad(s >= IBUF_EXIST); + ut_ad(s < REINIT); + } + + uint32_t read_unfix(uint32_t s) + { + ut_ad(lock.is_write_locked()); + ut_ad(s == UNFIXED + 1 || s == IBUF_EXIST + 1 || s == REINIT + 1); + uint32_t old_state= zip.fix.fetch_add(s - READ_FIX); + ut_ad(old_state >= READ_FIX); + ut_ad(old_state < WRITE_FIX); + return old_state + (s - READ_FIX); + } + + void set_freed(uint32_t prev_state, uint32_t count= 0) + { + ut_ad(lock.is_write_locked()); + ut_ad(prev_state >= UNFIXED); + ut_ad(prev_state < READ_FIX); + ut_d(auto s=) zip.fix.fetch_sub((prev_state & LRU_MASK) - FREED - count); + ut_ad(!((prev_state ^ s) & LRU_MASK)); + } + + inline void set_state(uint32_t s); + inline void set_corrupt_id(); + + /** @return the log sequence number of the oldest pending modification + @retval 0 if the block is being removed from (or not in) buf_pool.flush_list + @retval 1 if the block is in buf_pool.flush_list but not modified + @retval 2 if the block belongs to the temporary tablespace and + has unwritten changes */ + lsn_t oldest_modification() const { return oldest_modification_; } + /** @return the log sequence number of the oldest pending modification, + @retval 0 if the block is definitely not in buf_pool.flush_list + @retval 1 if the block is in buf_pool.flush_list but not modified + @retval 2 if the block belongs to the temporary tablespace and + has unwritten changes */ + lsn_t oldest_modification_acquire() const + { return oldest_modification_.load(std::memory_order_acquire); } + /** Set oldest_modification when adding to buf_pool.flush_list */ + inline void set_oldest_modification(lsn_t lsn); + /** Clear oldest_modification after removing from buf_pool.flush_list */ + inline void clear_oldest_modification(); + /** Reset the oldest_modification when marking a persistent page freed */ + void reset_oldest_modification() + { + ut_ad(oldest_modification() > 2); + oldest_modification_.store(1, std::memory_order_release); + } + + /** Complete a read of a page. + @param node data file + @return whether the operation succeeded + @retval DB_PAGE_CORRUPTED if the checksum fails + @retval DB_DECRYPTION_FAILED if the page cannot be decrypted + @retval DB_FAIL if the page contains the wrong ID */ + dberr_t read_complete(const fil_node_t &node); + + /** Note that a block is no longer dirty, while not removing + it from buf_pool.flush_list + @param temporary whether the page belongs to the temporary tablespace + @param error whether an error may have occurred while writing */ + inline void write_complete(bool temporary, bool error); + + /** Write a flushable page to a file or free a freeable block. + @param evict whether to evict the page on write completion + @param space tablespace + @return whether a page write was initiated and buf_pool.mutex released */ + bool flush(bool evict, fil_space_t *space); + + /** Notify that a page in a temporary tablespace has been modified. */ + void set_temp_modified() + { + ut_ad(fsp_is_system_temporary(id().space())); + ut_ad(in_file()); + ut_ad((oldest_modification() | 2) == 2); + oldest_modification_= 2; + } + + /** Prepare to release a file page to buf_pool.free. */ + void free_file_page() + { + ut_ad((zip.fix.fetch_sub(REMOVE_HASH - MEMORY)) == REMOVE_HASH); + /* buf_LRU_block_free_non_file_page() asserts !oldest_modification() */ + ut_d(oldest_modification_= 0;) + id_= page_id_t(~0ULL); + } + + void fix_on_recovery() + { + ut_d(const auto f=) zip.fix.fetch_sub(READ_FIX - UNFIXED - 1); + ut_ad(f >= READ_FIX); + ut_ad(f < WRITE_FIX); + } + + uint32_t fix(uint32_t count= 1) + { + ut_ad(count); + ut_ad(count < IBUF_EXIST); + uint32_t f= zip.fix.fetch_add(count); + ut_ad(f >= FREED); + ut_ad(!((f ^ (f + 1)) & LRU_MASK)); + return f; + } + + uint32_t unfix() + { + uint32_t f= zip.fix.fetch_sub(1); + ut_ad(f > FREED); + ut_ad(!((f ^ (f - 1)) & LRU_MASK)); + return f - 1; + } + + /** @return the physical size, in bytes */ + ulint physical_size() const + { + return zip.ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip.ssize : srv_page_size; + } + + /** @return the ROW_FORMAT=COMPRESSED physical size, in bytes + @retval 0 if not compressed */ + ulint zip_size() const + { + return zip.ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip.ssize : 0; + } + + /** @return the byte offset of the page within a file */ + os_offset_t physical_offset() const + { + os_offset_t o= id().page_no(); + return zip.ssize + ? o << (zip.ssize + (UNIV_ZIP_SIZE_SHIFT_MIN - 1)) + : o << srv_page_size_shift; + } + + /** @return whether the block is mapped to a data file */ + bool in_file() const { return state() >= FREED; } + + /** @return whether the block can be relocated in memory. + The block can be dirty, but it must not be I/O-fixed or bufferfixed. */ + inline bool can_relocate() const; + /** @return whether the block has been flagged old in buf_pool.LRU */ + inline bool is_old() const; + /** Set whether a block is old in buf_pool.LRU */ + inline void set_old(bool old); + /** Flag a page accessed in buf_pool + @return whether this is not the first access */ + bool set_accessed() + { + if (is_accessed()) return true; + access_time= static_cast(ut_time_ms()); + return false; + } + /** @return ut_time_ms() at the time of first access of a block in buf_pool + @retval 0 if not accessed */ + unsigned is_accessed() const { ut_ad(in_file()); return access_time; } +}; + +/** The buffer control block structure */ + +struct buf_block_t{ + + /** @name General fields */ + /* @{ */ + + buf_page_t page; /*!< page information; this must + be the first field, so that + buf_pool.page_hash can point + to buf_page_t or buf_block_t */ +#ifdef UNIV_DEBUG + /** whether page.list is in buf_pool.withdraw + ((state() == NOT_USED)) and the buffer pool is being shrunk; + protected by buf_pool.mutex */ + bool in_withdraw_list; + /** whether unzip_LRU is in buf_pool.unzip_LRU + (in_file() && frame && zip.data); + protected by buf_pool.mutex */ + bool in_unzip_LRU_list; +#endif + /** member of buf_pool.unzip_LRU (if belongs_to_unzip_LRU()) */ + UT_LIST_NODE_T(buf_block_t) unzip_LRU; + /* @} */ + /** @name Optimistic search field */ + /* @{ */ + + ib_uint64_t modify_clock; /*!< this clock is incremented every + time a pointer to a record on the + page may become obsolete; this is + used in the optimistic cursor + positioning: if the modify clock has + not changed, we know that the pointer + is still valid; this field may be + changed if the thread (1) owns the + pool mutex and the page is not + bufferfixed, or (2) the thread has an + x-latch on the block */ + /* @} */ +#ifdef BTR_CUR_HASH_ADAPT + /** @name Hash search fields (unprotected) + NOTE that these fields are NOT protected by any semaphore! */ + /* @{ */ + + volatile uint16_t n_bytes; /*!< recommended prefix length for hash + search: number of bytes in + an incomplete last field */ + volatile uint16_t n_fields; /*!< recommended prefix length for hash + search: number of full fields */ + uint16_t n_hash_helps; /*!< counter which controls building + of a new hash index for the page */ + volatile bool left_side; /*!< true or false, depending on + whether the leftmost record of several + records with the same prefix should be + indexed in the hash index */ + /* @} */ + + /** @name Hash search fields + These 5 fields may only be modified when: + we are holding the appropriate x-latch in btr_search_latches[], and + one of the following holds: + (1) in_file(), and we are holding lock in any mode, or + (2) !is_read_fixed()&&(state()>=UNFIXED||state()==REMOVE_HASH). + + An exception to this is when we init or create a page + in the buffer pool in buf0buf.cc. + + Another exception for buf_pool_t::clear_hash_index() is that + assigning block->index = NULL (and block->n_pointers = 0) + is allowed whenever all AHI latches are exclusively locked. + + Another exception is that ha_insert_for_fold() may + decrement n_pointers without holding the appropriate latch + in btr_search_latches[]. Thus, n_pointers must be + protected by atomic memory access. + + This implies that the fields may be read without race + condition whenever any of the following hold: + - the btr_search_sys.partition[].latch is being held, or + - state() == NOT_USED || state() == MEMORY, + and holding some latch prevents the state from changing to that. + + Some use of assert_block_ahi_empty() or assert_block_ahi_valid() + is prone to race conditions while buf_pool_t::clear_hash_index() is + executing (the adaptive hash index is being disabled). Such use + is explicitly commented. */ + + /* @{ */ + +# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + Atomic_counter + n_pointers; /*!< used in debugging: the number of + pointers in the adaptive hash index + pointing to this frame */ +# define assert_block_ahi_empty(block) \ + ut_a((block)->n_pointers == 0) +# define assert_block_ahi_empty_on_init(block) do { \ + MEM_MAKE_DEFINED(&(block)->n_pointers, sizeof (block)->n_pointers); \ + assert_block_ahi_empty(block); \ +} while (0) +# define assert_block_ahi_valid(block) \ + ut_a((block)->index || (block)->n_pointers == 0) +# else /* UNIV_AHI_DEBUG || UNIV_DEBUG */ +# define assert_block_ahi_empty(block) /* nothing */ +# define assert_block_ahi_empty_on_init(block) /* nothing */ +# define assert_block_ahi_valid(block) /* nothing */ +# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + unsigned curr_n_fields:10;/*!< prefix length for hash indexing: + number of full fields */ + unsigned curr_n_bytes:15;/*!< number of bytes in hash + indexing */ + unsigned curr_left_side:1;/*!< TRUE or FALSE in hash indexing */ + dict_index_t* index; /*!< Index for which the + adaptive hash index has been + created, or NULL if the page + does not exist in the + index. Note that it does not + guarantee that the index is + complete, though: there may + have been hash collisions, + record deletions, etc. */ + /* @} */ +#else /* BTR_CUR_HASH_ADAPT */ +# define assert_block_ahi_empty(block) /* nothing */ +# define assert_block_ahi_empty_on_init(block) /* nothing */ +# define assert_block_ahi_valid(block) /* nothing */ +#endif /* BTR_CUR_HASH_ADAPT */ + void fix() { page.fix(); } + uint32_t unfix() { return page.unfix(); } + + /** @return the physical size, in bytes */ + ulint physical_size() const { return page.physical_size(); } + + /** @return the ROW_FORMAT=COMPRESSED physical size, in bytes + @retval 0 if not compressed */ + ulint zip_size() const { return page.zip_size(); } + + /** Initialize the block. + @param page_id page identifier + @param zip_size ROW_FORMAT=COMPRESSED page size, or 0 + @param state initial state() */ + void initialise(const page_id_t page_id, ulint zip_size, uint32_t state); +}; + +/**********************************************************************//** +Compute the hash fold value for blocks in buf_pool.zip_hash. */ +/* @{ */ +#define BUF_POOL_ZIP_FOLD_PTR(ptr) (ulint(ptr) >> srv_page_size_shift) +#define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->page.frame) +#define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b)) +/* @} */ + +/** A "Hazard Pointer" class used to iterate over buf_pool.LRU or +buf_pool.flush_list. A hazard pointer is a buf_page_t pointer +which we intend to iterate over next and we want it remain valid +even after we release the mutex that protects the list. */ +class HazardPointer +{ +public: + virtual ~HazardPointer() = default; + + /** @return current value */ + buf_page_t *get() const { mysql_mutex_assert_owner(m_mutex); return m_hp; } + + /** Set current value + @param bpage buffer block to be set as hp */ + void set(buf_page_t *bpage) + { + mysql_mutex_assert_owner(m_mutex); + ut_ad(!bpage || bpage->in_file()); + m_hp= bpage; + } + + /** Checks if a bpage is the hp + @param bpage buffer block to be compared + @return true if it is hp */ + bool is_hp(const buf_page_t *bpage) const + { mysql_mutex_assert_owner(m_mutex); return bpage == m_hp; } + + /** Adjust the value of hp. This happens when some + other thread working on the same list attempts to + remove the hp from the list. */ + virtual void adjust(const buf_page_t*) = 0; + +#ifdef UNIV_DEBUG + /** mutex that protects access to the m_hp. */ + const mysql_mutex_t *m_mutex= nullptr; +#endif /* UNIV_DEBUG */ + +protected: + /** hazard pointer */ + buf_page_t *m_hp= nullptr; +}; + +/** Class implementing buf_pool.flush_list hazard pointer */ +class FlushHp : public HazardPointer +{ +public: + ~FlushHp() override = default; + + /** Adjust the value of hp. This happens when some + other thread working on the same list attempts to + remove the hp from the list. + @param bpage buffer block to be compared */ + MY_ATTRIBUTE((nonnull)) + void adjust(const buf_page_t *bpage) override + { + /* We only support reverse traversal for now. */ + if (is_hp(bpage)) + m_hp= UT_LIST_GET_PREV(list, m_hp); + + ut_ad(!m_hp || m_hp->oldest_modification()); + } +}; + +/** Class implementing buf_pool.LRU hazard pointer */ +class LRUHp : public HazardPointer { +public: + ~LRUHp() override = default; + + /** Adjust the value of hp. This happens when some + other thread working on the same list attempts to + remove the hp from the list. + @param bpage buffer block to be compared */ + MY_ATTRIBUTE((nonnull)) + void adjust(const buf_page_t *bpage) override + { + /** We only support reverse traversal for now. */ + if (is_hp(bpage)) + m_hp= UT_LIST_GET_PREV(LRU, m_hp); + + ut_ad(!m_hp || m_hp->in_LRU_list); + } +}; + +/** Special purpose iterators to be used when scanning the LRU list. +The idea is that when one thread finishes the scan it leaves the +itr in that position and the other thread can start scan from +there */ +class LRUItr : public LRUHp { +public: + ~LRUItr() override = default; + + /** Select from where to start a scan. If we have scanned + too deep into the LRU list it resets the value to the tail + of the LRU list. + @return buf_page_t from where to start scan. */ + inline buf_page_t *start(); +}; + +/** Struct that is embedded in the free zip blocks */ +struct buf_buddy_free_t { + union { + ulint size; /*!< size of the block */ + byte bytes[FIL_PAGE_DATA]; + /*!< stamp[FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID] + == BUF_BUDDY_FREE_STAMP denotes a free + block. If the space_id field of buddy + block != BUF_BUDDY_FREE_STAMP, the block + is not in any zip_free list. If the + space_id is BUF_BUDDY_FREE_STAMP then + stamp[0] will contain the + buddy block size. */ + } stamp; + + buf_page_t bpage; /*!< Embedded bpage descriptor */ + UT_LIST_NODE_T(buf_buddy_free_t) list; + /*!< Node of zip_free list */ +}; + +/** @brief The buffer pool statistics structure; +protected by buf_pool.mutex unless otherwise noted. */ +struct buf_pool_stat_t{ + /** Initialize the counters */ + void init() { memset((void*) this, 0, sizeof *this); } + + ib_counter_t n_page_gets; + /*!< number of page gets performed; + also successful searches through + the adaptive hash index are + counted as page gets; + NOT protected by buf_pool.mutex */ + ulint n_pages_read; /*!< number read operations */ + ulint n_pages_written;/*!< number write operations */ + ulint n_pages_created;/*!< number of pages created + in the pool with no read */ + ulint n_ra_pages_read_rnd;/*!< number of pages read in + as part of random read ahead */ + ulint n_ra_pages_read;/*!< number of pages read in + as part of read ahead */ + ulint n_ra_pages_evicted;/*!< number of read ahead + pages that are evicted without + being accessed */ + ulint n_pages_made_young; /*!< number of pages made young, in + buf_page_make_young() */ + ulint n_pages_not_made_young; /*!< number of pages not made + young because the first access + was not long enough ago, in + buf_page_peek_if_too_old() */ + /** number of waits for eviction */ + ulint LRU_waits; + ulint LRU_bytes; /*!< LRU size in bytes */ +}; + +/** Statistics of buddy blocks of a given size. */ +struct buf_buddy_stat_t { + /** Number of blocks allocated from the buddy system. */ + ulint used; + /** Number of blocks relocated by the buddy system. */ + ib_uint64_t relocated; + /** Total duration of block relocations, in microseconds. */ + ib_uint64_t relocated_usec; +}; + +/** The buffer pool */ +class buf_pool_t +{ + /** A chunk of buffers */ + struct chunk_t + { + /** number of elements in blocks[] */ + size_t size; + /** memory allocated for the page frames */ + unsigned char *mem; + /** descriptor of mem */ + ut_new_pfx_t mem_pfx; + /** array of buffer control blocks */ + buf_block_t *blocks; + + /** Map of first page frame address to chunks[] */ + using map= std::map, + ut_allocator>>; + /** Chunk map that may be under construction by buf_resize_thread() */ + static map *map_reg; + /** Current chunk map for lookup only */ + static map *map_ref; + + /** @return the memory size bytes. */ + size_t mem_size() const { return mem_pfx.m_size; } + + /** Register the chunk */ + void reg() { map_reg->emplace(map::value_type(blocks->page.frame, this)); } + + /** Allocate a chunk of buffer frames. + @param bytes requested size + @return whether the allocation succeeded */ + inline bool create(size_t bytes); + +#ifdef UNIV_DEBUG + /** Find a block that points to a ROW_FORMAT=COMPRESSED page + @param data pointer to the start of a ROW_FORMAT=COMPRESSED page frame + @return the block + @retval nullptr if not found */ + const buf_block_t *contains_zip(const void *data) const + { + const buf_block_t *block= blocks; + for (auto i= size; i--; block++) + if (block->page.zip.data == data) + return block; + return nullptr; + } + + /** Check that all blocks are in a replaceable state. + @return address of a non-free block + @retval nullptr if all freed */ + inline const buf_block_t *not_freed() const; +#endif /* UNIV_DEBUG */ + }; +public: + /** Hash cell chain in page_hash_table */ + struct hash_chain + { + /** pointer to the first block */ + buf_page_t *first; + }; +private: + /** Withdraw blocks from the buffer pool until meeting withdraw_target. + @return whether retry is needed */ + inline bool withdraw_blocks(); + + /** Determine if a pointer belongs to a buf_block_t. It can be a pointer to + the buf_block_t itself or a member of it. + @param ptr a pointer that will not be dereferenced + @return whether the ptr belongs to a buf_block_t struct */ + bool is_block_field(const void *ptr) const + { + const chunk_t *chunk= chunks; + const chunk_t *const echunk= chunk + ut_min(n_chunks, n_chunks_new); + + /* TODO: protect chunks with a mutex (the older pointer will + currently remain during resize()) */ + for (; chunk < echunk; chunk++) + if (ptr >= reinterpret_cast(chunk->blocks) && + ptr < reinterpret_cast(chunk->blocks + chunk->size)) + return true; + return false; + } + + /** Try to reallocate a control block. + @param block control block to reallocate + @return whether the reallocation succeeded */ + inline bool realloc(buf_block_t *block); + +public: + bool is_initialised() const { return chunks != nullptr; } + + /** Create the buffer pool. + @return whether the creation failed */ + bool create(); + + /** Clean up after successful create() */ + void close(); + + /** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */ + inline void resize(); + + /** @return whether resize() is in progress */ + bool resize_in_progress() const + { + return UNIV_UNLIKELY(resizing.load(std::memory_order_relaxed)); + } + + /** @return the current size in blocks */ + size_t get_n_pages() const + { + ut_ad(is_initialised()); + size_t size= 0; + for (auto j= ut_min(n_chunks_new, n_chunks); j--; ) + size+= chunks[j].size; + return size; + } + + /** Determine whether a frame is intended to be withdrawn during resize(). + @param ptr pointer within a buf_page_t::frame + @return whether the frame will be withdrawn */ + bool will_be_withdrawn(const byte *ptr) const + { + ut_ad(n_chunks_new < n_chunks); +#ifdef SAFE_MUTEX + if (resize_in_progress()) + mysql_mutex_assert_owner(&mutex); +#endif /* SAFE_MUTEX */ + + for (const chunk_t *chunk= chunks + n_chunks_new, + * const echunk= chunks + n_chunks; + chunk != echunk; chunk++) + if (ptr >= chunk->blocks->page.frame && + ptr < (chunk->blocks + chunk->size - 1)->page.frame + srv_page_size) + return true; + return false; + } + + /** Determine whether a block is intended to be withdrawn during resize(). + @param bpage buffer pool block + @return whether the frame will be withdrawn */ + bool will_be_withdrawn(const buf_page_t &bpage) const + { + ut_ad(n_chunks_new < n_chunks); +#ifdef SAFE_MUTEX + if (resize_in_progress()) + mysql_mutex_assert_owner(&mutex); +#endif /* SAFE_MUTEX */ + + for (const chunk_t *chunk= chunks + n_chunks_new, + * const echunk= chunks + n_chunks; + chunk != echunk; chunk++) + if (&bpage >= &chunk->blocks->page && + &bpage < &chunk->blocks[chunk->size].page) + return true; + return false; + } + + /** Release and evict a corrupted page. + @param bpage x-latched page that was found corrupted + @param state expected current state of the page */ + ATTRIBUTE_COLD void corrupted_evict(buf_page_t *bpage, uint32_t state); + + /** Release a memory block to the buffer pool. */ + ATTRIBUTE_COLD void free_block(buf_block_t *block); + +#ifdef UNIV_DEBUG + /** Find a block that points to a ROW_FORMAT=COMPRESSED page + @param data pointer to the start of a ROW_FORMAT=COMPRESSED page frame + @return the block + @retval nullptr if not found */ + const buf_block_t *contains_zip(const void *data) const + { + mysql_mutex_assert_owner(&mutex); + for (const chunk_t *chunk= chunks, * const end= chunks + n_chunks; + chunk != end; chunk++) + if (const buf_block_t *block= chunk->contains_zip(data)) + return block; + return nullptr; + } + + /** Assert that all buffer pool pages are in a replaceable state */ + void assert_all_freed(); +#endif /* UNIV_DEBUG */ + +#ifdef BTR_CUR_HASH_ADAPT + /** Clear the adaptive hash index on all pages in the buffer pool. */ + inline void clear_hash_index(); + + /** Get a buffer block from an adaptive hash index pointer. + This function does not return if the block is not identified. + @param ptr pointer to within a page frame + @return pointer to block, never NULL */ + inline buf_block_t *block_from_ahi(const byte *ptr) const; +#endif /* BTR_CUR_HASH_ADAPT */ + + /** + @return the smallest oldest_modification lsn for any page + @retval empty_lsn if all modified persistent pages have been flushed */ + lsn_t get_oldest_modification(lsn_t empty_lsn) + { + mysql_mutex_assert_owner(&flush_list_mutex); + while (buf_page_t *bpage= UT_LIST_GET_LAST(flush_list)) + { + ut_ad(!fsp_is_system_temporary(bpage->id().space())); + lsn_t lsn= bpage->oldest_modification(); + if (lsn != 1) + { + ut_ad(lsn > 2); + return lsn; + } + delete_from_flush_list(bpage); + } + return empty_lsn; + } + + /** Determine if a buffer block was created by chunk_t::create(). + @param block block descriptor (not dereferenced) + @return whether block has been created by chunk_t::create() */ + bool is_uncompressed(const buf_block_t *block) const + { + return is_block_field(reinterpret_cast(block)); + } + +public: + /** @return whether the buffer pool contains a page + @tparam allow_watch whether to allow watch_is_sentinel() + @param page_id page identifier + @param chain hash table chain for page_id.fold() */ + template + TRANSACTIONAL_INLINE + bool page_hash_contains(const page_id_t page_id, hash_chain &chain) + { + transactional_shared_lock_guard g + {page_hash.lock_get(chain)}; + buf_page_t *bpage= page_hash.get(page_id, chain); + if (bpage >= &watch[0] && bpage < &watch[UT_ARR_SIZE(watch)]) + { + ut_ad(!bpage->in_zip_hash); + ut_ad(!bpage->zip.data); + if (!allow_watch) + bpage= nullptr; + } + return bpage; + } + + /** Determine if a block is a sentinel for a buffer pool watch. + @param bpage page descriptor + @return whether bpage a sentinel for a buffer pool watch */ + bool watch_is_sentinel(const buf_page_t &bpage) + { +#ifdef SAFE_MUTEX + DBUG_ASSERT(mysql_mutex_is_owner(&mutex) || + page_hash.lock_get(page_hash.cell_get(bpage.id().fold())). + is_locked()); +#endif /* SAFE_MUTEX */ + ut_ad(bpage.in_file()); + if (&bpage < &watch[0] || &bpage >= &watch[array_elements(watch)]) + return false; + ut_ad(!bpage.in_zip_hash); + ut_ad(!bpage.zip.data); + return true; + } + + /** Check if a watched page has been read. + This may only be called after !watch_set() and before invoking watch_unset(). + @param id page identifier + @return whether the page was read to the buffer pool */ + TRANSACTIONAL_INLINE + bool watch_occurred(const page_id_t id) + { + hash_chain &chain= page_hash.cell_get(id.fold()); + transactional_shared_lock_guard g + {page_hash.lock_get(chain)}; + /* The page must exist because watch_set() increments buf_fix_count. */ + return !watch_is_sentinel(*page_hash.get(id, chain)); + } + + /** Register a watch for a page identifier. + @param id page identifier + @param chain page_hash.cell_get(id.fold()) + @return a buffer page corresponding to id + @retval nullptr if the block was not present in page_hash */ + buf_page_t *watch_set(const page_id_t id, hash_chain &chain); + + /** Stop watching whether a page has been read in. + watch_set(id) must have returned nullptr before. + @param id page identifier + @param chain unlocked hash table chain */ + void watch_unset(const page_id_t id, hash_chain &chain); + + /** Remove the sentinel block for the watch before replacing it with a + real block. watch_unset() or watch_occurred() will notice + that the block has been replaced with the real block. + @param w sentinel + @param chain locked hash table chain + @return w->state() */ + inline uint32_t watch_remove(buf_page_t *w, hash_chain &chain); + + /** @return whether less than 1/4 of the buffer pool is available */ + TPOOL_SUPPRESS_TSAN + bool running_out() const + { + return !recv_recovery_is_on() && + UT_LIST_GET_LEN(free) + UT_LIST_GET_LEN(LRU) < + n_chunks_new / 4 * chunks->size; + } + + /** @return whether the buffer pool has run out */ + TPOOL_SUPPRESS_TSAN + bool ran_out() const + { return UNIV_UNLIKELY(!try_LRU_scan || !UT_LIST_GET_LEN(free)); } + + /** @return whether the buffer pool is shrinking */ + inline bool is_shrinking() const + { + return n_chunks_new < n_chunks; + } + +#ifdef UNIV_DEBUG + /** Validate the buffer pool. */ + void validate(); +#endif /* UNIV_DEBUG */ +#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG + /** Write information of the buf_pool to the error log. */ + void print(); +#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */ + + /** Remove a block from the LRU list. + @return the predecessor in the LRU list */ + buf_page_t *LRU_remove(buf_page_t *bpage) + { + mysql_mutex_assert_owner(&mutex); + ut_ad(bpage->in_LRU_list); + ut_ad(bpage->in_page_hash); + ut_ad(!bpage->in_zip_hash); + ut_ad(bpage->in_file()); + lru_hp.adjust(bpage); + lru_scan_itr.adjust(bpage); + ut_d(bpage->in_LRU_list= false); + buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage); + UT_LIST_REMOVE(LRU, bpage); + return prev; + } + + /** Number of pages to read ahead */ + static constexpr uint32_t READ_AHEAD_PAGES= 64; + + /** Buffer pool mutex */ + alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex; + /** current statistics; protected by mutex */ + buf_pool_stat_t stat; + /** old statistics; protected by mutex */ + buf_pool_stat_t old_stat; + + /** @name General fields */ + /* @{ */ + ulint curr_pool_size; /*!< Current pool size in bytes */ + ulint LRU_old_ratio; /*!< Reserve this much of the buffer + pool for "old" blocks */ +#ifdef UNIV_DEBUG + ulint buddy_n_frames; /*!< Number of frames allocated from + the buffer pool to the buddy system */ + ulint mutex_exit_forbidden; /*!< Forbid release mutex */ +#endif + ut_allocator allocator; /*!< Allocator used for + allocating memory for the the "chunks" + member. */ + ulint n_chunks; /*!< number of buffer pool chunks */ + ulint n_chunks_new; /*!< new number of buffer pool chunks. + both n_chunks{,new} are protected under + mutex */ + chunk_t* chunks; /*!< buffer pool chunks */ + chunk_t* chunks_old; /*!< old buffer pool chunks to be freed + after resizing buffer pool */ + /** current pool size in pages */ + Atomic_counter curr_size; + /** read-ahead request size in pages */ + Atomic_counter read_ahead_area; + + /** Hash table with singly-linked overflow lists */ + struct page_hash_table + { + static_assert(CPU_LEVEL1_DCACHE_LINESIZE >= 64, "less than 64 bytes"); + static_assert(!(CPU_LEVEL1_DCACHE_LINESIZE & 63), + "not a multiple of 64 bytes"); + + /** Number of array[] elements per page_hash_latch. + Must be one less than a power of 2. */ + static constexpr size_t ELEMENTS_PER_LATCH= 64 / sizeof(void*) - 1; + static constexpr size_t EMPTY_SLOTS_PER_LATCH= + ((CPU_LEVEL1_DCACHE_LINESIZE / 64) - 1) * (64 / sizeof(void*)); + + /** number of payload elements in array[] */ + Atomic_relaxed n_cells; + /** the hash table, with pad(n_cells) elements, aligned to L1 cache size */ + hash_chain *array; + + /** Create the hash table. + @param n the lower bound of n_cells */ + void create(ulint n); + + /** Free the hash table. */ + void free() { aligned_free(array); array= nullptr; } + + /** @return the index of an array element */ + ulint calc_hash(ulint fold) const { return calc_hash(fold, n_cells); } + /** @return raw array index converted to padded index */ + static ulint pad(ulint h) + { + ulint latches= h / ELEMENTS_PER_LATCH; + ulint empty_slots= latches * EMPTY_SLOTS_PER_LATCH; + return 1 + latches + empty_slots + h; + } + private: + /** @return the hash value before any ELEMENTS_PER_LATCH padding */ + static ulint hash(ulint fold, ulint n) { return ut_hash_ulint(fold, n); } + + /** @return the index of an array element */ + static ulint calc_hash(ulint fold, ulint n_cells) + { + return pad(hash(fold, n_cells)); + } + public: + /** @return the latch covering a hash table chain */ + static page_hash_latch &lock_get(hash_chain &chain) + { + static_assert(!((ELEMENTS_PER_LATCH + 1) & ELEMENTS_PER_LATCH), + "must be one less than a power of 2"); + const size_t addr= reinterpret_cast(&chain); + ut_ad(addr & (ELEMENTS_PER_LATCH * sizeof chain)); + return *reinterpret_cast + (addr & ~(ELEMENTS_PER_LATCH * sizeof chain)); + } + + /** Get a hash table slot. */ + hash_chain &cell_get(ulint fold) const + { return array[calc_hash(fold, n_cells)]; } + + /** Append a block descriptor to a hash bucket chain. */ + void append(hash_chain &chain, buf_page_t *bpage) + { + ut_ad(!bpage->in_page_hash); + ut_ad(!bpage->hash); + ut_d(bpage->in_page_hash= true); + buf_page_t **prev= &chain.first; + while (*prev) + { + ut_ad((*prev)->in_page_hash); + prev= &(*prev)->hash; + } + *prev= bpage; + } + + /** Remove a block descriptor from a hash bucket chain. */ + void remove(hash_chain &chain, buf_page_t *bpage) + { + ut_ad(bpage->in_page_hash); + buf_page_t **prev= &chain.first; + while (*prev != bpage) + { + ut_ad((*prev)->in_page_hash); + prev= &(*prev)->hash; + } + *prev= bpage->hash; + ut_d(bpage->in_page_hash= false); + bpage->hash= nullptr; + } + + /** Replace a block descriptor with another. */ + void replace(hash_chain &chain, buf_page_t *old, buf_page_t *bpage) + { + ut_ad(old->in_page_hash); + ut_ad(bpage->in_page_hash); + ut_d(old->in_page_hash= false); + ut_ad(bpage->hash == old->hash); + old->hash= nullptr; + buf_page_t **prev= &chain.first; + while (*prev != old) + { + ut_ad((*prev)->in_page_hash); + prev= &(*prev)->hash; + } + *prev= bpage; + } + + /** Look up a page in a hash bucket chain. */ + inline buf_page_t *get(const page_id_t id, const hash_chain &chain) const; + + /** Exclusively aqcuire all latches */ + inline void write_lock_all(); + + /** Release all latches */ + inline void write_unlock_all(); + }; + + /** Hash table of file pages (buf_page_t::in_file() holds), + indexed by page_id_t. Protected by both mutex and page_hash.lock_get(). */ + page_hash_table page_hash; + + /** map of block->frame to buf_block_t blocks that belong + to buf_buddy_alloc(); protected by buf_pool.mutex */ + hash_table_t zip_hash; + Atomic_counter + n_pend_unzip; /*!< number of pending decompressions */ + + time_t last_printout_time; + /*!< when buf_print_io was last time + called */ + buf_buddy_stat_t buddy_stat[BUF_BUDDY_SIZES_MAX + 1]; + /*!< Statistics of buddy system, + indexed by block size */ + + /* @} */ + + /** number of index page splits */ + Atomic_counter pages_split; + + /** @name Page flushing algorithm fields */ + /* @{ */ + + /** mutex protecting flush_list, buf_page_t::set_oldest_modification() + and buf_page_t::list pointers when !oldest_modification() */ + alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t flush_list_mutex; + /** "hazard pointer" for flush_list scans; protected by flush_list_mutex */ + FlushHp flush_hp; + /** flush_list size in bytes; protected by flush_list_mutex */ + ulint flush_list_bytes; + /** possibly modified persistent pages (a subset of LRU); + os_aio_pending_writes() is approximately COUNT(is_write_fixed()) */ + UT_LIST_BASE_NODE_T(buf_page_t) flush_list; + /** number of blocks ever added to flush_list; + sometimes protected by flush_list_mutex */ + size_t flush_list_requests; + + TPOOL_SUPPRESS_TSAN void add_flush_list_requests(size_t size) + { ut_ad(size); flush_list_requests+= size; } +private: + static constexpr unsigned PAGE_CLEANER_IDLE= 1; + static constexpr unsigned FLUSH_LIST_ACTIVE= 2; + static constexpr unsigned LRU_FLUSH= 4; + + /** Number of pending LRU flush * LRU_FLUSH + + PAGE_CLEANER_IDLE + FLUSH_LIST_ACTIVE flags */ + unsigned page_cleaner_status; + /** track server activity count for signaling idle flushing */ + ulint last_activity_count; +public: + /** signalled to wake up the page_cleaner; protected by flush_list_mutex */ + pthread_cond_t do_flush_list; + /** broadcast when !n_flush(); protected by flush_list_mutex */ + pthread_cond_t done_flush_LRU; + /** broadcast when a batch completes; protected by flush_list_mutex */ + pthread_cond_t done_flush_list; + + /** @return number of pending LRU flush */ + unsigned n_flush() const + { + mysql_mutex_assert_owner(&flush_list_mutex); + return page_cleaner_status / LRU_FLUSH; + } + + /** Increment the number of pending LRU flush */ + inline void n_flush_inc(); + + /** Decrement the number of pending LRU flush */ + inline void n_flush_dec(); + + /** Decrement the number of pending LRU flush + while holding flush_list_mutex */ + inline void n_flush_dec_holding_mutex(); + + /** @return whether flush_list flushing is active */ + bool flush_list_active() const + { + mysql_mutex_assert_owner(&flush_list_mutex); + return page_cleaner_status & FLUSH_LIST_ACTIVE; + } + + void flush_list_set_active() + { + ut_ad(!flush_list_active()); + page_cleaner_status+= FLUSH_LIST_ACTIVE; + } + void flush_list_set_inactive() + { + ut_ad(flush_list_active()); + page_cleaner_status-= FLUSH_LIST_ACTIVE; + } + + /** @return whether the page cleaner must sleep due to being idle */ + bool page_cleaner_idle() const noexcept + { + mysql_mutex_assert_owner(&flush_list_mutex); + return page_cleaner_status & PAGE_CLEANER_IDLE; + } + + /** @return whether the page cleaner may be initiating writes */ + bool page_cleaner_active() const + { + mysql_mutex_assert_owner(&flush_list_mutex); + static_assert(PAGE_CLEANER_IDLE == 1, "efficiency"); + return page_cleaner_status > PAGE_CLEANER_IDLE; + } + + /** Wake up the page cleaner if needed. + @param for_LRU whether to wake up for LRU eviction */ + void page_cleaner_wakeup(bool for_LRU= false); + + /** Register whether an explicit wakeup of the page cleaner is needed */ + void page_cleaner_set_idle(bool deep_sleep) + { + mysql_mutex_assert_owner(&flush_list_mutex); + page_cleaner_status= (page_cleaner_status & ~PAGE_CLEANER_IDLE) | + (PAGE_CLEANER_IDLE * deep_sleep); + } + + /** Update server last activity count */ + void update_last_activity_count(ulint activity_count) + { + mysql_mutex_assert_owner(&flush_list_mutex); + last_activity_count= activity_count; + } + + unsigned freed_page_clock;/*!< a sequence number used + to count the number of buffer + blocks removed from the end of + the LRU list; NOTE that this + counter may wrap around at 4 + billion! A thread is allowed + to read this for heuristic + purposes without holding any + mutex or latch */ + /** Cleared when buf_LRU_get_free_block() fails. + Set whenever the free list grows, along with a broadcast of done_free. + Protected by buf_pool.mutex. */ + Atomic_relaxed try_LRU_scan; + /* @} */ + + /** @name LRU replacement algorithm fields */ + /* @{ */ + + UT_LIST_BASE_NODE_T(buf_page_t) free; + /*!< base node of the free + block list */ + /** broadcast each time when the free list grows or try_LRU_scan is set; + protected by mutex */ + pthread_cond_t done_free; + + UT_LIST_BASE_NODE_T(buf_page_t) withdraw; + /*!< base node of the withdraw + block list. It is only used during + shrinking buffer pool size, not to + reuse the blocks will be removed */ + + ulint withdraw_target;/*!< target length of withdraw + block list, when withdrawing */ + + /** "hazard pointer" used during scan of LRU while doing + LRU list batch. Protected by buf_pool_t::mutex. */ + LRUHp lru_hp; + + /** Iterator used to scan the LRU list when searching for + replacable victim. Protected by buf_pool_t::mutex. */ + LRUItr lru_scan_itr; + + UT_LIST_BASE_NODE_T(buf_page_t) LRU; + /*!< base node of the LRU list */ + + buf_page_t* LRU_old; /*!< pointer to the about + LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV + oldest blocks in the LRU list; + NULL if LRU length less than + BUF_LRU_OLD_MIN_LEN; + NOTE: when LRU_old != NULL, its length + should always equal LRU_old_len */ + ulint LRU_old_len; /*!< length of the LRU list from + the block to which LRU_old points + onward, including that block; + see buf0lru.cc for the restrictions + on this value; 0 if LRU_old == NULL; + NOTE: LRU_old_len must be adjusted + whenever LRU_old shrinks or grows! */ + + UT_LIST_BASE_NODE_T(buf_block_t) unzip_LRU; + /*!< base node of the + unzip_LRU list */ + + /* @} */ + /** free ROW_FORMAT=COMPRESSED page frames */ + UT_LIST_BASE_NODE_T(buf_buddy_free_t) zip_free[BUF_BUDDY_SIZES_MAX]; +#if BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN +# error "BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN" +#endif + + /** Sentinels to detect if pages are read into the buffer pool while + a delete-buffering operation is pending. Protected by mutex. */ + buf_page_t watch[innodb_purge_threads_MAX + 1]; + /** Reserve a buffer. */ + buf_tmp_buffer_t *io_buf_reserve() { return io_buf.reserve(); } + + /** Remove a block from flush_list. + @param bpage buffer pool page */ + void delete_from_flush_list(buf_page_t *bpage) noexcept; + + /** Prepare to insert a modified blcok into flush_list. + @param lsn start LSN of the mini-transaction + @return insert position for insert_into_flush_list() */ + inline buf_page_t *prepare_insert_into_flush_list(lsn_t lsn) noexcept; + + /** Insert a modified block into the flush list. + @param prev insert position (from prepare_insert_into_flush_list()) + @param block modified block + @param lsn start LSN of the mini-transaction that modified the block */ + inline void insert_into_flush_list(buf_page_t *prev, buf_block_t *block, + lsn_t lsn) noexcept; + + /** Free a page whose underlying file page has been freed. */ + ATTRIBUTE_COLD void release_freed_page(buf_page_t *bpage) noexcept; + +private: + /** Temporary memory for page_compressed and encrypted I/O */ + struct io_buf_t + { + /** number of elements in slots[] */ + ulint n_slots; + /** array of slots */ + buf_tmp_buffer_t *slots; + + void create(ulint n_slots); + + void close(); + + /** Reserve a buffer */ + buf_tmp_buffer_t *reserve(); + } io_buf; + + /** whether resize() is in the critical path */ + std::atomic resizing; +}; + +/** The InnoDB buffer pool */ +extern buf_pool_t buf_pool; + +inline buf_page_t *buf_pool_t::page_hash_table::get(const page_id_t id, + const hash_chain &chain) + const +{ +#ifdef SAFE_MUTEX + DBUG_ASSERT(mysql_mutex_is_owner(&buf_pool.mutex) || + lock_get(const_cast(chain)).is_locked()); +#endif /* SAFE_MUTEX */ + for (buf_page_t *bpage= chain.first; bpage; bpage= bpage->hash) + { + ut_ad(bpage->in_page_hash); + ut_ad(bpage->in_file()); + if (bpage->id() == id) + return bpage; + } + return nullptr; +} + +#ifdef SUX_LOCK_GENERIC +inline void page_hash_latch::lock_shared() +{ + mysql_mutex_assert_not_owner(&buf_pool.mutex); + if (!read_trylock()) + read_lock_wait(); +} + +inline void page_hash_latch::lock() +{ + if (!write_trylock()) + write_lock_wait(); +} +#endif /* SUX_LOCK_GENERIC */ + +inline void buf_page_t::set_state(uint32_t s) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(s <= REMOVE_HASH || s >= UNFIXED); + ut_ad(s < WRITE_FIX); + ut_ad(s <= READ_FIX || zip.fix == READ_FIX); + zip.fix= s; +} + +inline void buf_page_t::set_corrupt_id() +{ +#ifdef UNIV_DEBUG + switch (oldest_modification()) { + case 0: + break; + case 2: + ut_ad(fsp_is_system_temporary(id().space())); + /* buf_LRU_block_free_non_file_page() asserts !oldest_modification() */ + ut_d(oldest_modification_= 0;) + break; + default: + ut_ad("block is dirty" == 0); + } + const auto f= state(); + if (f != REMOVE_HASH) + { + ut_ad(f >= UNFIXED); + ut_ad(buf_pool.page_hash.lock_get(buf_pool.page_hash.cell_get(id_.fold())). + is_write_locked()); + } +#endif + id_.set_corrupted(); +} + +/** Set oldest_modification when adding to buf_pool.flush_list */ +inline void buf_page_t::set_oldest_modification(lsn_t lsn) +{ + mysql_mutex_assert_owner(&buf_pool.flush_list_mutex); + ut_ad(oldest_modification() <= 1); + ut_ad(lsn > 2); + oldest_modification_= lsn; +} + +/** Clear oldest_modification after removing from buf_pool.flush_list */ +inline void buf_page_t::clear_oldest_modification() +{ +#ifdef SAFE_MUTEX + if (oldest_modification() != 2) + mysql_mutex_assert_owner(&buf_pool.flush_list_mutex); +#endif /* SAFE_MUTEX */ + ut_d(const auto s= state()); + ut_ad(s >= REMOVE_HASH); + ut_ad(oldest_modification()); + ut_ad(!list.prev); + ut_ad(!list.next); + /* We must use release memory order to guarantee that callers of + oldest_modification_acquire() will observe the block as + being detached from buf_pool.flush_list, after reading the value 0. */ + oldest_modification_.store(0, std::memory_order_release); +} + +/** @return whether the block can be relocated in memory. +The block can be dirty, but it must not be I/O-fixed or bufferfixed. */ +inline bool buf_page_t::can_relocate() const +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + const auto f= state(); + ut_ad(f >= FREED); + ut_ad(in_LRU_list); + return (f == FREED || (f < READ_FIX && !(f & ~LRU_MASK))) && + !lock.is_locked_or_waiting(); +} + +/** @return whether the block has been flagged old in buf_pool.LRU */ +inline bool buf_page_t::is_old() const +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(in_file()); + ut_ad(in_LRU_list); + return old; +} + +/** Set whether a block is old in buf_pool.LRU */ +inline void buf_page_t::set_old(bool old) +{ + mysql_mutex_assert_owner(&buf_pool.mutex); + ut_ad(in_LRU_list); + +#ifdef UNIV_LRU_DEBUG + ut_a((buf_pool.LRU_old_len == 0) == (buf_pool.LRU_old == nullptr)); + /* If a block is flagged "old", the LRU_old list must exist. */ + ut_a(!old || buf_pool.LRU_old); + + if (UT_LIST_GET_PREV(LRU, this) && UT_LIST_GET_NEXT(LRU, this)) + { + const buf_page_t *prev= UT_LIST_GET_PREV(LRU, this); + const buf_page_t *next = UT_LIST_GET_NEXT(LRU, this); + if (prev->old == next->old) + ut_a(prev->old == old); + else + { + ut_a(!prev->old); + ut_a(buf_pool.LRU_old == (old ? this : next)); + } + } +#endif /* UNIV_LRU_DEBUG */ + + this->old= old; +} + +#ifdef UNIV_DEBUG +/** Forbid the release of the buffer pool mutex. */ +# define buf_pool_mutex_exit_forbid() do { \ + mysql_mutex_assert_owner(&buf_pool.mutex); \ + buf_pool.mutex_exit_forbidden++; \ +} while (0) +/** Allow the release of the buffer pool mutex. */ +# define buf_pool_mutex_exit_allow() do { \ + mysql_mutex_assert_owner(&buf_pool.mutex); \ + ut_ad(buf_pool.mutex_exit_forbidden--); \ +} while (0) +#else +/** Forbid the release of the buffer pool mutex. */ +# define buf_pool_mutex_exit_forbid() ((void) 0) +/** Allow the release of the buffer pool mutex. */ +# define buf_pool_mutex_exit_allow() ((void) 0) +#endif + +/********************************************************************** +Let us list the consistency conditions for different control block states. + +NOT_USED: is in free list, not LRU, not flush_list, nor page_hash +MEMORY: is not in any of free, LRU, flush_list, page_hash +in_file(): is not in free list, is in LRU list, id() is defined, + is in page_hash (not necessarily if is_read_fixed()) + + is in buf_pool.flush_list, if and only + if oldest_modification == 1 || oldest_modification > 2 + + (1) if is_write_fixed(): is u-locked + (2) if is_read_fixed(): is x-locked + +State transitions: + +NOT_USED => MEMORY +MEMORY => NOT_USED +MEMORY => UNFIXED +UNFIXED => in_file() +in_file() => UNFIXED or FREED +UNFIXED or FREED => REMOVE_HASH +REMOVE_HASH => NOT_USED (if and only if !oldest_modification()) +*/ + +/** Select from where to start a scan. If we have scanned +too deep into the LRU list it resets the value to the tail +of the LRU list. +@return buf_page_t from where to start scan. */ +inline buf_page_t *LRUItr::start() +{ + mysql_mutex_assert_owner(m_mutex); + + if (!m_hp || m_hp->old) + m_hp= UT_LIST_GET_LAST(buf_pool.LRU); + + return m_hp; +} + +#ifdef UNIV_DEBUG +/** Functor to validate the LRU list. */ +struct CheckInLRUList { + void operator()(const buf_page_t* elem) const + { + ut_a(elem->in_LRU_list); + } + + static void validate() + { + ut_list_validate(buf_pool.LRU, CheckInLRUList()); + } +}; + +/** Functor to validate the LRU list. */ +struct CheckInFreeList { + void operator()(const buf_page_t* elem) const + { + ut_a(elem->in_free_list); + } + + static void validate() + { + ut_list_validate(buf_pool.free, CheckInFreeList()); + } +}; + +struct CheckUnzipLRUAndLRUList { + void operator()(const buf_block_t* elem) const + { + ut_a(elem->page.in_LRU_list); + ut_a(elem->in_unzip_LRU_list); + } + + static void validate() + { + ut_list_validate(buf_pool.unzip_LRU, + CheckUnzipLRUAndLRUList()); + } +}; +#endif /* UNIV_DEBUG */ + +#include "buf0buf.inl" + +#endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/buf0buf.inl b/storage/innobase/include/buf0buf.inl new file mode 100644 index 00000000..b3158cf1 --- /dev/null +++ b/storage/innobase/include/buf0buf.inl @@ -0,0 +1,132 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2014, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0buf.ic +The database buffer buf_pool + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#include "buf0lru.h" + +/** Determine if a block is still close enough to the MRU end of the LRU list +meaning that it is not in danger of getting evicted and also implying +that it has been accessed recently. +The page must be either buffer-fixed, or its page hash must be locked. +@param[in] bpage buffer pool page +@return whether bpage is close to MRU end of LRU */ +inline bool buf_page_peek_if_young(const buf_page_t *bpage) +{ + /* FIXME: bpage->freed_page_clock is 31 bits */ + return((buf_pool.freed_page_clock & ((1UL << 31) - 1)) + < (bpage->freed_page_clock + + (buf_pool.curr_size + * (BUF_LRU_OLD_RATIO_DIV - buf_pool.LRU_old_ratio) + / (BUF_LRU_OLD_RATIO_DIV * 4)))); +} + +/** Determine if a block should be moved to the start of the LRU list if +there is danger of dropping from the buffer pool. +@param[in] bpage buffer pool page +@return true if bpage should be made younger */ +inline bool buf_page_peek_if_too_old(const buf_page_t *bpage) +{ + if (buf_pool.freed_page_clock == 0) { + /* If eviction has not started yet, do not update the + statistics or move blocks in the LRU list. This is + either the warm-up phase or an in-memory workload. */ + return(FALSE); + } else if (buf_LRU_old_threshold_ms && bpage->old) { + uint32_t access_time = bpage->is_accessed(); + + /* It is possible that the below comparison returns an + unexpected result. 2^32 milliseconds pass in about 50 days, + so if the difference between ut_time_ms() and access_time + is e.g. 50 days + 15 ms, then the below will behave as if + it is 15 ms. This is known and fixing it would require to + increase buf_page_t::access_time from 32 to 64 bits. */ + if (access_time + && ((ib_uint32_t) (ut_time_ms() - access_time)) + >= buf_LRU_old_threshold_ms) { + return(TRUE); + } + + buf_pool.stat.n_pages_not_made_young++; + return false; + } else { + return !buf_page_peek_if_young(bpage); + } +} + +/** Allocate a buffer block. +@return own: the allocated block, in state BUF_BLOCK_MEMORY */ +inline buf_block_t *buf_block_alloc() +{ + return buf_LRU_get_free_block(false); +} + +/********************************************************************//** +Frees a buffer block which does not contain a file page. */ +UNIV_INLINE +void +buf_block_free( +/*===========*/ + buf_block_t* block) /*!< in, own: block to be freed */ +{ + mysql_mutex_lock(&buf_pool.mutex); + buf_LRU_block_free_non_file_page(block); + mysql_mutex_unlock(&buf_pool.mutex); +} + +/********************************************************************//** +Increments the modify clock of a frame by 1. The caller must (1) own the +buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock +on the block. */ +UNIV_INLINE +void +buf_block_modify_clock_inc( +/*=======================*/ + buf_block_t* block) /*!< in: block */ +{ +#ifdef SAFE_MUTEX + ut_ad((mysql_mutex_is_owner(&buf_pool.mutex) + && !block->page.buf_fix_count()) + || block->page.lock.have_u_or_x()); +#else /* SAFE_MUTEX */ + ut_ad(!block->page.buf_fix_count() || block->page.lock.have_u_or_x()); +#endif /* SAFE_MUTEX */ + assert_block_ahi_valid(block); + + block->modify_clock++; +} + +/********************************************************************//** +Returns the value of the modify clock. The caller must have an s-lock +or x-lock on the block. +@return value */ +UNIV_INLINE +ib_uint64_t +buf_block_get_modify_clock( +/*=======================*/ + buf_block_t* block) /*!< in: block */ +{ + ut_ad(block->page.lock.have_any()); + return(block->modify_clock); +} diff --git a/storage/innobase/include/buf0checksum.h b/storage/innobase/include/buf0checksum.h new file mode 100644 index 00000000..d9f03177 --- /dev/null +++ b/storage/innobase/include/buf0checksum.h @@ -0,0 +1,57 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0checksum.h +Buffer pool checksum functions, also linked from /extra/innochecksum.cc + +Created Aug 11, 2011 Vasil Dimov +*******************************************************/ + +#pragma once +#include "buf0types.h" + +/** Calculate the CRC32 checksum of a page. The value is stored to the page +when it is written to a file and also checked for a match when reading from +the file. Note that we must be careful to calculate the same value on all +architectures. +@param[in] page buffer page (srv_page_size bytes) +@return CRC-32C */ +uint32_t buf_calc_page_crc32(const byte* page); + +#ifndef UNIV_INNOCHECKSUM +/** Calculate a checksum which is stored to the page when it is written +to a file. Note that we must be careful to calculate the same value on +32-bit and 64-bit architectures. +@param[in] page file page (srv_page_size bytes) +@return checksum */ +uint32_t +buf_calc_page_new_checksum(const byte* page); + +/** In MySQL before 4.0.14 or 4.1.1 there was an InnoDB bug that +the checksum only looked at the first few bytes of the page. +This calculates that old checksum. +NOTE: we must first store the new formula checksum to +FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum +because this takes that field as an input! +@param[in] page file page (srv_page_size bytes) +@return checksum */ +uint32_t +buf_calc_page_old_checksum(const byte* page); +#endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/buf0dblwr.h b/storage/innobase/include/buf0dblwr.h new file mode 100644 index 00000000..9932b0e5 --- /dev/null +++ b/storage/innobase/include/buf0dblwr.h @@ -0,0 +1,164 @@ +/***************************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0dblwr.h +Doublewrite buffer module + +Created 2011/12/19 Inaam Rana +*******************************************************/ + +#pragma once + +#include "os0file.h" +#include "buf0types.h" + +/** Doublewrite control struct */ +class buf_dblwr_t +{ + struct element + { + /** asynchronous write request */ + IORequest request; + /** payload size in bytes */ + size_t size; + }; + + struct slot + { + /** first free position in write_buf measured in units of + * srv_page_size */ + ulint first_free; + /** number of slots reserved for the current write batch */ + ulint reserved; + /** the doublewrite buffer, aligned to srv_page_size */ + byte* write_buf; + /** buffer blocks to be written via write_buf */ + element* buf_block_arr; + }; + + /** the page number of the first doublewrite block (block_size() pages) */ + page_id_t block1{0, 0}; + /** the page number of the second doublewrite block (block_size() pages) */ + page_id_t block2{0, 0}; + + /** mutex protecting the data members below */ + mysql_mutex_t mutex; + /** condition variable for !batch_running */ + pthread_cond_t cond; + /** whether a batch is being written from the doublewrite buffer */ + bool batch_running; + /** number of expected flush_buffered_writes_completed() calls */ + unsigned flushing_buffered_writes; + /** number of flush_buffered_writes_completed() calls */ + ulint writes_completed; + /** number of pages written by flush_buffered_writes_completed() */ + ulint pages_written; + + slot slots[2]; + slot *active_slot; + + /** Initialise the persistent storage of the doublewrite buffer. + @param header doublewrite page header in the TRX_SYS page */ + inline void init(const byte *header); + + /** Flush possible buffered writes to persistent storage. */ + bool flush_buffered_writes(const ulint size); + +public: + /** Initialise the doublewrite buffer data structures. */ + void init(); + /** Create or restore the doublewrite buffer in the TRX_SYS page. + @return whether the operation succeeded */ + bool create(); + /** Free the doublewrite buffer. */ + void close(); + + /** Acquire the mutex */ + void lock() { mysql_mutex_lock(&mutex); } + /** @return the number of completed batches */ + ulint batches() const + { mysql_mutex_assert_owner(&mutex); return writes_completed; } + /** @return the number of final pages written */ + ulint written() const + { mysql_mutex_assert_owner(&mutex); return pages_written; } + /** Release the mutex */ + void unlock() { mysql_mutex_unlock(&mutex); } + + /** Initialize the doublewrite buffer memory structure on recovery. + If we are upgrading from a version before MySQL 4.1, then this + function performs the necessary update operations to support + innodb_file_per_table. If we are in a crash recovery, this function + loads the pages from double write buffer into memory. + @param file File handle + @param path Path name of file + @return DB_SUCCESS or error code */ + dberr_t init_or_load_pages(pfs_os_file_t file, const char *path); + + /** Process and remove the double write buffer pages for all tablespaces. */ + void recover(); + + /** Update the doublewrite buffer on data page write completion. */ + void write_completed(); + /** Flush possible buffered writes to persistent storage. + It is very important to call this function after a batch of writes has been + posted, and also when we may have to wait for a page latch! + Otherwise a deadlock of threads can occur. */ + void flush_buffered_writes(); + /** Update the doublewrite buffer on write batch completion + @param request the completed batch write request */ + void flush_buffered_writes_completed(const IORequest &request); + + /** Size of the doublewrite block in pages */ + uint32_t block_size() const { return FSP_EXTENT_SIZE; } + + /** Schedule a page write. If the doublewrite memory buffer is full, + flush_buffered_writes() will be invoked to make space. + @param request asynchronous write request + @param size payload size in bytes */ + void add_to_batch(const IORequest &request, size_t size); + + /** Determine whether the doublewrite buffer has been created */ + bool is_created() const + { return UNIV_LIKELY(block1 != page_id_t(0, 0)); } + + /** @return whether a page identifier is part of the doublewrite buffer */ + bool is_inside(const page_id_t id) const + { + if (!is_created()) + return false; + ut_ad(block1 < block2); + if (id < block1) + return false; + const uint32_t size= block_size(); + return id < block1 + size || (id >= block2 && id < block2 + size); + } + + /** Wait for flush_buffered_writes() to be fully completed */ + void wait_flush_buffered_writes() + { + mysql_mutex_lock(&mutex); + while (batch_running) + my_cond_wait(&cond, &mutex.m_mutex); + mysql_mutex_unlock(&mutex); + } +}; + +/** The doublewrite buffer */ +extern buf_dblwr_t buf_dblwr; diff --git a/storage/innobase/include/buf0dump.h b/storage/innobase/include/buf0dump.h new file mode 100644 index 00000000..48586900 --- /dev/null +++ b/storage/innobase/include/buf0dump.h @@ -0,0 +1,44 @@ +/***************************************************************************** + +Copyright (c) 2011, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file buf/buf0dump.h +Implements a buffer pool dump/load. + +Created April 08, 2011 Vasil Dimov +*******************************************************/ + +#ifndef buf0dump_h +#define buf0dump_h + +/** Start the buffer pool dump/load task and instructs it to start a dump. */ +void buf_dump_start(); +/** Start the buffer pool dump/load task and instructs it to start a load. */ +void buf_load_start(); + +/** Abort a currently running buffer pool load. */ +void buf_load_abort(); + +/** Start async buffer pool load, if srv_buffer_pool_load_at_startup was set.*/ +void buf_load_at_startup(); + +/** Wait for currently running load/dumps to finish*/ +void buf_load_dump_end(); + +#endif /* buf0dump_h */ diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h new file mode 100644 index 00000000..0cce514b --- /dev/null +++ b/storage/innobase/include/buf0flu.h @@ -0,0 +1,125 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2014, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0flu.h +The database buffer pool flush algorithm + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#pragma once + +#include "ut0byte.h" +#include "log0log.h" +#include "buf0buf.h" + +/** Number of pages flushed via LRU. Protected by buf_pool.mutex. +Also included in buf_pool.stat.n_pages_written. */ +extern ulint buf_lru_flush_page_count; +/** Number of pages freed without flushing. Protected by buf_pool.mutex. */ +extern ulint buf_lru_freed_page_count; + +/** Flag indicating if the page_cleaner is in active state. */ +extern Atomic_relaxed buf_page_cleaner_is_active; + +/** Remove all dirty pages belonging to a given tablespace when we are +deleting the data file of that tablespace. +The pages still remain a part of LRU and are evicted from +the list as they age towards the tail of the LRU. +@param id tablespace identifier */ +void buf_flush_remove_pages(uint32_t id); + +/*******************************************************************//** +Relocates a buffer control block on the flush_list. +Note that it is assumed that the contents of bpage has already been +copied to dpage. */ +ATTRIBUTE_COLD +void +buf_flush_relocate_on_flush_list( +/*=============================*/ + buf_page_t* bpage, /*!< in/out: control block being moved */ + buf_page_t* dpage); /*!< in/out: destination block */ + +/** Complete write of a file page from buf_pool. +@param request write request +@param error whether the write may have failed */ +void buf_page_write_complete(const IORequest &request, bool error); + +/** Assign the full crc32 checksum for non-compressed page. +@param[in,out] page page to be updated */ +void buf_flush_assign_full_crc32_checksum(byte* page); + +/** Initialize a page for writing to the tablespace. +@param[in] block buffer block; NULL if bypassing the buffer pool +@param[in,out] page page frame +@param[in,out] page_zip_ compressed page, or NULL if uncompressed +@param[in] use_full_checksum whether tablespace uses full checksum */ +void +buf_flush_init_for_writing( + const buf_block_t* block, + byte* page, + void* page_zip_, + bool use_full_checksum); + +/** Try to flush dirty pages that belong to a given tablespace. +@param space tablespace +@param n_flushed number of pages written +@return whether the flush for some pages might not have been initiated */ +bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed= nullptr) + MY_ATTRIBUTE((warn_unused_result)); + +/** Write out dirty blocks from buf_pool.LRU, +and move clean blocks to buf_pool.free. +The caller must invoke buf_dblwr.flush_buffered_writes() +after releasing buf_pool.mutex. +@param max_n wished maximum mumber of blocks flushed +@param evict whether to evict pages after flushing +@return evict ? number of processed pages : number of pages written +@retval 0 if a buf_pool.LRU batch is already running */ +ulint buf_flush_LRU(ulint max_n, bool evict); + +/** Wait until a LRU flush batch ends. */ +void buf_flush_wait_LRU_batch_end(); +/** Wait until all persistent pages are flushed up to a limit. +@param sync_lsn buf_pool.get_oldest_modification(LSN_MAX) to wait for */ +ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn); +/** Initiate more eager page flushing if the log checkpoint age is too old. +@param lsn buf_pool.get_oldest_modification(LSN_MAX) target +@param furious true=furious flushing, false=limit to innodb_io_capacity */ +ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious); + +/** Initialize page_cleaner. */ +ATTRIBUTE_COLD void buf_flush_page_cleaner_init(); + +/** Flush the buffer pool on shutdown. */ +ATTRIBUTE_COLD void buf_flush_buffer_pool(); + +#ifdef UNIV_DEBUG +/** Validate the flush list. */ +void buf_flush_validate(); +#endif /* UNIV_DEBUG */ + +/** Synchronously flush dirty blocks during recv_sys_t::apply(). +NOTE: The calling thread is not allowed to hold any buffer page latches! */ +void buf_flush_sync_batch(lsn_t lsn); + +/** Synchronously flush dirty blocks. +NOTE: The calling thread is not allowed to hold any buffer page latches! */ +void buf_flush_sync(); diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h new file mode 100644 index 00000000..aec08e77 --- /dev/null +++ b/storage/innobase/include/buf0lru.h @@ -0,0 +1,193 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0lru.h +The database buffer pool LRU replacement algorithm + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#pragma once + +#include "buf0types.h" +#include "hash0hash.h" + +// Forward declaration +struct trx_t; +struct fil_space_t; + +/** Flush this many pages in buf_LRU_get_free_block() */ +extern size_t innodb_lru_flush_size; + +/*####################################################################### +These are low-level functions +#########################################################################*/ + +/** Minimum LRU list length for which the LRU_old pointer is defined */ +#define BUF_LRU_OLD_MIN_LEN 512 /* 8 megabytes of 16k pages */ + +/** Try to free a block. If bpage is a descriptor of a compressed-only +ROW_FORMAT=COMPRESSED page, the buf_page_t object will be freed as well. +The caller must hold buf_pool.mutex. +@param bpage block to be freed +@param zip whether to remove both copies of a ROW_FORMAT=COMPRESSED page +@retval true if freed and buf_pool.mutex may have been temporarily released +@retval false if the page was not freed */ +bool buf_LRU_free_page(buf_page_t *bpage, bool zip) + MY_ATTRIBUTE((nonnull)); + +/** Try to free a replaceable block. +@param limit maximum number of blocks to scan +@return true if found and freed */ +bool buf_LRU_scan_and_free_block(ulint limit= ULINT_UNDEFINED); + +/** @return a buffer block from the buf_pool.free list +@retval NULL if the free list is empty */ +buf_block_t* buf_LRU_get_free_only(); + +/** Get a block from the buf_pool.free list. +If the list is empty, blocks will be moved from the end of buf_pool.LRU +to buf_pool.free. + +This function is called from a user thread when it needs a clean +block to read in a page. Note that we only ever get a block from +the free list. Even when we flush a page or find a page in LRU scan +we put it to free list to be used. +* iteration 0: + * get a block from the buf_pool.free list, success:done + * if buf_pool.try_LRU_scan is set + * scan LRU up to 100 pages to free a clean block + * success:retry the free list + * flush up to innodb_lru_flush_size LRU blocks to data files + (until UT_LIST_GET_GEN(buf_pool.free) < innodb_lru_scan_depth) + * on buf_page_write_complete() the blocks will put on buf_pool.free list + * success: retry the free list +* subsequent iterations: same as iteration 0 except: + * scan whole LRU list + * scan LRU list even if buf_pool.try_LRU_scan is not set + +@param have_mutex whether buf_pool.mutex is already being held +@return the free control block, in state BUF_BLOCK_MEMORY */ +buf_block_t* buf_LRU_get_free_block(bool have_mutex) + MY_ATTRIBUTE((malloc,warn_unused_result)); + +/** @return whether the unzip_LRU list should be used for evicting a victim +instead of the general LRU list */ +bool buf_LRU_evict_from_unzip_LRU(); + +/** Puts a block back to the free list. +@param[in] block block; not containing a file page */ +void +buf_LRU_block_free_non_file_page(buf_block_t* block); +/******************************************************************//** +Adds a block to the LRU list. Please make sure that the page_size is +already set when invoking the function, so that we can get correct +page_size from the buffer page when adding a block into LRU */ +void +buf_LRU_add_block( +/*==============*/ + buf_page_t* bpage, /*!< in: control block */ + bool old); /*!< in: true if should be put to the old + blocks in the LRU list, else put to the + start; if the LRU list is very short, added to + the start regardless of this parameter */ +/******************************************************************//** +Adds a block to the LRU list of decompressed zip pages. */ +void +buf_unzip_LRU_add_block( +/*====================*/ + buf_block_t* block, /*!< in: control block */ + ibool old); /*!< in: TRUE if should be put to the end + of the list, else put to the start */ + +/** Update buf_pool.LRU_old_ratio. +@param[in] old_pct Reserve this percentage of + the buffer pool for "old" blocks +@param[in] adjust true=adjust the LRU list; + false=just assign buf_pool.LRU_old_ratio + during the initialization of InnoDB +@return updated old_pct */ +uint buf_LRU_old_ratio_update(uint old_pct, bool adjust); +/********************************************************************//** +Update the historical stats that we are collecting for LRU eviction +policy at the end of each interval. */ +void +buf_LRU_stat_update(); + +#ifdef UNIV_DEBUG +/** Validate the LRU list. */ +void buf_LRU_validate(); +#endif /* UNIV_DEBUG */ +#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG +/** Dump the LRU list to stderr. */ +void buf_LRU_print(); +#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */ + +/** @name Heuristics for detecting index scan @{ */ +/** The denominator of buf_pool.LRU_old_ratio. */ +#define BUF_LRU_OLD_RATIO_DIV 1024 +/** Maximum value of buf_pool.LRU_old_ratio. +@see buf_LRU_old_adjust_len +@see buf_pool.LRU_old_ratio_update */ +#define BUF_LRU_OLD_RATIO_MAX BUF_LRU_OLD_RATIO_DIV +/** Minimum value of buf_pool.LRU_old_ratio. +@see buf_LRU_old_adjust_len +@see buf_pool.LRU_old_ratio_update +The minimum must exceed +(BUF_LRU_OLD_TOLERANCE + 5) * BUF_LRU_OLD_RATIO_DIV / BUF_LRU_OLD_MIN_LEN. */ +#define BUF_LRU_OLD_RATIO_MIN 51 + +#if BUF_LRU_OLD_RATIO_MIN >= BUF_LRU_OLD_RATIO_MAX +# error "BUF_LRU_OLD_RATIO_MIN >= BUF_LRU_OLD_RATIO_MAX" +#endif +#if BUF_LRU_OLD_RATIO_MAX > BUF_LRU_OLD_RATIO_DIV +# error "BUF_LRU_OLD_RATIO_MAX > BUF_LRU_OLD_RATIO_DIV" +#endif + +/** Move blocks to "new" LRU list only if the first access was at +least this many milliseconds ago. Not protected by any mutex or latch. */ +extern uint buf_LRU_old_threshold_ms; +/* @} */ + +/** @brief Statistics for selecting the LRU list for eviction. + +These statistics are not 'of' LRU but 'for' LRU. We keep count of I/O +and page_zip_decompress() operations. Based on the statistics we decide +if we want to evict from buf_pool.unzip_LRU or buf_pool.LRU. */ +struct buf_LRU_stat_t +{ + ulint io; /**< Counter of buffer pool I/O operations. */ + ulint unzip; /**< Counter of page_zip_decompress operations. */ +}; + +/** Current operation counters. Not protected by any mutex. +Cleared by buf_LRU_stat_update(). */ +extern buf_LRU_stat_t buf_LRU_stat_cur; + +/** Running sum of past values of buf_LRU_stat_cur. +Updated by buf_LRU_stat_update(). Protected by buf_pool.mutex. */ +extern buf_LRU_stat_t buf_LRU_stat_sum; + +/********************************************************************//** +Increments the I/O counter in buf_LRU_stat_cur. */ +#define buf_LRU_stat_inc_io() buf_LRU_stat_cur.io++ +/********************************************************************//** +Increments the page_zip_decompress() counter in buf_LRU_stat_cur. */ +#define buf_LRU_stat_inc_unzip() buf_LRU_stat_cur.unzip++ diff --git a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h new file mode 100644 index 00000000..3dd085dd --- /dev/null +++ b/storage/innobase/include/buf0rea.h @@ -0,0 +1,120 @@ +/***************************************************************************** + +Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0rea.h +The database buffer read + +Created 11/5/1995 Heikki Tuuri +*******************************************************/ + +#ifndef buf0rea_h +#define buf0rea_h + +#include "buf0buf.h" + +/** High-level function which reads a page asynchronously from a file to the +buffer buf_pool if it is not already there. Sets the io_fix flag and sets +an exclusive lock on the buffer frame. The flag is cleared and the x-lock +released by the i/o-handler thread. +@param page_id page id +@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@retval DB_SUCCESS if the page was read and is not corrupted +@retval DB_SUCCESS_LOCKED_REC if the page was not read +@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted +@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but +after decryption normal page checksum does not match. +@retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */ +dberr_t buf_read_page(const page_id_t page_id, ulint zip_size); + +/** High-level function which reads a page asynchronously from a file to the +buffer buf_pool if it is not already there. Sets the io_fix flag and sets +an exclusive lock on the buffer frame. The flag is cleared and the x-lock +released by the i/o-handler thread. +@param[in,out] space tablespace +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 */ +void buf_read_page_background(fil_space_t *space, const page_id_t page_id, + ulint zip_size) + MY_ATTRIBUTE((nonnull)); + +/** Applies a random read-ahead in buf_pool if there are at least a threshold +value of accessed pages from the random read-ahead area. Does not read any +page, not even the one at the position (space, offset), if the read-ahead +mechanism is not activated. NOTE 1: the calling thread may own latches on +pages: to avoid deadlocks this function must be written such that it cannot +end up waiting for these latches! NOTE 2: the calling thread must want +access to the page given: this rule is set to prevent unintended read-aheads +performed by ibuf routines, a situation which could result in a deadlock if +the OS does not support asynchronous i/o. +@param[in] page_id page id of a page which the current thread +wants to access +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] ibuf whether we are inside ibuf routine +@return number of page read requests issued; NOTE that if we read ibuf +pages, it may happen that the page at the given page number does not +get read even if we return a positive value! */ +ulint +buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf); + +/** Applies linear read-ahead if in the buf_pool the page is a border page of +a linear read-ahead area and all the pages in the area have been accessed. +Does not read any page if the read-ahead mechanism is not activated. Note +that the algorithm looks at the 'natural' adjacent successor and +predecessor of the page, which on the leaf level of a B-tree are the next +and previous page in the chain of leaves. To know these, the page specified +in (space, offset) must already be present in the buf_pool. Thus, the +natural way to use this function is to call it when a page in the buf_pool +is accessed the first time, calling this function just after it has been +bufferfixed. +NOTE 1: as this function looks at the natural predecessor and successor +fields on the page, what happens, if these are not initialized to any +sensible value? No problem, before applying read-ahead we check that the +area to read is within the span of the space, if not, read-ahead is not +applied. An uninitialized value may result in a useless read operation, but +only very improbably. +NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this +function must be written such that it cannot end up waiting for these +latches! +NOTE 3: the calling thread must want access to the page given: this rule is +set to prevent unintended read-aheads performed by ibuf routines, a situation +which could result in a deadlock if the OS does not support asynchronous io. +@param[in] page_id page id; see NOTE 3 above +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] ibuf whether if we are inside ibuf routine +@return number of page read requests issued */ +ulint +buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf); + +/** Schedule a page for recovery. +@param space tablespace +@param page_id page identifier +@param recs log records +@param init page initialization, or nullptr if the page needs to be read */ +void buf_read_recover(fil_space_t *space, const page_id_t page_id, + page_recv_t &recs, recv_init *init); + +/** @name Modes used in read-ahead @{ */ +/** read only pages belonging to the insert buffer tree */ +#define BUF_READ_IBUF_PAGES_ONLY 131 +/** read any page */ +#define BUF_READ_ANY_PAGE 132 +/* @} */ + +#endif diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h new file mode 100644 index 00000000..6c13f5ee --- /dev/null +++ b/storage/innobase/include/buf0types.h @@ -0,0 +1,235 @@ +/***************************************************************************** + +Copyright (c) 1995, 2015, Oracle and/or its affiliates. All rights reserved. +Copyright (c) 2019, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/buf0types.h +The database buffer pool global types for the directory + +Created 11/17/1995 Heikki Tuuri +*******************************************************/ + +#pragma once +#include "univ.i" + +/** Buffer page (uncompressed or compressed) */ +class buf_page_t; +/** Buffer block for which an uncompressed page exists */ +struct buf_block_t; +/** Buffer pool statistics struct */ +struct buf_pool_stat_t; +/** Buffer pool buddy statistics struct */ +struct buf_buddy_stat_t; + +/** A buffer frame. @see page_t */ +typedef byte buf_frame_t; + +/** Alternatives for srv_checksum_algorithm, which can be changed by +setting innodb_checksum_algorithm */ +enum srv_checksum_algorithm_t { + /** Write crc32; allow full_crc32,crc32,innodb,none when reading */ + SRV_CHECKSUM_ALGORITHM_CRC32, + /** Write crc32; allow full_crc23,crc32 when reading */ + SRV_CHECKSUM_ALGORITHM_STRICT_CRC32, + /** For new files, always compute CRC-32C for the whole page. + For old files, allow crc32, innodb or none when reading. */ + SRV_CHECKSUM_ALGORITHM_FULL_CRC32, + /** For new files, always compute CRC-32C for the whole page. + For old files, allow crc32 when reading. */ + SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32 +}; + +inline bool is_checksum_strict(srv_checksum_algorithm_t algo) +{ + return algo == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32; +} + +inline bool is_checksum_strict(ulint algo) +{ + return algo == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32; +} + +/** Parameters of binary buddy system for compressed pages (buf0buddy.h) */ +/* @{ */ +/** Zip shift value for the smallest page size */ +#define BUF_BUDDY_LOW_SHIFT UNIV_ZIP_SIZE_SHIFT_MIN + +/** Smallest buddy page size */ +#define BUF_BUDDY_LOW (1U << BUF_BUDDY_LOW_SHIFT) + +/** Actual number of buddy sizes based on current page size */ +#define BUF_BUDDY_SIZES (srv_page_size_shift - BUF_BUDDY_LOW_SHIFT) + +/** Maximum number of buddy sizes based on the max page size */ +#define BUF_BUDDY_SIZES_MAX (UNIV_PAGE_SIZE_SHIFT_MAX \ + - BUF_BUDDY_LOW_SHIFT) + +/** twice the maximum block size of the buddy system; +the underlying memory is aligned by this amount: +this must be equal to srv_page_size */ +#define BUF_BUDDY_HIGH (BUF_BUDDY_LOW << BUF_BUDDY_SIZES) +/* @} */ + +/** Page identifier. */ +class page_id_t +{ +public: + /** Constructor from (space, page_no). + @param space tablespace id + @param page_no page number */ + constexpr page_id_t(uint32_t space, uint32_t page_no) : + m_id(uint64_t{space} << 32 | page_no) {} + + constexpr page_id_t(uint64_t id) : m_id(id) {} + constexpr bool operator==(const page_id_t& rhs) const + { return m_id == rhs.m_id; } + constexpr bool operator!=(const page_id_t& rhs) const + { return m_id != rhs.m_id; } + constexpr bool operator<(const page_id_t& rhs) const + { return m_id < rhs.m_id; } + constexpr bool operator>(const page_id_t& rhs) const + { return m_id > rhs.m_id; } + constexpr bool operator<=(const page_id_t& rhs) const + { return m_id <= rhs.m_id; } + constexpr bool operator>=(const page_id_t& rhs) const + { return m_id >= rhs.m_id; } + page_id_t &operator--() { ut_ad(page_no()); m_id--; return *this; } + page_id_t &operator++() + { + ut_ad(page_no() < 0xFFFFFFFFU); + m_id++; + return *this; + } + page_id_t operator-(uint32_t i) const + { + ut_ad(page_no() >= i); + return page_id_t(m_id - i); + } + page_id_t operator+(uint32_t i) const + { + ut_ad(page_no() < ~i); + return page_id_t(m_id + i); + } + + /** Retrieve the tablespace id. + @return tablespace id */ + constexpr uint32_t space() const { return static_cast(m_id >> 32); } + + /** Retrieve the page number. + @return page number */ + constexpr uint32_t page_no() const { return static_cast(m_id); } + + /** Retrieve the fold value. + @return fold value */ + constexpr ulint fold() const + { return (ulint{space()} << 20) + space() + page_no(); } + + /** Reset the page number only. + @param[in] page_no page number */ + void set_page_no(uint32_t page_no) + { + m_id= (m_id & ~uint64_t{0} << 32) | page_no; + } + + constexpr ulonglong raw() const { return m_id; } + + /** Flag the page identifier as corrupted. */ + void set_corrupted() { m_id= ~0ULL; } + + /** @return whether the page identifier belongs to a corrupted page */ + constexpr bool is_corrupted() const { return m_id == ~0ULL; } + +private: + /** The page identifier */ + uint64_t m_id; +}; + +/** A 64KiB buffer of NUL bytes, for use in assertions and checks, +and dummy default values of instantly dropped columns. +Initially, BLOB field references are set to NUL bytes, in +dtuple_convert_big_rec(). */ +extern const byte *field_ref_zero; + +#ifndef UNIV_INNOCHECKSUM + +/** Latch types */ +enum rw_lock_type_t +{ + RW_S_LATCH= 1 << 0, + RW_X_LATCH= 1 << 1, + RW_SX_LATCH= 1 << 2, + RW_NO_LATCH= 1 << 3 +}; + +#include "sux_lock.h" + +#ifdef SUX_LOCK_GENERIC +class page_hash_latch : private rw_lock +{ + /** Wait for a shared lock */ + void read_lock_wait(); + /** Wait for an exclusive lock */ + void write_lock_wait(); +public: + /** Acquire a shared lock */ + inline void lock_shared(); + /** Acquire an exclusive lock */ + inline void lock(); + + /** @return whether an exclusive lock is being held by any thread */ + bool is_write_locked() const { return rw_lock::is_write_locked(); } + + /** @return whether any lock is being held by any thread */ + bool is_locked() const { return rw_lock::is_locked(); } + /** @return whether any lock is being held or waited for by any thread */ + bool is_locked_or_waiting() const { return rw_lock::is_locked_or_waiting(); } + + /** Release a shared lock */ + void unlock_shared() { read_unlock(); } + /** Release an exclusive lock */ + void unlock() { write_unlock(); } +}; +#elif defined _WIN32 || SIZEOF_SIZE_T >= 8 +class page_hash_latch +{ + srw_spin_lock_low lk; +public: + void lock_shared() { lk.rd_lock(); } + void unlock_shared() { lk.rd_unlock(); } + void lock() { lk.wr_lock(); } + void unlock() { lk.wr_unlock(); } + bool is_write_locked() const { return lk.is_write_locked(); } + bool is_locked() const { return lk.is_locked(); } + bool is_locked_or_waiting() const { return lk.is_locked_or_waiting(); } +}; +#else +class page_hash_latch +{ + srw_spin_mutex lk; +public: + void lock_shared() { lock(); } + void unlock_shared() { unlock(); } + void lock() { lk.wr_lock(); } + void unlock() { lk.wr_unlock(); } + bool is_locked() const { return lk.is_locked(); } + bool is_write_locked() const { return is_locked(); } + bool is_locked_or_waiting() const { return is_locked(); } +}; +#endif + +#endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/data0data.h b/storage/innobase/include/data0data.h new file mode 100644 index 00000000..a5356e0d --- /dev/null +++ b/storage/innobase/include/data0data.h @@ -0,0 +1,704 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/data0data.h +SQL data field and tuple + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#ifndef data0data_h +#define data0data_h + +#include "data0types.h" +#include "data0type.h" +#include "mem0mem.h" +#include "dict0types.h" +#include "btr0types.h" +#include + +#include + +/** Storage for overflow data in a big record, that is, a clustered +index record which needs external storage of data fields */ +struct big_rec_t; +struct upd_t; + +/** Dummy variable to catch access to uninitialized fields. In the +debug version, dtuple_create() will make all fields of dtuple_t point +to data_error. */ +ut_d(extern byte data_error); + +/*********************************************************************//** +Sets the type struct of SQL data field. */ +UNIV_INLINE +void +dfield_set_type( +/*============*/ + dfield_t* field, /*!< in: SQL data field */ + const dtype_t* type); /*!< in: pointer to data type struct */ + +/*********************************************************************//** +Sets length in a field. */ +UNIV_INLINE +void +dfield_set_len( +/*===========*/ + dfield_t* field, /*!< in: field */ + ulint len) /*!< in: length or UNIV_SQL_NULL */ + MY_ATTRIBUTE((nonnull)); + +/** Gets spatial status for "external storage" +@param[in,out] field field */ +UNIV_INLINE +spatial_status_t +dfield_get_spatial_status( + const dfield_t* field); + +/** Sets spatial status for "external storage" +@param[in,out] field field +@param[in] spatial_status spatial status */ +UNIV_INLINE +void +dfield_set_spatial_status( + dfield_t* field, + spatial_status_t spatial_status); + +/*********************************************************************//** +Sets pointer to the data and length in a field. */ +UNIV_INLINE +void +dfield_set_data( +/*============*/ + dfield_t* field, /*!< in: field */ + const void* data, /*!< in: data */ + ulint len) /*!< in: length or UNIV_SQL_NULL */ + MY_ATTRIBUTE((nonnull(1))); +/*********************************************************************//** +Sets pointer to the data and length in a field. */ +UNIV_INLINE +void +dfield_write_mbr( +/*=============*/ + dfield_t* field, /*!< in: field */ + const double* mbr) /*!< in: data */ + MY_ATTRIBUTE((nonnull(1))); +/*********************************************************************//** +Sets a data field to SQL NULL. */ +UNIV_INLINE +void +dfield_set_null( +/*============*/ + dfield_t* field) /*!< in/out: field */ + MY_ATTRIBUTE((nonnull)); +/**********************************************************************//** +Writes an SQL null field full of zeros. */ +UNIV_INLINE +void +data_write_sql_null( +/*================*/ + byte* data, /*!< in: pointer to a buffer of size len */ + ulint len) /*!< in: SQL null size in bytes */ + MY_ATTRIBUTE((nonnull)); +/*********************************************************************//** +Copies the data and len fields. */ +UNIV_INLINE +void +dfield_copy_data( +/*=============*/ + dfield_t* field1, /*!< out: field to copy to */ + const dfield_t* field2); /*!< in: field to copy from */ + +/*********************************************************************//** +Copies a data field to another. */ +UNIV_INLINE +void +dfield_copy( +/*========*/ + dfield_t* field1, /*!< out: field to copy to */ + const dfield_t* field2) /*!< in: field to copy from */ + MY_ATTRIBUTE((nonnull)); +/*********************************************************************//** +Copies the data pointed to by a data field. */ +UNIV_INLINE +void +dfield_dup( +/*=======*/ + dfield_t* field, /*!< in/out: data field */ + mem_heap_t* heap) /*!< in: memory heap where allocated */ + MY_ATTRIBUTE((nonnull)); + +/*********************************************************************//** +Tests if two data fields are equal. +If len==0, tests the data length and content for equality. +If len>0, tests the first len bytes of the content for equality. +@return TRUE if both fields are NULL or if they are equal */ +UNIV_INLINE +ibool +dfield_datas_are_binary_equal( +/*==========================*/ + const dfield_t* field1, /*!< in: field */ + const dfield_t* field2, /*!< in: field */ + ulint len) /*!< in: maximum prefix to compare, + or 0 to compare the whole field length */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*********************************************************************//** +Tests if dfield data length and content is equal to the given. +@return TRUE if equal */ +UNIV_INLINE +ibool +dfield_data_is_binary_equal( +/*========================*/ + const dfield_t* field, /*!< in: field */ + ulint len, /*!< in: data length or UNIV_SQL_NULL */ + const byte* data) /*!< in: data */ + MY_ATTRIBUTE((nonnull(1), warn_unused_result)); + +/*********************************************************************//** +Gets info bits in a data tuple. +@return info bits */ +UNIV_INLINE +ulint +dtuple_get_info_bits( +/*=================*/ + const dtuple_t* tuple) /*!< in: tuple */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*********************************************************************//** +Sets info bits in a data tuple. */ +UNIV_INLINE +void +dtuple_set_info_bits( +/*=================*/ + dtuple_t* tuple, /*!< in: tuple */ + ulint info_bits) /*!< in: info bits */ + MY_ATTRIBUTE((nonnull)); +/*********************************************************************//** +Gets number of fields used in record comparisons. +@return number of fields used in comparisons in rem0cmp.* */ +UNIV_INLINE +ulint +dtuple_get_n_fields_cmp( +/*====================*/ + const dtuple_t* tuple) /*!< in: tuple */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*********************************************************************//** +Gets number of fields used in record comparisons. */ +UNIV_INLINE +void +dtuple_set_n_fields_cmp( +/*====================*/ + dtuple_t* tuple, /*!< in: tuple */ + ulint n_fields_cmp) /*!< in: number of fields used in + comparisons in rem0cmp.* */ + MY_ATTRIBUTE((nonnull)); + +/* Estimate the number of bytes that are going to be allocated when +creating a new dtuple_t object */ +#define DTUPLE_EST_ALLOC(n_fields) \ + (sizeof(dtuple_t) + (n_fields) * sizeof(dfield_t)) + +/** Creates a data tuple from an already allocated chunk of memory. +The size of the chunk must be at least DTUPLE_EST_ALLOC(n_fields). +The default value for number of fields used in record comparisons +for this tuple is n_fields. +@param[in,out] buf buffer to use +@param[in] buf_size buffer size +@param[in] n_fields number of field +@param[in] n_v_fields number of fields on virtual columns +@return created tuple (inside buf) */ +UNIV_INLINE +dtuple_t* +dtuple_create_from_mem( + void* buf, + ulint buf_size, + ulint n_fields, + ulint n_v_fields) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/**********************************************************//** +Creates a data tuple to a memory heap. The default value for number +of fields used in record comparisons for this tuple is n_fields. +@return own: created tuple */ +UNIV_INLINE +dtuple_t* +dtuple_create( +/*==========*/ + mem_heap_t* heap, /*!< in: memory heap where the tuple + is created, DTUPLE_EST_ALLOC(n_fields) + bytes will be allocated from this heap */ + ulint n_fields)/*!< in: number of fields */ + MY_ATTRIBUTE((nonnull, malloc)); + +/** Initialize the virtual field data in a dtuple_t +@param[in,out] vrow dtuple contains the virtual fields */ +UNIV_INLINE void dtuple_init_v_fld(dtuple_t* vrow); + +/** Duplicate the virtual field data in a dtuple_t +@param[in,out] vrow dtuple contains the virtual fields +@param[in] heap heap memory to use */ +UNIV_INLINE void dtuple_dup_v_fld(dtuple_t* vrow, mem_heap_t* heap); + +/** Creates a data tuple with possible virtual columns to a memory heap. +@param[in] heap memory heap where the tuple is created +@param[in] n_fields number of fields +@param[in] n_v_fields number of fields on virtual col +@return own: created tuple */ +UNIV_INLINE +dtuple_t* +dtuple_create_with_vcol( + mem_heap_t* heap, + ulint n_fields, + ulint n_v_fields); + +/*********************************************************************//** +Sets number of fields used in a tuple. Normally this is set in +dtuple_create, but if you want later to set it smaller, you can use this. */ +void +dtuple_set_n_fields( +/*================*/ + dtuple_t* tuple, /*!< in: tuple */ + ulint n_fields) /*!< in: number of fields */ + MY_ATTRIBUTE((nonnull)); +/** Copies a data tuple's virtaul fields to another. This is a shallow copy; +@param[in,out] d_tuple destination tuple +@param[in] s_tuple source tuple */ +UNIV_INLINE +void +dtuple_copy_v_fields( + dtuple_t* d_tuple, + const dtuple_t* s_tuple); +/*********************************************************************//** +Copies a data tuple to another. This is a shallow copy; if a deep copy +is desired, dfield_dup() will have to be invoked on each field. +@return own: copy of tuple */ +UNIV_INLINE +dtuple_t* +dtuple_copy( +/*========*/ + const dtuple_t* tuple, /*!< in: tuple to copy from */ + mem_heap_t* heap) /*!< in: memory heap + where the tuple is created */ + MY_ATTRIBUTE((nonnull, malloc)); +/**********************************************************//** +The following function returns the sum of data lengths of a tuple. The space +occupied by the field structs or the tuple struct is not counted. +@return sum of data lens */ +UNIV_INLINE +ulint +dtuple_get_data_size( +/*=================*/ + const dtuple_t* tuple, /*!< in: typed data tuple */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ + MY_ATTRIBUTE((nonnull)); +/*********************************************************************//** +Computes the number of externally stored fields in a data tuple. +@return number of fields */ +UNIV_INLINE +ulint +dtuple_get_n_ext( +/*=============*/ + const dtuple_t* tuple) /*!< in: tuple */ + MY_ATTRIBUTE((nonnull)); +/** Fold a prefix given as the number of fields of a tuple. +@param[in] tuple index record +@param[in] n_fields number of complete fields to fold +@param[in] n_bytes number of bytes to fold in the last field +@param[in] index_id index tree ID +@return the folded value */ +UNIV_INLINE +ulint +dtuple_fold( + const dtuple_t* tuple, + ulint n_fields, + ulint n_bytes, + index_id_t tree_id) + MY_ATTRIBUTE((warn_unused_result)); +/*******************************************************************//** +Sets types of fields binary in a tuple. */ +UNIV_INLINE +void +dtuple_set_types_binary( +/*====================*/ + dtuple_t* tuple, /*!< in: data tuple */ + ulint n) /*!< in: number of fields to set */ + MY_ATTRIBUTE((nonnull)); +/**********************************************************************//** +Checks if a dtuple contains an SQL null value. +@return TRUE if some field is SQL null */ +UNIV_INLINE +ibool +dtuple_contains_null( +/*=================*/ + const dtuple_t* tuple) /*!< in: dtuple */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/**********************************************************//** +Checks that a data field is typed. Asserts an error if not. +@return TRUE if ok */ +ibool +dfield_check_typed( +/*===============*/ + const dfield_t* field) /*!< in: data field */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/**********************************************************//** +Checks that a data tuple is typed. Asserts an error if not. +@return TRUE if ok */ +ibool +dtuple_check_typed( +/*===============*/ + const dtuple_t* tuple) /*!< in: tuple */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +#ifdef UNIV_DEBUG +/**********************************************************//** +Validates the consistency of a tuple which must be complete, i.e, +all fields must have been set. +@return TRUE if ok */ +ibool +dtuple_validate( +/*============*/ + const dtuple_t* tuple) /*!< in: tuple */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +#endif /* UNIV_DEBUG */ +/*************************************************************//** +Pretty prints a dfield value according to its data type. */ +void +dfield_print( +/*=========*/ + const dfield_t* dfield) /*!< in: dfield */ + MY_ATTRIBUTE((nonnull)); +/*************************************************************//** +Pretty prints a dfield value according to its data type. Also the hex string +is printed if a string contains non-printable characters. */ +void +dfield_print_also_hex( +/*==================*/ + const dfield_t* dfield) /*!< in: dfield */ + MY_ATTRIBUTE((nonnull)); +/**********************************************************//** +The following function prints the contents of a tuple. */ +void +dtuple_print( +/*=========*/ + FILE* f, /*!< in: output stream */ + const dtuple_t* tuple) /*!< in: tuple */ + MY_ATTRIBUTE((nonnull)); + +/** Print the contents of a tuple. +@param[out] o output stream +@param[in] field array of data fields +@param[in] n number of data fields */ +void +dfield_print( + std::ostream& o, + const dfield_t* field, + ulint n); +/** Print the contents of a tuple. +@param[out] o output stream +@param[in] tuple data tuple */ +void +dtuple_print( + std::ostream& o, + const dtuple_t* tuple); + +/** Print the contents of a tuple. +@param[out] o output stream +@param[in] tuple data tuple */ +inline +std::ostream& +operator<<(std::ostream& o, const dtuple_t& tuple) +{ + dtuple_print(o, &tuple); + return(o); +} + +/**************************************************************//** +Moves parts of long fields in entry to the big record vector so that +the size of tuple drops below the maximum record size allowed in the +database. Moves data only from those fields which are not necessary +to determine uniquely the insertion place of the tuple in the index. +@return own: created big record vector, NULL if we are not able to +shorten the entry enough, i.e., if there are too many fixed-length or +short fields in entry or the index is clustered */ +big_rec_t* +dtuple_convert_big_rec( +/*===================*/ + dict_index_t* index, /*!< in: index */ + upd_t* upd, /*!< in/out: update vector */ + dtuple_t* entry, /*!< in/out: index entry */ + ulint* n_ext) /*!< in/out: number of + externally stored columns */ + MY_ATTRIBUTE((malloc, warn_unused_result)); +/**************************************************************//** +Puts back to entry the data stored in vector. Note that to ensure the +fields in entry can accommodate the data, vector must have been created +from entry with dtuple_convert_big_rec. */ +void +dtuple_convert_back_big_rec( +/*========================*/ + dict_index_t* index, /*!< in: index */ + dtuple_t* entry, /*!< in: entry whose data was put to vector */ + big_rec_t* vector) /*!< in, own: big rec vector; it is + freed in this function */ + MY_ATTRIBUTE((nonnull)); +/**************************************************************//** +Frees the memory in a big rec vector. */ +UNIV_INLINE +void +dtuple_big_rec_free( +/*================*/ + big_rec_t* vector) /*!< in, own: big rec vector; it is + freed in this function */ + MY_ATTRIBUTE((nonnull)); + +/*######################################################################*/ + +/** Structure for an SQL data field */ +struct dfield_t{ + void* data; /*!< pointer to data */ + unsigned ext:1; /*!< TRUE=externally stored, FALSE=local */ + unsigned spatial_status:2; + /*!< spatial status of externally stored field + in undo log for purge */ + unsigned len; /*!< data length; UNIV_SQL_NULL if SQL null */ + dtype_t type; /*!< type of data */ + + /** Create a deep copy of this object. + @param[in,out] heap memory heap in which the clone will be created + @return the cloned object */ + dfield_t* clone(mem_heap_t* heap) const; + + /** @return system field indicates history row */ + bool vers_history_row() const + { + ut_ad(type.vers_sys_end()); + if (type.mtype == DATA_FIXBINARY) { + ut_ad(len == sizeof timestamp_max_bytes); + return 0 != memcmp(data, timestamp_max_bytes, len); + } else { + ut_ad(type.mtype == DATA_INT); + ut_ad(len == sizeof trx_id_max_bytes); + return 0 != memcmp(data, trx_id_max_bytes, len); + } + ut_ad(0); + return false; + } +}; + +/** Structure for an SQL data tuple of fields (logical record) */ +struct dtuple_t { + ulint info_bits; /*!< info bits of an index record: + the default is 0; this field is used + if an index record is built from + a data tuple */ + ulint n_fields; /*!< number of fields in dtuple */ + ulint n_fields_cmp; /*!< number of fields which should + be used in comparison services + of rem0cmp.*; the index search + is performed by comparing only these + fields, others are ignored; the + default value in dtuple creation is + the same value as n_fields */ + dfield_t* fields; /*!< fields */ + ulint n_v_fields; /*!< number of virtual fields */ + dfield_t* v_fields; /*!< fields on virtual column */ +#ifdef UNIV_DEBUG + ulint magic_n; /*!< magic number, used in + debug assertions */ +/** Value of dtuple_t::magic_n */ +# define DATA_TUPLE_MAGIC_N 65478679 +#endif /* UNIV_DEBUG */ + + /** Trim the tail of an index tuple before insert or update. + After instant ADD COLUMN, if the last fields of a clustered index tuple + match the default values that were explicitly specified or implied + during ADD COLUMN, there will be no need to store them. + NOTE: A page latch in the index must be held, so that the index + may not lose 'instantness' before the trimmed tuple has been + inserted or updated. + @param[in] index index possibly with instantly added columns */ + void trim(const dict_index_t& index); + + bool vers_history_row() const + { + for (ulint i = 0; i < n_fields; i++) { + const dfield_t* field = &fields[i]; + if (field->type.vers_sys_end()) { + return field->vers_history_row(); + } + } + return false; + } + + /** + @param info_bits the info_bits of a data tuple + @return whether this is a hidden metadata record + for instant ADD COLUMN or ALTER TABLE */ + static bool is_alter_metadata(ulint info_bits) + { + return UNIV_UNLIKELY(info_bits == REC_INFO_METADATA_ALTER); + } + + /** + @param info_bits the info_bits of a data tuple + @return whether this is a hidden metadata record + for instant ADD COLUMN or ALTER TABLE */ + static bool is_metadata(ulint info_bits) + { + return UNIV_UNLIKELY((info_bits & ~REC_INFO_DELETED_FLAG) + == REC_INFO_METADATA_ADD); + } + + /** @return whether this is a hidden metadata record + for instant ALTER TABLE (not only ADD COLUMN) */ + bool is_alter_metadata() const { return is_alter_metadata(info_bits); } + + /** @return whether this is a hidden metadata record + for instant ADD COLUMN or ALTER TABLE */ + bool is_metadata() const { return is_metadata(info_bits); } + + /** Copy type information from index fields. + @param index index field to be copied */ + inline void copy_field_types(const dict_index_t &index); +}; + +inline ulint dtuple_get_n_fields(const dtuple_t* tuple) +{ return tuple->n_fields; } +inline dtype_t* dfield_get_type(dfield_t* field) { return &field->type; } +inline const dtype_t* dfield_get_type(const dfield_t* field) +{ return &field->type; } +inline void* dfield_get_data(dfield_t* field) +{ + ut_ad(field->len == UNIV_SQL_NULL || field->data != &data_error); + return field->data; +} +inline const void* dfield_get_data(const dfield_t* field) +{ + ut_ad(field->len == UNIV_SQL_NULL || field->data != &data_error); + return field->data; +} +inline ulint dfield_get_len(const dfield_t* field) { + ut_ad(field->len == UNIV_SQL_NULL || field->data != &data_error); + ut_ad(field->len != UNIV_SQL_DEFAULT); + return field->len; +} +inline bool dfield_is_null(const dfield_t* field) +{ return field->len == UNIV_SQL_NULL; } +/** @return whether a column is to be stored off-page */ +inline bool dfield_is_ext(const dfield_t* field) +{ + ut_ad(!field->ext || field->len >= BTR_EXTERN_FIELD_REF_SIZE); + return static_cast(field->ext); +} +/** Set the "external storage" flag */ +inline void dfield_set_ext(dfield_t* field) { field->ext = 1; } + +/** Gets number of virtual fields in a data tuple. +@param[in] tuple dtuple to check +@return number of fields */ +inline ulint +dtuple_get_n_v_fields(const dtuple_t* tuple) { return tuple->n_v_fields; } + +inline const dfield_t* dtuple_get_nth_field(const dtuple_t* tuple, ulint n) +{ + ut_ad(n < tuple->n_fields); + return &tuple->fields[n]; +} +inline dfield_t* dtuple_get_nth_field(dtuple_t* tuple, ulint n) +{ + ut_ad(n < tuple->n_fields); + return &tuple->fields[n]; +} + +/** Get a virtual column in a table row or an extended clustered index record. +@param[in] tuple tuple +@oaran[in] n the nth virtual field to get +@return nth virtual field */ +inline const dfield_t* dtuple_get_nth_v_field(const dtuple_t* tuple, ulint n) +{ + ut_ad(n < tuple->n_v_fields); + return &tuple->v_fields[n]; +} +/** Get a virtual column in a table row or an extended clustered index record. +@param[in] tuple tuple +@oaran[in] n the nth virtual field to get +@return nth virtual field */ +inline dfield_t* dtuple_get_nth_v_field(dtuple_t* tuple, ulint n) +{ + ut_ad(n < tuple->n_v_fields); + return &tuple->v_fields[n]; +} + +/** A slot for a field in a big rec vector */ +struct big_rec_field_t { + + /** Constructor. + @param[in] field_no_ the field number + @param[in] len_ the data length + @param[in] data_ the data */ + big_rec_field_t(ulint field_no_, ulint len_, const void* data_) + : field_no(field_no_), + len(len_), + data(data_) + {} + + ulint field_no; /*!< field number in record */ + ulint len; /*!< stored data length, in bytes */ + const void* data; /*!< stored data */ +}; + +/** Storage format for overflow data in a big record, that is, a +clustered index record which needs external storage of data fields */ +struct big_rec_t { + mem_heap_t* heap; /*!< memory heap from which + allocated */ + const ulint capacity; /*!< fields array size */ + ulint n_fields; /*!< number of stored fields */ + big_rec_field_t*fields; /*!< stored fields */ + + /** Constructor. + @param[in] max the capacity of the array of fields. */ + explicit big_rec_t(const ulint max) + : heap(0), + capacity(max), + n_fields(0), + fields(0) + {} + + /** Append one big_rec_field_t object to the end of array of fields */ + void append(const big_rec_field_t& field) + { + ut_ad(n_fields < capacity); + fields[n_fields] = field; + n_fields++; + } + + /** Allocate a big_rec_t object in the given memory heap, and for + storing n_fld number of fields. + @param[in] heap memory heap in which this object is allocated + @param[in] n_fld maximum number of fields that can be stored in + this object + @return the allocated object */ + static big_rec_t* alloc( + mem_heap_t* heap, + ulint n_fld); +}; + +#include "data0data.inl" + +#endif diff --git a/storage/innobase/include/data0data.inl b/storage/innobase/include/data0data.inl new file mode 100644 index 00000000..2d1bf5a2 --- /dev/null +++ b/storage/innobase/include/data0data.inl @@ -0,0 +1,633 @@ +/***************************************************************************** + +Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/data0data.ic +SQL data field and tuple + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#include "ut0rnd.h" + +/*********************************************************************//** +Sets the type struct of SQL data field. */ +UNIV_INLINE +void +dfield_set_type( +/*============*/ + dfield_t* field, /*!< in: SQL data field */ + const dtype_t* type) /*!< in: pointer to data type struct */ +{ + ut_ad(field != NULL); + ut_ad(type != NULL); + + field->type = *type; +} + +/*********************************************************************//** +Sets length in a field. */ +UNIV_INLINE +void +dfield_set_len( +/*===========*/ + dfield_t* field, /*!< in: field */ + ulint len) /*!< in: length or UNIV_SQL_NULL */ +{ + ut_ad(len != UNIV_SQL_DEFAULT); + field->ext = 0; + field->len = static_cast(len); +} + +/** Gets spatial status for "external storage" +@param[in,out] field field */ +UNIV_INLINE +spatial_status_t +dfield_get_spatial_status( + const dfield_t* field) +{ + ut_ad(dfield_is_ext(field)); + + return(static_cast(field->spatial_status)); +} + +/** Sets spatial status for "external storage" +@param[in,out] field field +@param[in] spatial_status spatial status */ +UNIV_INLINE +void +dfield_set_spatial_status( + dfield_t* field, + spatial_status_t spatial_status) +{ + field->spatial_status = spatial_status & 3; + ut_ad(dfield_get_spatial_status(field) == spatial_status); +} + +/*********************************************************************//** +Sets pointer to the data and length in a field. */ +UNIV_INLINE +void +dfield_set_data( +/*============*/ + dfield_t* field, /*!< in: field */ + const void* data, /*!< in: data */ + ulint len) /*!< in: length or UNIV_SQL_NULL */ +{ + field->data = (void*) data; + field->ext = 0; + field->len = static_cast(len); +} + +/*********************************************************************//** +Sets pointer to the data and length in a field. */ +UNIV_INLINE +void +dfield_write_mbr( +/*=============*/ + dfield_t* field, /*!< in: field */ + const double* mbr) /*!< in: data */ +{ + MEM_CHECK_DEFINED(mbr, sizeof *mbr); + field->ext = 0; + + for (unsigned i = 0; i < SPDIMS * 2; i++) { + mach_double_write(static_cast(field->data) + + i * sizeof(double), mbr[i]); + } + + field->len = DATA_MBR_LEN; +} + +/*********************************************************************//** +Sets a data field to SQL NULL. */ +UNIV_INLINE +void +dfield_set_null( +/*============*/ + dfield_t* field) /*!< in/out: field */ +{ + dfield_set_data(field, NULL, UNIV_SQL_NULL); +} + +/*********************************************************************//** +Copies the data and len fields. */ +UNIV_INLINE +void +dfield_copy_data( +/*=============*/ + dfield_t* field1, /*!< out: field to copy to */ + const dfield_t* field2) /*!< in: field to copy from */ +{ + ut_ad(field1 != NULL); + ut_ad(field2 != NULL); + + field1->data = field2->data; + field1->len = field2->len; + field1->ext = field2->ext; + field1->spatial_status = field2->spatial_status; +} + +/*********************************************************************//** +Copies a data field to another. */ +UNIV_INLINE +void +dfield_copy( +/*========*/ + dfield_t* field1, /*!< out: field to copy to */ + const dfield_t* field2) /*!< in: field to copy from */ +{ + *field1 = *field2; +} + +/*********************************************************************//** +Copies the data pointed to by a data field. */ +UNIV_INLINE +void +dfield_dup( +/*=======*/ + dfield_t* field, /*!< in/out: data field */ + mem_heap_t* heap) /*!< in: memory heap where allocated */ +{ + if (!dfield_is_null(field)) { + MEM_CHECK_DEFINED(field->data, field->len); + field->data = mem_heap_dup(heap, field->data, field->len); + } +} + +/*********************************************************************//** +Tests if two data fields are equal. +If len==0, tests the data length and content for equality. +If len>0, tests the first len bytes of the content for equality. +@return TRUE if both fields are NULL or if they are equal */ +UNIV_INLINE +ibool +dfield_datas_are_binary_equal( +/*==========================*/ + const dfield_t* field1, /*!< in: field */ + const dfield_t* field2, /*!< in: field */ + ulint len) /*!< in: maximum prefix to compare, + or 0 to compare the whole field length */ +{ + ulint len2 = len; + + if (field1->len == UNIV_SQL_NULL || len == 0 || field1->len < len) { + len = field1->len; + } + + if (field2->len == UNIV_SQL_NULL || len2 == 0 || field2->len < len2) { + len2 = field2->len; + } + + return(len == len2 + && (len == UNIV_SQL_NULL + || !memcmp(field1->data, field2->data, len))); +} + +/*********************************************************************//** +Tests if dfield data length and content is equal to the given. +@return TRUE if equal */ +UNIV_INLINE +ibool +dfield_data_is_binary_equal( +/*========================*/ + const dfield_t* field, /*!< in: field */ + ulint len, /*!< in: data length or UNIV_SQL_NULL */ + const byte* data) /*!< in: data */ +{ + ut_ad(len != UNIV_SQL_DEFAULT); + return(len == dfield_get_len(field) + && (!len || len == UNIV_SQL_NULL + || !memcmp(dfield_get_data(field), data, len))); +} + +/*********************************************************************//** +Gets info bits in a data tuple. +@return info bits */ +UNIV_INLINE +ulint +dtuple_get_info_bits( +/*=================*/ + const dtuple_t* tuple) /*!< in: tuple */ +{ + return(tuple->info_bits); +} + +/*********************************************************************//** +Sets info bits in a data tuple. */ +UNIV_INLINE +void +dtuple_set_info_bits( +/*=================*/ + dtuple_t* tuple, /*!< in: tuple */ + ulint info_bits) /*!< in: info bits */ +{ + tuple->info_bits = info_bits; +} + +/*********************************************************************//** +Gets number of fields used in record comparisons. +@return number of fields used in comparisons in rem0cmp.* */ +UNIV_INLINE +ulint +dtuple_get_n_fields_cmp( +/*====================*/ + const dtuple_t* tuple) /*!< in: tuple */ +{ + return(tuple->n_fields_cmp); +} + +/*********************************************************************//** +Sets number of fields used in record comparisons. */ +UNIV_INLINE +void +dtuple_set_n_fields_cmp( +/*====================*/ + dtuple_t* tuple, /*!< in: tuple */ + ulint n_fields_cmp) /*!< in: number of fields used in + comparisons in rem0cmp.* */ +{ + ut_ad(n_fields_cmp <= tuple->n_fields); + tuple->n_fields_cmp = n_fields_cmp; +} + +/** Creates a data tuple from an already allocated chunk of memory. +The size of the chunk must be at least DTUPLE_EST_ALLOC(n_fields). +The default value for number of fields used in record comparisons +for this tuple is n_fields. +@param[in,out] buf buffer to use +@param[in] buf_size buffer size +@param[in] n_fields number of field +@param[in] n_v_fields number of fields on virtual columns +@return created tuple (inside buf) */ +UNIV_INLINE +dtuple_t* +dtuple_create_from_mem( + void* buf, + ulint buf_size, + ulint n_fields, + ulint n_v_fields) +{ + dtuple_t* tuple; + ulint n_t_fields = n_fields + n_v_fields; + + ut_a(buf_size >= DTUPLE_EST_ALLOC(n_t_fields)); + + tuple = (dtuple_t*) buf; + tuple->info_bits = 0; + tuple->n_fields = n_fields; + tuple->n_v_fields = n_v_fields; + tuple->n_fields_cmp = n_fields; + tuple->fields = (dfield_t*) &tuple[1]; + if (n_v_fields > 0) { + tuple->v_fields = &tuple->fields[n_fields]; + } else { + tuple->v_fields = NULL; + } + +#ifdef UNIV_DEBUG + tuple->magic_n = DATA_TUPLE_MAGIC_N; + + { /* In the debug version, initialize fields to an error value */ + ulint i; + + for (i = 0; i < n_t_fields; i++) { + dfield_t* field; + + if (i >= n_fields) { + field = dtuple_get_nth_v_field( + tuple, i - n_fields); + } else { + field = dtuple_get_nth_field(tuple, i); + } + + dfield_set_len(field, UNIV_SQL_NULL); + field->data = &data_error; + dfield_get_type(field)->mtype = DATA_ERROR; + dfield_get_type(field)->prtype = DATA_ERROR; + } + } +#endif + MEM_CHECK_ADDRESSABLE(tuple->fields, n_t_fields + * sizeof *tuple->fields); + MEM_UNDEFINED(tuple->fields, n_t_fields * sizeof *tuple->fields); + return(tuple); +} + +/** Duplicate the virtual field data in a dtuple_t +@param[in,out] vrow dtuple contains the virtual fields +@param[in,out] heap heap memory to use */ +UNIV_INLINE +void +dtuple_dup_v_fld(dtuple_t* vrow, mem_heap_t* heap) +{ + for (ulint i = 0; i < vrow->n_v_fields; i++) { + dfield_t* dfield = dtuple_get_nth_v_field(vrow, i); + dfield_dup(dfield, heap); + } +} + +/** Initialize the virtual field data in a dtuple_t +@param[in,out] vrow dtuple contains the virtual fields */ +UNIV_INLINE +void +dtuple_init_v_fld(dtuple_t* vrow) +{ + for (ulint i = 0; i < vrow->n_v_fields; i++) { + dfield_t* dfield = dtuple_get_nth_v_field(vrow, i); + dfield_get_type(dfield)->mtype = DATA_MISSING; + dfield_set_len(dfield, UNIV_SQL_NULL); + } +} + +/**********************************************************//** +Creates a data tuple to a memory heap. The default value for number +of fields used in record comparisons for this tuple is n_fields. +@return own: created tuple */ +UNIV_INLINE +dtuple_t* +dtuple_create( +/*==========*/ + mem_heap_t* heap, /*!< in: memory heap where the tuple + is created, DTUPLE_EST_ALLOC(n_fields) + bytes will be allocated from this heap */ + ulint n_fields) /*!< in: number of fields */ +{ + return(dtuple_create_with_vcol(heap, n_fields, 0)); +} + +/** Creates a data tuple with virtual columns to a memory heap. +@param[in] heap memory heap where the tuple is created +@param[in] n_fields number of fields +@param[in] n_v_fields number of fields on virtual col +@return own: created tuple */ +UNIV_INLINE +dtuple_t* +dtuple_create_with_vcol( + mem_heap_t* heap, + ulint n_fields, + ulint n_v_fields) +{ + void* buf; + ulint buf_size; + dtuple_t* tuple; + + ut_ad(heap); + + buf_size = DTUPLE_EST_ALLOC(n_fields + n_v_fields); + buf = mem_heap_alloc(heap, buf_size); + + tuple = dtuple_create_from_mem(buf, buf_size, n_fields, n_v_fields); + + return(tuple); +} + +/** Copies a data tuple's virtual fields to another. This is a shallow copy; +@param[in,out] d_tuple destination tuple +@param[in] s_tuple source tuple */ +UNIV_INLINE +void +dtuple_copy_v_fields( + dtuple_t* d_tuple, + const dtuple_t* s_tuple) +{ + + ulint n_v_fields = dtuple_get_n_v_fields(d_tuple); + ut_ad(n_v_fields == dtuple_get_n_v_fields(s_tuple)); + + for (ulint i = 0; i < n_v_fields; i++) { + dfield_copy(dtuple_get_nth_v_field(d_tuple, i), + dtuple_get_nth_v_field(s_tuple, i)); + } +} + +/*********************************************************************//** +Copies a data tuple to another. This is a shallow copy; if a deep copy +is desired, dfield_dup() will have to be invoked on each field. +@return own: copy of tuple */ +UNIV_INLINE +dtuple_t* +dtuple_copy( +/*========*/ + const dtuple_t* tuple, /*!< in: tuple to copy from */ + mem_heap_t* heap) /*!< in: memory heap + where the tuple is created */ +{ + ulint n_fields = dtuple_get_n_fields(tuple); + ulint n_v_fields = dtuple_get_n_v_fields(tuple); + dtuple_t* new_tuple = dtuple_create_with_vcol( + heap, n_fields, n_v_fields); + ulint i; + + for (i = 0; i < n_fields; i++) { + dfield_copy(dtuple_get_nth_field(new_tuple, i), + dtuple_get_nth_field(tuple, i)); + } + + for (i = 0; i < n_v_fields; i++) { + dfield_copy(dtuple_get_nth_v_field(new_tuple, i), + dtuple_get_nth_v_field(tuple, i)); + } + + return(new_tuple); +} + +/**********************************************************//** +The following function returns the sum of data lengths of a tuple. The space +occupied by the field structs or the tuple struct is not counted. Neither +is possible space in externally stored parts of the field. +@return sum of data lengths */ +UNIV_INLINE +ulint +dtuple_get_data_size( +/*=================*/ + const dtuple_t* tuple, /*!< in: typed data tuple */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ +{ + const dfield_t* field; + ulint n_fields; + ulint len; + ulint i; + ulint sum = 0; + + ut_ad(dtuple_check_typed(tuple)); + ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N); + + n_fields = tuple->n_fields; + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(tuple, i); + len = dfield_get_len(field); + + if (len == UNIV_SQL_NULL) { + len = dtype_get_sql_null_size(dfield_get_type(field), + comp); + } + + sum += len; + } + + return(sum); +} + +/*********************************************************************//** +Computes the number of externally stored fields in a data tuple. +@return number of externally stored fields */ +UNIV_INLINE +ulint +dtuple_get_n_ext( +/*=============*/ + const dtuple_t* tuple) /*!< in: tuple */ +{ + ulint n_ext = 0; + ulint n_fields = tuple->n_fields; + ulint i; + + ut_ad(dtuple_check_typed(tuple)); + ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N); + + for (i = 0; i < n_fields; i++) { + n_ext += dtuple_get_nth_field(tuple, i)->ext; + } + + return(n_ext); +} + +/*******************************************************************//** +Sets types of fields binary in a tuple. */ +UNIV_INLINE +void +dtuple_set_types_binary( +/*====================*/ + dtuple_t* tuple, /*!< in: data tuple */ + ulint n) /*!< in: number of fields to set */ +{ + dtype_t* dfield_type; + ulint i; + + for (i = 0; i < n; i++) { + dfield_type = dfield_get_type(dtuple_get_nth_field(tuple, i)); + dtype_set(dfield_type, DATA_BINARY, 0, 0); + } +} + +/** Fold a prefix given as the number of fields of a tuple. +@param[in] tuple index record +@param[in] n_fields number of complete fields to fold +@param[in] n_bytes number of bytes to fold in the last field +@param[in] index_id index tree ID +@return the folded value */ +UNIV_INLINE +ulint +dtuple_fold( + const dtuple_t* tuple, + ulint n_fields, + ulint n_bytes, + index_id_t tree_id) +{ + const dfield_t* field; + ulint i; + const byte* data; + ulint len; + ulint fold; + + ut_ad(tuple); + ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N); + ut_ad(dtuple_check_typed(tuple)); + + fold = ut_fold_ull(tree_id); + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(tuple, i); + + data = (const byte*) dfield_get_data(field); + len = dfield_get_len(field); + + if (len != UNIV_SQL_NULL) { + fold = ut_fold_ulint_pair(fold, + ut_fold_binary(data, len)); + } + } + + if (n_bytes > 0) { + field = dtuple_get_nth_field(tuple, i); + + data = (const byte*) dfield_get_data(field); + len = dfield_get_len(field); + + if (len != UNIV_SQL_NULL) { + if (len > n_bytes) { + len = n_bytes; + } + + fold = ut_fold_ulint_pair(fold, + ut_fold_binary(data, len)); + } + } + + return(fold); +} + +/**********************************************************************//** +Writes an SQL null field full of zeros. */ +UNIV_INLINE +void +data_write_sql_null( +/*================*/ + byte* data, /*!< in: pointer to a buffer of size len */ + ulint len) /*!< in: SQL null size in bytes */ +{ + memset(data, 0, len); +} + +/**********************************************************************//** +Checks if a dtuple contains an SQL null value. +@return TRUE if some field is SQL null */ +UNIV_INLINE +ibool +dtuple_contains_null( +/*=================*/ + const dtuple_t* tuple) /*!< in: dtuple */ +{ + ulint n; + ulint i; + + n = dtuple_get_n_fields(tuple); + + for (i = 0; i < n; i++) { + if (dfield_is_null(dtuple_get_nth_field(tuple, i))) { + + return(TRUE); + } + } + + return(FALSE); +} + +/**************************************************************//** +Frees the memory in a big rec vector. */ +UNIV_INLINE +void +dtuple_big_rec_free( +/*================*/ + big_rec_t* vector) /*!< in, own: big rec vector; it is + freed in this function */ +{ + mem_heap_free(vector->heap); +} diff --git a/storage/innobase/include/data0type.h b/storage/innobase/include/data0type.h new file mode 100644 index 00000000..3d63ddb7 --- /dev/null +++ b/storage/innobase/include/data0type.h @@ -0,0 +1,591 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/data0type.h +Data types + +Created 1/16/1996 Heikki Tuuri +*******************************************************/ + +#pragma once +#include "univ.i" + +/** Special length indicating a missing instantly added column */ +#define UNIV_SQL_DEFAULT (UNIV_SQL_NULL - 1) + +/** @return whether a length is actually stored in a field */ +#define len_is_stored(len) (len != UNIV_SQL_NULL && len != UNIV_SQL_DEFAULT) + +extern ulint data_mysql_default_charset_coll; +#define DATA_MYSQL_BINARY_CHARSET_COLL 63 + +/* SQL data type struct */ +struct dtype_t; + +/** SQL Like operator comparison types */ +enum ib_like_t { + IB_LIKE_EXACT, /**< e.g. STRING */ + IB_LIKE_PREFIX /**< e.g., STRING% */ +}; + +/*-------------------------------------------*/ +/* The 'MAIN TYPE' of a column */ +#define DATA_MISSING 0 /* missing column */ +#define DATA_VARCHAR 1 /* character varying of the + latin1_swedish_ci charset-collation; note + that the MySQL format for this, DATA_BINARY, + DATA_VARMYSQL, is also affected by whether the + 'precise type' contains + DATA_MYSQL_TRUE_VARCHAR */ +#define DATA_CHAR 2 /* fixed length character of the + latin1_swedish_ci charset-collation */ +#define DATA_FIXBINARY 3 /* binary string of fixed length */ +#define DATA_BINARY 4 /* binary string */ +#define DATA_BLOB 5 /* binary large object, or a TEXT type; + if prtype & DATA_BINARY_TYPE == 0, then this is + actually a TEXT column (or a BLOB created + with < 4.0.14; since column prefix indexes + came only in 4.0.14, the missing flag in BLOBs + created before that does not cause any harm) */ +#define DATA_INT 6 /* integer: can be any size 1 - 8 bytes */ +#define DATA_SYS_CHILD 7 /* address of the child page in node pointer */ +#define DATA_SYS 8 /* system column */ + +/* Data types >= DATA_FLOAT must be compared using the whole field, not as +binary strings */ + +#define DATA_FLOAT 9 +#define DATA_DOUBLE 10 +#define DATA_DECIMAL 11 /* decimal number stored as an ASCII string */ +#define DATA_VARMYSQL 12 /* any charset varying length char */ +#define DATA_MYSQL 13 /* any charset fixed length char */ + /* NOTE that 4.1.1 used DATA_MYSQL and + DATA_VARMYSQL for all character sets, and the + charset-collation for tables created with it + can also be latin1_swedish_ci */ + +/* DATA_GEOMETRY includes all standard geometry datatypes as described in +OGC standard(point, line_string, polygon, multi_point, multi_polygon, +multi_line_string, geometry_collection, geometry). +Currently, geometry data is stored in the standard Well-Known Binary(WKB) +format (http://www.opengeospatial.org/standards/sfa). +We use BLOB as the underlying datatype. */ +#define DATA_GEOMETRY 14 /* geometry datatype of variable length */ +#define DATA_MTYPE_MAX 63 /* dtype_store_for_order_and_null_size() + requires the values are <= 63 */ + +#define DATA_MTYPE_CURRENT_MIN DATA_VARCHAR /* minimum value of mtype */ +#define DATA_MTYPE_CURRENT_MAX DATA_GEOMETRY /* maximum value of mtype */ +/*-------------------------------------------*/ +/* The 'PRECISE TYPE' of a column */ +/* +Tables created by a MySQL user have the following convention: + +- In the least significant byte in the precise type we store the MySQL type +code (not applicable for system columns). + +- In the second least significant byte we OR flags DATA_NOT_NULL, +DATA_UNSIGNED, DATA_BINARY_TYPE. + +- In the third least significant byte of the precise type of string types we +store the MySQL charset-collation code. In DATA_BLOB columns created with +< 4.0.14 we do not actually know if it is a BLOB or a TEXT column. Since there +are no indexes on prefixes of BLOB or TEXT columns in < 4.0.14, this is no +problem, though. + +Note that versions < 4.1.2 or < 5.0.1 did not store the charset code to the +precise type, since the charset was always the default charset of the MySQL +installation. If the stored charset code is 0 in the system table SYS_COLUMNS +of InnoDB, that means that the default charset of this MySQL installation +should be used. + +When loading a table definition from the system tables to the InnoDB data +dictionary cache in main memory, InnoDB versions >= 4.1.2 and >= 5.0.1 check +if the stored charset-collation is 0, and if that is the case and the type is +a non-binary string, replace that 0 by the default charset-collation code of +this MySQL installation. In short, in old tables, the charset-collation code +in the system tables on disk can be 0, but in in-memory data structures +(dtype_t), the charset-collation code is always != 0 for non-binary string +types. + +In new tables, in binary string types, the charset-collation code is the +MySQL code for the 'binary charset', that is, != 0. + +For binary string types and for DATA_CHAR, DATA_VARCHAR, and for those +DATA_BLOB which are binary or have the charset-collation latin1_swedish_ci, +InnoDB performs all comparisons internally, without resorting to the MySQL +comparison functions. This is to save CPU time. + +InnoDB's own internal system tables have different precise types for their +columns, and for them the precise type is usually not used at all. +*/ + +#define DATA_ENGLISH 4 /* English language character string: this + is a relic from pre-MySQL time and only used + for InnoDB's own system tables */ +#define DATA_ERROR 111 /* another relic from pre-MySQL time */ + +#define DATA_MYSQL_TYPE_MASK 255U/* AND with this mask to extract the MySQL + type from the precise type */ +#define DATA_MYSQL_TRUE_VARCHAR 15 /* MySQL type code for the >= 5.0.3 + format true VARCHAR */ + +/* Precise data types for system columns and the length of those columns; +NOTE: the values must run from 0 up in the order given! All codes must +be less than 256 */ +#define DATA_ROW_ID 0 /* row id: a 48-bit integer */ +#define DATA_ROW_ID_LEN 6 /* stored length for row id */ + +#define DATA_TRX_ID 1 /* transaction id: 6 bytes */ +#define DATA_TRX_ID_LEN 6 + +#define DATA_ROLL_PTR 2 /* rollback data pointer: 7 bytes */ +#define DATA_ROLL_PTR_LEN 7 + +#define DATA_N_SYS_COLS 3 /* number of system columns defined above */ + +#define DATA_FTS_DOC_ID 3 /* Used as FTS DOC ID column */ + +#define DATA_SYS_PRTYPE_MASK 0xFU /* mask to extract the above from prtype */ + +/* Flags ORed to the precise data type */ +#define DATA_NOT_NULL 256U /* this is ORed to the precise type when + the column is declared as NOT NULL */ +#define DATA_UNSIGNED 512U /* this id ORed to the precise type when + we have an unsigned integer type */ +#define DATA_BINARY_TYPE 1024U /* if the data type is a binary character + string, this is ORed to the precise type: + this only holds for tables created with + >= MySQL-4.0.14 */ +/* #define DATA_NONLATIN1 2048 This is a relic from < 4.1.2 and < 5.0.1. + In earlier versions this was set for some + BLOB columns. +*/ +#define DATA_GIS_MBR 2048U /* Used as GIS MBR column */ +/** the size of a GIS maximum bounding rectangle */ +constexpr uint8_t DATA_MBR_LEN= uint8_t(SPDIMS * 2 * sizeof(double)); + +#define DATA_LONG_TRUE_VARCHAR 4096U /* this is ORed to the precise data + type when the column is true VARCHAR where + MySQL uses 2 bytes to store the data len; + for shorter VARCHARs MySQL uses only 1 byte */ +#define DATA_VIRTUAL 8192U /* Virtual column */ + +/** System Versioning */ +#define DATA_VERS_START 16384U /* start system field */ +#define DATA_VERS_END 32768U /* end system field */ +/** system-versioned user data column */ +#define DATA_VERSIONED (DATA_VERS_START|DATA_VERS_END) + +/*-------------------------------------------*/ + +/* This many bytes we need to store the type information affecting the +alphabetical order for a single field and decide the storage size of an +SQL null*/ +#define DATA_ORDER_NULL_TYPE_BUF_SIZE 4 +/* In the >= 4.1.x storage format we add 2 bytes more so that we can also +store the charset-collation number; one byte is left unused, though */ +#define DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE 6 + +/* Maximum multi-byte character length in bytes, plus 1 */ +#define DATA_MBMAX 8 + +/* For checking if mtype is GEOMETRY datatype */ +#define DATA_GEOMETRY_MTYPE(mtype) ((mtype) == DATA_GEOMETRY) + +/* For checking if mtype is BLOB or GEOMETRY, since we use BLOB as +the underlying datatype of GEOMETRY data. */ +#define DATA_LARGE_MTYPE(mtype) ((mtype) == DATA_BLOB \ + || (mtype) == DATA_GEOMETRY) + +/* For checking if data type is big length data type. */ +#define DATA_BIG_LEN_MTYPE(len, mtype) ((len) > 255 || DATA_LARGE_MTYPE(mtype)) + +/* For checking if the column is a big length column. */ +#define DATA_BIG_COL(col) DATA_BIG_LEN_MTYPE((col)->len, (col)->mtype) + +/* For checking if data type is large binary data type. */ +#define DATA_LARGE_BINARY(mtype,prtype) ((mtype) == DATA_GEOMETRY || \ + ((mtype) == DATA_BLOB && !((prtype) & DATA_BINARY_TYPE))) + +/* We now support 15 bits (up to 32767) collation number */ +#define MAX_CHAR_COLL_NUM 32767 + +/* Mask to get the Charset Collation number (0x7fff) */ +#define CHAR_COLL_MASK MAX_CHAR_COLL_NUM + +/*********************************************************************//** +Gets the MySQL type code from a dtype. +@return MySQL type code; this is NOT an InnoDB type code! */ +UNIV_INLINE +ulint +dtype_get_mysql_type( +/*=================*/ + const dtype_t* type); /*!< in: type struct */ +/*********************************************************************//** +Determine how many bytes the first n characters of the given string occupy. +If the string is shorter than n characters, returns the number of bytes +the characters in the string occupy. +@return length of the prefix, in bytes */ +ulint +dtype_get_at_most_n_mbchars( +/*========================*/ + ulint prtype, /*!< in: precise type */ + ulint mbminlen, /*!< in: minimum length of + a multi-byte character, in bytes */ + ulint mbmaxlen, /*!< in: maximum length of + a multi-byte character, in bytes */ + ulint prefix_len, /*!< in: length of the requested + prefix, in characters, multiplied by + dtype_get_mbmaxlen(dtype) */ + ulint data_len, /*!< in: length of str (in bytes) */ + const char* str); /*!< in: the string whose prefix + length is being determined */ +/** @return whether main type is a string type */ +inline bool dtype_is_string_type(ulint mtype) +{ + return mtype <= DATA_BLOB + || mtype == DATA_MYSQL || mtype == DATA_VARMYSQL; +} + +/** @return whether a type is a binary string type */ +inline bool dtype_is_binary_string_type(ulint mtype, ulint prtype) +{ + /* Note that for tables created before MySQL 4.0.14, + we do not know if a DATA_BLOB column is a BLOB or a TEXT column. + For those DATA_BLOB columns we return false. */ + + return mtype == DATA_FIXBINARY || mtype == DATA_BINARY + || (mtype == DATA_BLOB && (prtype & DATA_BINARY_TYPE)); +} + +/** @return whether a type is a non-binary string type */ +inline bool dtype_is_non_binary_string_type(ulint mtype, ulint prtype) +{ + return dtype_is_string_type(mtype) + && !dtype_is_binary_string_type(mtype, prtype); +} + +/*********************************************************************//** +Sets a data type structure. */ +UNIV_INLINE +void +dtype_set( +/*======*/ + dtype_t* type, /*!< in: type struct to init */ + ulint mtype, /*!< in: main data type */ + ulint prtype, /*!< in: precise type */ + ulint len); /*!< in: precision of type */ +/*********************************************************************//** +Copies a data type structure. */ +UNIV_INLINE +void +dtype_copy( +/*=======*/ + dtype_t* type1, /*!< in: type struct to copy to */ + const dtype_t* type2); /*!< in: type struct to copy from */ +/*********************************************************************//** +Gets the SQL main data type. +@return SQL main data type */ +UNIV_INLINE +ulint +dtype_get_mtype( +/*============*/ + const dtype_t* type); /*!< in: data type */ +/*********************************************************************//** +Gets the precise data type. +@return precise data type */ +UNIV_INLINE +ulint +dtype_get_prtype( +/*=============*/ + const dtype_t* type); /*!< in: data type */ + +/*********************************************************************//** +Compute the mbminlen and mbmaxlen members of a data type structure. */ +void +dtype_get_mblen( +/*============*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type (and collation) */ + unsigned* mbminlen, /*!< out: minimum length of a + multi-byte character */ + unsigned* mbmaxlen); /*!< out: maximum length of a + multi-byte character */ +/** +Get the charset-collation code for string types. +@param prtype InnoDB precise type +@return charset-collation code */ +inline uint16_t dtype_get_charset_coll(ulint prtype) +{ + return static_cast(prtype >> 16) & CHAR_COLL_MASK; +} + +/** Form a precise type from the < 4.1.2 format precise type plus the +charset-collation code. +@param[in] old_prtype MySQL type code and the flags + DATA_BINARY_TYPE etc. +@param[in] charset_coll character-set collation code +@return precise type, including the charset-collation code */ +UNIV_INLINE +uint32_t +dtype_form_prtype(ulint old_prtype, ulint charset_coll) +{ + ut_ad(old_prtype < 256 * 256); + ut_ad(charset_coll <= MAX_CHAR_COLL_NUM); + return(uint32_t(old_prtype + (charset_coll << 16))); +} + +/*********************************************************************//** +Determines if a MySQL string type is a subset of UTF-8. This function +may return false negatives, in case further character-set collation +codes are introduced in MySQL later. +@return whether a subset of UTF-8 */ +UNIV_INLINE +bool +dtype_is_utf8( +/*==========*/ + ulint prtype);/*!< in: precise data type */ +/*********************************************************************//** +Gets the type length. +@return fixed length of the type, in bytes, or 0 if variable-length */ +UNIV_INLINE +ulint +dtype_get_len( +/*==========*/ + const dtype_t* type); /*!< in: data type */ + +/*********************************************************************//** +Gets the minimum length of a character, in bytes. +@return minimum length of a char, in bytes, or 0 if this is not a +character type */ +UNIV_INLINE +ulint +dtype_get_mbminlen( +/*===============*/ + const dtype_t* type); /*!< in: type */ +/*********************************************************************//** +Gets the maximum length of a character, in bytes. +@return maximum length of a char, in bytes, or 0 if this is not a +character type */ +UNIV_INLINE +ulint +dtype_get_mbmaxlen( +/*===============*/ + const dtype_t* type); /*!< in: type */ +/***********************************************************************//** +Returns the size of a fixed size data type, 0 if not a fixed size type. +@return fixed size, or 0 */ +UNIV_INLINE +unsigned +dtype_get_fixed_size_low( +/*=====================*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + ulint len, /*!< in: length */ + ulint mbminlen, /*!< in: minimum length of a + multibyte character, in bytes */ + ulint mbmaxlen, /*!< in: maximum length of a + multibyte character, in bytes */ + ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */ + +/***********************************************************************//** +Returns the minimum size of a data type. +@return minimum size */ +UNIV_INLINE +unsigned +dtype_get_min_size_low( +/*===================*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + ulint len, /*!< in: length */ + ulint mbminlen, /*!< in: minimum length of a character */ + ulint mbmaxlen); /*!< in: maximum length of a character */ +/***********************************************************************//** +Returns the maximum size of a data type. Note: types in system tables may be +incomplete and return incorrect information. +@return maximum size */ +UNIV_INLINE +ulint +dtype_get_max_size_low( +/*===================*/ + ulint mtype, /*!< in: main type */ + ulint len); /*!< in: length */ +/***********************************************************************//** +Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type. +For fixed length types it is the fixed length of the type, otherwise 0. +@return SQL null storage size in ROW_FORMAT=REDUNDANT */ +UNIV_INLINE +ulint +dtype_get_sql_null_size( +/*====================*/ + const dtype_t* type, /*!< in: type */ + ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */ + +/**********************************************************************//** +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. */ +UNIV_INLINE +void +dtype_read_for_order_and_null_size( +/*===============================*/ + dtype_t* type, /*!< in: type struct */ + const byte* buf); /*!< in: buffer for the stored order info */ +/**********************************************************************//** +Stores for a type the information which determines its alphabetical ordering +and the storage size of an SQL NULL value. This is the >= 4.1.x storage +format. */ +UNIV_INLINE +void +dtype_new_store_for_order_and_null_size( +/*====================================*/ + byte* buf, /*!< in: buffer for + DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE + bytes where we store the info */ + const dtype_t* type, /*!< in: type struct */ + ulint prefix_len);/*!< in: prefix length to + replace type->len, or 0 */ +/**********************************************************************//** +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. This is the 4.1.x storage +format. */ +UNIV_INLINE +void +dtype_new_read_for_order_and_null_size( +/*===================================*/ + dtype_t* type, /*!< in: type struct */ + const byte* buf); /*!< in: buffer for stored type order info */ + +/*********************************************************************//** +Validates a data type structure. +@return TRUE if ok */ +ibool +dtype_validate( +/*===========*/ + const dtype_t* type); /*!< in: type struct to validate */ +#ifdef UNIV_DEBUG +/** Print a data type structure. +@param[in] type data type */ +void +dtype_print( + const dtype_t* type); +#endif /* UNIV_DEBUG */ + +struct dict_col_t; + +/* Structure for an SQL data type. +If you add fields to this structure, be sure to initialize them everywhere. +This structure is initialized in the following functions: +dtype_set() +dtype_read_for_order_and_null_size() +dtype_new_read_for_order_and_null_size() +sym_tab_add_null_lit() */ + +struct dtype_t{ + unsigned prtype:32; /*!< precise type; MySQL data + type, charset code, flags to + indicate nullability, + signedness, whether this is a + binary string, whether this is + a true VARCHAR where MySQL + uses 2 bytes to store the length */ + unsigned mtype:8; /*!< main data type */ + + /* the remaining fields do not affect alphabetical ordering: */ + + unsigned len:16; /*!< length; for MySQL data this + is field->pack_length(), + except that for a >= 5.0.3 + type true VARCHAR this is the + maximum byte length of the + string data (in addition to + the string, MySQL uses 1 or 2 + bytes to store the string length) */ + unsigned mbminlen:3; /*!< minimum length of a character, + in bytes */ + unsigned mbmaxlen:3; /*!< maximum length of a character, + in bytes */ + + /** @return whether this is system versioned user field */ + bool is_versioned() const { return !(~prtype & DATA_VERSIONED); } + /** @return whether this is the system field start */ + bool vers_sys_start() const + { + return (prtype & DATA_VERSIONED) == DATA_VERS_START; + } + /** @return whether this is the system field end */ + bool vers_sys_end() const + { + return (prtype & DATA_VERSIONED) == DATA_VERS_END; + } + + /** Set the type of the BLOB in the hidden metadata record. */ + void metadata_blob_init() + { + prtype = DATA_NOT_NULL; + mtype = DATA_BLOB; + len = 0; + mbminlen = 0; + mbmaxlen = 0; + } + + /** Copy the type information from a column. + @param col column type to be copied */ + void assign(const dict_col_t &col); +}; + +/** The DB_TRX_ID,DB_ROLL_PTR values for "no history is available" */ +extern const byte reset_trx_id[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]; + +/** Info bit denoting the predefined minimum record: this bit is set +if and only if the record is the first user record on a non-leaf +B-tree page that is the leftmost page on its level +(PAGE_LEVEL is nonzero and FIL_PAGE_PREV is FIL_NULL). */ +#define REC_INFO_MIN_REC_FLAG 0x10UL +/** The delete-mark flag in info bits */ +#define REC_INFO_DELETED_FLAG 0x20UL + +/** Record status values for ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED */ +enum rec_comp_status_t { + /** User record (PAGE_LEVEL=0, heap>=PAGE_HEAP_NO_USER_LOW) */ + REC_STATUS_ORDINARY = 0, + /** Node pointer record (PAGE_LEVEL>=0, heap>=PAGE_HEAP_NO_USER_LOW) */ + REC_STATUS_NODE_PTR = 1, + /** The page infimum pseudo-record (heap=PAGE_HEAP_NO_INFIMUM) */ + REC_STATUS_INFIMUM = 2, + /** The page supremum pseudo-record (heap=PAGE_HEAP_NO_SUPREMUM) */ + REC_STATUS_SUPREMUM = 3, + /** Clustered index record that has been inserted or updated + after instant ADD COLUMN (more than dict_index_t::n_core_fields) */ + REC_STATUS_INSTANT = 4 +}; + +/** The dtuple_t::info_bits of the hidden metadata of instant ADD COLUMN. +@see rec_is_metadata() +@see rec_is_alter_metadata() */ +static const byte REC_INFO_METADATA_ADD + = REC_INFO_MIN_REC_FLAG | REC_STATUS_INSTANT; + +/** The dtuple_t::info_bits of the hidden metadata of instant ALTER TABLE. +@see rec_is_metadata() */ +static const byte REC_INFO_METADATA_ALTER + = REC_INFO_METADATA_ADD | REC_INFO_DELETED_FLAG; + +#include "data0type.inl" diff --git a/storage/innobase/include/data0type.inl b/storage/innobase/include/data0type.inl new file mode 100644 index 00000000..329cee5d --- /dev/null +++ b/storage/innobase/include/data0type.inl @@ -0,0 +1,487 @@ +/***************************************************************************** + +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/data0type.ic +Data types + +Created 1/16/1996 Heikki Tuuri +*******************************************************/ + +#include "mach0data.h" +#include "ha_prototypes.h" + +/*********************************************************************//** +Determines if a MySQL string type is a subset of UTF-8. This function +may return false negatives, in case further character-set collation +codes are introduced in MySQL later. +@return whether a subset of UTF-8 */ +UNIV_INLINE +bool +dtype_is_utf8( +/*==========*/ + ulint prtype) /*!< in: precise data type */ +{ + /* These codes have been copied from strings/ctype-extra.c + and strings/ctype-utf8.c. */ + switch (dtype_get_charset_coll(prtype)) { + case 11: /* ascii_general_ci */ + case 65: /* ascii_bin */ + case 33: /* utf8_general_ci */ + case 83: /* utf8_bin */ + case 254: /* utf8_general_cs */ + return true; + } + + return false; +} + +/*********************************************************************//** +Gets the MySQL type code from a dtype. +@return MySQL type code; this is NOT an InnoDB type code! */ +UNIV_INLINE +ulint +dtype_get_mysql_type( +/*=================*/ + const dtype_t* type) /*!< in: type struct */ +{ + return(type->prtype & 0xFFUL); +} + +/*********************************************************************//** +Compute the mbminlen and mbmaxlen members of a data type structure. */ +UNIV_INLINE +void +dtype_set_mblen( +/*============*/ + dtype_t* type) /*!< in/out: type */ +{ + unsigned mbminlen, mbmaxlen; + + dtype_get_mblen(type->mtype, type->prtype, &mbminlen, &mbmaxlen); + type->mbminlen = mbminlen & 7; + type->mbmaxlen = mbmaxlen & 7; + + ut_ad(dtype_validate(type)); +} + +/*********************************************************************//** +Sets a data type structure. */ +UNIV_INLINE +void +dtype_set( +/*======*/ + dtype_t* type, /*!< in: type struct to init */ + ulint mtype, /*!< in: main data type */ + ulint prtype, /*!< in: precise type */ + ulint len) /*!< in: precision of type */ +{ + ut_ad(type); + ut_ad(mtype <= DATA_MTYPE_MAX); + + type->mtype = static_cast(mtype); + type->prtype = static_cast(prtype); + type->len = static_cast(len); + + dtype_set_mblen(type); +} + +/*********************************************************************//** +Copies a data type structure. */ +UNIV_INLINE +void +dtype_copy( +/*=======*/ + dtype_t* type1, /*!< in: type struct to copy to */ + const dtype_t* type2) /*!< in: type struct to copy from */ +{ + *type1 = *type2; + + ut_ad(dtype_validate(type1)); +} + +/*********************************************************************//** +Gets the SQL main data type. +@return SQL main data type */ +UNIV_INLINE +ulint +dtype_get_mtype( +/*============*/ + const dtype_t* type) /*!< in: data type */ +{ + ut_ad(type); + + return(type->mtype); +} + +/*********************************************************************//** +Gets the precise data type. +@return precise data type */ +UNIV_INLINE +ulint +dtype_get_prtype( +/*=============*/ + const dtype_t* type) /*!< in: data type */ +{ + ut_ad(type); + + return(type->prtype); +} + +/*********************************************************************//** +Gets the type length. +@return fixed length of the type, in bytes, or 0 if variable-length */ +UNIV_INLINE +ulint +dtype_get_len( +/*==========*/ + const dtype_t* type) /*!< in: data type */ +{ + ut_ad(type); + + return(type->len); +} + +/*********************************************************************//** +Gets the minimum length of a character, in bytes. +@return minimum length of a char, in bytes, or 0 if this is not a +character type */ +UNIV_INLINE +ulint +dtype_get_mbminlen( +/*===============*/ + const dtype_t* type) /*!< in: type */ +{ + return type->mbminlen; +} +/*********************************************************************//** +Gets the maximum length of a character, in bytes. +@return maximum length of a char, in bytes, or 0 if this is not a +character type */ +UNIV_INLINE +ulint +dtype_get_mbmaxlen( +/*===============*/ + const dtype_t* type) /*!< in: type */ +{ + return type->mbmaxlen; +} + +/**********************************************************************//** +Stores for a type the information which determines its alphabetical ordering +and the storage size of an SQL NULL value. This is the >= 4.1.x storage +format. */ +UNIV_INLINE +void +dtype_new_store_for_order_and_null_size( +/*====================================*/ + byte* buf, /*!< in: buffer for + DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE + bytes where we store the info */ + const dtype_t* type, /*!< in: type struct */ + ulint prefix_len)/*!< in: prefix length to + replace type->len, or 0 */ +{ + compile_time_assert(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + ulint len; + + ut_ad(type); + ut_ad(type->mtype >= DATA_VARCHAR); + ut_ad(type->mtype <= DATA_MTYPE_MAX); + + buf[0] = (byte)(type->mtype & 0xFFUL); + + if (type->prtype & DATA_BINARY_TYPE) { + buf[0] |= 128; + } + + /* In versions < 4.1.2 we had: if (type->prtype & DATA_NONLATIN1) { + buf[0] |= 64; + } + */ + + buf[1] = (byte)(type->prtype & 0xFFUL); + + len = prefix_len ? prefix_len : type->len; + + mach_write_to_2(buf + 2, len & 0xFFFFUL); + + ut_ad(dtype_get_charset_coll(type->prtype) <= MAX_CHAR_COLL_NUM); + mach_write_to_2(buf + 4, dtype_get_charset_coll(type->prtype)); + + if (type->prtype & DATA_NOT_NULL) { + buf[4] |= 128; + } +} + +/**********************************************************************//** +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. This is the < 4.1.x +storage format. */ +UNIV_INLINE +void +dtype_read_for_order_and_null_size( +/*===============================*/ + dtype_t* type, /*!< in: type struct */ + const byte* buf) /*!< in: buffer for stored type order info */ +{ + compile_time_assert(4 == DATA_ORDER_NULL_TYPE_BUF_SIZE); + type->mtype = buf[0] & 63; + type->prtype = buf[1]; + + if (buf[0] & 128) { + type->prtype |= DATA_BINARY_TYPE; + } + + type->len = mach_read_from_2(buf + 2); + + type->prtype = dtype_form_prtype(type->prtype, + data_mysql_default_charset_coll); + dtype_set_mblen(type); +} + +/**********************************************************************//** +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. This is the >= 4.1.x +storage format. */ +UNIV_INLINE +void +dtype_new_read_for_order_and_null_size( +/*===================================*/ + dtype_t* type, /*!< in: type struct */ + const byte* buf) /*!< in: buffer for stored type order info */ +{ + compile_time_assert(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + type->mtype = buf[0] & 63; + type->prtype = buf[1]; + + if (buf[0] & 128) { + type->prtype |= DATA_BINARY_TYPE; + } + + if (buf[4] & 128) { + type->prtype |= DATA_NOT_NULL; + } + + type->len = mach_read_from_2(buf + 2); + + ulint charset_coll = mach_read_from_2(buf + 4) & CHAR_COLL_MASK; + + if (dtype_is_string_type(type->mtype)) { + ut_a(charset_coll <= MAX_CHAR_COLL_NUM); + + if (charset_coll == 0) { + /* This insert buffer record was inserted with MySQL + version < 4.1.2, and the charset-collation code was not + explicitly stored to dtype->prtype at that time. It + must be the default charset-collation of this MySQL + installation. */ + + charset_coll = data_mysql_default_charset_coll; + } + + type->prtype = dtype_form_prtype(type->prtype, charset_coll); + } + dtype_set_mblen(type); +} + +/***********************************************************************//** +Returns the size of a fixed size data type, 0 if not a fixed size type. +@return fixed size, or 0 */ +UNIV_INLINE +unsigned +dtype_get_fixed_size_low( +/*=====================*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + ulint len, /*!< in: length */ + ulint mbminlen, /*!< in: minimum length of a + multibyte character, in bytes */ + ulint mbmaxlen, /*!< in: maximum length of a + multibyte character, in bytes */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ +{ + switch (mtype) { + case DATA_SYS: +#ifdef UNIV_DEBUG + switch (prtype & DATA_MYSQL_TYPE_MASK) { + case DATA_ROW_ID: + ut_ad(len == DATA_ROW_ID_LEN); + break; + case DATA_TRX_ID: + ut_ad(len == DATA_TRX_ID_LEN); + break; + case DATA_ROLL_PTR: + ut_ad(len == DATA_ROLL_PTR_LEN); + break; + default: + ut_ad(0); + return(0); + } +#endif /* UNIV_DEBUG */ + /* fall through */ + case DATA_CHAR: + case DATA_FIXBINARY: + case DATA_INT: + case DATA_FLOAT: + case DATA_DOUBLE: + return static_cast(len); + case DATA_MYSQL: + if (prtype & DATA_BINARY_TYPE) { + return static_cast(len); + } else if (!comp) { + return static_cast(len); + } else { + if (mbminlen == mbmaxlen) { + return static_cast(len); + } + } + /* Treat as variable-length. */ + /* fall through */ + case DATA_VARCHAR: + case DATA_BINARY: + case DATA_DECIMAL: + case DATA_VARMYSQL: + case DATA_GEOMETRY: + case DATA_BLOB: + return(0); + default: + ut_error; + } + + return(0); +} + +/***********************************************************************//** +Returns the minimum size of a data type. +@return minimum size */ +UNIV_INLINE +unsigned +dtype_get_min_size_low( +/*===================*/ + ulint mtype, /*!< in: main type */ + ulint prtype, /*!< in: precise type */ + ulint len, /*!< in: length */ + ulint mbminlen, /*!< in: minimum length of a character */ + ulint mbmaxlen) /*!< in: maximum length of a character */ +{ + switch (mtype) { + case DATA_SYS: +#ifdef UNIV_DEBUG + switch (prtype & DATA_MYSQL_TYPE_MASK) { + case DATA_ROW_ID: + ut_ad(len == DATA_ROW_ID_LEN); + break; + case DATA_TRX_ID: + ut_ad(len == DATA_TRX_ID_LEN); + break; + case DATA_ROLL_PTR: + ut_ad(len == DATA_ROLL_PTR_LEN); + break; + default: + ut_ad(0); + return(0); + } +#endif /* UNIV_DEBUG */ + /* fall through */ + case DATA_CHAR: + case DATA_FIXBINARY: + case DATA_INT: + case DATA_FLOAT: + case DATA_DOUBLE: + return static_cast(len); + case DATA_MYSQL: + if (prtype & DATA_BINARY_TYPE) { + return static_cast(len); + } else { + if (mbminlen == mbmaxlen) { + return static_cast(len); + } + + /* this is a variable-length character set */ + ut_a(mbminlen > 0); + ut_a(mbmaxlen > mbminlen); + ut_a(len % mbmaxlen == 0); + return static_cast( + len * mbminlen / mbmaxlen); + } + case DATA_VARCHAR: + case DATA_BINARY: + case DATA_DECIMAL: + case DATA_VARMYSQL: + case DATA_GEOMETRY: + case DATA_BLOB: + return(0); + default: + ut_error; + } + + return(0); +} + +/***********************************************************************//** +Returns the maximum size of a data type. Note: types in system tables may be +incomplete and return incorrect information. +@return maximum size */ +UNIV_INLINE +ulint +dtype_get_max_size_low( +/*===================*/ + ulint mtype, /*!< in: main type */ + ulint len) /*!< in: length */ +{ + switch (mtype) { + case DATA_SYS: + case DATA_CHAR: + case DATA_FIXBINARY: + case DATA_INT: + case DATA_FLOAT: + case DATA_DOUBLE: + case DATA_MYSQL: + case DATA_VARCHAR: + case DATA_BINARY: + case DATA_DECIMAL: + case DATA_VARMYSQL: + return(len); + case DATA_GEOMETRY: + case DATA_BLOB: + break; + default: + ut_error; + } + + return(ULINT_MAX); +} + +/***********************************************************************//** +Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type. +For fixed length types it is the fixed length of the type, otherwise 0. +@return SQL null storage size in ROW_FORMAT=REDUNDANT */ +UNIV_INLINE +ulint +dtype_get_sql_null_size( +/*====================*/ + const dtype_t* type, /*!< in: type */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ +{ + return(dtype_get_fixed_size_low(type->mtype, type->prtype, type->len, + type->mbminlen, type->mbmaxlen, comp)); +} diff --git a/storage/innobase/include/data0types.h b/storage/innobase/include/data0types.h new file mode 100644 index 00000000..bcd6b8bc --- /dev/null +++ b/storage/innobase/include/data0types.h @@ -0,0 +1,36 @@ +/***************************************************************************** + +Copyright (c) 2000, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/data0types.h +Some type definitions + +Created 9/21/2000 Heikki Tuuri +*************************************************************************/ + +#ifndef data0types_h +#define data0types_h + +/* SQL data field struct */ +struct dfield_t; + +/* SQL data tuple struct */ +struct dtuple_t; + +#endif + diff --git a/storage/innobase/include/db0err.h b/storage/innobase/include/db0err.h new file mode 100644 index 00000000..64182aab --- /dev/null +++ b/storage/innobase/include/db0err.h @@ -0,0 +1,170 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/db0err.h +Global error codes for the database + +Created 5/24/1996 Heikki Tuuri +*******************************************************/ + +#ifndef db0err_h +#define db0err_h + +/* Do not include univ.i because univ.i includes this. */ + +enum dberr_t { + DB_SUCCESS, + + DB_SUCCESS_LOCKED_REC = 9, /*!< like DB_SUCCESS, but a new + explicit record lock was created */ + + /* The following are error codes */ + DB_ERROR = 11, + DB_INTERRUPTED, + DB_OUT_OF_MEMORY, + DB_OUT_OF_FILE_SPACE, + DB_LOCK_WAIT, + DB_DEADLOCK, + DB_ROLLBACK, + DB_DUPLICATE_KEY, + DB_MISSING_HISTORY, /*!< required history data has been + deleted due to lack of space in + rollback segment */ + DB_CLUSTER_NOT_FOUND = 30, + DB_TABLE_NOT_FOUND, + DB_TOO_BIG_RECORD, /*!< a record in an index would not fit + on a compressed page, or it would + become bigger than 1/2 free space in + an uncompressed page frame */ + DB_LOCK_WAIT_TIMEOUT, /*!< lock wait lasted too long */ + DB_NO_REFERENCED_ROW, /*!< referenced key value not found + for a foreign key in an insert or + update of a row */ + DB_ROW_IS_REFERENCED, /*!< cannot delete or update a row + because it contains a key value + which is referenced */ + DB_CANNOT_ADD_CONSTRAINT, /*!< adding a foreign key constraint + to a table failed */ + DB_CORRUPTION, /*!< data structure corruption + noticed */ + DB_CANNOT_DROP_CONSTRAINT, /*!< dropping a foreign key constraint + from a table failed */ + DB_NO_SAVEPOINT, /*!< no savepoint exists with the given + name */ + DB_TABLESPACE_EXISTS, /*!< we cannot create a new single-table + tablespace because a file of the same + name already exists */ + DB_TABLESPACE_DELETED, /*!< tablespace was deleted or is + being dropped right now */ + DB_TABLESPACE_NOT_FOUND, /*= sys_id) + { + if (!row_id.compare_exchange_strong(sys_id, id)) + continue; + if (!(id % ROW_ID_WRITE_MARGIN)) + dict_hdr_flush_row_id(id); + break; + } +} + +/**********************************************************************//** +Writes a row id to a record or other 6-byte stored form. */ +inline void dict_sys_write_row_id(byte *field, row_id_t row_id) +{ + static_assert(DATA_ROW_ID_LEN == 6, "compatibility"); + mach_write_to_6(field, row_id); +} + +/*****************************************************************//** +Initializes the data dictionary memory structures when the database is +started. This function is also called when the data dictionary is created. +@return DB_SUCCESS or error code. */ +dberr_t +dict_boot(void) +/*===========*/ + MY_ATTRIBUTE((warn_unused_result)); + +/*****************************************************************//** +Creates and initializes the data dictionary at the server bootstrap. +@return DB_SUCCESS or error code. */ +dberr_t +dict_create(void) +/*=============*/ + MY_ATTRIBUTE((warn_unused_result)); + +/*********************************************************************//** +Check if a table id belongs to system table. +@return true if the table id belongs to a system table. */ +inline bool dict_is_sys_table(table_id_t id) { return id < DICT_HDR_FIRST_ID; } + +/* Space id and page no where the dictionary header resides */ +#define DICT_HDR_SPACE 0 /* the SYSTEM tablespace */ +#define DICT_HDR_PAGE_NO FSP_DICT_HDR_PAGE_NO + +/* The ids for the basic system tables and their indexes */ +#define DICT_TABLES_ID 1 +#define DICT_COLUMNS_ID 2 +#define DICT_INDEXES_ID dict_index_t::DICT_INDEXES_ID /* 3 */ +#define DICT_FIELDS_ID 4 +/* The following is a secondary index on SYS_TABLES */ +#define DICT_TABLE_IDS_ID 5 + +/* The offset of the dictionary header on the page */ +#define DICT_HDR FSEG_PAGE_DATA + +/*-------------------------------------------------------------*/ +/* Dictionary header offsets */ +#define DICT_HDR_ROW_ID 0 /* The latest assigned row id */ +#define DICT_HDR_TABLE_ID 8 /* The latest assigned table id */ +#define DICT_HDR_INDEX_ID 16 /* The latest assigned index id */ +#define DICT_HDR_MAX_SPACE_ID 24 /* The latest assigned space id,or 0*/ +#define DICT_HDR_MIX_ID_LOW 28 /* Obsolete,always DICT_HDR_FIRST_ID*/ +#define DICT_HDR_TABLES 32 /* Root of SYS_TABLES clust index */ +#define DICT_HDR_TABLE_IDS 36 /* Root of SYS_TABLE_IDS sec index */ +#define DICT_HDR_COLUMNS 40 /* Root of SYS_COLUMNS clust index */ +#define DICT_HDR_INDEXES 44 /* Root of SYS_INDEXES clust index */ +#define DICT_HDR_FIELDS 48 /* Root of SYS_FIELDS clust index */ + +#define DICT_HDR_FSEG_HEADER 56 /* Segment header for the tablespace + segment into which the dictionary + header is created */ +/*-------------------------------------------------------------*/ + +/* The columns in SYS_TABLES */ +enum dict_col_sys_tables_enum { + DICT_COL__SYS_TABLES__NAME = 0, + DICT_COL__SYS_TABLES__ID = 1, + DICT_COL__SYS_TABLES__N_COLS = 2, + DICT_COL__SYS_TABLES__TYPE = 3, + DICT_COL__SYS_TABLES__MIX_ID = 4, + DICT_COL__SYS_TABLES__MIX_LEN = 5, + DICT_COL__SYS_TABLES__CLUSTER_ID = 6, + DICT_COL__SYS_TABLES__SPACE = 7, + DICT_NUM_COLS__SYS_TABLES = 8 +}; +/* The field numbers in the SYS_TABLES clustered index */ +enum dict_fld_sys_tables_enum { + DICT_FLD__SYS_TABLES__NAME = 0, + DICT_FLD__SYS_TABLES__DB_TRX_ID = 1, + DICT_FLD__SYS_TABLES__DB_ROLL_PTR = 2, + DICT_FLD__SYS_TABLES__ID = 3, + DICT_FLD__SYS_TABLES__N_COLS = 4, + DICT_FLD__SYS_TABLES__TYPE = 5, + DICT_FLD__SYS_TABLES__MIX_ID = 6, + DICT_FLD__SYS_TABLES__MIX_LEN = 7, + DICT_FLD__SYS_TABLES__CLUSTER_ID = 8, + DICT_FLD__SYS_TABLES__SPACE = 9, + DICT_NUM_FIELDS__SYS_TABLES = 10 +}; +/* The field numbers in the SYS_TABLE_IDS index */ +enum dict_fld_sys_table_ids_enum { + DICT_FLD__SYS_TABLE_IDS__ID = 0, + DICT_FLD__SYS_TABLE_IDS__NAME = 1, + DICT_NUM_FIELDS__SYS_TABLE_IDS = 2 +}; +/* The columns in SYS_COLUMNS */ +enum dict_col_sys_columns_enum { + DICT_COL__SYS_COLUMNS__TABLE_ID = 0, + DICT_COL__SYS_COLUMNS__POS = 1, + DICT_COL__SYS_COLUMNS__NAME = 2, + DICT_COL__SYS_COLUMNS__MTYPE = 3, + DICT_COL__SYS_COLUMNS__PRTYPE = 4, + DICT_COL__SYS_COLUMNS__LEN = 5, + DICT_COL__SYS_COLUMNS__PREC = 6, + DICT_NUM_COLS__SYS_COLUMNS = 7 +}; +/* The field numbers in the SYS_COLUMNS clustered index */ +enum dict_fld_sys_columns_enum { + DICT_FLD__SYS_COLUMNS__TABLE_ID = 0, + DICT_FLD__SYS_COLUMNS__POS = 1, + DICT_FLD__SYS_COLUMNS__DB_TRX_ID = 2, + DICT_FLD__SYS_COLUMNS__DB_ROLL_PTR = 3, + DICT_FLD__SYS_COLUMNS__NAME = 4, + DICT_FLD__SYS_COLUMNS__MTYPE = 5, + DICT_FLD__SYS_COLUMNS__PRTYPE = 6, + DICT_FLD__SYS_COLUMNS__LEN = 7, + DICT_FLD__SYS_COLUMNS__PREC = 8, + DICT_NUM_FIELDS__SYS_COLUMNS = 9 +}; +/* The columns in SYS_INDEXES */ +enum dict_col_sys_indexes_enum { + DICT_COL__SYS_INDEXES__TABLE_ID = 0, + DICT_COL__SYS_INDEXES__ID = 1, + DICT_COL__SYS_INDEXES__NAME = 2, + DICT_COL__SYS_INDEXES__N_FIELDS = 3, + DICT_COL__SYS_INDEXES__TYPE = 4, + DICT_COL__SYS_INDEXES__SPACE = 5, + DICT_COL__SYS_INDEXES__PAGE_NO = 6, + DICT_COL__SYS_INDEXES__MERGE_THRESHOLD = 7, + DICT_NUM_COLS__SYS_INDEXES = 8 +}; +/* The field numbers in the SYS_INDEXES clustered index */ +enum dict_fld_sys_indexes_enum { + DICT_FLD__SYS_INDEXES__TABLE_ID = 0, + DICT_FLD__SYS_INDEXES__ID = 1, + DICT_FLD__SYS_INDEXES__DB_TRX_ID = 2, + DICT_FLD__SYS_INDEXES__DB_ROLL_PTR = 3, + DICT_FLD__SYS_INDEXES__NAME = 4, + DICT_FLD__SYS_INDEXES__N_FIELDS = 5, + DICT_FLD__SYS_INDEXES__TYPE = 6, + DICT_FLD__SYS_INDEXES__SPACE = 7, + DICT_FLD__SYS_INDEXES__PAGE_NO = 8, + DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD = 9, + DICT_NUM_FIELDS__SYS_INDEXES = 10 +}; +/* The columns in SYS_FIELDS */ +enum dict_col_sys_fields_enum { + DICT_COL__SYS_FIELDS__INDEX_ID = 0, + DICT_COL__SYS_FIELDS__POS = 1, + DICT_COL__SYS_FIELDS__COL_NAME = 2, + DICT_NUM_COLS__SYS_FIELDS = 3 +}; +/* The field numbers in the SYS_FIELDS clustered index */ +enum dict_fld_sys_fields_enum { + DICT_FLD__SYS_FIELDS__INDEX_ID = 0, + DICT_FLD__SYS_FIELDS__POS = 1, + DICT_FLD__SYS_FIELDS__DB_TRX_ID = 2, + DICT_FLD__SYS_FIELDS__DB_ROLL_PTR = 3, + DICT_FLD__SYS_FIELDS__COL_NAME = 4, + DICT_NUM_FIELDS__SYS_FIELDS = 5 +}; +/* The columns in SYS_FOREIGN */ +enum dict_col_sys_foreign_enum { + DICT_COL__SYS_FOREIGN__ID = 0, + DICT_COL__SYS_FOREIGN__FOR_NAME = 1, + DICT_COL__SYS_FOREIGN__REF_NAME = 2, + DICT_COL__SYS_FOREIGN__N_COLS = 3, + DICT_NUM_COLS__SYS_FOREIGN = 4 +}; +/* The field numbers in the SYS_FOREIGN clustered index */ +enum dict_fld_sys_foreign_enum { + DICT_FLD__SYS_FOREIGN__ID = 0, + DICT_FLD__SYS_FOREIGN__DB_TRX_ID = 1, + DICT_FLD__SYS_FOREIGN__DB_ROLL_PTR = 2, + DICT_FLD__SYS_FOREIGN__FOR_NAME = 3, + DICT_FLD__SYS_FOREIGN__REF_NAME = 4, + DICT_FLD__SYS_FOREIGN__N_COLS = 5, + DICT_NUM_FIELDS__SYS_FOREIGN = 6 +}; +/* The field numbers in the SYS_FOREIGN_FOR_NAME secondary index */ +enum dict_fld_sys_foreign_for_name_enum { + DICT_FLD__SYS_FOREIGN_FOR_NAME__NAME = 0, + DICT_FLD__SYS_FOREIGN_FOR_NAME__ID = 1, + DICT_NUM_FIELDS__SYS_FOREIGN_FOR_NAME = 2 +}; +/* The columns in SYS_FOREIGN_COLS */ +enum dict_col_sys_foreign_cols_enum { + DICT_COL__SYS_FOREIGN_COLS__ID = 0, + DICT_COL__SYS_FOREIGN_COLS__POS = 1, + DICT_COL__SYS_FOREIGN_COLS__FOR_COL_NAME = 2, + DICT_COL__SYS_FOREIGN_COLS__REF_COL_NAME = 3, + DICT_NUM_COLS__SYS_FOREIGN_COLS = 4 +}; +/* The field numbers in the SYS_FOREIGN_COLS clustered index */ +enum dict_fld_sys_foreign_cols_enum { + DICT_FLD__SYS_FOREIGN_COLS__ID = 0, + DICT_FLD__SYS_FOREIGN_COLS__POS = 1, + DICT_FLD__SYS_FOREIGN_COLS__DB_TRX_ID = 2, + DICT_FLD__SYS_FOREIGN_COLS__DB_ROLL_PTR = 3, + DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME = 4, + DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME = 5, + DICT_NUM_FIELDS__SYS_FOREIGN_COLS = 6 +}; +/* The columns in SYS_VIRTUAL */ +enum dict_col_sys_virtual_enum { + DICT_COL__SYS_VIRTUAL__TABLE_ID = 0, + DICT_COL__SYS_VIRTUAL__POS = 1, + DICT_COL__SYS_VIRTUAL__BASE_POS = 2, + DICT_NUM_COLS__SYS_VIRTUAL = 3 +}; +/* The field numbers in the SYS_VIRTUAL clustered index */ +enum dict_fld_sys_virtual_enum { + DICT_FLD__SYS_VIRTUAL__TABLE_ID = 0, + DICT_FLD__SYS_VIRTUAL__POS = 1, + DICT_FLD__SYS_VIRTUAL__BASE_POS = 2, + DICT_FLD__SYS_VIRTUAL__DB_TRX_ID = 3, + DICT_FLD__SYS_VIRTUAL__DB_ROLL_PTR = 4, + DICT_NUM_FIELDS__SYS_VIRTUAL = 5 +}; + +/* A number of the columns above occur in multiple tables. These are the +length of thos fields. */ +#define DICT_FLD_LEN_SPACE 4 +#define DICT_FLD_LEN_FLAGS 4 + +#endif diff --git a/storage/innobase/include/dict0crea.h b/storage/innobase/include/dict0crea.h new file mode 100644 index 00000000..c40df12b --- /dev/null +++ b/storage/innobase/include/dict0crea.h @@ -0,0 +1,277 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0crea.h +Database object creation + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0crea_h +#define dict0crea_h + +#include "dict0dict.h" +#include "que0types.h" +#include "row0types.h" +#include "mtr0mtr.h" +#include "fil0crypt.h" + +/*********************************************************************//** +Creates a table create graph. +@return own: table create node */ +tab_node_t* +tab_create_graph_create( +/*====================*/ + dict_table_t* table, /*!< in: table to create, built as + a memory data structure */ + mem_heap_t* heap); /*!< in: heap where created */ + +/** Creates an index create graph. +@param[in] index index to create, built as a memory data structure +@param[in] table table name +@param[in,out] heap heap where created +@param[in] mode encryption mode (for creating a table) +@param[in] key_id encryption key identifier (for creating a table) +@param[in] add_v new virtual columns added in the same clause with + add index +@return own: index create node */ +ind_node_t* +ind_create_graph_create( + dict_index_t* index, + const char* table, + mem_heap_t* heap, + fil_encryption_t mode, + uint32_t key_id, + const dict_add_v_col_t* add_v = NULL); + +/***********************************************************//** +Creates a table. This is a high-level function used in SQL execution graphs. +@return query thread to run next or NULL */ +que_thr_t* +dict_create_table_step( +/*===================*/ + que_thr_t* thr); /*!< in: query thread */ + +/***********************************************************//** +Creates an index. This is a high-level function used in SQL execution +graphs. +@return query thread to run next or NULL */ +que_thr_t* +dict_create_index_step( +/*===================*/ + que_thr_t* thr); /*!< in: query thread */ + +/***************************************************************//** +Builds an index definition but doesn't update sys_table. +@return DB_SUCCESS or error code */ +void +dict_build_index_def( +/*=================*/ + const dict_table_t* table, /*!< in: table */ + dict_index_t* index, /*!< in/out: index */ + trx_t* trx); /*!< in/out: InnoDB transaction + handle */ +/***************************************************************//** +Creates an index tree for the index if it is not a member of a cluster. +Don't update SYSTEM TABLES. +@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ +dberr_t +dict_create_index_tree( +/*===================*/ + dict_index_t* index, /*!< in/out: index */ + const trx_t* trx); /*!< in: InnoDB transaction handle */ + +/** Drop the index tree associated with a row in SYS_INDEXES table. +@param[in,out] pcur persistent cursor on rec +@param[in,out] trx dictionary transaction +@param[in,out] mtr mini-transaction +@return tablespace ID to drop (if this is the clustered index) +@retval 0 if no tablespace is to be dropped */ +uint32_t dict_drop_index_tree(btr_pcur_t *pcur, trx_t *trx, mtr_t *mtr) + MY_ATTRIBUTE((nonnull(1,3), warn_unused_result)); + +/***************************************************************//** +Creates an index tree for the index if it is not a member of a cluster. +Don't update SYSTEM TABLES. +@return error code */ +dberr_t +dict_create_index_tree_in_mem( +/*==========================*/ + dict_index_t* index, /*!< in/out: index */ + const trx_t* trx); /*!< in: InnoDB transaction handle */ + +/********************************************************************//** +Generate a foreign key constraint name when it was not named by the user. +A generated constraint has a name of the format dbname/tablename_ibfk_NUMBER, +where the numbers start from 1, and are given locally for this table, that is, +the number is not global, as it used to be before MySQL 4.0.18. */ +UNIV_INLINE +dberr_t +dict_create_add_foreign_id( +/*=======================*/ + ulint* id_nr, /*!< in/out: number to use in id + generation; incremented if used */ + const char* name, /*!< in: table name */ + dict_foreign_t* foreign); /*!< in/out: foreign key */ + +/** Adds the given set of foreign key objects to the dictionary tables +in the database. This function does not modify the dictionary cache. The +caller must ensure that all foreign key objects contain a valid constraint +name in foreign->id. +@param[in] local_fk_set set of foreign key objects, to be added to +the dictionary tables +@param[in] table table to which the foreign key objects in +local_fk_set belong to +@param[in,out] trx transaction +@return error code or DB_SUCCESS */ +dberr_t +dict_create_add_foreigns_to_dictionary( +/*===================================*/ + const dict_foreign_set& local_fk_set, + const dict_table_t* table, + trx_t* trx) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Check if a foreign constraint is on columns server as base columns +of any stored column. This is to prevent creating SET NULL or CASCADE +constraint on such columns +@param[in] local_fk_set set of foreign key objects, to be added to +the dictionary tables +@param[in] table table to which the foreign key objects in +local_fk_set belong to +@return true if yes, otherwise, false */ +bool +dict_foreigns_has_s_base_col( + const dict_foreign_set& local_fk_set, + const dict_table_t* table); + +/********************************************************************//** +Add a foreign key definition to the data dictionary tables. +@return error code or DB_SUCCESS */ +dberr_t +dict_create_add_foreign_to_dictionary( +/*==================================*/ + const char* name, /*!< in: table name */ + const dict_foreign_t* foreign,/*!< in: foreign key */ + trx_t* trx) /*!< in/out: dictionary transaction */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/* Table create node structure */ +struct tab_node_t{ + que_common_t common; /*!< node type: QUE_NODE_TABLE_CREATE */ + dict_table_t* table; /*!< table to create, built as a + memory data structure with + dict_mem_... functions */ + ins_node_t* tab_def; /*!< child node which does the insert of + the table definition; the row to be + inserted is built by the parent node */ + ins_node_t* col_def; /*!< child node which does the inserts + of the column definitions; the row to + be inserted is built by the parent + node */ + ins_node_t* v_col_def; /*!< child node which does the inserts + of the sys_virtual row definitions; + the row to be inserted is built by + the parent node */ + /*----------------------*/ + /* Local storage for this graph node */ + ulint state; /*!< node execution state */ + ulint col_no; /*!< next column definition to insert */ + ulint base_col_no; /*!< next base column to insert */ + mem_heap_t* heap; /*!< memory heap used as auxiliary + storage */ +}; + +/* Table create node states */ +#define TABLE_BUILD_TABLE_DEF 1 +#define TABLE_BUILD_COL_DEF 2 +#define TABLE_BUILD_V_COL_DEF 3 +#define TABLE_ADD_TO_CACHE 4 +#define TABLE_COMPLETED 5 + +/* Index create node struct */ + +struct ind_node_t{ + que_common_t common; /*!< node type: QUE_NODE_INDEX_CREATE */ + dict_index_t* index; /*!< index to create, built as a + memory data structure with + dict_mem_... functions */ + const char* table_name; /*!< table name */ + ins_node_t* ind_def; /*!< child node which does the insert of + the index definition; the row to be + inserted is built by the parent node */ + ins_node_t* field_def; /*!< child node which does the inserts + of the field definitions; the row to + be inserted is built by the parent + node */ + /*----------------------*/ + /* Local storage for this graph node */ + ulint state; /*!< node execution state */ + uint32_t page_no; /* root page number of the index */ + dtuple_t* ind_row; /* index definition row built */ + ulint field_no; /* next field definition to insert */ + mem_heap_t* heap; /*!< memory heap used as auxiliary + storage */ + uint key_id; /*!< encryption key_id */ + fil_encryption_t mode; /*!< encryption mode */ + const dict_add_v_col_t* + add_v; /*!< new virtual columns that being + added along with an add index call */ +}; + +/** Compose a column number for a virtual column, stored in the "POS" field +of Sys_columns. The column number includes both its virtual column sequence +(the "nth" virtual column) and its actual column position in original table +@param[in] v_pos virtual column sequence +@param[in] col_pos column position in original table definition +@return composed column position number */ +UNIV_INLINE +ulint +dict_create_v_col_pos( + ulint v_pos, + ulint col_pos); + +/** Get the column number for a virtual column (the column position in +original table), stored in the "POS" field of Sys_columns +@param[in] pos virtual column position +@return column position in original table */ +UNIV_INLINE +ulint +dict_get_v_col_mysql_pos( + ulint pos); + +/** Get a virtual column sequence (the "nth" virtual column) for a +virtual column, stord in the "POS" field of Sys_columns +@param[in] pos virtual column position +@return virtual column sequence */ +UNIV_INLINE +ulint +dict_get_v_col_pos( + ulint pos); + +/* Index create node states */ +#define INDEX_BUILD_INDEX_DEF 1 +#define INDEX_BUILD_FIELD_DEF 2 +#define INDEX_CREATE_INDEX_TREE 3 +#define INDEX_ADD_TO_CACHE 4 + +#include "dict0crea.inl" + +#endif diff --git a/storage/innobase/include/dict0crea.inl b/storage/innobase/include/dict0crea.inl new file mode 100644 index 00000000..5641206d --- /dev/null +++ b/storage/innobase/include/dict0crea.inl @@ -0,0 +1,136 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0crea.ic +Database object creation + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#include "ha_prototypes.h" + +#include "mem0mem.h" + +/********************************************************************//** +Generate a foreign key constraint name when it was not named by the user. +A generated constraint has a name of the format dbname/tablename_ibfk_NUMBER, +where the numbers start from 1, and are given locally for this table, that is, +the number is not global, as it used to be before MySQL 4.0.18. */ +UNIV_INLINE +dberr_t +dict_create_add_foreign_id( +/*=======================*/ + ulint* id_nr, /*!< in/out: number to use in id generation; + incremented if used */ + const char* name, /*!< in: table name */ + dict_foreign_t* foreign)/*!< in/out: foreign key */ +{ + DBUG_ENTER("dict_create_add_foreign_id"); + + if (foreign->id == NULL) { + /* Generate a new constraint id */ + ulint namelen = strlen(name); + char* id = static_cast( + mem_heap_alloc(foreign->heap, + namelen + 20)); + + if (dict_table_t::is_temporary_name(name)) { + + /* no overflow if number < 1e13 */ + sprintf(id, "%s_ibfk_%lu", name, + (ulong) (*id_nr)++); + } else { + char table_name[MAX_TABLE_NAME_LEN + 21]; + uint errors = 0; + + strncpy(table_name, name, (sizeof table_name) - 1); + table_name[(sizeof table_name) - 1] = '\0'; + + innobase_convert_to_system_charset( + strchr(table_name, '/') + 1, + strchr(name, '/') + 1, + MAX_TABLE_NAME_LEN, &errors); + + if (errors) { + strncpy(table_name, name, + (sizeof table_name) - 1); + table_name[(sizeof table_name) - 1] = '\0'; + } + + /* no overflow if number < 1e13 */ + sprintf(id, "%s_ibfk_%lu", table_name, + (ulong) (*id_nr)++); + + if (innobase_check_identifier_length( + strchr(id,'/') + 1)) { + DBUG_RETURN(DB_IDENTIFIER_TOO_LONG); + } + } + foreign->id = id; + + DBUG_PRINT("dict_create_add_foreign_id", + ("generated foreign id: %s", id)); + } + + + DBUG_RETURN(DB_SUCCESS); +} + +/** Compose a column number for a virtual column, stored in the "POS" field +of Sys_columns. The column number includes both its virtual column sequence +(the "nth" virtual column) and its actual column position in original table +@param[in] v_pos virtual column sequence +@param[in] col_pos column position in original table definition +@return composed column position number */ +UNIV_INLINE +ulint +dict_create_v_col_pos( + ulint v_pos, + ulint col_pos) +{ + ut_ad(v_pos <= REC_MAX_N_FIELDS); + ut_ad(col_pos <= REC_MAX_N_FIELDS); + + return(((v_pos + 1) << 16) + col_pos); +} + +/** Get the column number for a virtual column (the column position in +original table), stored in the "POS" field of Sys_columns +@param[in] pos virtual column position +@return column position in original table */ +UNIV_INLINE +ulint +dict_get_v_col_mysql_pos( + ulint pos) +{ + return(pos & 0xFFFF); +} + +/** Get a virtual column sequence (the "nth" virtual column) for a +virtual column, stord in the "POS" field of Sys_columns +@param[in] pos virtual column position +@return virtual column sequence */ +UNIV_INLINE +ulint +dict_get_v_col_pos( + ulint pos) +{ + return((pos >> 16) - 1); +} diff --git a/storage/innobase/include/dict0defrag_bg.h b/storage/innobase/include/dict0defrag_bg.h new file mode 100644 index 00000000..679484ad --- /dev/null +++ b/storage/innobase/include/dict0defrag_bg.h @@ -0,0 +1,101 @@ +/***************************************************************************** + +Copyright (c) 2016, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0defrag_bg.h +Code used for background table and index +defragmentation + +Created 25/08/2016 Jan Lindström +*******************************************************/ + +#ifndef dict0defrag_bg_h +#define dict0defrag_bg_h + +#include "dict0types.h" + +/** Indices whose defrag stats need to be saved to persistent storage.*/ +struct defrag_pool_item_t { + table_id_t table_id; + index_id_t index_id; +}; + +/** Allocator type, used by std::vector */ +typedef ut_allocator + defrag_pool_allocator_t; + +/** The multitude of tables to be defragmented- an STL vector */ +typedef std::vector + defrag_pool_t; + +/** Pool where we store information on which tables are to be processed +by background defragmentation. */ +extern defrag_pool_t defrag_pool; + +/*****************************************************************//** +Initialize the defrag pool, called once during thread initialization. */ +void +dict_defrag_pool_init(void); +/*========================*/ + +/*****************************************************************//** +Free the resources occupied by the defrag pool, called once during +thread de-initialization. */ +void +dict_defrag_pool_deinit(void); +/*==========================*/ + +/*****************************************************************//** +Add an index in a table to the defrag pool, which is processed by the +background stats gathering thread. Only the table id and index id are +added to the list, so the table can be closed after being enqueued and +it will be opened when needed. If the table or index does not exist later +(has been DROPped), then it will be removed from the pool and skipped. */ +void +dict_stats_defrag_pool_add( +/*=======================*/ + const dict_index_t* index); /*!< in: table to add */ + +/*****************************************************************//** +Delete a given index from the auto defrag pool. */ +void +dict_stats_defrag_pool_del( +/*=======================*/ + const dict_table_t* table, /*! +#include + +class MDL_ticket; + +/** the first table or index ID for other than hard-coded system tables */ +constexpr uint8_t DICT_HDR_FIRST_ID= 10; + + +/** Get the database name length in a table name. +@param name filename-safe encoded table name "dbname/tablename" +@return database name length */ +inline size_t dict_get_db_name_len(const char *name) +{ + /* table_name_t::dblen() would assert that '/' is contained */ + if (const char* s= strchr(name, '/')) + return size_t(s - name); + + return 0; +} + + +/*********************************************************************//** +Open a table from its database and table name, this is currently used by +foreign constraint parser to get the referenced table. +@return complete table name with database and table name, allocated from +heap memory passed in */ +char* +dict_get_referenced_table( +/*======================*/ + const char* name, /*!< in: foreign key table name */ + const char* database_name, /*!< in: table db name */ + ulint database_name_len,/*!< in: db name length */ + const char* table_name, /*!< in: table name */ + ulint table_name_len, /*!< in: table name length */ + dict_table_t** table, /*!< out: table object or NULL */ + mem_heap_t* heap, /*!< in: heap memory */ + CHARSET_INFO* from_cs); /*!< in: table name charset */ +/*********************************************************************//** +Frees a foreign key struct. */ +void +dict_foreign_free( +/*==============*/ + dict_foreign_t* foreign); /*!< in, own: foreign key struct */ +/*********************************************************************//** +Finds the highest [number] for foreign key constraints of the table. Looks +only at the >= 4.0.18-format id's, which are of the form +databasename/tablename_ibfk_[number]. +@return highest number, 0 if table has no new format foreign key constraints */ +ulint +dict_table_get_highest_foreign_id( +/*==============================*/ + dict_table_t* table); /*!< in: table in the dictionary + memory cache */ +/** Check whether the dict_table_t is a partition. +A partitioned table on the SQL level is composed of InnoDB tables, +where each InnoDB table is a [sub]partition including its secondary indexes +which belongs to the partition. +@param[in] table Table to check. +@return true if the dict_table_t is a partition else false. */ +UNIV_INLINE +bool +dict_table_is_partition(const dict_table_t* table) +{ + /* Check both P and p on all platforms in case it was moved to/from + WIN. */ + return (strstr(table->name.m_name, "#p#") + || strstr(table->name.m_name, "#P#")); +} +/********************************************************************//** +Return the end of table name where we have removed dbname and '/'. +@return table name */ +const char* +dict_remove_db_name( +/*================*/ + const char* name) /*!< in: table name in the form + dbname '/' tablename */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Operation to perform when opening a table */ +enum dict_table_op_t { + /** Expect the tablespace to exist. */ + DICT_TABLE_OP_NORMAL = 0, + /** Drop any orphan indexes after an aborted online index creation */ + DICT_TABLE_OP_DROP_ORPHAN, + /** Silently load the tablespace if it does not exist, + and do not load the definitions of incomplete indexes. */ + DICT_TABLE_OP_LOAD_TABLESPACE, + /** Open the table only if it's in table cache. */ + DICT_TABLE_OP_OPEN_ONLY_IF_CACHED +}; + +/** Acquire MDL shared for the table name. +@tparam trylock whether to use non-blocking operation +@param[in,out] table table object +@param[in,out] thd background thread +@param[out] mdl mdl ticket +@param[in] table_op operation to perform when opening +@return table object after locking MDL shared +@retval NULL if the table is not readable, or if trylock && MDL blocked */ +template +dict_table_t* +dict_acquire_mdl_shared(dict_table_t *table, + THD *thd, + MDL_ticket **mdl, + dict_table_op_t table_op= DICT_TABLE_OP_NORMAL); + +/** Look up a table by numeric identifier. +@param[in] table_id table identifier +@param[in] dict_locked data dictionary locked +@param[in] table_op operation to perform when opening +@param[in,out] thd background thread, or NULL to not acquire MDL +@param[out] mdl mdl ticket, or NULL +@return table, NULL if does not exist */ +dict_table_t* +dict_table_open_on_id(table_id_t table_id, bool dict_locked, + dict_table_op_t table_op, THD *thd= nullptr, + MDL_ticket **mdl= nullptr) + MY_ATTRIBUTE((warn_unused_result)); + +/** Decrement the count of open handles */ +void dict_table_close(dict_table_t *table); + +/** Decrements the count of open handles of a table. +@param[in,out] table table +@param[in] dict_locked whether dict_sys.latch is being held +@param[in] thd thread to release MDL +@param[in] mdl metadata lock or NULL if the thread is a + foreground one. */ +void +dict_table_close( + dict_table_t* table, + bool dict_locked, + THD* thd = NULL, + MDL_ticket* mdl = NULL); + +/*********************************************************************//** +Gets the minimum number of bytes per character. +@return minimum multi-byte char size, in bytes */ +UNIV_INLINE +unsigned +dict_col_get_mbminlen( +/*==================*/ + const dict_col_t* col) /*!< in: column */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*********************************************************************//** +Gets the maximum number of bytes per character. +@return maximum multi-byte char size, in bytes */ +UNIV_INLINE +unsigned +dict_col_get_mbmaxlen( +/*==================*/ + const dict_col_t* col) /*!< in: column */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*********************************************************************//** +Gets the column data type. */ +UNIV_INLINE +void +dict_col_copy_type( +/*===============*/ + const dict_col_t* col, /*!< in: column */ + dtype_t* type); /*!< out: data type */ + +/**********************************************************************//** +Determine bytes of column prefix to be stored in the undo log. Please +note that if !dict_table_has_atomic_blobs(table), no prefix +needs to be stored in the undo log. +@return bytes of column prefix to be stored in the undo log */ +UNIV_INLINE +ulint +dict_max_field_len_store_undo( +/*==========================*/ + dict_table_t* table, /*!< in: table */ + const dict_col_t* col) /*!< in: column which index prefix + is based on */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Determine maximum bytes of a virtual column need to be stored +in the undo log. +@param[in] table dict_table_t for the table +@param[in] col_no virtual column number +@return maximum bytes of virtual column to be stored in the undo log */ +UNIV_INLINE +ulint +dict_max_v_field_len_store_undo( + dict_table_t* table, + ulint col_no); + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Assert that a column and a data type match. +@return TRUE */ +UNIV_INLINE +ibool +dict_col_type_assert_equal( +/*=======================*/ + const dict_col_t* col, /*!< in: column */ + const dtype_t* type) /*!< in: data type */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +#endif /* UNIV_DEBUG */ + +/***********************************************************************//** +Returns the minimum size of the column. +@return minimum size */ +UNIV_INLINE +unsigned +dict_col_get_min_size( +/*==================*/ + const dict_col_t* col) /*!< in: column */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/***********************************************************************//** +Returns the maximum size of the column. +@return maximum size */ +UNIV_INLINE +ulint +dict_col_get_max_size( +/*==================*/ + const dict_col_t* col) /*!< in: column */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/***********************************************************************//** +Returns the size of a fixed size column, 0 if not a fixed size column. +@return fixed size, or 0 */ +UNIV_INLINE +unsigned +dict_col_get_fixed_size( +/*====================*/ + const dict_col_t* col, /*!< in: column */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/***********************************************************************//** +Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column. +For fixed length types it is the fixed length of the type, otherwise 0. +@return SQL null storage size in ROW_FORMAT=REDUNDANT */ +UNIV_INLINE +unsigned +dict_col_get_sql_null_size( +/*=======================*/ + const dict_col_t* col, /*!< in: column */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*********************************************************************//** +Gets the column number. +@return col->ind, table column position (starting from 0) */ +UNIV_INLINE +unsigned +dict_col_get_no( +/*============*/ + const dict_col_t* col) /*!< in: column */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*********************************************************************//** +Gets the column position in the clustered index. */ +UNIV_INLINE +ulint +dict_col_get_clust_pos( +/*===================*/ + const dict_col_t* col, /*!< in: table column */ + const dict_index_t* clust_index) /*!< in: clustered index */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Gets the column position in the given index. +@param[in] col table column +@param[in] index index to be searched for column +@return position of column in the given index. */ +UNIV_INLINE +ulint +dict_col_get_index_pos( + const dict_col_t* col, + const dict_index_t* index) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/****************************************************************//** +If the given column name is reserved for InnoDB system columns, return +TRUE. +@return TRUE if name is reserved */ +ibool +dict_col_name_is_reserved( +/*======================*/ + const char* name) /*!< in: column name */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/** Unconditionally set the AUTO_INCREMENT counter. +@param[in,out] table table or partition +@param[in] value next available AUTO_INCREMENT value */ +MY_ATTRIBUTE((nonnull)) +UNIV_INLINE +void +dict_table_autoinc_initialize(dict_table_t* table, ib_uint64_t value) +{ + table->autoinc = value; +} + +/** +@param[in] table table or partition +@return the next AUTO_INCREMENT counter value +@retval 0 if AUTO_INCREMENT is not yet initialized */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +UNIV_INLINE +ib_uint64_t +dict_table_autoinc_read(const dict_table_t* table) +{ + return(table->autoinc); +} + +/** Update the AUTO_INCREMENT sequence if the value supplied is greater +than the current value. +@param[in,out] table table or partition +@param[in] value AUTO_INCREMENT value that was assigned to a row +@return whether the AUTO_INCREMENT sequence was updated */ +MY_ATTRIBUTE((nonnull)) +UNIV_INLINE +bool +dict_table_autoinc_update_if_greater(dict_table_t* table, ib_uint64_t value) +{ + if (value > table->autoinc) { + + table->autoinc = value; + return(true); + } + + return(false); +} + +/**********************************************************************//** +Adds system columns to a table object. */ +void +dict_table_add_system_columns( +/*==========================*/ + dict_table_t* table, /*!< in/out: table */ + mem_heap_t* heap) /*!< in: temporary heap */ + MY_ATTRIBUTE((nonnull)); +/**********************************************************************//** +Renames a table object. +@return TRUE if success */ +dberr_t +dict_table_rename_in_cache( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + span new_name, /*!< in: new name */ + bool replace_new_file) + /*!< in: whether to replace the + file with the new name + (as part of rolling back TRUNCATE) */ + MY_ATTRIBUTE((nonnull)); + +/** Removes an index from the dictionary cache. +@param[in,out] table table whose index to remove +@param[in,out] index index to remove, this object is destroyed and must not +be accessed by the caller afterwards */ +void +dict_index_remove_from_cache( + dict_table_t* table, + dict_index_t* index); + +/**********************************************************************//** +Change the id of a table object in the dictionary cache. This is used in +DISCARD TABLESPACE. */ +void +dict_table_change_id_in_cache( +/*==========================*/ + dict_table_t* table, /*!< in/out: table object already in cache */ + table_id_t new_id) /*!< in: new id to set */ + MY_ATTRIBUTE((nonnull)); +/**********************************************************************//** +Removes a foreign constraint struct from the dictionary cache. */ +void +dict_foreign_remove_from_cache( +/*===========================*/ + dict_foreign_t* foreign) /*!< in, own: foreign constraint */ + MY_ATTRIBUTE((nonnull)); +/**********************************************************************//** +Adds a foreign key constraint object to the dictionary cache. May free +the object if there already is an object with the same identifier in. +At least one of foreign table or referenced table must already be in +the dictionary cache! +@return DB_SUCCESS or error code */ +dberr_t +dict_foreign_add_to_cache( +/*======================*/ + dict_foreign_t* foreign, + /*!< in, own: foreign key constraint */ + const char** col_names, + /*!< in: column names, or NULL to use + foreign->foreign_table->col_names */ + bool check_charsets, + /*!< in: whether to check charset + compatibility */ + dict_err_ignore_t ignore_err) + /*!< in: error to be ignored */ + MY_ATTRIBUTE((nonnull(1), warn_unused_result)); +/**********************************************************************//** +Replace the index passed in with another equivalent index in the +foreign key lists of the table. +@return whether all replacements were found */ +bool +dict_foreign_replace_index( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + const char** col_names, + /*!< in: column names, or NULL + to use table->col_names */ + const dict_index_t* index) /*!< in: index to be replaced */ + MY_ATTRIBUTE((nonnull(1,3), warn_unused_result)); +/**********************************************************************//** +Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement. +@return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the +constraint id does not match */ +dberr_t +dict_foreign_parse_drop_constraints( +/*================================*/ + mem_heap_t* heap, /*!< in: heap from which we can + allocate memory */ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table, /*!< in: table */ + ulint* n, /*!< out: number of constraints + to drop */ + const char*** constraints_to_drop) /*!< out: id's of the + constraints to drop */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/**********************************************************************//** +Returns a table object and increments its open handle count. +NOTE! This is a high-level function to be used mainly from outside the +'dict' directory. Inside this directory dict_table_get_low +is usually the appropriate function. +@param[in] table_name Table name +@param[in] dict_locked whether dict_sys.latch is being held exclusively +@param[in] ignore_err error to be ignored when loading the table +@return table +@retval nullptr if does not exist */ +dict_table_t* +dict_table_open_on_name( + const char* table_name, + bool dict_locked, + dict_err_ignore_t ignore_err) + MY_ATTRIBUTE((warn_unused_result)); + +/** Outcome of dict_foreign_find_index() or dict_foreign_qualify_index() */ +enum fkerr_t +{ + /** A backing index was found for a FOREIGN KEY constraint */ + FK_SUCCESS = 0, + /** There is no index that covers the columns in the constraint. */ + FK_INDEX_NOT_FOUND, + /** The index is for a prefix index, not a full column. */ + FK_IS_PREFIX_INDEX, + /** A condition of SET NULL conflicts with a NOT NULL column. */ + FK_COL_NOT_NULL, + /** The column types do not match */ + FK_COLS_NOT_EQUAL +}; + +/*********************************************************************//** +Tries to find an index whose first fields are the columns in the array, +in the same order and is not marked for deletion and is not the same +as types_idx. +@return matching index, NULL if not found */ +dict_index_t* +dict_foreign_find_index( +/*====================*/ + const dict_table_t* table, /*!< in: table */ + const char** col_names, + /*!< in: column names, or NULL + to use table->col_names */ + const char** columns,/*!< in: array of column names */ + ulint n_cols, /*!< in: number of columns */ + const dict_index_t* types_idx, + /*!< in: NULL or an index + whose types the column types + must match */ + bool check_charsets, + /*!< in: whether to check + charsets. only has an effect + if types_idx != NULL */ + ulint check_null, + /*!< in: nonzero if none of + the columns must be declared + NOT NULL */ + fkerr_t* error = NULL, /*!< out: error code */ + ulint* err_col_no = NULL, + /*!< out: column number where + error happened */ + dict_index_t** err_index = NULL) + /*!< out: index where error + happened */ + + MY_ATTRIBUTE((nonnull(1,3), warn_unused_result)); + +/** Returns a virtual column's name. +@param[in] table table object +@param[in] col_nr virtual column number(nth virtual column) +@return column name. */ +const char* +dict_table_get_v_col_name( + const dict_table_t* table, + ulint col_nr); + +/** Check if the table has a given column. +@param[in] table table object +@param[in] col_name column name +@param[in] col_nr column number guessed, 0 as default +@return column number if the table has the specified column, +otherwise table->n_def */ +ulint +dict_table_has_column( + const dict_table_t* table, + const char* col_name, + ulint col_nr = 0); + +/**********************************************************************//** +Outputs info on foreign keys of a table. */ +std::string +dict_print_info_on_foreign_keys( +/*============================*/ + ibool create_table_format, /*!< in: if TRUE then print in + a format suitable to be inserted into + a CREATE TABLE, otherwise in the format + of SHOW TABLE STATUS */ + trx_t* trx, /*!< in: transaction */ + dict_table_t* table); /*!< in: table */ + +/**********************************************************************//** +Outputs info on a foreign key of a table in a format suitable for +CREATE TABLE. */ +std::string +dict_print_info_on_foreign_key_in_create_format( +/*============================================*/ + trx_t* trx, /*!< in: transaction */ + dict_foreign_t* foreign, /*!< in: foreign key constraint */ + ibool add_newline); /*!< in: whether to add a newline */ + +/*********************************************************************//** +Tries to find an index whose first fields are the columns in the array, +in the same order and is not marked for deletion and is not the same +as types_idx. +@return matching index, NULL if not found */ +bool +dict_foreign_qualify_index( +/*====================*/ + const dict_table_t* table, /*!< in: table */ + const char** col_names, + /*!< in: column names, or NULL + to use table->col_names */ + const char** columns,/*!< in: array of column names */ + ulint n_cols, /*!< in: number of columns */ + const dict_index_t* index, /*!< in: index to check */ + const dict_index_t* types_idx, + /*!< in: NULL or an index + whose types the column types + must match */ + bool check_charsets, + /*!< in: whether to check + charsets. only has an effect + if types_idx != NULL */ + ulint check_null, + /*!< in: nonzero if none of + the columns must be declared + NOT NULL */ + fkerr_t* error, /*!< out: error code */ + ulint* err_col_no, + /*!< out: column number where + error happened */ + dict_index_t** err_index) + /*!< out: index where error + happened */ + MY_ATTRIBUTE((nonnull(1,3), warn_unused_result)); +#ifdef UNIV_DEBUG +/********************************************************************//** +Gets the first index on the table (the clustered index). +@return index, NULL if none exists */ +UNIV_INLINE +dict_index_t* +dict_table_get_first_index( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/********************************************************************//** +Gets the last index on the table. +@return index, NULL if none exists */ +UNIV_INLINE +dict_index_t* +dict_table_get_last_index( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/********************************************************************//** +Gets the next index on the table. +@return index, NULL if none left */ +UNIV_INLINE +dict_index_t* +dict_table_get_next_index( +/*======================*/ + const dict_index_t* index) /*!< in: index */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +#else /* UNIV_DEBUG */ +# define dict_table_get_first_index(table) UT_LIST_GET_FIRST((table)->indexes) +# define dict_table_get_last_index(table) UT_LIST_GET_LAST((table)->indexes) +# define dict_table_get_next_index(index) UT_LIST_GET_NEXT(indexes, index) +#endif /* UNIV_DEBUG */ + +#define dict_index_is_clust(index) (index)->is_clust() +#define dict_index_is_auto_gen_clust(index) (index)->is_gen_clust() +#define dict_index_is_unique(index) (index)->is_unique() +#define dict_index_is_spatial(index) (index)->is_spatial() +#define dict_index_is_ibuf(index) (index)->is_ibuf() +#define dict_index_is_sec_or_ibuf(index) !(index)->is_primary() +#define dict_index_has_virtual(index) (index)->has_virtual() + +/** Get all the FTS indexes on a table. +@param[in] table table +@param[out] indexes all FTS indexes on this table +@return number of FTS indexes */ +ulint +dict_table_get_all_fts_indexes( + const dict_table_t* table, + ib_vector_t* indexes); + +/********************************************************************//** +Gets the number of user-defined non-virtual columns in a table in the +dictionary cache. +@return number of user-defined (e.g., not ROW_ID) non-virtual +columns of a table */ +UNIV_INLINE +unsigned +dict_table_get_n_user_cols( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ + MY_ATTRIBUTE((warn_unused_result)); +/********************************************************************//** +Gets the number of all non-virtual columns (also system) in a table +in the dictionary cache. +@return number of columns of a table */ +UNIV_INLINE +unsigned +dict_table_get_n_cols( +/*==================*/ + const dict_table_t* table) /*!< in: table */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Gets the number of virtual columns in a table in the dictionary cache. +@param[in] table the table to check +@return number of virtual columns of a table */ +UNIV_INLINE +unsigned +dict_table_get_n_v_cols( + const dict_table_t* table); + +/** Check if a table has indexed virtual columns +@param[in] table the table to check +@return true is the table has indexed virtual columns */ +UNIV_INLINE +bool +dict_table_has_indexed_v_cols( + const dict_table_t* table); + +/********************************************************************//** +Gets the approximately estimated number of rows in the table. +@return estimated number of rows */ +UNIV_INLINE +ib_uint64_t +dict_table_get_n_rows( +/*==================*/ + const dict_table_t* table) /*!< in: table */ + MY_ATTRIBUTE((warn_unused_result)); +/********************************************************************//** +Increment the number of rows in the table by one. +Notice that this operation is not protected by any latch, the number is +approximate. */ +UNIV_INLINE +void +dict_table_n_rows_inc( +/*==================*/ + dict_table_t* table) /*!< in/out: table */ + MY_ATTRIBUTE((nonnull)); +/********************************************************************//** +Decrement the number of rows in the table by one. +Notice that this operation is not protected by any latch, the number is +approximate. */ +UNIV_INLINE +void +dict_table_n_rows_dec( +/*==================*/ + dict_table_t* table) /*!< in/out: table */ + MY_ATTRIBUTE((nonnull)); + +/** Get nth virtual column +@param[in] table target table +@param[in] col_nr column number in MySQL Table definition +@return dict_v_col_t ptr */ +dict_v_col_t* +dict_table_get_nth_v_col_mysql( + const dict_table_t* table, + ulint col_nr); + +#ifdef UNIV_DEBUG +/********************************************************************//** +Gets the nth column of a table. +@return pointer to column object */ +UNIV_INLINE +dict_col_t* +dict_table_get_nth_col( +/*===================*/ + const dict_table_t* table, /*!< in: table */ + ulint pos) /*!< in: position of column */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/** Gets the nth virtual column of a table. +@param[in] table table +@param[in] pos position of virtual column +@return pointer to virtual column object */ +UNIV_INLINE +dict_v_col_t* +dict_table_get_nth_v_col( + const dict_table_t* table, + ulint pos); +/********************************************************************//** +Gets the given system column of a table. +@return pointer to column object */ +UNIV_INLINE +dict_col_t* +dict_table_get_sys_col( +/*===================*/ + const dict_table_t* table, /*!< in: table */ + unsigned sys) /*!< in: DATA_ROW_ID, ... */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +#else /* UNIV_DEBUG */ +#define dict_table_get_nth_col(table, pos) (&(table)->cols[pos]) +#define dict_table_get_sys_col(table, sys) \ + &(table)->cols[(table)->n_cols + (sys) - DATA_N_SYS_COLS] +/* Get nth virtual columns */ +#define dict_table_get_nth_v_col(table, pos) (&(table)->v_cols[pos]) +#endif /* UNIV_DEBUG */ +/** Wrapper function. +@see dict_col_t::name() +@param[in] table table +@param[in] col_nr column number in table +@return column name */ +inline +const char* +dict_table_get_col_name(const dict_table_t* table, ulint col_nr) +{ + return(dict_table_get_nth_col(table, col_nr)->name(*table)); +} + +/********************************************************************//** +Gets the given system column number of a table. +@return column number */ +UNIV_INLINE +unsigned +dict_table_get_sys_col_no( +/*======================*/ + const dict_table_t* table, /*!< in: table */ + unsigned sys) /*!< in: DATA_ROW_ID, ... */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/********************************************************************//** +Returns the minimum data size of an index record. +@return minimum data size in bytes */ +UNIV_INLINE +unsigned +dict_index_get_min_size( +/*====================*/ + const dict_index_t* index) /*!< in: index */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +#define dict_table_is_comp(table) (table)->not_redundant() + +/** Determine if a table uses atomic BLOBs (no locally stored prefix). +@param[in] table InnoDB table +@return whether BLOBs are atomic */ +inline +bool +dict_table_has_atomic_blobs(const dict_table_t* table) +{ + return(DICT_TF_HAS_ATOMIC_BLOBS(table->flags)); +} + +/** @return potential max length stored inline for externally stored fields */ +inline size_t dict_table_t::get_overflow_field_local_len() const +{ + if (dict_table_has_atomic_blobs(this)) { + /* ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED: do not + store any BLOB prefix locally */ + return BTR_EXTERN_FIELD_REF_SIZE; + } + /* up to MySQL 5.1: store a 768-byte prefix locally */ + return BTR_EXTERN_FIELD_REF_SIZE + DICT_ANTELOPE_MAX_INDEX_COL_LEN; +} + +/** Set the various values in a dict_table_t::flags pointer. +@param[in,out] flags, Pointer to a 4 byte Table Flags +@param[in] format, File Format +@param[in] zip_ssize Zip Shift Size +@param[in] use_data_dir Table uses DATA DIRECTORY +@param[in] page_compressed Table uses page compression +@param[in] page_compression_level Page compression level */ +UNIV_INLINE +void +dict_tf_set( + ulint* flags, + rec_format_t format, + ulint zip_ssize, + bool use_data_dir, + bool page_compressed, + ulint page_compression_level); + +/** Convert a 32 bit integer table flags to the 32 bit FSP Flags. +Fsp Flags are written into the tablespace header at the offset +FSP_SPACE_FLAGS and are also stored in the fil_space_t::flags field. +The following chart shows the translation of the low order bit. +Other bits are the same. +========================= Low order bit ========================== + | REDUNDANT | COMPACT | COMPRESSED | DYNAMIC +dict_table_t::flags | 0 | 1 | 1 | 1 +fil_space_t::flags | 0 | 0 | 1 | 1 +================================================================== +@param[in] table_flags dict_table_t::flags +@return tablespace flags (fil_space_t::flags) */ +inline uint32_t dict_tf_to_fsp_flags(unsigned table_flags) + MY_ATTRIBUTE((const)); + +/** Extract the ROW_FORMAT=COMPRESSED page size from table flags. +@param[in] flags flags +@return ROW_FORMAT=COMPRESSED page size +@retval 0 if not compressed */ +inline ulint dict_tf_get_zip_size(ulint flags) +{ + flags &= DICT_TF_MASK_ZIP_SSIZE; + return flags + ? (UNIV_ZIP_SIZE_MIN >> 1) + << (FSP_FLAGS_GET_ZIP_SSIZE(flags >> DICT_TF_POS_ZIP_SSIZE + << FSP_FLAGS_POS_ZIP_SSIZE)) + : 0; +} + +/********************************************************************//** +Checks if a column is in the ordering columns of the clustered index of a +table. Column prefixes are treated like whole columns. +@return TRUE if the column, or its prefix, is in the clustered key */ +ibool +dict_table_col_in_clustered_key( +/*============================*/ + const dict_table_t* table, /*!< in: table */ + ulint n) /*!< in: column number */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*******************************************************************//** +Check if the table has an FTS index. +@return TRUE if table has an FTS index */ +UNIV_INLINE +ibool +dict_table_has_fts_index( +/*=====================*/ + dict_table_t* table) /*!< in: table */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/** Copies types of virtual columns contained in table to tuple and sets all +fields of the tuple to the SQL NULL value. This function should +be called right after dtuple_create(). +@param[in,out] tuple data tuple +@param[in] table table +*/ +void +dict_table_copy_v_types( + dtuple_t* tuple, + const dict_table_t* table); + +/*******************************************************************//** +Copies types of columns contained in table to tuple and sets all +fields of the tuple to the SQL NULL value. This function should +be called right after dtuple_create(). */ +void +dict_table_copy_types( +/*==================*/ + dtuple_t* tuple, /*!< in/out: data tuple */ + const dict_table_t* table) /*!< in: table */ + MY_ATTRIBUTE((nonnull)); +/** Adds an index to the dictionary cache, with possible indexing newly +added column. +@param[in,out] index index; NOTE! The index memory + object is freed in this function! +@param[in] page_no root page number of the index +@param[in] add_v virtual columns being added along with ADD INDEX +@return DB_SUCCESS, or DB_CORRUPTION */ +dberr_t +dict_index_add_to_cache( + dict_index_t*& index, + ulint page_no, + const dict_add_v_col_t* add_v = NULL) + MY_ATTRIBUTE((warn_unused_result)); +/********************************************************************//** +Gets the number of fields in the internal representation of an index, +including fields added by the dictionary system. +@return number of fields */ +UNIV_INLINE +uint16_t +dict_index_get_n_fields( +/*====================*/ + const dict_index_t* index) /*!< in: an internal + representation of index (in + the dictionary cache) */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/********************************************************************//** +Gets the number of fields in the internal representation of an index +that uniquely determine the position of an index entry in the index, if +we do not take multiversioning into account: in the B-tree use the value +returned by dict_index_get_n_unique_in_tree. +@return number of fields */ +UNIV_INLINE +uint16_t +dict_index_get_n_unique( +/*====================*/ + const dict_index_t* index) /*!< in: an internal representation + of index (in the dictionary cache) */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/********************************************************************//** +Gets the number of fields in the internal representation of an index +which uniquely determine the position of an index entry in the index, if +we also take multiversioning into account. +@return number of fields */ +UNIV_INLINE +uint16_t +dict_index_get_n_unique_in_tree( +/*============================*/ + const dict_index_t* index) /*!< in: an internal representation + of index (in the dictionary cache) */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** The number of fields in the nonleaf page of spatial index, except +the page no field. */ +#define DICT_INDEX_SPATIAL_NODEPTR_SIZE 1 +/** +Gets the number of fields on nonleaf page level in the internal representation +of an index which uniquely determine the position of an index entry in the +index, if we also take multiversioning into account. Note, it doesn't +include page no field. +@param[in] index index +@return number of fields */ +UNIV_INLINE +uint16_t +dict_index_get_n_unique_in_tree_nonleaf( + const dict_index_t* index) + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/********************************************************************//** +Gets the number of user-defined ordering fields in the index. In the internal +representation we add the row id to the ordering fields to make all indexes +unique, but this function returns the number of fields the user defined +in the index as ordering fields. +@return number of fields */ +UNIV_INLINE +uint16_t +dict_index_get_n_ordering_defined_by_user( +/*======================================*/ + const dict_index_t* index) /*!< in: an internal representation + of index (in the dictionary cache) */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +#ifdef UNIV_DEBUG +/********************************************************************//** +Gets the nth field of an index. +@return pointer to field object */ +UNIV_INLINE +dict_field_t* +dict_index_get_nth_field( +/*=====================*/ + const dict_index_t* index, /*!< in: index */ + ulint pos) /*!< in: position of field */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +#else /* UNIV_DEBUG */ +# define dict_index_get_nth_field(index, pos) ((index)->fields + (pos)) +#endif /* UNIV_DEBUG */ +/********************************************************************//** +Gets pointer to the nth column in an index. +@return column */ +UNIV_INLINE +const dict_col_t* +dict_index_get_nth_col( +/*===================*/ + const dict_index_t* index, /*!< in: index */ + ulint pos) /*!< in: position of the field */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/********************************************************************//** +Gets the column number of the nth field in an index. +@return column number */ +UNIV_INLINE +ulint +dict_index_get_nth_col_no( +/*======================*/ + const dict_index_t* index, /*!< in: index */ + ulint pos) /*!< in: position of the field */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/********************************************************************//** +Looks for column n in an index. +@return position in internal representation of the index; +ULINT_UNDEFINED if not contained */ +UNIV_INLINE +ulint +dict_index_get_nth_col_pos( +/*=======================*/ + const dict_index_t* index, /*!< in: index */ + ulint n, /*!< in: column number */ + ulint* prefix_col_pos) /*!< out: col num if prefix */ + MY_ATTRIBUTE((nonnull(1), warn_unused_result)); + +/** Looks for column n in an index. +@param[in] index index +@param[in] n column number +@param[in] inc_prefix true=consider column prefixes too +@param[in] is_virtual true==virtual column +@return position in internal representation of the index; +ULINT_UNDEFINED if not contained */ +ulint +dict_index_get_nth_col_or_prefix_pos( + const dict_index_t* index, /*!< in: index */ + ulint n, /*!< in: column number */ + bool inc_prefix, /*!< in: TRUE=consider + column prefixes too */ + bool is_virtual, /*!< in: is a virtual column + */ + ulint* prefix_col_pos) /*!< out: col num if prefix + */ + __attribute__((warn_unused_result)); +/********************************************************************//** +Looks for a matching field in an index. The column has to be the same. The +column in index must be complete, or must contain a prefix longer than the +column in index2. That is, we must be able to construct the prefix in index2 +from the prefix in index. +@return position in internal representation of the index; +ULINT_UNDEFINED if not contained */ +ulint +dict_index_get_nth_field_pos( +/*=========================*/ + const dict_index_t* index, /*!< in: index from which to search */ + const dict_index_t* index2, /*!< in: index */ + ulint n) /*!< in: field number in index2 */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/********************************************************************//** +Looks for column n position in the clustered index. +@return position in internal representation of the clustered index */ +unsigned +dict_table_get_nth_col_pos( +/*=======================*/ + const dict_table_t* table, /*!< in: table */ + ulint n, /*!< in: column number */ + ulint* prefix_col_pos) /*!< out: col num if prefix */ + MY_ATTRIBUTE((nonnull(1), warn_unused_result)); +/** Add a column to an index. +@param index index +@param table table +@param col column +@param prefix_len column prefix length +@param descending whether to use descending order */ +void dict_index_add_col(dict_index_t *index, const dict_table_t *table, + dict_col_t *col, ulint prefix_len, + bool descending= false) + MY_ATTRIBUTE((nonnull)); + +/*******************************************************************//** +Copies types of fields contained in index to tuple. */ +void +dict_index_copy_types( +/*==================*/ + dtuple_t* tuple, /*!< in/out: data tuple */ + const dict_index_t* index, /*!< in: index */ + ulint n_fields) /*!< in: number of + field types to copy */ + MY_ATTRIBUTE((nonnull)); +/*********************************************************************//** +Gets the field column. +@return field->col, pointer to the table column */ +UNIV_INLINE +const dict_col_t* +dict_field_get_col( +/*===============*/ + const dict_field_t* field) /*!< in: index field */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/**********************************************************************//** +Returns an index object if it is found in the dictionary cache. +@return index, NULL if not found */ +dict_index_t* +dict_index_get_if_in_cache_low( +/*===========================*/ + index_id_t index_id) /*!< in: index id */ + MY_ATTRIBUTE((warn_unused_result)); +#ifdef UNIV_DEBUG +/**********************************************************************//** +Returns an index object if it is found in the dictionary cache. +@return index, NULL if not found */ +dict_index_t* +dict_index_get_if_in_cache( +/*=======================*/ + index_id_t index_id) /*!< in: index id */ + MY_ATTRIBUTE((warn_unused_result)); +/**********************************************************************//** +Checks that a tuple has n_fields_cmp value in a sensible range, so that +no comparison can occur with the page number field in a node pointer. +@return TRUE if ok */ +ibool +dict_index_check_search_tuple( +/*==========================*/ + const dict_index_t* index, /*!< in: index tree */ + const dtuple_t* tuple) /*!< in: tuple used in a search */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/** Whether and when to allow temporary index names */ +enum check_name { + /** Require all indexes to be complete. */ + CHECK_ALL_COMPLETE, + /** Allow aborted online index creation. */ + CHECK_ABORTED_OK, + /** Allow partial indexes to exist. */ + CHECK_PARTIAL_OK +}; +/**********************************************************************//** +Check for duplicate index entries in a table [using the index name] */ +void +dict_table_check_for_dup_indexes( +/*=============================*/ + const dict_table_t* table, /*!< in: Check for dup indexes + in this table */ + enum check_name check) /*!< in: whether and when to allow + temporary index names */ + MY_ATTRIBUTE((nonnull)); +#endif /* UNIV_DEBUG */ +/**********************************************************************//** +Builds a node pointer out of a physical record and a page number. +@return own: node pointer */ +dtuple_t* +dict_index_build_node_ptr( +/*======================*/ + const dict_index_t* index, /*!< in: index */ + const rec_t* rec, /*!< in: record for which to build node + pointer */ + ulint page_no,/*!< in: page number to put in node + pointer */ + mem_heap_t* heap, /*!< in: memory heap where pointer + created */ + ulint level) /*!< in: level of rec in tree: + 0 means leaf level */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/** Convert a physical record into a search tuple. +@param[in] rec index record (not necessarily in an index page) +@param[in] index index +@param[in] leaf whether rec is in a leaf page +@param[in] n_fields number of data fields +@param[in,out] heap memory heap for allocation +@return own: data tuple */ +dtuple_t* +dict_index_build_data_tuple( + const rec_t* rec, + const dict_index_t* index, + bool leaf, + ulint n_fields, + mem_heap_t* heap) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/*********************************************************************//** +Gets the page number of the root of the index tree. +@return page number */ +UNIV_INLINE +uint32_t +dict_index_get_page( +/*================*/ + const dict_index_t* tree) /*!< in: index */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/********************************************************************//** +Returns free space reserved for future updates of records. This is +relevant only in the case of many consecutive inserts, as updates +which make the records bigger might fragment the index. +@return number of free bytes on page, reserved for updates */ +UNIV_INLINE +ulint +dict_index_get_space_reserve(void); +/*==============================*/ + +/* Online index creation @{ */ +/********************************************************************//** +Gets the status of online index creation. +@return the status */ +UNIV_INLINE +enum online_index_status +dict_index_get_online_status( +/*=========================*/ + const dict_index_t* index) /*!< in: secondary index */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/********************************************************************//** +Sets the status of online index creation. */ +UNIV_INLINE +void +dict_index_set_online_status( +/*=========================*/ + dict_index_t* index, /*!< in/out: index */ + enum online_index_status status) /*!< in: status */ + MY_ATTRIBUTE((nonnull)); +/********************************************************************//** +Determines if a secondary index is being or has been created online, +or if the table is being rebuilt online, allowing concurrent modifications +to the table. +@retval true if the index is being or has been built online, or +if this is a clustered index and the table is being or has been rebuilt online +@retval false if the index has been created or the table has been +rebuilt completely */ +UNIV_INLINE +bool +dict_index_is_online_ddl( +/*=====================*/ + const dict_index_t* index) /*!< in: index */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*********************************************************************//** +Calculates the minimum record length in an index. */ +ulint +dict_index_calc_min_rec_len( +/*========================*/ + const dict_index_t* index) /*!< in: index */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/********************************************************************//** +Checks if the database name in two table names is the same. +@return TRUE if same db name */ +ibool +dict_tables_have_same_db( +/*=====================*/ + const char* name1, /*!< in: table name in the form + dbname '/' tablename */ + const char* name2) /*!< in: table name in the form + dbname '/' tablename */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Get an index by name. +@param[in] table the table where to look for the index +@param[in] name the index name to look for +@return index, NULL if does not exist */ +dict_index_t* +dict_table_get_index_on_name(dict_table_t* table, const char* name) + MY_ATTRIBUTE((warn_unused_result)); + +/** Get an index by name. +@param[in] table the table where to look for the index +@param[in] name the index name to look for +@return index, NULL if does not exist */ +inline +const dict_index_t* +dict_table_get_index_on_name(const dict_table_t* table, const char* name) +{ + return dict_table_get_index_on_name(const_cast(table), + name); +} + +/*************************************************************** +Check whether a column exists in an FTS index. */ +UNIV_INLINE +ulint +dict_table_is_fts_column( +/*=====================*/ + /* out: ULINT_UNDEFINED if no match else + the offset within the vector */ + ib_vector_t* indexes,/* in: vector containing only FTS indexes */ + ulint col_no, /* in: col number to search for */ + bool is_virtual)/*!< in: whether it is a virtual column */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Looks for an index with the given id given a table instance. +@param[in] table table instance +@param[in] id index id +@return index or NULL */ +dict_index_t* +dict_table_find_index_on_id( + const dict_table_t* table, + index_id_t id) + MY_ATTRIBUTE((nonnull(1))); + +/** Maximum number of columns in a foreign key constraint. Please Note MySQL +has a much lower limit on the number of columns allowed in a foreign key +constraint */ +#define MAX_NUM_FK_COLUMNS 500 + +/* Buffers for storing detailed information about the latest foreign key +and unique key errors */ +extern FILE* dict_foreign_err_file; +extern mysql_mutex_t dict_foreign_err_mutex; + +/** InnoDB data dictionary cache */ +class dict_sys_t +{ + /** The my_hrtime_coarse().val of the oldest lock_wait() start, or 0 */ + std::atomic latch_ex_wait_start; + + /** the rw-latch protecting the data dictionary cache */ + alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_lock latch; +#ifdef UNIV_DEBUG + /** whether latch is being held in exclusive mode (by any thread) */ + Atomic_relaxed latch_ex; + /** number of S-latch holders */ + Atomic_counter latch_readers; +#endif +public: + /** Indexes of SYS_TABLE[] */ + enum + { + SYS_TABLES= 0, + SYS_INDEXES, + SYS_COLUMNS, + SYS_FIELDS, + SYS_FOREIGN, + SYS_FOREIGN_COLS, + SYS_VIRTUAL + }; + /** System table names */ + static const span SYS_TABLE[]; + + /** all tables (persistent and temporary), hashed by name */ + hash_table_t table_hash; + /** hash table of persistent table IDs */ + hash_table_t table_id_hash; + + /** the SYS_TABLES table */ + dict_table_t *sys_tables; + /** the SYS_COLUMNS table */ + dict_table_t *sys_columns; + /** the SYS_INDEXES table */ + dict_table_t *sys_indexes; + /** the SYS_FIELDS table */ + dict_table_t *sys_fields; + /** the SYS_FOREIGN table */ + dict_table_t *sys_foreign; + /** the SYS_FOREIGN_COLS table */ + dict_table_t *sys_foreign_cols; + /** the SYS_VIRTUAL table */ + dict_table_t *sys_virtual; + + /** @return whether all non-hard-coded system tables exist */ + bool sys_tables_exist() const + { return UNIV_LIKELY(sys_foreign && sys_foreign_cols && sys_virtual); } + + /** list of persistent tables that can be evicted */ + UT_LIST_BASE_NODE_T(dict_table_t) table_LRU; + /** list of persistent tables that cannot be evicted */ + UT_LIST_BASE_NODE_T(dict_table_t) table_non_LRU; + +private: + bool m_initialised= false; + /** the sequence of temporary table IDs */ + std::atomic temp_table_id{DICT_HDR_FIRST_ID}; + /** hash table of temporary table IDs */ + hash_table_t temp_id_hash; + /** the next value of DB_ROW_ID, backed by DICT_HDR_ROW_ID + (FIXME: remove this, and move to dict_table_t) */ + Atomic_relaxed row_id; + /** The synchronization interval of row_id */ + static constexpr size_t ROW_ID_WRITE_MARGIN= 256; +public: + /** Diagnostic message for exceeding the lock_wait() timeout */ + static const char fatal_msg[]; + + /** @return A new value for GEN_CLUST_INDEX(DB_ROW_ID) */ + inline row_id_t get_new_row_id(); + + /** Ensure that row_id is not smaller than id, on IMPORT TABLESPACE */ + inline void update_row_id(row_id_t id); + + /** Recover the global DB_ROW_ID sequence on database startup */ + void recover_row_id(row_id_t id) + { + row_id= ut_uint64_align_up(id, ROW_ID_WRITE_MARGIN) + ROW_ID_WRITE_MARGIN; + } + + /** @return a new temporary table ID */ + table_id_t acquire_temporary_table_id() + { + return temp_table_id.fetch_add(1, std::memory_order_relaxed); + } + + /** Look up a temporary table. + @param id temporary table ID + @return temporary table + @retval nullptr if the table does not exist + (should only happen during the rollback of CREATE...SELECT) */ + dict_table_t *acquire_temporary_table(table_id_t id) + { + ut_ad(frozen()); + dict_table_t *table; + ulint fold = ut_fold_ull(id); + HASH_SEARCH(id_hash, &temp_id_hash, fold, dict_table_t*, table, + ut_ad(table->cached), table->id == id); + if (UNIV_LIKELY(table != nullptr)) + { + DBUG_ASSERT(table->is_temporary()); + DBUG_ASSERT(table->id >= DICT_HDR_FIRST_ID); + table->acquire(); + } + return table; + } + + /** Look up a persistent table. + @param id table ID + @return table + @retval nullptr if not cached */ + dict_table_t *find_table(table_id_t id) + { + ut_ad(frozen()); + dict_table_t *table; + ulint fold= ut_fold_ull(id); + HASH_SEARCH(id_hash, &table_id_hash, fold, dict_table_t*, table, + ut_ad(table->cached), table->id == id); + DBUG_ASSERT(!table || !table->is_temporary()); + return table; + } + + bool is_initialised() const { return m_initialised; } + + /** Initialise the data dictionary cache. */ + void create(); + + /** Close the data dictionary cache on shutdown. */ + void close(); + + /** Resize the hash tables based on the current buffer pool size. */ + void resize(); + + /** Add a table definition to the data dictionary cache */ + inline void add(dict_table_t* table); + /** Remove a table definition from the data dictionary cache. + @param[in,out] table cached table definition to be evicted + @param[in] lru whether this is part of least-recently-used evictiono + @param[in] keep whether to keep (not free) the object */ + void remove(dict_table_t* table, bool lru = false, bool keep = false); + +#ifdef UNIV_DEBUG + /** Find a table */ + template bool find(const dict_table_t *table) + { + ut_ad(table); + ut_ad(table->can_be_evicted == in_lru); + ut_ad(frozen()); + for (const dict_table_t* t= in_lru ? table_LRU.start : table_non_LRU.start; + t; t = UT_LIST_GET_NEXT(table_LRU, t)) + { + if (t == table) return true; + ut_ad(t->can_be_evicted == in_lru); + } + return false; + } + /** Find a table */ + bool find(const dict_table_t *table) + { + return table->can_be_evicted ? find(table) : find(table); + } +#endif + + /** Move a table to the non-LRU list from the LRU list. */ + void prevent_eviction(dict_table_t *table) + { + ut_d(locked()); + ut_ad(find(table)); + if (!table->can_be_evicted) + return; + table->can_be_evicted= false; + UT_LIST_REMOVE(table_LRU, table); + UT_LIST_ADD_LAST(table_non_LRU, table); + } + +#ifdef UNIV_DEBUG + /** @return whether any thread (not necessarily the current thread) + is holding the latch; that is, this check may return false + positives */ + bool frozen() const { return latch_readers || latch_ex; } + /** @return whether any thread (not necessarily the current thread) + is holding a shared latch */ + bool frozen_not_locked() const { return latch_readers; } + /** @return whether the current thread holds the exclusive latch */ + bool locked() const { return latch_ex == pthread_self(); } +#endif +private: + /** Acquire the exclusive latch */ + ATTRIBUTE_NOINLINE + void lock_wait(SRW_LOCK_ARGS(const char *file, unsigned line)); +public: + /** @return the my_hrtime_coarse().val of the oldest lock_wait() start, + assuming that requests are served on a FIFO basis */ + ulonglong oldest_wait() const + { return latch_ex_wait_start.load(std::memory_order_relaxed); } + + /** Exclusively lock the dictionary cache. */ + void lock(SRW_LOCK_ARGS(const char *file, unsigned line)) + { + if (latch.wr_lock_try()) + { + ut_ad(!latch_readers); + ut_ad(!latch_ex); + ut_d(latch_ex= pthread_self()); + } + else + lock_wait(SRW_LOCK_ARGS(file, line)); + } + +#ifdef UNIV_PFS_RWLOCK + /** Unlock the data dictionary cache. */ + ATTRIBUTE_NOINLINE void unlock(); + /** Acquire a shared lock on the dictionary cache. */ + ATTRIBUTE_NOINLINE void freeze(const char *file, unsigned line); + /** Release a shared lock on the dictionary cache. */ + ATTRIBUTE_NOINLINE void unfreeze(); +#else + /** Unlock the data dictionary cache. */ + void unlock() + { + ut_ad(latch_ex == pthread_self()); + ut_ad(!latch_readers); + ut_d(latch_ex= 0); + latch.wr_unlock(); + } + /** Acquire a shared lock on the dictionary cache. */ + void freeze() + { + latch.rd_lock(); + ut_ad(!latch_ex); + ut_d(latch_readers++); + } + /** Release a shared lock on the dictionary cache. */ + void unfreeze() + { + ut_ad(!latch_ex); + ut_ad(latch_readers--); + latch.rd_unlock(); + } +#endif + + /** Estimate the used memory occupied by the data dictionary + table and index objects. + @return number of bytes occupied */ + TPOOL_SUPPRESS_TSAN ulint rough_size() const + { + /* No latch; this is a very crude approximation anyway */ + ulint size = UT_LIST_GET_LEN(table_LRU) + UT_LIST_GET_LEN(table_non_LRU); + size *= sizeof(dict_table_t) + + sizeof(dict_index_t) * 2 + + (sizeof(dict_col_t) + sizeof(dict_field_t)) * 10 + + sizeof(dict_field_t) * 5 /* total number of key fields */ + + 200; /* arbitrary, covering names and overhead */ + size += (table_hash.n_cells + table_id_hash.n_cells + + temp_id_hash.n_cells) * sizeof(hash_cell_t); + return size; + } + + /** Evict unused, unlocked tables from table_LRU. + @param half whether to consider half the tables only (instead of all) + @return number of tables evicted */ + ulint evict_table_LRU(bool half); + + /** Look up a table in the dictionary cache. + @param name table name + @return table handle + @retval nullptr if not found */ + dict_table_t *find_table(const span &name) const + { + ut_ad(frozen()); + for (dict_table_t *table= static_cast + (HASH_GET_FIRST(&table_hash, table_hash.calc_hash + (my_crc32c(0, name.data(), name.size())))); + table; table= table->name_hash) + if (strlen(table->name.m_name) == name.size() && + !memcmp(table->name.m_name, name.data(), name.size())) + return table; + return nullptr; + } + + /** Look up or load a table definition + @param name table name + @param ignore errors to ignore when loading the table definition + @return table handle + @retval nullptr if not found */ + dict_table_t *load_table(const span &name, + dict_err_ignore_t ignore= DICT_ERR_IGNORE_NONE); + + /** Attempt to load the system tables on startup + @return whether any discrepancy with the expected definition was found */ + bool load_sys_tables(); + /** Create or check system tables on startup */ + dberr_t create_or_check_sys_tables(); +}; + +/** the data dictionary cache */ +extern dict_sys_t dict_sys; + +/*********************************************************************//** +Converts a database and table name from filesystem encoding +(e.g. d@i1b/a@q1b@1Kc, same format as used in dict_table_t::name) in two +strings in UTF8 encoding (e.g. dцb and aюbØc). The output buffers must be +at least MAX_DB_UTF8_LEN and MAX_TABLE_UTF8_LEN bytes. */ +void +dict_fs2utf8( +/*=========*/ + const char* db_and_table, /*!< in: database and table names, + e.g. d@i1b/a@q1b@1Kc */ + char* db_utf8, /*!< out: database name, e.g. dцb */ + size_t db_utf8_size, /*!< in: dbname_utf8 size */ + char* table_utf8, /*!< out: table name, e.g. aюbØc */ + size_t table_utf8_size)/*!< in: table_utf8 size */ + MY_ATTRIBUTE((nonnull)); + +/** Flag an index corrupted both in the data dictionary cache +and in the system table SYS_INDEXES. +@param index index to be flagged as corrupted +@param ctx context (for error log reporting) */ +void dict_set_corrupted(dict_index_t *index, const char *ctx) + ATTRIBUTE_COLD __attribute__((nonnull)); + +/** Sets merge_threshold in the SYS_INDEXES +@param[in,out] index index +@param[in] merge_threshold value to set */ +void +dict_index_set_merge_threshold( + dict_index_t* index, + ulint merge_threshold); + +#ifdef UNIV_DEBUG +/** Sets merge_threshold for all indexes in dictionary cache for debug. +@param[in] merge_threshold_all value to set for all indexes */ +void +dict_set_merge_threshold_all_debug( + uint merge_threshold_all); +#endif /* UNIV_DEBUG */ + +/** Validate the table flags. +@param[in] flags Table flags +@return true if valid. */ +UNIV_INLINE +bool +dict_tf_is_valid( + ulint flags); + +/** Validate both table flags and table flags2 and make sure they +are compatible. +@param[in] flags Table flags +@param[in] flags2 Table flags2 +@return true if valid. */ +UNIV_INLINE +bool +dict_tf2_is_valid( + ulint flags, + ulint flags2); + +/*********************************************************************//** +This function should be called whenever a page is successfully +compressed. Updates the compression padding information. */ +void +dict_index_zip_success( +/*===================*/ + dict_index_t* index) /*!< in/out: index to be updated. */ + MY_ATTRIBUTE((nonnull)); +/*********************************************************************//** +This function should be called whenever a page compression attempt +fails. Updates the compression padding information. */ +void +dict_index_zip_failure( +/*===================*/ + dict_index_t* index) /*!< in/out: index to be updated. */ + MY_ATTRIBUTE((nonnull)); +/*********************************************************************//** +Return the optimal page size, for which page will likely compress. +@return page size beyond which page may not compress*/ +ulint +dict_index_zip_pad_optimal_page_size( +/*=================================*/ + dict_index_t* index) /*!< in: index for which page size + is requested */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*************************************************************//** +Convert table flag to row format string. +@return row format name */ +const char* +dict_tf_to_row_format_string( +/*=========================*/ + ulint table_flag); /*!< in: row format setting */ + +/** encode number of columns and number of virtual columns in one +4 bytes value. We could do this because the number of columns in +InnoDB is limited to 1017 +@param[in] n_col number of non-virtual column +@param[in] n_v_col number of virtual column +@return encoded value */ +UNIV_INLINE +ulint +dict_table_encode_n_col( + ulint n_col, + ulint n_v_col); + +/** Decode number of virtual and non-virtual columns in one 4 bytes value. +@param[in] encoded encoded value +@param[in,out] n_col number of non-virtual column +@param[in,out] n_v_col number of virtual column */ +UNIV_INLINE +void +dict_table_decode_n_col( + ulint encoded, + ulint* n_col, + ulint* n_v_col); + +/** Free the virtual column template +@param[in,out] vc_templ virtual column template */ +UNIV_INLINE +void +dict_free_vc_templ( + dict_vcol_templ_t* vc_templ); + +/** Check whether the table have virtual index. +@param[in] table InnoDB table +@return true if the table have virtual index, false otherwise. */ +UNIV_INLINE +bool +dict_table_have_virtual_index( + dict_table_t* table); + +#include "dict0dict.inl" + +#endif diff --git a/storage/innobase/include/dict0dict.inl b/storage/innobase/include/dict0dict.inl new file mode 100644 index 00000000..4cc3eae9 --- /dev/null +++ b/storage/innobase/include/dict0dict.inl @@ -0,0 +1,1217 @@ +/***************************************************************************** + +Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0dict.ic +Data dictionary system + +Created 1/8/1996 Heikki Tuuri +***********************************************************************/ + +#include "fsp0sysspace.h" +#include "dict0pagecompress.h" + +/*********************************************************************//** +Gets the minimum number of bytes per character. +@return minimum multi-byte char size, in bytes */ +UNIV_INLINE +unsigned +dict_col_get_mbminlen( +/*==================*/ + const dict_col_t* col) /*!< in: column */ +{ + return col->mbminlen; +} +/*********************************************************************//** +Gets the maximum number of bytes per character. +@return maximum multi-byte char size, in bytes */ +UNIV_INLINE +unsigned +dict_col_get_mbmaxlen( +/*==================*/ + const dict_col_t* col) /*!< in: column */ +{ + return col->mbmaxlen; +} +/*********************************************************************//** +Gets the column data type. */ +UNIV_INLINE +void +dict_col_copy_type( +/*===============*/ + const dict_col_t* col, /*!< in: column */ + dtype_t* type) /*!< out: data type */ +{ + ut_ad(col != NULL); + ut_ad(type != NULL); + + type->mtype = col->mtype; + type->prtype = col->prtype; + type->len = col->len; + type->mbminlen = col->mbminlen; + type->mbmaxlen = col->mbmaxlen; +} + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Assert that a column and a data type match. +@return TRUE */ +UNIV_INLINE +ibool +dict_col_type_assert_equal( +/*=======================*/ + const dict_col_t* col, /*!< in: column */ + const dtype_t* type) /*!< in: data type */ +{ + ut_ad(col->mtype == type->mtype); + ut_ad(col->prtype == type->prtype); + //ut_ad(col->len == type->len); + ut_ad(col->mbminlen == type->mbminlen); + ut_ad(col->mbmaxlen == type->mbmaxlen); + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/***********************************************************************//** +Returns the minimum size of the column. +@return minimum size */ +UNIV_INLINE +unsigned +dict_col_get_min_size( +/*==================*/ + const dict_col_t* col) /*!< in: column */ +{ + return(dtype_get_min_size_low(col->mtype, col->prtype, col->len, + col->mbminlen, col->mbmaxlen)); +} +/***********************************************************************//** +Returns the maximum size of the column. +@return maximum size */ +UNIV_INLINE +ulint +dict_col_get_max_size( +/*==================*/ + const dict_col_t* col) /*!< in: column */ +{ + return(dtype_get_max_size_low(col->mtype, col->len)); +} +/***********************************************************************//** +Returns the size of a fixed size column, 0 if not a fixed size column. +@return fixed size, or 0 */ +UNIV_INLINE +unsigned +dict_col_get_fixed_size( +/*====================*/ + const dict_col_t* col, /*!< in: column */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ +{ + return(dtype_get_fixed_size_low(col->mtype, col->prtype, col->len, + col->mbminlen, col->mbmaxlen, comp)); +} +/***********************************************************************//** +Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column. +For fixed length types it is the fixed length of the type, otherwise 0. +@return SQL null storage size in ROW_FORMAT=REDUNDANT */ +UNIV_INLINE +unsigned +dict_col_get_sql_null_size( +/*=======================*/ + const dict_col_t* col, /*!< in: column */ + ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */ +{ + return(dict_col_get_fixed_size(col, comp)); +} + +/*********************************************************************//** +Gets the column number. +@return col->ind, table column position (starting from 0) */ +UNIV_INLINE +unsigned +dict_col_get_no( +/*============*/ + const dict_col_t* col) /*!< in: column */ +{ + return(col->ind); +} + +/*********************************************************************//** +Gets the column position in the clustered index. */ +UNIV_INLINE +ulint +dict_col_get_clust_pos( +/*===================*/ + const dict_col_t* col, /*!< in: table column */ + const dict_index_t* clust_index) /*!< in: clustered index */ +{ + ulint i; + + ut_ad(dict_index_is_clust(clust_index)); + + for (i = 0; i < clust_index->n_def; i++) { + const dict_field_t* field = &clust_index->fields[i]; + + if (!field->prefix_len && field->col == col) { + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/** Gets the column position in the given index. +@param[in] col table column +@param[in] index index to be searched for column +@return position of column in the given index. */ +UNIV_INLINE +ulint +dict_col_get_index_pos( + const dict_col_t* col, + const dict_index_t* index) +{ + ulint i; + + for (i = 0; i < index->n_def; i++) { + const dict_field_t* field = &index->fields[i]; + + if (!field->prefix_len && field->col == col) { + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +#ifdef UNIV_DEBUG +/********************************************************************//** +Gets the first index on the table (the clustered index). +@return index, NULL if none exists */ +UNIV_INLINE +dict_index_t* +dict_table_get_first_index( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return(UT_LIST_GET_FIRST(((dict_table_t*) table)->indexes)); +} + +/********************************************************************//** +Gets the last index on the table. +@return index, NULL if none exists */ +UNIV_INLINE +dict_index_t* +dict_table_get_last_index( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + return(UT_LIST_GET_LAST((const_cast(table)) + ->indexes)); +} + +/********************************************************************//** +Gets the next index on the table. +@return index, NULL if none left */ +UNIV_INLINE +dict_index_t* +dict_table_get_next_index( +/*======================*/ + const dict_index_t* index) /*!< in: index */ +{ + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + return(UT_LIST_GET_NEXT(indexes, (dict_index_t*) index)); +} +#endif /* UNIV_DEBUG */ + +/********************************************************************//** +Gets the number of user-defined non-virtual columns in a table in the +dictionary cache. +@return number of user-defined (e.g., not ROW_ID) non-virtual +columns of a table */ +UNIV_INLINE +unsigned +dict_table_get_n_user_cols( +/*=======================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + /* n_cols counts stored columns only. A table may contain + virtual columns and no user-specified stored columns at all. */ + ut_ad(table->n_cols >= DATA_N_SYS_COLS); + return unsigned(table->n_cols) - DATA_N_SYS_COLS; +} + +/********************************************************************//** +Gets the number of all non-virtual columns (also system) in a table +in the dictionary cache. +@return number of non-virtual columns of a table */ +UNIV_INLINE +unsigned +dict_table_get_n_cols( +/*==================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + return(table->n_cols); +} + +/** Gets the number of virtual columns in a table in the dictionary cache. +@param[in] table the table to check +@return number of virtual columns of a table */ +UNIV_INLINE +unsigned +dict_table_get_n_v_cols( + const dict_table_t* table) +{ + ut_ad(table); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return(table->n_v_cols); +} + +/** Check if a table has indexed virtual columns +@param[in] table the table to check +@return true is the table has indexed virtual columns */ +UNIV_INLINE +bool +dict_table_has_indexed_v_cols( + const dict_table_t* table) +{ + + for (unsigned i = 0; i < table->n_v_cols; i++) { + const dict_v_col_t* col = dict_table_get_nth_v_col(table, i); + if (col->m_col.ord_part) { + return(true); + } + } + + return(false); +} + +/********************************************************************//** +Gets the approximately estimated number of rows in the table. +@return estimated number of rows */ +UNIV_INLINE +ib_uint64_t +dict_table_get_n_rows( +/*==================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table->stat_initialized); + + return(table->stat_n_rows); +} + +/********************************************************************//** +Increment the number of rows in the table by one. +Notice that this operation is not protected by any latch, the number is +approximate. */ +UNIV_INLINE +void +dict_table_n_rows_inc( +/*==================*/ + dict_table_t* table) /*!< in/out: table */ +{ + if (table->stat_initialized) { + ib_uint64_t n_rows = table->stat_n_rows; + if (n_rows < 0xFFFFFFFFFFFFFFFFULL) { + table->stat_n_rows = n_rows + 1; + } + } +} + +/********************************************************************//** +Decrement the number of rows in the table by one. +Notice that this operation is not protected by any latch, the number is +approximate. */ +UNIV_INLINE +void +dict_table_n_rows_dec( +/*==================*/ + dict_table_t* table) /*!< in/out: table */ +{ + if (table->stat_initialized) { + ib_uint64_t n_rows = table->stat_n_rows; + if (n_rows > 0) { + table->stat_n_rows = n_rows - 1; + } + } +} + +#ifdef UNIV_DEBUG +/********************************************************************//** +Gets the nth column of a table. +@return pointer to column object */ +UNIV_INLINE +dict_col_t* +dict_table_get_nth_col( +/*===================*/ + const dict_table_t* table, /*!< in: table */ + ulint pos) /*!< in: position of column */ +{ + ut_ad(pos < table->n_def); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + return((dict_col_t*) (table->cols) + pos); +} + +/** Gets the nth virtual column of a table. +@param[in] table table +@param[in] pos position of virtual column +@return pointer to virtual column object */ +UNIV_INLINE +dict_v_col_t* +dict_table_get_nth_v_col( + const dict_table_t* table, + ulint pos) +{ + ut_ad(table); + ut_ad(pos < table->n_v_def); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + ut_ad(!table->v_cols[pos].m_col.is_added()); + ut_ad(!table->v_cols[pos].m_col.is_dropped()); + return &table->v_cols[pos]; +} + +/********************************************************************//** +Gets the given system column of a table. +@return pointer to column object */ +UNIV_INLINE +dict_col_t* +dict_table_get_sys_col( +/*===================*/ + const dict_table_t* table, /*!< in: table */ + unsigned sys) /*!< in: DATA_ROW_ID, ... */ +{ + dict_col_t* col; + col = dict_table_get_nth_col(table, + dict_table_get_sys_col_no(table, sys)); + ut_ad(col->mtype == DATA_SYS); + ut_ad(col->prtype == (sys | DATA_NOT_NULL)); + + return(col); +} +#endif /* UNIV_DEBUG */ + +/********************************************************************//** +Gets the given system column number of a table. +@return column number */ +UNIV_INLINE +unsigned +dict_table_get_sys_col_no( +/*======================*/ + const dict_table_t* table, /*!< in: table */ + unsigned sys) /*!< in: DATA_ROW_ID, ... */ +{ + ut_ad(sys < DATA_N_SYS_COLS); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + return unsigned(table->n_cols) + (sys - DATA_N_SYS_COLS); +} + +/************************************************************************ +Check if the table has an FTS index. */ +UNIV_INLINE +ibool +dict_table_has_fts_index( +/*=====================*/ + /* out: TRUE if table has an FTS index */ + dict_table_t* table) /* in: table */ +{ + return(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS)); +} + +/** Validate the flags for tables that are not ROW_FORMAT=REDUNDANT. +@param[in] flags table flags +@return whether the flags are valid */ +inline +bool +dict_tf_is_valid_not_redundant(ulint flags) +{ + const bool atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(flags); + + ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(flags); + + if (!zip_ssize) { + /* Not ROW_FORMAT=COMPRESSED */ + } else if (!atomic_blobs) { + /* ROW_FORMAT=COMPRESSED implies ROW_FORMAT=DYNAMIC + for the uncompressed page format */ + return(false); + } else if (zip_ssize > PAGE_ZIP_SSIZE_MAX + || zip_ssize > srv_page_size_shift + || srv_page_size_shift > UNIV_ZIP_SIZE_SHIFT_MAX) { + /* KEY_BLOCK_SIZE is out of bounds, or + ROW_FORMAT=COMPRESSED is not supported with this + innodb_page_size (only up to 16KiB) */ + return(false); + } + + switch (DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags)) { + case 0: + /* PAGE_COMPRESSION_LEVEL=0 should imply PAGE_COMPRESSED=NO */ + return(!DICT_TF_GET_PAGE_COMPRESSION(flags)); + case 1: case 2: case 3: case 4: case 5: case 6: case 7: case 8: case 9: + /* PAGE_COMPRESSION_LEVEL requires + ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC + (not ROW_FORMAT=COMPRESSED or ROW_FORMAT=REDUNDANT) + and PAGE_COMPRESSED=YES */ + return(!zip_ssize && DICT_TF_GET_PAGE_COMPRESSION(flags)); + default: + /* Invalid PAGE_COMPRESSION_LEVEL value */ + return(false); + } +} + +/** Validate the table flags. +@param[in] flags Table flags +@return true if valid. */ +UNIV_INLINE +bool +dict_tf_is_valid( + ulint flags) +{ + ut_ad(flags < 1U << DICT_TF_BITS); + /* The DATA_DIRECTORY flag can be assigned fully independently + of all other persistent table flags. */ + flags &= ~DICT_TF_MASK_DATA_DIR; + if (!(flags & 1)) { + /* Only ROW_FORMAT=REDUNDANT has 0 in the least significant + bit. For ROW_FORMAT=REDUNDANT, only the DATA_DIR flag + (which we cleared above) can be set. If any other flags + are set, the flags are invalid. */ + return(flags == 0 || flags == DICT_TF_MASK_NO_ROLLBACK); + } + + return(dict_tf_is_valid_not_redundant(flags)); +} + +/** Validate both table flags and table flags2 and make sure they +are compatible. +@param[in] flags Table flags +@param[in] flags2 Table flags2 +@return true if valid. */ +UNIV_INLINE +bool +dict_tf2_is_valid( + ulint flags, + ulint flags2) +{ + if (!dict_tf_is_valid(flags)) { + return(false); + } + + if ((flags2 & DICT_TF2_UNUSED_BIT_MASK) != 0) { + return(false); + } + + return(true); +} + +/********************************************************************//** +Determine the file format from dict_table_t::flags +The low order bit will be zero for REDUNDANT and 1 for COMPACT. For any +other row_format, file_format is > 0 and DICT_TF_COMPACT will also be set. +@return file format version */ +UNIV_INLINE +rec_format_t +dict_tf_get_rec_format( +/*===================*/ + ulint flags) /*!< in: dict_table_t::flags */ +{ + ut_a(dict_tf_is_valid(flags)); + + if (!DICT_TF_GET_COMPACT(flags)) { + return(REC_FORMAT_REDUNDANT); + } + + if (!DICT_TF_HAS_ATOMIC_BLOBS(flags)) { + return(REC_FORMAT_COMPACT); + } + + if (DICT_TF_GET_ZIP_SSIZE(flags)) { + return(REC_FORMAT_COMPRESSED); + } + + return(REC_FORMAT_DYNAMIC); +} + +/** Set the various values in a dict_table_t::flags pointer. +@param[in,out] flags, Pointer to a 4 byte Table Flags +@param[in] format File Format +@param[in] zip_ssize Zip Shift Size +@param[in] use_data_dir Table uses DATA DIRECTORY +@param[in] page_compressed Table uses page compression +@param[in] page_compression_level Page compression level */ +UNIV_INLINE +void +dict_tf_set( +/*========*/ + ulint* flags, + rec_format_t format, + ulint zip_ssize, + bool use_data_dir, + bool page_compressed, + ulint page_compression_level) +{ + *flags = use_data_dir ? 1 << DICT_TF_POS_DATA_DIR : 0; + + switch (format) { + case REC_FORMAT_REDUNDANT: + ut_ad(zip_ssize == 0); + /* no other options are allowed */ + ut_ad(!page_compressed); + return; + case REC_FORMAT_COMPACT: + *flags |= DICT_TF_COMPACT; + ut_ad(zip_ssize == 0); + break; + case REC_FORMAT_COMPRESSED: + *flags |= DICT_TF_COMPACT + | (1 << DICT_TF_POS_ATOMIC_BLOBS) + | (zip_ssize << DICT_TF_POS_ZIP_SSIZE); + break; + case REC_FORMAT_DYNAMIC: + *flags |= DICT_TF_COMPACT + | (1 << DICT_TF_POS_ATOMIC_BLOBS); + ut_ad(zip_ssize == 0); + break; + } + + if (page_compressed) { + *flags |= (1 << DICT_TF_POS_ATOMIC_BLOBS) + | (1 << DICT_TF_POS_PAGE_COMPRESSION) + | (page_compression_level << DICT_TF_POS_PAGE_COMPRESSION_LEVEL); + + ut_ad(zip_ssize == 0); + ut_ad(dict_tf_get_page_compression(*flags) == TRUE); + ut_ad(dict_tf_get_page_compression_level(*flags) == page_compression_level); + } +} + +/** Convert a 32 bit integer table flags to the 32 bit FSP Flags. +Fsp Flags are written into the tablespace header at the offset +FSP_SPACE_FLAGS and are also stored in the fil_space_t::flags field. +The following chart shows the translation of the low order bit. +Other bits are the same. +========================= Low order bit ========================== + | REDUNDANT | COMPACT | COMPRESSED | DYNAMIC +dict_table_t::flags | 0 | 1 | 1 | 1 +fil_space_t::flags | 0 | 0 | 1 | 1 +================================================================== +@param[in] table_flags dict_table_t::flags +@return tablespace flags (fil_space_t::flags) */ +inline uint32_t dict_tf_to_fsp_flags(unsigned table_flags) +{ + uint32_t fsp_flags; + uint32_t page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL( + table_flags); + + ut_ad((DICT_TF_GET_PAGE_COMPRESSION(table_flags) == 0) + == (page_compression_level == 0)); + + DBUG_EXECUTE_IF("dict_tf_to_fsp_flags_failure", return UINT32_MAX;); + + /* No ROW_FORMAT=COMPRESSED for innodb_checksum_algorithm=full_crc32 */ + if ((srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32 + || srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_FULL_CRC32) + && !(table_flags & DICT_TF_MASK_ZIP_SSIZE)) { + + fsp_flags = 1U << FSP_FLAGS_FCRC32_POS_MARKER + | FSP_FLAGS_FCRC32_PAGE_SSIZE(); + + if (page_compression_level) { + fsp_flags |= static_cast( + innodb_compression_algorithm) + << FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO; + } + } else { + /* Adjust bit zero. */ + fsp_flags = DICT_TF_HAS_ATOMIC_BLOBS(table_flags) ? 1 : 0; + + /* ZIP_SSIZE and ATOMIC_BLOBS are at the same position. */ + fsp_flags |= table_flags + & (DICT_TF_MASK_ZIP_SSIZE | DICT_TF_MASK_ATOMIC_BLOBS); + + fsp_flags |= FSP_FLAGS_PAGE_SSIZE(); + + if (page_compression_level) { + fsp_flags |= FSP_FLAGS_MASK_PAGE_COMPRESSION; + } + } + + ut_a(fil_space_t::is_valid_flags(fsp_flags, false)); + + if (DICT_TF_HAS_DATA_DIR(table_flags)) { + fsp_flags |= 1U << FSP_FLAGS_MEM_DATA_DIR; + } + + fsp_flags |= page_compression_level << FSP_FLAGS_MEM_COMPRESSION_LEVEL; + + return(fsp_flags); +} + +/********************************************************************//** +Convert a 32 bit integer table flags to the 32bit integer that is written +to a SYS_TABLES.TYPE field. The following chart shows the translation of +the low order bit. Other bits are the same. +========================= Low order bit ========================== + | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC +dict_table_t::flags | 0 | 1 | 1 +SYS_TABLES.TYPE | 1 | 1 | 1 +================================================================== +@return ulint containing SYS_TABLES.TYPE */ +UNIV_INLINE +ulint +dict_tf_to_sys_tables_type( +/*=======================*/ + ulint flags) /*!< in: dict_table_t::flags */ +{ + ulint type; + + ut_a(dict_tf_is_valid(flags)); + + /* Adjust bit zero. It is always 1 in SYS_TABLES.TYPE */ + type = 1; + + /* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION, + PAGE_COMPRESSION_LEVEL are the same. */ + type |= flags & (DICT_TF_MASK_ZIP_SSIZE + | DICT_TF_MASK_ATOMIC_BLOBS + | DICT_TF_MASK_DATA_DIR + | DICT_TF_MASK_PAGE_COMPRESSION + | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL + | DICT_TF_MASK_NO_ROLLBACK); + + return(type); +} + +/********************************************************************//** +Gets the number of fields in the internal representation of an index, +including fields added by the dictionary system. +@return number of fields */ +UNIV_INLINE +uint16_t +dict_index_get_n_fields( +/*====================*/ + const dict_index_t* index) /*!< in: an internal + representation of index (in + the dictionary cache) */ +{ + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + return(index->n_fields); +} + +/********************************************************************//** +Gets the number of fields in the internal representation of an index +that uniquely determine the position of an index entry in the index, if +we do not take multiversioning into account: in the B-tree use the value +returned by dict_index_get_n_unique_in_tree. +@return number of fields */ +UNIV_INLINE +uint16_t +dict_index_get_n_unique( +/*====================*/ + const dict_index_t* index) /*!< in: an internal representation + of index (in the dictionary cache) */ +{ + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + ut_ad(index->cached); + return(index->n_uniq); +} + +/********************************************************************//** +Gets the number of fields in the internal representation of an index +which uniquely determine the position of an index entry in the index, if +we also take multiversioning into account. +@return number of fields */ +UNIV_INLINE +uint16_t +dict_index_get_n_unique_in_tree( +/*============================*/ + const dict_index_t* index) /*!< in: an internal representation + of index (in the dictionary cache) */ +{ + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + ut_ad(index->cached); + + if (dict_index_is_clust(index)) { + + return(dict_index_get_n_unique(index)); + } + + return(dict_index_get_n_fields(index)); +} + +/** +Gets the number of fields on nonleaf page level in the internal representation +of an index which uniquely determine the position of an index entry in the +index, if we also take multiversioning into account. Note, it doesn't +include page no field. +@param[in] index index +@return number of fields */ +UNIV_INLINE +uint16_t +dict_index_get_n_unique_in_tree_nonleaf( + const dict_index_t* index) +{ + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + ut_ad(index->cached); + + if (dict_index_is_spatial(index)) { + /* For spatial index, on non-leaf page, we have only + 2 fields(mbr+page_no). So, except page no field, + there's one field there. */ + return(DICT_INDEX_SPATIAL_NODEPTR_SIZE); + } else { + return(dict_index_get_n_unique_in_tree(index)); + } +} + +/********************************************************************//** +Gets the number of user-defined ordering fields in the index. In the internal +representation of clustered indexes we add the row id to the ordering fields +to make a clustered index unique, but this function returns the number of +fields the user defined in the index as ordering fields. +@return number of fields */ +UNIV_INLINE +uint16_t +dict_index_get_n_ordering_defined_by_user( +/*======================================*/ + const dict_index_t* index) /*!< in: an internal representation + of index (in the dictionary cache) */ +{ + return(index->n_user_defined_cols); +} + +#ifdef UNIV_DEBUG +/********************************************************************//** +Gets the nth field of an index. +@return pointer to field object */ +UNIV_INLINE +dict_field_t* +dict_index_get_nth_field( +/*=====================*/ + const dict_index_t* index, /*!< in: index */ + ulint pos) /*!< in: position of field */ +{ + ut_ad(pos < index->n_def); + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return((dict_field_t*) (index->fields) + pos); +} +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Gets the field column. +@return field->col, pointer to the table column */ +UNIV_INLINE +const dict_col_t* +dict_field_get_col( +/*===============*/ + const dict_field_t* field) /*!< in: index field */ +{ + return(field->col); +} + +/********************************************************************//** +Gets pointer to the nth column in an index. +@return column */ +UNIV_INLINE +const dict_col_t* +dict_index_get_nth_col( +/*===================*/ + const dict_index_t* index, /*!< in: index */ + ulint pos) /*!< in: position of the field */ +{ + return(dict_field_get_col(dict_index_get_nth_field(index, pos))); +} + +/********************************************************************//** +Gets the column number the nth field in an index. +@return column number */ +UNIV_INLINE +ulint +dict_index_get_nth_col_no( +/*======================*/ + const dict_index_t* index, /*!< in: index */ + ulint pos) /*!< in: position of the field */ +{ + return(dict_col_get_no(dict_index_get_nth_col(index, pos))); +} + +/********************************************************************//** +Looks for column n in an index. +@return position in internal representation of the index; +ULINT_UNDEFINED if not contained */ +UNIV_INLINE +ulint +dict_index_get_nth_col_pos( +/*=======================*/ + const dict_index_t* index, /*!< in: index */ + ulint n, /*!< in: column number */ + ulint* prefix_col_pos) /*!< out: col num if prefix */ +{ + return(dict_index_get_nth_col_or_prefix_pos(index, n, false, false, + prefix_col_pos)); +} + +/********************************************************************//** +Returns the minimum data size of an index record. +@return minimum data size in bytes */ +UNIV_INLINE +unsigned +dict_index_get_min_size( +/*====================*/ + const dict_index_t* index) /*!< in: index */ +{ + unsigned n= dict_index_get_n_fields(index); + unsigned size= 0; + + while (n--) + size+= dict_col_get_min_size(dict_index_get_nth_col(index, n)); + + return size; +} + +/*********************************************************************//** +Gets the page number of the root of the index tree. +@return page number */ +UNIV_INLINE +uint32_t +dict_index_get_page( +/*================*/ + const dict_index_t* index) /*!< in: index */ +{ + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + + return(index->page); +} + +/********************************************************************//** +Returns free space reserved for future updates of records. This is +relevant only in the case of many consecutive inserts, as updates +which make the records bigger might fragment the index. +@return number of free bytes on page, reserved for updates */ +UNIV_INLINE +ulint +dict_index_get_space_reserve(void) +/*==============================*/ +{ + return(srv_page_size / 16); +} + +/********************************************************************//** +Gets the status of online index creation. +@return the status */ +UNIV_INLINE +enum online_index_status +dict_index_get_online_status( +/*=========================*/ + const dict_index_t* index) /*!< in: secondary index */ +{ + enum online_index_status status; + + status = (enum online_index_status) index->online_status; + + /* Without the index->lock protection, the online + status can change from ONLINE_INDEX_CREATION to + ONLINE_INDEX_COMPLETE (or ONLINE_INDEX_ABORTED) in + row_log_apply() once log application is done. So to make + sure the status is ONLINE_INDEX_CREATION or ONLINE_INDEX_COMPLETE + you should always do the recheck after acquiring index->lock */ + +#ifdef UNIV_DEBUG + switch (status) { + case ONLINE_INDEX_COMPLETE: + case ONLINE_INDEX_CREATION: + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + return(status); + } + ut_error; +#endif /* UNIV_DEBUG */ + return(status); +} + +/********************************************************************//** +Sets the status of online index creation. */ +UNIV_INLINE +void +dict_index_set_online_status( +/*=========================*/ + dict_index_t* index, /*!< in/out: index */ + enum online_index_status status) /*!< in: status */ +{ + ut_ad(!(index->type & DICT_FTS)); + ut_ad(index->lock.have_x()); + +#ifdef UNIV_DEBUG + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_COMPLETE: + case ONLINE_INDEX_CREATION: + break; + case ONLINE_INDEX_ABORTED: + ut_ad(status == ONLINE_INDEX_ABORTED_DROPPED); + break; + case ONLINE_INDEX_ABORTED_DROPPED: + ut_error; + } +#endif /* UNIV_DEBUG */ + + index->online_status = status & 3; + ut_ad(dict_index_get_online_status(index) == status); +} + +/********************************************************************//** +Determines if a secondary index is being or has been created online, +or if the table is being rebuilt online, allowing concurrent modifications +to the table. +@retval true if the index is being or has been built online, or +if this is a clustered index and the table is being or has been rebuilt online +@retval false if the index has been created or the table has been +rebuilt completely */ +UNIV_INLINE +bool +dict_index_is_online_ddl( +/*=====================*/ + const dict_index_t* index) /*!< in: index */ +{ +#ifdef UNIV_DEBUG + if (dict_index_is_clust(index)) { + switch (dict_index_get_online_status(index)) { + case ONLINE_INDEX_CREATION: + return(true); + case ONLINE_INDEX_COMPLETE: + return(false); + case ONLINE_INDEX_ABORTED: + case ONLINE_INDEX_ABORTED_DROPPED: + break; + } + ut_ad(0); + return(false); + } +#endif /* UNIV_DEBUG */ + + return(UNIV_UNLIKELY(dict_index_get_online_status(index) + != ONLINE_INDEX_COMPLETE)); +} + +/**********************************************************************//** +Check whether a column exists in an FTS index. +@return ULINT_UNDEFINED if no match else the offset within the vector */ +UNIV_INLINE +ulint +dict_table_is_fts_column( +/*=====================*/ + ib_vector_t* indexes,/*!< in: vector containing only FTS indexes */ + ulint col_no, /*!< in: col number to search for */ + bool is_virtual) /*!< in: whether it is a virtual column */ + +{ + ulint i; + + for (i = 0; i < ib_vector_size(indexes); ++i) { + dict_index_t* index; + + index = (dict_index_t*) ib_vector_getp(indexes, i); + + if (index->contains_col_or_prefix(col_no, is_virtual)) { + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/**********************************************************************//** +Determine bytes of column prefix to be stored in the undo log. Please +note that if !dict_table_has_atomic_blobs(table), no prefix +needs to be stored in the undo log. +@return bytes of column prefix to be stored in the undo log */ +UNIV_INLINE +ulint +dict_max_field_len_store_undo( +/*==========================*/ + dict_table_t* table, /*!< in: table */ + const dict_col_t* col) /*!< in: column which index prefix + is based on */ +{ + if (!dict_table_has_atomic_blobs(table)) { + return(0); + } + + if (col->max_prefix != 0) { + return(col->max_prefix); + } + + return(REC_VERSION_56_MAX_INDEX_COL_LEN); +} + +/** Determine maximum bytes of a virtual column need to be stored +in the undo log. +@param[in] table dict_table_t for the table +@param[in] col_no virtual column number +@return maximum bytes of virtual column to be stored in the undo log */ +UNIV_INLINE +ulint +dict_max_v_field_len_store_undo( + dict_table_t* table, + ulint col_no) +{ + const dict_col_t* col + = &dict_table_get_nth_v_col(table, col_no)->m_col; + ulint max_log_len; + + /* This calculation conforms to the non-virtual column + maximum log length calculation: + 1) if No atomic BLOB, upto REC_ANTELOPE_MAX_INDEX_COL_LEN + 2) if atomic BLOB, upto col->max_prefix or + REC_VERSION_56_MAX_INDEX_COL_LEN, whichever is less */ + if (dict_table_has_atomic_blobs(table)) { + if (DATA_BIG_COL(col) && col->max_prefix > 0) { + max_log_len = col->max_prefix; + } else { + max_log_len = DICT_MAX_FIELD_LEN_BY_FORMAT(table); + } + } else { + max_log_len = REC_ANTELOPE_MAX_INDEX_COL_LEN; + } + + return(max_log_len); +} + +/** Check if the table is found is a file_per_table tablespace. +This test does not use table flags2 since some REDUNDANT tables in the +system tablespace may have garbage in the MIX_LEN field where flags2 is +stored. These garbage MIX_LEN fields were written before v3.23.52. +A patch was added to v3.23.52 which initializes the MIX_LEN field to 0. +Since file-per-table tablespaces were added in 4.1, any SYS_TABLES +record with a non-zero space ID will have a reliable MIX_LEN field. +However, this test does not use flags2 from SYS_TABLES.MIX_LEN. Instead, +assume that if the tablespace is not a predefined system tablespace, + then it must be file-per-table. +Also, during ALTER TABLE, the DICT_TF2_USE_FILE_PER_TABLE flag may not be +set on one of the file-per-table tablespaces. +This test cannot be done on a table in the process of being created +because the space_id will be zero until the tablespace is created. +@param[in] table An existing open table to check +@return true if this table was created as a file-per-table tablespace. */ +UNIV_INLINE +bool +dict_table_is_file_per_table( + const dict_table_t* table) /*!< in: table to check */ +{ + return table->space != fil_system.sys_space + && table->space != fil_system.temp_space; +} + +/** Acquire the table handle. */ +inline void dict_table_t::acquire() +{ + ut_ad(dict_sys.frozen()); + n_ref_count++; +} + +/** Release the table handle. +@return whether the last handle was released */ +inline +bool +dict_table_t::release() +{ + auto n = n_ref_count--; + ut_ad(n > 0); + return n == 1; +} + +/** Encode the number of columns and number of virtual columns in a +4 bytes value. We could do this because the number of columns in +InnoDB is limited to 1017 +@param[in] n_col number of non-virtual column +@param[in] n_v_col number of virtual column +@return encoded value */ +UNIV_INLINE +ulint +dict_table_encode_n_col( + ulint n_col, + ulint n_v_col) +{ + return(n_col + (n_v_col<<16)); +} + +/** decode number of virtual and non-virtual columns in one 4 bytes value. +@param[in] encoded encoded value +@param[in,out] n_col number of non-virtual column +@param[in,out] n_v_col number of virtual column */ +UNIV_INLINE +void +dict_table_decode_n_col( + ulint encoded, + ulint* n_col, + ulint* n_v_col) +{ + + ulint num = encoded & ~DICT_N_COLS_COMPACT; + *n_v_col = num >> 16; + *n_col = num & 0xFFFF; +} + +/** Free the virtual column template +@param[in,out] vc_templ virtual column template */ +void +dict_free_vc_templ( + dict_vcol_templ_t* vc_templ) +{ + UT_DELETE_ARRAY(vc_templ->default_rec); + vc_templ->default_rec = NULL; + + if (vc_templ->vtempl != NULL) { + ut_ad(vc_templ->n_v_col > 0); + for (ulint i = 0; i < vc_templ->n_col + + vc_templ->n_v_col; i++) { + if (vc_templ->vtempl[i] != NULL) { + ut_free(vc_templ->vtempl[i]); + } + } + ut_free(vc_templ->vtempl); + vc_templ->vtempl = NULL; + } +} + +/** Check whether the table have virtual index. +@param[in] table InnoDB table +@return true if the table have virtual index, false otherwise. */ +UNIV_INLINE +bool +dict_table_have_virtual_index( + dict_table_t* table) +{ + for (ulint col_no = 0; col_no < dict_table_get_n_v_cols(table); + col_no++) { + const dict_v_col_t* col + = dict_table_get_nth_v_col(table, col_no); + + if (col->m_col.ord_part) { + return(true); + } + } + + return(false); +} diff --git a/storage/innobase/include/dict0load.h b/storage/innobase/include/dict0load.h new file mode 100644 index 00000000..f7d33d5b --- /dev/null +++ b/storage/innobase/include/dict0load.h @@ -0,0 +1,220 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0load.h +Loads to the memory cache database object definitions +from dictionary tables + +Created 4/24/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0load_h +#define dict0load_h + +#include "dict0types.h" +#include "trx0types.h" +#include "ut0byte.h" +#include "mem0mem.h" +#include "btr0types.h" + +#include + +/** A stack of table names related through foreign key constraints */ +typedef std::deque > dict_names_t; + +/** Check each tablespace found in the data dictionary. +Then look at each table defined in SYS_TABLES that has a space_id > 0 +to find all the file-per-table tablespaces. + +In a crash recovery we already have some tablespace objects created from +processing the REDO log. We will compare the +space_id information in the data dictionary to what we find in the +tablespace file. In addition, more validation will be done if recovery +was needed and force_recovery is not set. + +We also scan the biggest space id, and store it to fil_system. */ +void dict_check_tablespaces_and_store_max_id(); + +/** Make sure the data_file_name is saved in dict_table_t if needed. +@param[in,out] table Table object */ +void dict_get_and_save_data_dir_path(dict_table_t* table); + +/***********************************************************************//** +Loads a table object based on the table id. +@return table; NULL if table does not exist */ +dict_table_t* +dict_load_table_on_id( +/*==================*/ + table_id_t table_id, /*!< in: table id */ + dict_err_ignore_t ignore_err); /*!< in: errors to ignore + when loading the table */ +/********************************************************************//** +This function is called when the database is booted. +Loads system table index definitions except for the clustered index which +is added to the dictionary cache at booting before calling this function. */ +void +dict_load_sys_table( +/*================*/ + dict_table_t* table); /*!< in: system table */ +/***********************************************************************//** +Loads foreign key constraints where the table is either the foreign key +holder or where the table is referenced by a foreign key. Adds these +constraints to the data dictionary. + +The foreign key constraint is loaded only if the referenced table is also +in the dictionary cache. If the referenced table is not in dictionary +cache, then it is added to the output parameter (fk_tables). + +@return DB_SUCCESS or error code */ +dberr_t +dict_load_foreigns( +/*===============*/ + const char* table_name, /*!< in: table name */ + const char** col_names, /*!< in: column names, or NULL + to use table->col_names */ + trx_id_t trx_id, /*!< in: DDL transaction id, + or 0 to check + recursive load of tables + chained by FK */ + bool check_charsets, /*!< in: whether to check + charset compatibility */ + dict_err_ignore_t ignore_err, /*!< in: error to be ignored */ + dict_names_t& fk_tables) /*!< out: stack of table names + which must be loaded + subsequently to load all the + foreign key constraints. */ + MY_ATTRIBUTE((nonnull(1))); + +/********************************************************************//** +This function opens a system table, and return the first record. +@return first record of the system table */ +const rec_t* +dict_startscan_system( +/*==================*/ + btr_pcur_t* pcur, /*!< out: persistent cursor to + the record */ + mtr_t* mtr, /*!< in: the mini-transaction */ + dict_table_t* table); /*!< in: system table */ +/********************************************************************//** +This function get the next system table record as we scan the table. +@return the record if found, NULL if end of scan. */ +const rec_t* +dict_getnext_system( +/*================*/ + btr_pcur_t* pcur, /*!< in/out: persistent cursor + to the record */ + mtr_t* mtr); /*!< in: the mini-transaction */ + +/** Load a table definition from a SYS_TABLES record to dict_table_t. +Do not load any columns or indexes. +@param[in,out] mtr mini-transaction +@param[in] uncommitted whether to use READ UNCOMMITTED isolation level +@param[in] rec SYS_TABLES record +@param[out,own] table table, or nullptr +@return error message +@retval nullptr on success */ +const char *dict_load_table_low(mtr_t *mtr, bool uncommitted, + const rec_t *rec, dict_table_t **table) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/********************************************************************//** +This function parses a SYS_INDEXES record and populate a dict_index_t +structure with the information from the record. For detail information +about SYS_INDEXES fields, please refer to dict_boot() function. +@return error message, or NULL on success */ +const char* +dict_process_sys_indexes_rec( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_INDEXES rec */ + dict_index_t* index, /*!< out: dict_index_t to be + filled */ + table_id_t* table_id); /*!< out: table id */ +/********************************************************************//** +This function parses a SYS_COLUMNS record and populate a dict_column_t +structure with the information from the record. +@return error message, or NULL on success */ +const char* +dict_process_sys_columns_rec( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_COLUMNS rec */ + dict_col_t* column, /*!< out: dict_col_t to be filled */ + table_id_t* table_id, /*!< out: table id */ + const char** col_name, /*!< out: column name */ + ulint* nth_v_col); /*!< out: if virtual col, this is + records its sequence number */ + +/** This function parses a SYS_VIRTUAL record and extract virtual column +information +@param[in,out] heap heap memory +@param[in] rec current SYS_COLUMNS rec +@param[in,out] table_id table id +@param[in,out] pos virtual column position +@param[in,out] base_pos base column position +@return error message, or NULL on success */ +const char* +dict_process_sys_virtual_rec( + const rec_t* rec, + table_id_t* table_id, + ulint* pos, + ulint* base_pos); +/********************************************************************//** +This function parses a SYS_FIELDS record and populate a dict_field_t +structure with the information from the record. +@return error message, or NULL on success */ +const char* +dict_process_sys_fields_rec( +/*========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_FIELDS rec */ + dict_field_t* sys_field, /*!< out: dict_field_t to be + filled */ + ulint* pos, /*!< out: Field position */ + index_id_t* index_id, /*!< out: current index id */ + index_id_t last_id); /*!< in: previous index id */ +/********************************************************************//** +This function parses a SYS_FOREIGN record and populate a dict_foreign_t +structure with the information from the record. For detail information +about SYS_FOREIGN fields, please refer to dict_load_foreign() function +@return error message, or NULL on success */ +const char* +dict_process_sys_foreign_rec( +/*=========================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_FOREIGN rec */ + dict_foreign_t* foreign); /*!< out: dict_foreign_t to be + filled */ +/********************************************************************//** +This function parses a SYS_FOREIGN_COLS record and extract necessary +information from the record and return to caller. +@return error message, or NULL on success */ +const char* +dict_process_sys_foreign_col_rec( +/*=============================*/ + mem_heap_t* heap, /*!< in/out: heap memory */ + const rec_t* rec, /*!< in: current SYS_FOREIGN_COLS rec */ + const char** name, /*!< out: foreign key constraint name */ + const char** for_col_name, /*!< out: referencing column name */ + const char** ref_col_name, /*!< out: referenced column name + in referenced table */ + ulint* pos); /*!< out: column position */ + +#endif diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h new file mode 100644 index 00000000..fde2a714 --- /dev/null +++ b/storage/innobase/include/dict0mem.h @@ -0,0 +1,2649 @@ +/***************************************************************************** + +Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0mem.h +Data dictionary memory object creation + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0mem_h +#define dict0mem_h + +#include "dict0types.h" +#include "data0type.h" +#include "mem0mem.h" +#include "row0types.h" +#include "btr0types.h" +#include "lock0types.h" +#include "que0types.h" +#include "sux_lock.h" +#include "ut0mem.h" +#include "ut0rnd.h" +#include "ut0byte.h" +#include "hash0hash.h" +#include "trx0types.h" +#include "fts0fts.h" +#include "buf0buf.h" +#include "mtr0mtr.h" +#include "gis0type.h" +#include "fil0fil.h" +#include "fil0crypt.h" +#include "mysql_com.h" +#include +#include +#include +#include +#include +#include + +/* Forward declaration. */ +struct ib_rbt_t; + +/** Type flags of an index: OR'ing of the flags is allowed to define a +combination of types */ +/* @{ */ +#define DICT_CLUSTERED 1 /*!< clustered index; for other than + auto-generated clustered indexes, + also DICT_UNIQUE will be set */ +#define DICT_UNIQUE 2 /*!< unique index */ +#define DICT_IBUF 8 /*!< insert buffer tree */ +#define DICT_CORRUPT 16 /*!< bit to store the corrupted flag + in SYS_INDEXES.TYPE */ +#define DICT_FTS 32 /* FTS index; can't be combined with the + other flags */ +#define DICT_SPATIAL 64 /* SPATIAL index; can't be combined with the + other flags */ +#define DICT_VIRTUAL 128 /* Index on Virtual column */ + +#define DICT_IT_BITS 8 /*!< number of bits used for + SYS_INDEXES.TYPE */ +/* @} */ + +#if 0 /* not implemented, retained for history */ +/** Types for a table object */ +#define DICT_TABLE_ORDINARY 1 /*!< ordinary table */ +#define DICT_TABLE_CLUSTER_MEMBER 2 +#define DICT_TABLE_CLUSTER 3 /* this means that the table is + really a cluster definition */ +#endif + +/* Table and tablespace flags are generally not used for the Antelope file +format except for the low order bit, which is used differently depending on +where the flags are stored. + +==================== Low order flags bit ========================= + | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC +SYS_TABLES.TYPE | 1 | 1 | 1 +dict_table_t::flags | 0 | 1 | 1 +FSP_SPACE_FLAGS | 0 | 0 | 1 +fil_space_t::flags | 0 | 0 | 1 + +Before the 5.1 plugin, SYS_TABLES.TYPE was always DICT_TABLE_ORDINARY (1) +and the tablespace flags field was always 0. In the 5.1 plugin, these fields +were repurposed to identify compressed and dynamic row formats. + +The following types and constants describe the flags found in dict_table_t +and SYS_TABLES.TYPE. Similar flags found in fil_space_t and FSP_SPACE_FLAGS +are described in fsp0fsp.h. */ + +/* @{ */ +/** dict_table_t::flags bit 0 is equal to 0 if the row format = Redundant */ +#define DICT_TF_REDUNDANT 0 /*!< Redundant row format. */ +/** dict_table_t::flags bit 0 is equal to 1 if the row format = Compact */ +#define DICT_TF_COMPACT 1U /*!< Compact row format. */ + +/** This bitmask is used in SYS_TABLES.N_COLS to set and test whether +the Compact page format is used, i.e ROW_FORMAT != REDUNDANT */ +constexpr uint32_t DICT_N_COLS_COMPACT= 1U << 31; + +/** Width of the COMPACT flag */ +#define DICT_TF_WIDTH_COMPACT 1 + +/** Width of the ZIP_SSIZE flag */ +#define DICT_TF_WIDTH_ZIP_SSIZE 4 + +/** Width of the ATOMIC_BLOBS flag. The ROW_FORMAT=REDUNDANT and +ROW_FORMAT=COMPACT broke up BLOB and TEXT fields, storing the first 768 bytes +in the clustered index. ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPRESSED +store the whole blob or text field off-page atomically. +Secondary indexes are created from this external data using row_ext_t +to cache the BLOB prefixes. */ +#define DICT_TF_WIDTH_ATOMIC_BLOBS 1 + +/** If a table is created with the MYSQL option DATA DIRECTORY and +innodb-file-per-table, an older engine will not be able to find that table. +This flag prevents older engines from attempting to open the table and +allows InnoDB to update_create_info() accordingly. */ +#define DICT_TF_WIDTH_DATA_DIR 1 + +/** +Width of the page compression flag +*/ +#define DICT_TF_WIDTH_PAGE_COMPRESSION 1 +#define DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL 4 + +/** +The NO_ROLLBACK flag (3=yes; the values 1,2 used stand for +ATOMIC_WRITES=ON and ATOMIC_WRITES=OFF between MariaDB 10.1.0 and 10.2.3) +*/ +#define DICT_TF_WIDTH_NO_ROLLBACK 2 + +/** Width of all the currently known table flags */ +#define DICT_TF_BITS (DICT_TF_WIDTH_COMPACT \ + + DICT_TF_WIDTH_ZIP_SSIZE \ + + DICT_TF_WIDTH_ATOMIC_BLOBS \ + + DICT_TF_WIDTH_DATA_DIR \ + + DICT_TF_WIDTH_PAGE_COMPRESSION \ + + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL \ + + DICT_TF_WIDTH_NO_ROLLBACK) + +/** Zero relative shift position of the COMPACT field */ +#define DICT_TF_POS_COMPACT 0 +/** Zero relative shift position of the ZIP_SSIZE field */ +#define DICT_TF_POS_ZIP_SSIZE (DICT_TF_POS_COMPACT \ + + DICT_TF_WIDTH_COMPACT) +/** Zero relative shift position of the ATOMIC_BLOBS field */ +#define DICT_TF_POS_ATOMIC_BLOBS (DICT_TF_POS_ZIP_SSIZE \ + + DICT_TF_WIDTH_ZIP_SSIZE) +/** Zero relative shift position of the DATA_DIR field */ +#define DICT_TF_POS_DATA_DIR (DICT_TF_POS_ATOMIC_BLOBS \ + + DICT_TF_WIDTH_ATOMIC_BLOBS) +/** Zero relative shift position of the PAGE_COMPRESSION field */ +#define DICT_TF_POS_PAGE_COMPRESSION (DICT_TF_POS_DATA_DIR \ + + DICT_TF_WIDTH_DATA_DIR) +/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */ +#define DICT_TF_POS_PAGE_COMPRESSION_LEVEL (DICT_TF_POS_PAGE_COMPRESSION \ + + DICT_TF_WIDTH_PAGE_COMPRESSION) +/** Zero relative shift position of the NO_ROLLBACK field */ +#define DICT_TF_POS_NO_ROLLBACK (DICT_TF_POS_PAGE_COMPRESSION_LEVEL \ + + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL) +#define DICT_TF_POS_UNUSED (DICT_TF_POS_NO_ROLLBACK \ + + DICT_TF_WIDTH_NO_ROLLBACK) + +/** Bit mask of the COMPACT field */ +#define DICT_TF_MASK_COMPACT \ + ((~(~0U << DICT_TF_WIDTH_COMPACT)) \ + << DICT_TF_POS_COMPACT) +/** Bit mask of the ZIP_SSIZE field */ +#define DICT_TF_MASK_ZIP_SSIZE \ + ((~(~0U << DICT_TF_WIDTH_ZIP_SSIZE)) \ + << DICT_TF_POS_ZIP_SSIZE) +/** Bit mask of the ATOMIC_BLOBS field */ +#define DICT_TF_MASK_ATOMIC_BLOBS \ + ((~(~0U << DICT_TF_WIDTH_ATOMIC_BLOBS)) \ + << DICT_TF_POS_ATOMIC_BLOBS) +/** Bit mask of the DATA_DIR field */ +#define DICT_TF_MASK_DATA_DIR \ + ((~(~0U << DICT_TF_WIDTH_DATA_DIR)) \ + << DICT_TF_POS_DATA_DIR) +/** Bit mask of the PAGE_COMPRESSION field */ +#define DICT_TF_MASK_PAGE_COMPRESSION \ + ((~(~0U << DICT_TF_WIDTH_PAGE_COMPRESSION)) \ + << DICT_TF_POS_PAGE_COMPRESSION) +/** Bit mask of the PAGE_COMPRESSION_LEVEL field */ +#define DICT_TF_MASK_PAGE_COMPRESSION_LEVEL \ + ((~(~0U << DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL)) \ + << DICT_TF_POS_PAGE_COMPRESSION_LEVEL) +/** Bit mask of the NO_ROLLBACK field */ +#define DICT_TF_MASK_NO_ROLLBACK \ + ((~(~0U << DICT_TF_WIDTH_NO_ROLLBACK)) \ + << DICT_TF_POS_NO_ROLLBACK) + +/** Return the value of the COMPACT field */ +#define DICT_TF_GET_COMPACT(flags) \ + ((flags & DICT_TF_MASK_COMPACT) \ + >> DICT_TF_POS_COMPACT) +/** Return the value of the ZIP_SSIZE field */ +#define DICT_TF_GET_ZIP_SSIZE(flags) \ + ((flags & DICT_TF_MASK_ZIP_SSIZE) \ + >> DICT_TF_POS_ZIP_SSIZE) +/** Return the value of the ATOMIC_BLOBS field */ +#define DICT_TF_HAS_ATOMIC_BLOBS(flags) \ + ((flags & DICT_TF_MASK_ATOMIC_BLOBS) \ + >> DICT_TF_POS_ATOMIC_BLOBS) +/** Return the value of the DATA_DIR field */ +#define DICT_TF_HAS_DATA_DIR(flags) \ + ((flags & DICT_TF_MASK_DATA_DIR) \ + >> DICT_TF_POS_DATA_DIR) +/** Return the value of the PAGE_COMPRESSION field */ +#define DICT_TF_GET_PAGE_COMPRESSION(flags) \ + ((flags & DICT_TF_MASK_PAGE_COMPRESSION) \ + >> DICT_TF_POS_PAGE_COMPRESSION) +/** Return the value of the PAGE_COMPRESSION_LEVEL field */ +#define DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags) \ + ((flags & DICT_TF_MASK_PAGE_COMPRESSION_LEVEL) \ + >> DICT_TF_POS_PAGE_COMPRESSION_LEVEL) + +/* @} */ + +/** @brief Table Flags set number 2. + +These flags will be stored in SYS_TABLES.MIX_LEN. All unused flags +will be written as 0. The column may contain garbage for tables +created with old versions of InnoDB that only implemented +ROW_FORMAT=REDUNDANT. InnoDB engines do not check these flags +for unknown bits in order to protect backward incompatibility. */ +/* @{ */ +/** Total number of bits in table->flags2. */ +#define DICT_TF2_BITS 7 +#define DICT_TF2_UNUSED_BIT_MASK (~0U << DICT_TF2_BITS) +#define DICT_TF2_BIT_MASK ~DICT_TF2_UNUSED_BIT_MASK + +/** TEMPORARY; TRUE for tables from CREATE TEMPORARY TABLE. */ +#define DICT_TF2_TEMPORARY 1U + +/** The table has an internal defined DOC ID column */ +#define DICT_TF2_FTS_HAS_DOC_ID 2U + +/** The table has an FTS index */ +#define DICT_TF2_FTS 4U + +/** Need to add Doc ID column for FTS index build. +This is a transient bit for index build */ +#define DICT_TF2_FTS_ADD_DOC_ID 8U + +/** This bit is used during table creation to indicate that it will +use its own tablespace instead of the system tablespace. */ +#define DICT_TF2_USE_FILE_PER_TABLE 16U + +/** Set when we discard/detach the tablespace */ +#define DICT_TF2_DISCARDED 32U + +/** This bit is set if all aux table names (both common tables and +index tables) of a FTS table are in HEX format. */ +#define DICT_TF2_FTS_AUX_HEX_NAME 64U + +/* @} */ + +#define DICT_TF2_FLAG_SET(table, flag) \ + (table->flags2 |= (flag)) + +#define DICT_TF2_FLAG_IS_SET(table, flag) \ + (table->flags2 & (flag)) + +#define DICT_TF2_FLAG_UNSET(table, flag) \ + (table->flags2 &= ~(flag) & ((1U << DICT_TF2_BITS) - 1)) + +/** Tables could be chained together with Foreign key constraint. When +first load the parent table, we would load all of its descedents. +This could result in rescursive calls and out of stack error eventually. +DICT_FK_MAX_RECURSIVE_LOAD defines the maximum number of recursive loads, +when exceeded, the child table will not be loaded. It will be loaded when +the foreign constraint check needs to be run. */ +#define DICT_FK_MAX_RECURSIVE_LOAD 20 + +/** Similarly, when tables are chained together with foreign key constraints +with on cascading delete/update clause, delete from parent table could +result in recursive cascading calls. This defines the maximum number of +such cascading deletes/updates allowed. When exceeded, the delete from +parent table will fail, and user has to drop excessive foreign constraint +before proceeds. */ +#define FK_MAX_CASCADE_DEL 15 + +/****************************************************************/ /** + Free a table memory object. */ +void +dict_mem_table_free( +/*================*/ + dict_table_t* table); /*!< in: table */ +/**********************************************************************//** +Adds a column definition to a table. */ +void +dict_mem_table_add_col( +/*===================*/ + dict_table_t* table, /*!< in: table */ + mem_heap_t* heap, /*!< in: temporary memory heap, or NULL */ + const char* name, /*!< in: column name, or NULL */ + ulint mtype, /*!< in: main datatype */ + ulint prtype, /*!< in: precise type */ + ulint len) /*!< in: precision */ + MY_ATTRIBUTE((nonnull(1))); +/** Adds a virtual column definition to a table. +@param[in,out] table table +@param[in] heap temporary memory heap, or NULL. It is + used to store name when we have not finished + adding all columns. When all columns are + added, the whole name will copy to memory from + table->heap +@param[in] name column name +@param[in] mtype main datatype +@param[in] prtype precise type +@param[in] len length +@param[in] pos position in a table +@param[in] num_base number of base columns +@return the virtual column definition */ +dict_v_col_t* +dict_mem_table_add_v_col( + dict_table_t* table, + mem_heap_t* heap, + const char* name, + ulint mtype, + ulint prtype, + ulint len, + ulint pos, + ulint num_base); + +/** Adds a stored column definition to a table. +@param[in] table table +@param[in] num_base number of base columns. */ +void +dict_mem_table_add_s_col( + dict_table_t* table, + ulint num_base); + +/**********************************************************************//** +Renames a column of a table in the data dictionary cache. */ +void +dict_mem_table_col_rename( +/*======================*/ + dict_table_t* table, /*!< in/out: table */ + ulint nth_col,/*!< in: column index */ + const char* from, /*!< in: old column name */ + const char* to, /*!< in: new column name */ + bool is_virtual); + /*!< in: if this is a virtual column */ +/**********************************************************************//** +This function populates a dict_col_t memory structure with +supplied information. */ +void +dict_mem_fill_column_struct( +/*========================*/ + dict_col_t* column, /*!< out: column struct to be + filled */ + ulint col_pos, /*!< in: column position */ + ulint mtype, /*!< in: main data type */ + ulint prtype, /*!< in: precise type */ + ulint col_len); /*!< in: column length */ +/**********************************************************************//** +This function poplulates a dict_index_t index memory structure with +supplied information. */ +UNIV_INLINE +void +dict_mem_fill_index_struct( +/*=======================*/ + dict_index_t* index, /*!< out: index to be filled */ + mem_heap_t* heap, /*!< in: memory heap */ + const char* index_name, /*!< in: index name */ + ulint type, /*!< in: DICT_UNIQUE, + DICT_CLUSTERED, ... ORed */ + ulint n_fields); /*!< in: number of fields */ +/**********************************************************************//** +Creates an index memory object. +@return own: index object */ +dict_index_t* +dict_mem_index_create( +/*==================*/ + dict_table_t* table, /*!< in: table */ + const char* index_name, /*!< in: index name */ + ulint type, /*!< in: DICT_UNIQUE, + DICT_CLUSTERED, ... ORed */ + ulint n_fields); /*!< in: number of fields */ + +/**********************************************************************//** +Frees an index memory object. */ +void +dict_mem_index_free( +/*================*/ + dict_index_t* index); /*!< in: index */ +/**********************************************************************//** +Creates and initializes a foreign constraint memory object. +@return own: foreign constraint struct */ +dict_foreign_t* +dict_mem_foreign_create(void); +/*=========================*/ + +/**********************************************************************//** +Sets the foreign_table_name_lookup pointer based on the value of +lower_case_table_names. If that is 0 or 1, foreign_table_name_lookup +will point to foreign_table_name. If 2, then another string is +allocated from the heap and set to lower case. */ +void +dict_mem_foreign_table_name_lookup_set( +/*===================================*/ + dict_foreign_t* foreign, /*!< in/out: foreign struct */ + ibool do_alloc); /*!< in: is an alloc needed */ + +/**********************************************************************//** +Sets the referenced_table_name_lookup pointer based on the value of +lower_case_table_names. If that is 0 or 1, referenced_table_name_lookup +will point to referenced_table_name. If 2, then another string is +allocated from the heap and set to lower case. */ +void +dict_mem_referenced_table_name_lookup_set( +/*======================================*/ + dict_foreign_t* foreign, /*!< in/out: foreign struct */ + ibool do_alloc); /*!< in: is an alloc needed */ + +/** Fills the dependent virtual columns in a set. +Reason for being dependent are +1) FK can be present on base column of virtual columns +2) FK can be present on column which is a part of virtual index +@param[in,out] foreign foreign key information. */ +void +dict_mem_foreign_fill_vcol_set( + dict_foreign_t* foreign); + +/** Fill virtual columns set in each fk constraint present in the table. +@param[in,out] table innodb table object. */ +void +dict_mem_table_fill_foreign_vcol_set( + dict_table_t* table); + +/** Free the vcol_set from all foreign key constraint on the table. +@param[in,out] table innodb table object. */ +void +dict_mem_table_free_foreign_vcol_set( + dict_table_t* table); + +/** Create a temporary tablename like "#sql-ibNNN". +@param[in] heap A memory heap +@param[in] dbtab Table name in the form database/table name +@param[in] id Table id +@return A unique temporary tablename suitable for InnoDB use */ +char* +dict_mem_create_temporary_tablename( + mem_heap_t* heap, + const char* dbtab, + table_id_t id); + +/** SQL identifier name wrapper for pretty-printing */ +class id_name_t +{ +public: + /** Default constructor */ + id_name_t() + : m_name() + {} + /** Constructor + @param[in] name identifier to assign */ + explicit id_name_t( + const char* name) + : m_name(name) + {} + + /** Assignment operator + @param[in] name identifier to assign */ + id_name_t& operator=( + const char* name) + { + m_name = name; + return(*this); + } + + /** Implicit type conversion + @return the name */ + operator const char*() const + { + return(m_name); + } + + /** Explicit type conversion + @return the name */ + const char* operator()() const + { + return(m_name); + } + +private: + /** The name in internal representation */ + const char* m_name; +}; + +/** Data structure for a column in a table */ +struct dict_col_t{ + /*----------------------*/ + /** The following are copied from dtype_t, + so that all bit-fields can be packed tightly. */ + /* @{ */ + unsigned prtype:32; /*!< precise type; MySQL data + type, charset code, flags to + indicate nullability, + signedness, whether this is a + binary string, whether this is + a true VARCHAR where MySQL + uses 2 bytes to store the length */ + unsigned mtype:8; /*!< main data type */ + + /* the remaining fields do not affect alphabetical ordering: */ + + unsigned len:16; /*!< length; for MySQL data this + is field->pack_length(), + except that for a >= 5.0.3 + type true VARCHAR this is the + maximum byte length of the + string data (in addition to + the string, MySQL uses 1 or 2 + bytes to store the string length) */ + + unsigned mbminlen:3; /*!< minimum length of a + character, in bytes */ + unsigned mbmaxlen:3; /*!< maximum length of a + character, in bytes */ + /*----------------------*/ + /* End of definitions copied from dtype_t */ + /* @} */ + + unsigned ind:10; /*!< table column position + (starting from 0) */ + unsigned ord_part:1; /*!< nonzero if this column + appears in the ordering fields + of an index */ + unsigned max_prefix:12; /*!< maximum index prefix length on + this column. Our current max limit is + 3072 (REC_VERSION_56_MAX_INDEX_COL_LEN) + bytes. */ +private: + /** Special value of ind for a dropped column */ + static const unsigned DROPPED = 1023; +public: + + /** Detach a virtual column from an index. + @param index being-freed index */ + inline void detach(const dict_index_t &index); + + /** Data for instantly added columns */ + struct def_t + { + /** original default value of instantly added column */ + const void *data; + /** len of data, or UNIV_SQL_DEFAULT if unavailable */ + ulint len; + } def_val; + + /** Retrieve the column name. + @param table the table of this column */ + const char *name(const dict_table_t &table) const; + + /** @return whether this is a virtual column */ + bool is_virtual() const { return prtype & DATA_VIRTUAL; } + /** @return whether NULL is an allowed value for this column */ + bool is_nullable() const { return !(prtype & DATA_NOT_NULL); } + + /** @return whether table of this system field is TRX_ID-based */ + bool vers_native() const + { + ut_ad(vers_sys_start() || vers_sys_end()); + ut_ad(mtype == DATA_INT || mtype == DATA_FIXBINARY); + return mtype == DATA_INT; + } + /** @return whether this user column (not row_start, row_end) + has System Versioning property */ + bool is_versioned() const { return !(~prtype & DATA_VERSIONED); } + /** @return whether this is the system version start */ + bool vers_sys_start() const + { + return (prtype & DATA_VERSIONED) == DATA_VERS_START; + } + /** @return whether this is the system version end */ + bool vers_sys_end() const + { + return (prtype & DATA_VERSIONED) == DATA_VERS_END; + } + + /** @return whether this is an instantly-added column */ + bool is_added() const + { + DBUG_ASSERT(def_val.len != UNIV_SQL_DEFAULT || !def_val.data); + return def_val.len != UNIV_SQL_DEFAULT; + } + /** Flag the column instantly dropped */ + void set_dropped() { ind = DROPPED; } + /** Flag the column instantly dropped. + @param not_null whether the column was NOT NULL + @param len2 whether the length exceeds 255 bytes + @param fixed_len the fixed length in bytes, or 0 */ + void set_dropped(bool not_null, bool len2, unsigned fixed) + { + DBUG_ASSERT(!len2 || !fixed); + prtype= not_null ? DATA_NOT_NULL | DATA_BINARY_TYPE : DATA_BINARY_TYPE; + if (fixed) + { + mtype= DATA_FIXBINARY; + len= static_cast(fixed); + } + else + { + mtype= DATA_BINARY; + len= len2 ? 65535 : 255; + } + mbminlen= mbmaxlen= 0; + ind= DROPPED; + ord_part= 0; + max_prefix= 0; + } + /** @return whether the column was instantly dropped */ + bool is_dropped() const { return ind == DROPPED; } + /** @return whether the column was instantly dropped + @param index the clustered index */ + inline bool is_dropped(const dict_index_t &index) const; + + /** Get the default value of an instantly-added column. + @param[out] len value length (in bytes), or UNIV_SQL_NULL + @return default value + @retval NULL if the default value is SQL NULL (len=UNIV_SQL_NULL) */ + const byte *instant_value(ulint *len) const + { + DBUG_ASSERT(is_added()); + *len= def_val.len; + return static_cast(def_val.data); + } + + /** Remove the 'instant ADD' status of the column */ + void clear_instant() + { + def_val.len= UNIV_SQL_DEFAULT; + def_val.data= NULL; + } + + /** @return whether two columns have compatible data type encoding */ + bool same_type(const dict_col_t &other) const + { + if (mtype != other.mtype) + { + /* For latin1_swedish_ci, DATA_CHAR and DATA_VARCHAR + will be used instead of DATA_MYSQL and DATA_VARMYSQL. + As long as mtype,prtype are being written to InnoDB + data dictionary tables, we cannot simplify this. */ + switch (mtype) { + default: + return false; + case DATA_VARCHAR: + if (other.mtype != DATA_VARMYSQL) + return false; + goto check_encoding; + case DATA_VARMYSQL: + if (other.mtype != DATA_VARCHAR) + return false; + goto check_encoding; + case DATA_CHAR: + if (other.mtype != DATA_MYSQL) + return false; + goto check_encoding; + case DATA_MYSQL: + if (other.mtype != DATA_CHAR) + return false; + goto check_encoding; + } + } + else if (dtype_is_string_type(mtype)) + { + check_encoding: + const uint16_t cset= dtype_get_charset_coll(prtype); + const uint16_t ocset= dtype_get_charset_coll(other.prtype); + return cset == ocset || dict_col_t::same_encoding(cset, ocset); + } + + return true; + } + + /** @return whether two collations codes have the same character encoding */ + static bool same_encoding(uint16_t a, uint16_t b); + + /** Determine if the columns have the same format + except for is_nullable() and is_versioned(). + @param other column to compare to + @return whether the columns have the same format */ + bool same_format(const dict_col_t &other) const + { + return same_type(other) && len >= other.len && + mbminlen == other.mbminlen && mbmaxlen >= other.mbmaxlen && + !((prtype ^ other.prtype) & ~(DATA_NOT_NULL | DATA_VERSIONED | + CHAR_COLL_MASK << 16 | + DATA_LONG_TRUE_VARCHAR)); + } + + /** @return whether the column values are comparable by memcmp() */ + bool is_binary() const { return prtype & DATA_BINARY_TYPE; } +}; + +/** Index information put in a list of virtual column structure. Index +id and virtual column position in the index will be logged. +There can be multiple entries for a given index, with a different position. */ +struct dict_v_idx_t { + /** active index on the column */ + dict_index_t* index; + + /** position in this index */ + ulint nth_field; + + dict_v_idx_t(dict_index_t* index, ulint nth_field) + : index(index), nth_field(nth_field) {} +}; + +/** Data structure for a virtual column in a table */ +struct dict_v_col_t{ + /** column structure */ + dict_col_t m_col; + + /** array of base column ptr */ + dict_col_t** base_col; + + /** number of base column */ + unsigned num_base:10; + + /** column pos in table */ + unsigned v_pos:10; + + /** Virtual index list, and column position in the index */ + std::forward_list > + v_indexes; + + /** Detach the column from an index. + @param index index to be detached from */ + void detach(const dict_index_t &index) + { + if (v_indexes.empty()) return; + auto i= v_indexes.before_begin(); + do { + auto prev = i++; + if (i == v_indexes.end()) + { + return; + } + if (i->index == &index) + { + v_indexes.erase_after(prev); + return; + } + } + while (i != v_indexes.end()); + } +}; + +/** Data structure for newly added virtual column in a index. +It is used only during rollback_inplace_alter_table() of +addition of index depending on newly added virtual columns +and uses index heap. Should be freed when index is being +removed from cache. */ +struct dict_add_v_col_info +{ + ulint n_v_col; + dict_v_col_t *v_col; + + /** Add the newly added virtual column while rollbacking + the index which contains new virtual columns + @param col virtual column to be duplicated + @param offset offset where to duplicate virtual column */ + dict_v_col_t* add_drop_v_col(mem_heap_t *heap, dict_v_col_t *col, + ulint offset) + { + ut_ad(n_v_col); + ut_ad(offset < n_v_col); + if (!v_col) + v_col= static_cast + (mem_heap_alloc(heap, n_v_col * sizeof *v_col)); + new (&v_col[offset]) dict_v_col_t(); + v_col[offset].m_col= col->m_col; + v_col[offset].v_pos= col->v_pos; + return &v_col[offset]; + } +}; + +/** Data structure for newly added virtual column in a table */ +struct dict_add_v_col_t{ + /** number of new virtual column */ + ulint n_v_col; + + /** column structures */ + const dict_v_col_t* v_col; + + /** new col names */ + const char** v_col_name; +}; + +/** Data structure for a stored column in a table. */ +struct dict_s_col_t { + /** Stored column ptr */ + dict_col_t* m_col; + /** array of base col ptr */ + dict_col_t** base_col; + /** number of base columns */ + ulint num_base; + /** column pos in table */ + ulint s_pos; +}; + +/** list to put stored column for create_table_info_t */ +typedef std::forward_list > +dict_s_col_list; + +/** @brief DICT_ANTELOPE_MAX_INDEX_COL_LEN is measured in bytes and +is the maximum indexed column length (or indexed prefix length) in +ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT. Also, in any format, +any fixed-length field that is longer than this will be encoded as +a variable-length field. + +It is set to 3*256, so that one can create a column prefix index on +256 characters of a TEXT or VARCHAR column also in the UTF-8 +charset. In that charset, a character may take at most 3 bytes. This +constant MUST NOT BE CHANGED, or the compatibility of InnoDB data +files would be at risk! */ +#define DICT_ANTELOPE_MAX_INDEX_COL_LEN REC_ANTELOPE_MAX_INDEX_COL_LEN + +/** Find out maximum indexed column length by its table format. +For ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT, the maximum +field length is REC_ANTELOPE_MAX_INDEX_COL_LEN - 1 (767). For +ROW_FORMAT=COMPRESSED and ROW_FORMAT=DYNAMIC, the length could +be REC_VERSION_56_MAX_INDEX_COL_LEN (3072) bytes */ +#define DICT_MAX_FIELD_LEN_BY_FORMAT(table) \ + (dict_table_has_atomic_blobs(table) \ + ? REC_VERSION_56_MAX_INDEX_COL_LEN \ + : REC_ANTELOPE_MAX_INDEX_COL_LEN - 1) + +#define DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags) \ + (DICT_TF_HAS_ATOMIC_BLOBS(flags) \ + ? REC_VERSION_56_MAX_INDEX_COL_LEN \ + : REC_ANTELOPE_MAX_INDEX_COL_LEN - 1) + +/** Defines the maximum fixed length column size */ +#define DICT_MAX_FIXED_COL_LEN DICT_ANTELOPE_MAX_INDEX_COL_LEN + +#ifdef WITH_WSREP +#define WSREP_MAX_SUPPORTED_KEY_LENGTH 3500 +#endif /* WITH_WSREP */ + +/** Data structure for a field in an index */ +struct dict_field_t{ + dict_col_t* col; /*!< pointer to the table column */ + id_name_t name; /*!< name of the column */ + unsigned prefix_len:12; /*!< 0 or the length of the column + prefix in bytes in a MySQL index of + type, e.g., INDEX (textcol(25)); + must be smaller than + DICT_MAX_FIELD_LEN_BY_FORMAT; + NOTE that in the UTF-8 charset, MySQL + sets this to (mbmaxlen * the prefix len) + in UTF-8 chars */ + unsigned fixed_len:10; /*!< 0 or the fixed length of the + column if smaller than + DICT_ANTELOPE_MAX_INDEX_COL_LEN */ + /** 1=DESC, 0=ASC */ + unsigned descending:1; + + /** Zero-initialize all fields */ + dict_field_t() { memset((void*) this, 0, sizeof *this); } + + /** Check whether two index fields are equivalent. + @param[in] old the other index field + @return whether the index fields are equivalent */ + bool same(const dict_field_t& other) const + { + return(prefix_len == other.prefix_len + && fixed_len == other.fixed_len); + } +}; + +/**********************************************************************//** +PADDING HEURISTIC BASED ON LINEAR INCREASE OF PADDING TO AVOID +COMPRESSION FAILURES +(Note: this is relevant only for compressed indexes) +GOAL: Avoid compression failures by maintaining information about the +compressibility of data. If data is not very compressible then leave +some extra space 'padding' in the uncompressed page making it more +likely that compression of less than fully packed uncompressed page will +succeed. + +This padding heuristic works by increasing the pad linearly until the +desired failure rate is reached. A "round" is a fixed number of +compression operations. +After each round, the compression failure rate for that round is +computed. If the failure rate is too high, then padding is incremented +by a fixed value, otherwise it's left intact. +If the compression failure is lower than the desired rate for a fixed +number of consecutive rounds, then the padding is decreased by a fixed +value. This is done to prevent overshooting the padding value, +and to accommodate the possible change in data compressibility. */ + +/** Number of zip ops in one round. */ +#define ZIP_PAD_ROUND_LEN (128) + +/** Number of successful rounds after which the padding is decreased */ +#define ZIP_PAD_SUCCESSFUL_ROUND_LIMIT (5) + +/** Amount by which padding is increased. */ +#define ZIP_PAD_INCR (128) + +/** Percentage of compression failures that are allowed in a single +round */ +extern ulong zip_failure_threshold_pct; + +/** Maximum percentage of a page that can be allowed as a pad to avoid +compression failures */ +extern ulong zip_pad_max; + +/** Data structure to hold information about about how much space in +an uncompressed page should be left as padding to avoid compression +failures. This estimate is based on a self-adapting heuristic. */ +struct zip_pad_info_t { + /** Dummy assignment operator for dict_index_t::clone() */ + zip_pad_info_t &operator=(const zip_pad_info_t&) { return *this; } + std::mutex mutex; /*!< mutex protecting the info */ + Atomic_relaxed + pad; /*!< number of bytes used as pad */ + ulint success;/*!< successful compression ops during + current round */ + ulint failure;/*!< failed compression ops during + current round */ + ulint n_rounds;/*!< number of currently successful + rounds */ +}; + +/** Number of samples of data size kept when page compression fails for +a certain index.*/ +#define STAT_DEFRAG_DATA_SIZE_N_SAMPLE 10 + +/** "GEN_CLUST_INDEX" is the name reserved for InnoDB default +system clustered index when there is no primary key. */ +const char innobase_index_reserve_name[] = "GEN_CLUST_INDEX"; + +/** Data structure for an index. Most fields will be +initialized to 0, NULL or FALSE in dict_mem_index_create(). */ +struct dict_index_t { + /** Columns whose character-set collation is being changed */ + struct col_info + { + /** number of columns whose charset-collation is being changed */ + unsigned n_cols; + /** columns with changed charset-collation */ + dict_col_t *cols; + + /** Add a column with changed collation. */ + dict_col_t *add(mem_heap_t *heap, const dict_col_t &col, unsigned offset) + { + ut_ad(offset < n_cols); + if (!cols) + cols= static_cast + (mem_heap_alloc(heap, n_cols * sizeof col)); + new (&cols[offset]) dict_col_t(col); + return &cols[offset]; + } + }; + + /** Maximum number of fields */ + static constexpr unsigned MAX_N_FIELDS= (1U << 10) - 1; + + index_id_t id; /*!< id of the index */ + mem_heap_t* heap; /*!< memory heap */ + id_name_t name; /*!< index name */ + dict_table_t* table; /*!< back pointer to table */ + /** root page number, or FIL_NULL if the index has been detached + from storage (DISCARD TABLESPACE or similar), + or 1 if the index is in table->freed_indexes */ + unsigned page:32; + unsigned merge_threshold:6; + /*!< In the pessimistic delete, if the page + data size drops below this limit in percent, + merging it to a neighbor is tried */ +# define DICT_INDEX_MERGE_THRESHOLD_DEFAULT 50 + unsigned type:DICT_IT_BITS; + /*!< index type (DICT_CLUSTERED, DICT_UNIQUE, + DICT_IBUF, DICT_CORRUPT) */ +#define MAX_KEY_LENGTH_BITS 12 + unsigned trx_id_offset:MAX_KEY_LENGTH_BITS; + /*!< position of the trx id column + in a clustered index record, if the fields + before it are known to be of a fixed size, + 0 otherwise */ +#if (1<lock. */ + unsigned uncommitted:1; + /*!< a flag that is set for secondary indexes + that have not been committed to the + data dictionary yet. Protected by + MDL */ + +#ifdef UNIV_DEBUG + /** whether this is a dummy index object */ + bool is_dummy; + /** whether btr_cur_instant_init() is in progress */ + bool in_instant_init; + uint32_t magic_n;/*!< magic number */ +/** Value of dict_index_t::magic_n */ +# define DICT_INDEX_MAGIC_N 76789786 +#endif + dict_field_t* fields; /*!< array of field descriptions */ + st_mysql_ftparser* + parser; /*!< fulltext parser plugin */ + + /** It just indicates whether newly added virtual column + during alter. It stores column in case of alter failure. + It should use heap from dict_index_t. It should be freed + while removing the index from table. */ + dict_add_v_col_info* new_vcol_info; + + /** During ALTER TABLE, columns that a being-added index depends on + and whose encoding or collation is being changed to something + that is compatible with the clustered index. + Allocated from dict_index_t::heap. + + @see rollback_inplace_alter_table() + @see ha_innobase_inplace_ctx::col_collations */ + col_info* change_col_info; + + UT_LIST_NODE_T(dict_index_t) + indexes;/*!< list of indexes of the table */ +#ifdef BTR_CUR_ADAPT + btr_search_t* search_info; + /*!< info used in optimistic searches */ +#endif /* BTR_CUR_ADAPT */ + row_log_t* online_log; + /*!< the log of modifications + during online index creation; + valid when online_status is + ONLINE_INDEX_CREATION */ + /*----------------------*/ + /** Statistics for query optimization */ + /* @{ */ + ib_uint64_t* stat_n_diff_key_vals; + /*!< approximate number of different + key values for this index, for each + n-column prefix where 1 <= n <= + dict_get_n_unique(index) (the array is + indexed from 0 to n_uniq-1); we + periodically calculate new + estimates */ + ib_uint64_t* stat_n_sample_sizes; + /*!< number of pages that were sampled + to calculate each of stat_n_diff_key_vals[], + e.g. stat_n_sample_sizes[3] pages were sampled + to get the number stat_n_diff_key_vals[3]. */ + ib_uint64_t* stat_n_non_null_key_vals; + /* approximate number of non-null key values + for this index, for each column where + 1 <= n <= dict_get_n_unique(index) (the array + is indexed from 0 to n_uniq-1); This + is used when innodb_stats_method is + "nulls_ignored". */ + ulint stat_index_size; + /*!< approximate index size in + database pages */ + ulint stat_n_leaf_pages; + /*!< approximate number of leaf pages in the + index tree */ + bool stats_error_printed; + /*!< has persistent statistics error printed + for this index ? */ + /* @} */ + /** Statistics for defragmentation, these numbers are estimations and + could be very inaccurate at certain times, e.g. right after restart, + during defragmentation, etc. */ + /* @{ */ + ulint stat_defrag_modified_counter; + ulint stat_defrag_n_pages_freed; + /* number of pages freed by defragmentation. */ + ulint stat_defrag_n_page_split; + /* number of page splits since last full index + defragmentation. */ + ulint stat_defrag_data_size_sample[STAT_DEFRAG_DATA_SIZE_N_SAMPLE]; + /* data size when compression failure happened + the most recent 10 times. */ + ulint stat_defrag_sample_next_slot; + /* in which slot the next sample should be + saved. */ + /* @} */ +private: + /** R-tree split sequence number */ + Atomic_relaxed rtr_ssn; +public: + void set_ssn(node_seq_t ssn) { rtr_ssn= ssn; } + node_seq_t assign_ssn() { return rtr_ssn.fetch_add(1) + 1; } + node_seq_t ssn() const { return rtr_ssn; } + + rtr_info_track_t* + rtr_track;/*!< tracking all R-Tree search cursors */ + trx_id_t trx_id; /*!< id of the transaction that created this + index, or 0 if the index existed + when InnoDB was started up */ + zip_pad_info_t zip_pad;/*!< Information about state of + compression failures and successes */ + /** lock protecting the non-leaf index pages */ + mutable index_lock lock; + + /** Determine if the index has been committed to the + data dictionary. + @return whether the index definition has been committed */ + bool is_committed() const + { + ut_ad(!uncommitted || !(type & DICT_CLUSTERED)); + return(UNIV_LIKELY(!uncommitted)); + } + + /** Flag an index committed or uncommitted. + @param[in] committed whether the index is committed */ + void set_committed(bool committed) + { + ut_ad(!to_be_dropped); + ut_ad(committed || !(type & DICT_CLUSTERED)); + ut_ad(!committed || !change_col_info); + uncommitted = !committed; + } + + /** Notify that the index pages are going to be modified. + @param[in,out] mtr mini-transaction */ + inline void set_modified(mtr_t& mtr) const; + + /** @return whether this index is readable + @retval true normally + @retval false if this is a single-table tablespace + and the .ibd file is missing, or a + page cannot be read or decrypted */ + inline bool is_readable() const; + + /** @return whether instant ALTER TABLE is in effect */ + inline bool is_instant() const; + + /** @return whether the index is the primary key index + (not the clustered index of the change buffer) */ + bool is_primary() const + { + return DICT_CLUSTERED == (type & (DICT_CLUSTERED | DICT_IBUF)); + } + + /** @return whether this is a generated clustered index */ + bool is_gen_clust() const { return type == DICT_CLUSTERED; } + + /** @return whether this is a clustered index */ + bool is_clust() const { return type & DICT_CLUSTERED; } + + /** @return whether this is a unique index */ + bool is_unique() const { return type & DICT_UNIQUE; } + + /** @return whether this is a spatial index */ + bool is_spatial() const { return UNIV_UNLIKELY(type & DICT_SPATIAL); } + + /** @return whether this is the change buffer */ + bool is_ibuf() const { return UNIV_UNLIKELY(type & DICT_IBUF); } + + /** @return whether this index requires locking */ + bool has_locking() const { return !is_ibuf(); } + + /** @return whether this is a normal B-tree index + (not the change buffer, not SPATIAL or FULLTEXT) */ + bool is_btree() const { + return UNIV_LIKELY(!(type & (DICT_IBUF | DICT_SPATIAL + | DICT_FTS | DICT_CORRUPT))); + } + + /** @return whether the index includes virtual columns */ + bool has_virtual() const { return type & DICT_VIRTUAL; } + + /** @return the position of DB_TRX_ID */ + uint16_t db_trx_id() const { + DBUG_ASSERT(is_primary()); + DBUG_ASSERT(n_uniq); + DBUG_ASSERT(n_uniq <= MAX_REF_PARTS); + return n_uniq; + } + /** @return the position of DB_ROLL_PTR */ + uint16_t db_roll_ptr() const + { + return static_cast(db_trx_id() + 1); + } + + /** @return the offset of the metadata BLOB field, + or the first user field after the PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR */ + uint16_t first_user_field() const + { + return static_cast(db_trx_id() + 2); + } + + /** @return whether the index is corrupted */ + inline bool is_corrupted() const; + + /** Detach the virtual columns from the index that is to be removed. */ + void detach_columns() + { + if (!has_virtual() || !cached) + return; + for (unsigned i= 0; i < n_fields; i++) + { + dict_col_t* col= fields[i].col; + if (!col || !col->is_virtual()) + continue; + col->detach(*this); + } + } + + /** Determine how many fields of a given prefix can be set NULL. + @param[in] n_prefix number of fields in the prefix + @return number of fields 0..n_prefix-1 that can be set NULL */ + unsigned get_n_nullable(ulint n_prefix) const + { + DBUG_ASSERT(n_prefix > 0); + DBUG_ASSERT(n_prefix <= n_fields); + unsigned n = n_nullable; + for (; n_prefix < n_fields; n_prefix++) { + const dict_col_t* col = fields[n_prefix].col; + DBUG_ASSERT(!col->is_virtual()); + n -= col->is_nullable(); + } + DBUG_ASSERT(n < n_def); + return n; + } + + /** Get the default value of an instantly-added clustered index field. + @param[in] n instantly added field position + @param[out] len value length (in bytes), or UNIV_SQL_NULL + @return default value + @retval NULL if the default value is SQL NULL (len=UNIV_SQL_NULL) */ + const byte* instant_field_value(ulint n, ulint* len) const + { + DBUG_ASSERT(is_instant() || id == DICT_INDEXES_ID); + DBUG_ASSERT(n + (id == DICT_INDEXES_ID) >= n_core_fields); + DBUG_ASSERT(n < n_fields); + return fields[n].col->instant_value(len); + } + + /** Adjust index metadata for instant ADD/DROP/reorder COLUMN. + @param[in] clustered index definition after instant ALTER TABLE */ + inline void instant_add_field(const dict_index_t& instant); + /** Remove instant ADD COLUMN metadata. */ + inline void clear_instant_add(); + /** Remove instant ALTER TABLE metadata. */ + inline void clear_instant_alter(); + + /** Construct the metadata record for instant ALTER TABLE. + @param[in] row dummy or default values for existing columns + @param[in,out] heap memory heap for allocations + @return metadata record */ + inline dtuple_t* + instant_metadata(const dtuple_t& row, mem_heap_t* heap) const; + + /** Check if record in clustered index is historical row. + @param[in] rec clustered row + @param[in] offsets offsets + @return true if row is historical */ + bool + vers_history_row(const rec_t* rec, const rec_offs* offsets); + + /** Check if record in secondary index is historical row. + @param[in] rec record in a secondary index + @param[out] history_row true if row is historical + @return true on error */ + bool + vers_history_row(const rec_t* rec, bool &history_row); + + /** Assign the number of new column to be added as a part + of the index + @param n_vcol number of virtual columns to be added */ + void assign_new_v_col(ulint n_vcol) + { + new_vcol_info= static_cast + (mem_heap_zalloc(heap, sizeof *new_vcol_info)); + new_vcol_info->n_v_col= n_vcol; + } + + /* @return whether index has new virtual column */ + bool has_new_v_col() const { return new_vcol_info; } + + /* @return number of newly added virtual column */ + ulint get_new_n_vcol() const + { return new_vcol_info ? new_vcol_info->n_v_col : 0; } + + /** Assign the number of collation change fields as a part of the index + @param n_cols number of columns whose collation is changing */ + void init_change_cols(unsigned n_cols) + { + ut_ad(n_fields > n_cols || type & DICT_FTS); + change_col_info= static_cast + (mem_heap_zalloc(heap, sizeof(col_info))); + change_col_info->n_cols= n_cols; + } + + /** Reconstruct the clustered index fields. + @return whether metadata is incorrect */ + inline bool reconstruct_fields(); + + /** Check if the index contains a column or a prefix of that column. + @param[in] n column number + @param[in] is_virtual whether it is a virtual col + @return whether the index contains the column or its prefix */ + bool contains_col_or_prefix(ulint n, bool is_virtual) const + MY_ATTRIBUTE((warn_unused_result)); + +#ifdef BTR_CUR_HASH_ADAPT + /** @return a clone of this */ + dict_index_t* clone() const; + /** Clone this index for lazy dropping of the adaptive hash index. + @return this or a clone */ + dict_index_t* clone_if_needed(); + /** @return number of leaf pages pointed to by the adaptive hash index */ + inline ulint n_ahi_pages() const; + /** @return whether mark_freed() had been invoked */ + bool freed() const { return UNIV_UNLIKELY(page == 1); } + /** Note that the index is waiting for btr_search_lazy_free() */ + void set_freed() { ut_ad(!freed()); page= 1; } +#endif /* BTR_CUR_HASH_ADAPT */ + + /** @return whether it is forbidden to invoke clear_instant_add() */ + bool must_avoid_clear_instant_add() const + { + if (is_instant()) + for (auto i= this; (i= UT_LIST_GET_NEXT(indexes, i)) != nullptr; ) + if (i->to_be_dropped /* || i->online_log*/) + return true; + return false; + } + + /** This ad-hoc class is used by record_size_info only. */ + class record_size_info_t { + public: + record_size_info_t() + : max_leaf_size(0), shortest_size(0), too_big(false), + first_overrun_field_index(SIZE_T_MAX), overrun_size(0) + { + } + + /** Mark row potentially too big for page and set up first + overflow field index. */ + void set_too_big(size_t field_index) + { + ut_ad(field_index != SIZE_T_MAX); + + too_big = true; + if (first_overrun_field_index > field_index) { + first_overrun_field_index = field_index; + overrun_size = shortest_size; + } + } + + /** @return overrun field index or SIZE_T_MAX if nothing + overflowed*/ + size_t get_first_overrun_field_index() const + { + ut_ad(row_is_too_big()); + ut_ad(first_overrun_field_index != SIZE_T_MAX); + return first_overrun_field_index; + } + + size_t get_overrun_size() const + { + ut_ad(row_is_too_big()); + return overrun_size; + } + + bool row_is_too_big() const { return too_big; } + + size_t max_leaf_size; /** Bigger row size this index can + produce */ + size_t shortest_size; /** shortest because it counts everything + as in overflow pages */ + + private: + bool too_big; /** This one is true when maximum row size this + index can produce is bigger than maximum row + size given page can hold. */ + size_t first_overrun_field_index; /** After adding this field + index row overflowed maximum + allowed size. Useful for + reporting back to user. */ + size_t overrun_size; /** Just overrun row size */ + }; + + /** Returns max possibly record size for that index, size of a shortest + everything in overflow) size of the longest possible row and index + of a field which made index records too big to fit on a page.*/ + inline record_size_info_t record_size_info() const; + + /** Clear the index tree and reinitialize the root page, in the + rollback of TRX_UNDO_EMPTY. The BTR_SEG_LEAF is freed and reinitialized. + @param thr query thread + @return error code */ + dberr_t clear(que_thr_t *thr); + + /** Check whether the online log is dummy value to indicate + whether table undergoes active DDL. + @retval true if online log is dummy value */ + bool online_log_is_dummy() const + { + return online_log == reinterpret_cast(this); + } + + /** Assign clustered index online log to dummy value */ + void online_log_make_dummy() + { + online_log= reinterpret_cast(this); + } +}; + +/** Detach a virtual column from an index. +@param index being-freed index */ +inline void dict_col_t::detach(const dict_index_t &index) +{ + if (is_virtual()) + reinterpret_cast(this)->detach(index); +} + +/** Add a field definition to an index. +@param index index +@param name pointer to column name +@param prefix_len column prefix length, or 0 +@param descending whether to use descending order */ +inline void dict_mem_index_add_field(dict_index_t *index, const char *name, + ulint prefix_len, bool descending= false) +{ + ut_ad(index->magic_n == DICT_INDEX_MAGIC_N); + dict_field_t &field= index->fields[index->n_def++]; + field.name= name; + field.prefix_len= prefix_len & ((1U << 12) - 1); + field.descending= descending; +} + +/** The status of online index creation */ +enum online_index_status { + /** the index is complete and ready for access */ + ONLINE_INDEX_COMPLETE = 0, + /** the index is being created, online + (allowing concurrent modifications) */ + ONLINE_INDEX_CREATION, + /** secondary index creation was aborted and the index + should be dropped as soon as index->table->n_ref_count reaches 0, + or online table rebuild was aborted and the clustered index + of the original table should soon be restored to + ONLINE_INDEX_COMPLETE */ + ONLINE_INDEX_ABORTED, + /** the online index creation was aborted, the index was + dropped from the data dictionary and the tablespace, and it + should be dropped from the data dictionary cache as soon as + index->table->n_ref_count reaches 0. */ + ONLINE_INDEX_ABORTED_DROPPED +}; + +/** Set to store the virtual columns which are affected by Foreign +key constraint. */ +typedef std::set, + ut_allocator > dict_vcol_set; + +/** Data structure for a foreign key constraint; an example: +FOREIGN KEY (A, B) REFERENCES TABLE2 (C, D). Most fields will be +initialized to 0, NULL or FALSE in dict_mem_foreign_create(). */ +struct dict_foreign_t{ + mem_heap_t* heap; /*!< this object is allocated from + this memory heap */ + char* id; /*!< id of the constraint as a + null-terminated string */ + unsigned n_fields:10; /*!< number of indexes' first fields + for which the foreign key + constraint is defined: we allow the + indexes to contain more fields than + mentioned in the constraint, as long + as the first fields are as mentioned */ + unsigned type:6; /*!< 0 or DICT_FOREIGN_ON_DELETE_CASCADE + or DICT_FOREIGN_ON_DELETE_SET_NULL */ + char* foreign_table_name;/*!< foreign table name */ + char* foreign_table_name_lookup; + /*!< foreign table name used for dict lookup */ + dict_table_t* foreign_table; /*!< table where the foreign key is */ + const char** foreign_col_names;/*!< names of the columns in the + foreign key */ + char* referenced_table_name;/*!< referenced table name */ + char* referenced_table_name_lookup; + /*!< referenced table name for dict lookup*/ + dict_table_t* referenced_table;/*!< table where the referenced key + is */ + const char** referenced_col_names;/*!< names of the referenced + columns in the referenced table */ + dict_index_t* foreign_index; /*!< foreign index; we require that + both tables contain explicitly defined + indexes for the constraint: InnoDB + does not generate new indexes + implicitly */ + dict_index_t* referenced_index;/*!< referenced index */ + + dict_vcol_set* v_cols; /*!< set of virtual columns affected + by foreign key constraint. */ + + /** Check whether the fulltext index gets affected by + foreign key constraint */ + bool affects_fulltext() const; +}; + +std::ostream& +operator<< (std::ostream& out, const dict_foreign_t& foreign); + +struct dict_foreign_print { + + dict_foreign_print(std::ostream& out) + : m_out(out) + {} + + void operator()(const dict_foreign_t* foreign) { + m_out << *foreign; + } +private: + std::ostream& m_out; +}; + +/** Compare two dict_foreign_t objects using their ids. Used in the ordering +of dict_table_t::foreign_set and dict_table_t::referenced_set. It returns +true if the first argument is considered to go before the second in the +strict weak ordering it defines, and false otherwise. */ +struct dict_foreign_compare { + + bool operator()( + const dict_foreign_t* lhs, + const dict_foreign_t* rhs) const + { + return strcmp(lhs->id, rhs->id) < 0; + } +}; + +/** A function object to find a foreign key with the given index as the +referenced index. Return the foreign key with matching criteria or NULL */ +struct dict_foreign_with_index { + + dict_foreign_with_index(const dict_index_t* index) + : m_index(index) + {} + + bool operator()(const dict_foreign_t* foreign) const + { + return(foreign->referenced_index == m_index); + } + + const dict_index_t* m_index; +}; + +/* A function object to check if the foreign constraint is between different +tables. Returns true if foreign key constraint is between different tables, +false otherwise. */ +struct dict_foreign_different_tables { + + bool operator()(const dict_foreign_t* foreign) const + { + return(foreign->foreign_table != foreign->referenced_table); + } +}; + +/** A function object to check if the foreign key constraint has the same +name as given. If the full name of the foreign key constraint doesn't match, +then, check if removing the database name from the foreign key constraint +matches. Return true if it matches, false otherwise. */ +struct dict_foreign_matches_id { + + dict_foreign_matches_id(const char* id) + : m_id(id) + {} + + bool operator()(const dict_foreign_t* foreign) const + { + if (0 == innobase_strcasecmp(foreign->id, m_id)) { + return(true); + } + if (const char* pos = strchr(foreign->id, '/')) { + if (0 == innobase_strcasecmp(m_id, pos + 1)) { + return(true); + } + } + return(false); + } + + const char* m_id; +}; + +typedef std::set< + dict_foreign_t*, + dict_foreign_compare, + ut_allocator > dict_foreign_set; + +std::ostream& +operator<< (std::ostream& out, const dict_foreign_set& fk_set); + +/** Function object to check if a foreign key object is there +in the given foreign key set or not. It returns true if the +foreign key is not found, false otherwise */ +struct dict_foreign_not_exists { + dict_foreign_not_exists(const dict_foreign_set& obj_) + : m_foreigns(obj_) + {} + + /* Return true if the given foreign key is not found */ + bool operator()(dict_foreign_t* const & foreign) const { + return(m_foreigns.find(foreign) == m_foreigns.end()); + } +private: + const dict_foreign_set& m_foreigns; +}; + +/** Validate the search order in the foreign key set. +@param[in] fk_set the foreign key set to be validated +@return true if search order is fine in the set, false otherwise. */ +bool +dict_foreign_set_validate( + const dict_foreign_set& fk_set); + +/** Validate the search order in the foreign key sets of the table +(foreign_set and referenced_set). +@param[in] table table whose foreign key sets are to be validated +@return true if foreign key sets are fine, false otherwise. */ +bool +dict_foreign_set_validate( + const dict_table_t& table); + +/*********************************************************************//** +Frees a foreign key struct. */ +inline +void +dict_foreign_free( +/*==============*/ + dict_foreign_t* foreign) /*!< in, own: foreign key struct */ +{ + if (foreign->v_cols != NULL) { + UT_DELETE(foreign->v_cols); + } + + mem_heap_free(foreign->heap); +} + +/** The destructor will free all the foreign key constraints in the set +by calling dict_foreign_free() on each of the foreign key constraints. +This is used to free the allocated memory when a local set goes out +of scope. */ +struct dict_foreign_set_free { + + dict_foreign_set_free(const dict_foreign_set& foreign_set) + : m_foreign_set(foreign_set) + {} + + ~dict_foreign_set_free() + { + std::for_each(m_foreign_set.begin(), + m_foreign_set.end(), + dict_foreign_free); + } + + const dict_foreign_set& m_foreign_set; +}; + +/** The flags for ON_UPDATE and ON_DELETE can be ORed; the default is that +a foreign key constraint is enforced, therefore RESTRICT just means no flag */ +/* @{ */ +#define DICT_FOREIGN_ON_DELETE_CASCADE 1U /*!< ON DELETE CASCADE */ +#define DICT_FOREIGN_ON_DELETE_SET_NULL 2U /*!< ON UPDATE SET NULL */ +#define DICT_FOREIGN_ON_UPDATE_CASCADE 4U /*!< ON DELETE CASCADE */ +#define DICT_FOREIGN_ON_UPDATE_SET_NULL 8U /*!< ON UPDATE SET NULL */ +#define DICT_FOREIGN_ON_DELETE_NO_ACTION 16U /*!< ON DELETE NO ACTION */ +#define DICT_FOREIGN_ON_UPDATE_NO_ACTION 32U /*!< ON UPDATE NO ACTION */ +/* @} */ + +/** Display an identifier. +@param[in,out] s output stream +@param[in] id_name SQL identifier (other than table name) +@return the output stream */ +std::ostream& +operator<<( + std::ostream& s, + const id_name_t& id_name); + +/** Display a table name. +@param[in,out] s output stream +@param[in] table_name table name +@return the output stream */ +std::ostream& +operator<<( + std::ostream& s, + const table_name_t& table_name); + +/** List of locks that different transactions have acquired on a table. This +list has a list node that is embedded in a nested union/structure. We have to +generate a specific template for it. */ + +typedef ut_list_base lock_table_t::*> + table_lock_list_t; + +/** mysql template structure defined in row0mysql.cc */ +struct mysql_row_templ_t; + +/** Structure defines template related to virtual columns and +their base columns */ +struct dict_vcol_templ_t { + /** number of regular columns */ + ulint n_col; + + /** number of virtual columns */ + ulint n_v_col; + + /** array of templates for virtual col and their base columns */ + mysql_row_templ_t** vtempl; + + /** table's database name */ + std::string db_name; + + /** table name */ + std::string tb_name; + + /** MySQL record length */ + ulint rec_len; + + /** default column value if any */ + byte* default_rec; + + /** cached MySQL TABLE object */ + TABLE* mysql_table; + + /** when mysql_table was cached */ + uint64_t mysql_table_query_id; + + dict_vcol_templ_t() : vtempl(0), mysql_table_query_id(~0ULL) {} +}; + +/** Metadata on clustered index fields starting from first_user_field() */ +class field_map_element_t +{ + /** Number of bits for representing a column number */ + static constexpr uint16_t IND_BITS = 10; + + /** Set if the column of the field has been instantly dropped */ + static constexpr uint16_t DROPPED = 1U << (IND_BITS + 5); + + /** Set if the column was dropped and originally declared NOT NULL */ + static constexpr uint16_t NOT_NULL = 1U << (IND_BITS + 4); + + /** Column index (if !(data & DROPPED)): table->cols[data & IND], + or field length (if (data & DROPPED)): + (data & IND) = 0 if variable-length with max_len < 256 bytes; + (data & IND) = 1 if variable-length with max_len > 255 bytes; + (data & IND) = 1 + L otherwise, with L=fixed length of the column */ + static constexpr uint16_t IND = (1U << IND_BITS) - 1; + + /** Field metadata */ + uint16_t data; + + void clear_not_null() { data &= uint16_t(~NOT_NULL); } +public: + bool is_dropped() const { return data & DROPPED; } + void set_dropped() { data |= DROPPED; } + bool is_not_null() const { return data & NOT_NULL; } + void set_not_null() { ut_ad(is_dropped()); data |= NOT_NULL; } + uint16_t ind() const { return data & IND; } + void set_ind(uint16_t i) + { + DBUG_ASSERT(i <= IND); + DBUG_ASSERT(!ind()); + data |= i; + } + field_map_element_t& operator= (uint16_t value) + { + data = value; + return *this; + } + operator uint16_t() { return data; } +}; + +static_assert(sizeof(field_map_element_t) == 2, + "Size mismatch for a persistent data item!"); + +/** Instantly dropped or reordered columns */ +struct dict_instant_t +{ + /** Number of dropped columns */ + unsigned n_dropped; + /** Dropped columns */ + dict_col_t* dropped; + /** Map of clustered index non-PK fields[i - first_user_field()] + to table columns */ + field_map_element_t* field_map; +}; + +/** These are used when MySQL FRM and InnoDB data dictionary are +in inconsistent state. */ +typedef enum { + DICT_FRM_CONSISTENT = 0, /*!< Consistent state */ + DICT_FRM_NO_PK = 1, /*!< MySQL has no primary key + but InnoDB dictionary has + non-generated one. */ + DICT_NO_PK_FRM_HAS = 2, /*!< MySQL has primary key but + InnoDB dictionary has not. */ + DICT_FRM_INCONSISTENT_KEYS = 3 /*!< Key count mismatch */ +} dict_frm_t; + +/** Data structure for a database table. Most fields will be +zero-initialized in dict_table_t::create(). */ +struct dict_table_t { + + /** Get reference count. + @return current value of n_ref_count */ + inline uint32_t get_ref_count() const { return n_ref_count; } + + /** Acquire the table handle. */ + inline void acquire(); + + /** Release the table handle. + @return whether the last handle was released */ + inline bool release(); + + /** @return whether the table supports transactions */ + bool no_rollback() const + { + return !(~unsigned(flags) & DICT_TF_MASK_NO_ROLLBACK); + } + /** @return whether this is a temporary table */ + bool is_temporary() const + { + return flags2 & DICT_TF2_TEMPORARY; + } + + /** @return whether the table is not in ROW_FORMAT=REDUNDANT */ + bool not_redundant() const { return flags & DICT_TF_COMPACT; } + + /** @return whether this table is readable + @retval true normally + @retval false if this is a single-table tablespace + and the .ibd file is missing, or a + page cannot be read or decrypted */ + bool is_readable() const + { + ut_ad(file_unreadable || space); + return(UNIV_LIKELY(!file_unreadable)); + } + + /** @return whether the table is accessible */ + bool is_accessible() const + { + return UNIV_LIKELY(is_readable() && !corrupted && space) + && !space->is_stopping(); + } + + /** Check if a table name contains the string "/#sql" + which denotes temporary or intermediate tables in MariaDB. */ + static bool is_temporary_name(const char* name) + { + return strstr(name, "/#sql"); + } + + /** @return whether instant ALTER TABLE is in effect */ + bool is_instant() const + { + return(UT_LIST_GET_FIRST(indexes)->is_instant()); + } + + /** @return whether the table supports instant ALTER TABLE */ + bool supports_instant() const + { + return(!(flags & DICT_TF_MASK_ZIP_SSIZE)); + } + + /** @return the number of instantly dropped columns */ + unsigned n_dropped() const { return instant ? instant->n_dropped : 0; } + + /** Look up an old column. + @param[in] cols the old columns of the table + @param[in] col_map map from old table columns to altered ones + @param[in] n_cols number of old columns + @param[in] i the number of the new column + @return old column + @retval NULL if column i was added to the table */ + static const dict_col_t* find(const dict_col_t* cols, + const ulint* col_map, ulint n_cols, + ulint i) + { + for (ulint o = n_cols; o--; ) { + if (col_map[o] == i) { + return &cols[o]; + } + } + return NULL; + } + + /** Serialise metadata of dropped or reordered columns. + @param[in,out] heap memory heap for allocation + @param[out] field data field with the metadata */ + inline void serialise_columns(mem_heap_t* heap, dfield_t* field) const; + + /** Reconstruct dropped or reordered columns. + @param[in] metadata data from serialise_columns() + @param[in] len length of the metadata, in bytes + @return whether parsing the metadata failed */ + bool deserialise_columns(const byte* metadata, ulint len); + + /** Set is_instant() before instant_column(). + @param[in] old previous table definition + @param[in] col_map map from old.cols[] + and old.v_cols[] to this + @param[out] first_alter_pos 0, or + 1 + first changed column position */ + inline void prepare_instant(const dict_table_t& old, + const ulint* col_map, + unsigned& first_alter_pos); + + /** Adjust table metadata for instant ADD/DROP/reorder COLUMN. + @param[in] table table on which prepare_instant() was invoked + @param[in] col_map mapping from cols[] and v_cols[] to table + @return whether the metadata record must be updated */ + inline bool instant_column(const dict_table_t& table, + const ulint* col_map); + + /** Roll back instant_column(). + @param[in] old_n_cols original n_cols + @param[in] old_cols original cols + @param[in] old_col_names original col_names + @param[in] old_instant original instant structure + @param[in] old_fields original fields + @param[in] old_n_fields original number of fields + @param[in] old_n_core_fields original number of core fields + @param[in] old_n_v_cols original n_v_cols + @param[in] old_v_cols original v_cols + @param[in] old_v_col_names original v_col_names + @param[in] col_map column map */ + inline void rollback_instant( + unsigned old_n_cols, + dict_col_t* old_cols, + const char* old_col_names, + dict_instant_t* old_instant, + dict_field_t* old_fields, + unsigned old_n_fields, + unsigned old_n_core_fields, + unsigned old_n_v_cols, + dict_v_col_t* old_v_cols, + const char* old_v_col_names, + const ulint* col_map); + + /** Add the table definition to the data dictionary cache */ + void add_to_cache(); + + /** @return whether the table is versioned. + It is assumed that both vers_start and vers_end set to 0 + iff table is not versioned. In any other case, + these fields correspond to actual positions in cols[]. */ + bool versioned() const { return vers_start || vers_end; } + bool versioned_by_id() const + { + return versioned() && cols[vers_start].mtype == DATA_INT; + } + + /** For overflow fields returns potential max length stored inline */ + inline size_t get_overflow_field_local_len() const; + + /** Parse the table file name into table name and database name. + @tparam dict_frozen whether the caller holds dict_sys.latch + @param[in,out] db_name database name buffer + @param[in,out] tbl_name table name buffer + @param[out] db_name_len database name length + @param[out] tbl_name_len table name length + @return whether the table name is visible to SQL */ + template + bool parse_name(char (&db_name)[NAME_LEN + 1], + char (&tbl_name)[NAME_LEN + 1], + size_t *db_name_len, size_t *tbl_name_len) const; + + /** Clear the table when rolling back TRX_UNDO_EMPTY + @return error code */ + dberr_t clear(que_thr_t *thr); + +#ifdef UNIV_DEBUG + /** @return whether the current thread holds the lock_mutex */ + bool lock_mutex_is_owner() const + { return lock_mutex_owner == pthread_self(); } + /** @return whether the current thread holds the stats_mutex (lock_mutex) */ + bool stats_mutex_is_owner() const + { return lock_mutex_owner == pthread_self(); } +#endif /* UNIV_DEBUG */ + void lock_mutex_init() { lock_mutex.init(); } + void lock_mutex_destroy() { lock_mutex.destroy(); } + /** Acquire lock_mutex */ + void lock_mutex_lock() + { + ut_ad(!lock_mutex_is_owner()); + lock_mutex.wr_lock(); + ut_ad(!lock_mutex_owner.exchange(pthread_self())); + } + /** Try to acquire lock_mutex */ + bool lock_mutex_trylock() + { + ut_ad(!lock_mutex_is_owner()); + bool acquired= lock_mutex.wr_lock_try(); + ut_ad(!acquired || !lock_mutex_owner.exchange(pthread_self())); + return acquired; + } + /** Release lock_mutex */ + void lock_mutex_unlock() + { + ut_ad(lock_mutex_owner.exchange(0) == pthread_self()); + lock_mutex.wr_unlock(); + } +#ifndef SUX_LOCK_GENERIC + /** @return whether the lock mutex is held by some thread */ + bool lock_mutex_is_locked() const noexcept { return lock_mutex.is_locked(); } +#endif + + /* stats mutex lock currently defaults to lock_mutex but in the future, + there could be a use-case to have separate mutex for stats. + extra indirection (through inline so no performance hit) should + help simplify code and increase long-term maintainability */ + void stats_mutex_init() { lock_mutex_init(); } + void stats_mutex_destroy() { lock_mutex_destroy(); } + void stats_mutex_lock() { lock_mutex_lock(); } + void stats_mutex_unlock() { lock_mutex_unlock(); } + + /** Rename the data file. + @param new_name name of the table + @param replace whether to replace the file with the new name + (as part of rolling back TRUNCATE) */ + dberr_t rename_tablespace(span new_name, bool replace) const; + +private: + /** Initialize instant->field_map. + @param[in] table table definition to copy from */ + inline void init_instant(const dict_table_t& table); +public: + /** Id of the table. */ + table_id_t id; + /** dict_sys.id_hash chain node */ + dict_table_t* id_hash; + /** Table name in name_hash */ + table_name_t name; + /** dict_sys.name_hash chain node */ + dict_table_t* name_hash; + + /** Memory heap */ + mem_heap_t* heap; + + /** NULL or the directory path specified by DATA DIRECTORY. */ + char* data_dir_path; + + /** The tablespace of the table */ + fil_space_t* space; + /** Tablespace ID */ + uint32_t space_id; + + /** Stores information about: + 1 row format (redundant or compact), + 2 compressed page size (zip shift size), + 3 whether using atomic blobs, + 4 whether the table has been created with the option DATA DIRECTORY. + Use DICT_TF_GET_COMPACT(), DICT_TF_GET_ZIP_SSIZE(), + DICT_TF_HAS_ATOMIC_BLOBS() and DICT_TF_HAS_DATA_DIR() to parse this + flag. */ + unsigned flags:DICT_TF_BITS; + + /** Stores information about: + 1 whether the table has been created using CREATE TEMPORARY TABLE, + 2 whether the table has an internally defined DOC ID column, + 3 whether the table has a FTS index, + 4 whether DOC ID column need to be added to the FTS index, + 5 whether the table is being created its own tablespace, + 6 whether the table has been DISCARDed, + 7 whether the aux FTS tables names are in hex. + Use DICT_TF2_FLAG_IS_SET() to parse this flag. */ + unsigned flags2:DICT_TF2_BITS; + + /** TRUE if the table is an intermediate table during copy alter + operation or a partition/subpartition which is required for copying + data and skip the undo log for insertion of row in the table. + This variable will be set and unset during extra(), or during the + process of altering partitions */ + unsigned skip_alter_undo:1; + + /*!< whether this is in a single-table tablespace and the .ibd + file is missing or page decryption failed and page is corrupted */ + unsigned file_unreadable:1; + + /** TRUE if the table object has been added to the dictionary cache. */ + unsigned cached:1; + + /** Number of non-virtual columns defined so far. */ + unsigned n_def:10; + + /** Number of non-virtual columns. */ + unsigned n_cols:10; + + /** Number of total columns (inlcude virtual and non-virtual) */ + unsigned n_t_cols:10; + + /** Number of total columns defined so far. */ + unsigned n_t_def:10; + + /** Number of virtual columns defined so far. */ + unsigned n_v_def:10; + + /** Number of virtual columns. */ + unsigned n_v_cols:10; + + /** 1 + the position of autoinc counter field in clustered + index, or 0 if there is no persistent AUTO_INCREMENT column in + the table. */ + unsigned persistent_autoinc:10; + + /** TRUE if it's not an InnoDB system table or a table that has no FK + relationships. */ + unsigned can_be_evicted:1; + + /** TRUE if table is corrupted. */ + unsigned corrupted:1; + + /** TRUE if some indexes should be dropped after ONLINE_INDEX_ABORTED + or ONLINE_INDEX_ABORTED_DROPPED. */ + unsigned drop_aborted:1; + + /** Array of column descriptions. */ + dict_col_t* cols; + + /** Array of virtual column descriptions. */ + dict_v_col_t* v_cols; + + /** List of stored column descriptions. It is used only for foreign key + check during create table and copy alter operations. + During copy alter, s_cols list is filled during create table operation + and need to preserve till rename table operation. That is the + reason s_cols is a part of dict_table_t */ + dict_s_col_list* s_cols; + + /** Instantly dropped or reordered columns, or NULL if none */ + dict_instant_t* instant; + + /** Column names packed in a character string + "name1\0name2\0...nameN\0". Until the string contains n_cols, it will + be allocated from a temporary heap. The final string will be allocated + from table->heap. */ + const char* col_names; + + /** Virtual column names */ + const char* v_col_names; + unsigned vers_start:10; + /*!< System Versioning: row start col index */ + unsigned vers_end:10; + /*!< System Versioning: row end col index */ + bool is_system_db; + /*!< True if the table belongs to a system + database (mysql, information_schema or + performance_schema) */ + dict_frm_t dict_frm_mismatch; + /*!< !DICT_FRM_CONSISTENT==0 if data + dictionary information and + MySQL FRM information mismatch. */ + /** The FTS_DOC_ID_INDEX, or NULL if no fulltext indexes exist */ + dict_index_t* fts_doc_id_index; + + /** List of indexes of the table. */ + UT_LIST_BASE_NODE_T(dict_index_t) indexes; +#ifdef BTR_CUR_HASH_ADAPT + /** List of detached indexes that are waiting to be freed along with + the last adaptive hash index entry. + Protected by autoinc_mutex (sic!) */ + UT_LIST_BASE_NODE_T(dict_index_t) freed_indexes; +#endif /* BTR_CUR_HASH_ADAPT */ + + /** List of foreign key constraints in the table. These refer to + columns in other tables. */ + UT_LIST_BASE_NODE_T(dict_foreign_t) foreign_list; + + /** List of foreign key constraints which refer to this table. */ + UT_LIST_BASE_NODE_T(dict_foreign_t) referenced_list; + + /** Node of the LRU list of tables. */ + UT_LIST_NODE_T(dict_table_t) table_LRU; + + /** Maximum recursive level we support when loading tables chained + together with FK constraints. If exceeds this level, we will stop + loading child table into memory along with its parent table. */ + byte fk_max_recusive_level; + + /** DDL transaction that last touched the table definition, or 0 if + no history is available. This includes possible changes in + ha_innobase::prepare_inplace_alter_table() and + ha_innobase::commit_inplace_alter_table(). */ + trx_id_t def_trx_id; + + /** Last transaction that inserted into an empty table. + Updated while holding exclusive table lock and an exclusive + latch on the clustered index root page (which must also be + an empty leaf page), and an ahi_latch (if btr_search_enabled). */ + Atomic_relaxed bulk_trx_id; + + /** Original table name, for MDL acquisition in purge. Normally, + this points to the same as name. When is_temporary_name(name.m_name) holds, + this should be a copy of the original table name, allocated from heap. */ + table_name_t mdl_name; + + /*!< set of foreign key constraints in the table; these refer to + columns in other tables */ + dict_foreign_set foreign_set; + + /*!< set of foreign key constraints which refer to this table */ + dict_foreign_set referenced_set; + + /** Statistics for query optimization. Mostly protected by + dict_sys.latch and stats_mutex_lock(). @{ */ + + /** TRUE if statistics have been calculated the first time after + database startup or table creation. */ + unsigned stat_initialized:1; + + /** Timestamp of last recalc of the stats. */ + time_t stats_last_recalc; + + /** The two bits below are set in the 'stat_persistent' member. They + have the following meaning: + 1. _ON=0, _OFF=0, no explicit persistent stats setting for this table, + the value of the global srv_stats_persistent is used to determine + whether the table has persistent stats enabled or not + 2. _ON=0, _OFF=1, persistent stats are explicitly disabled for this + table, regardless of the value of the global srv_stats_persistent + 3. _ON=1, _OFF=0, persistent stats are explicitly enabled for this + table, regardless of the value of the global srv_stats_persistent + 4. _ON=1, _OFF=1, not allowed, we assert if this ever happens. */ + #define DICT_STATS_PERSISTENT_ON (1 << 1) + #define DICT_STATS_PERSISTENT_OFF (1 << 2) + + /** Indicates whether the table uses persistent stats or not. See + DICT_STATS_PERSISTENT_ON and DICT_STATS_PERSISTENT_OFF. */ + ib_uint32_t stat_persistent; + + /** The two bits below are set in the 'stats_auto_recalc' member. They + have the following meaning: + 1. _ON=0, _OFF=0, no explicit auto recalc setting for this table, the + value of the global srv_stats_persistent_auto_recalc is used to + determine whether the table has auto recalc enabled or not + 2. _ON=0, _OFF=1, auto recalc is explicitly disabled for this table, + regardless of the value of the global srv_stats_persistent_auto_recalc + 3. _ON=1, _OFF=0, auto recalc is explicitly enabled for this table, + regardless of the value of the global srv_stats_persistent_auto_recalc + 4. _ON=1, _OFF=1, not allowed, we assert if this ever happens. */ + #define DICT_STATS_AUTO_RECALC_ON (1 << 1) + #define DICT_STATS_AUTO_RECALC_OFF (1 << 2) + + /** Indicates whether the table uses automatic recalc for persistent + stats or not. See DICT_STATS_AUTO_RECALC_ON and + DICT_STATS_AUTO_RECALC_OFF. */ + ib_uint32_t stats_auto_recalc; + + /** The number of pages to sample for this table during persistent + stats estimation. If this is 0, then the value of the global + srv_stats_persistent_sample_pages will be used instead. */ + ulint stats_sample_pages; + + /** Approximate number of rows in the table. We periodically calculate + new estimates. */ + ib_uint64_t stat_n_rows; + + /** Approximate clustered index size in database pages. */ + ulint stat_clustered_index_size; + + /** Approximate size of other indexes in database pages. */ + ulint stat_sum_of_other_index_sizes; + + /** How many rows are modified since last stats recalc. When a row is + inserted, updated, or deleted, we add 1 to this number; we calculate + new estimates for the table and the indexes if the table has changed + too much, see dict_stats_update_if_needed(). The counter is reset + to zero at statistics calculation. This counter is not protected by + any latch, because this is only used for heuristics. */ + ib_uint64_t stat_modified_counter; + + bool stats_error_printed; + /*!< Has persistent stats error beein + already printed for this table ? */ + /* @} */ + + /** AUTOINC related members. @{ */ + + /* The actual collection of tables locked during AUTOINC read/write is + kept in trx_t. In order to quickly determine whether a transaction has + locked the AUTOINC lock we keep a pointer to the transaction here in + the 'autoinc_trx' member. This is to avoid acquiring the + lock_sys.latch and scanning the vector in trx_t. + When an AUTOINC lock has to wait, the corresponding lock instance is + created on the trx lock heap rather than use the pre-allocated instance + in autoinc_lock below. */ + + /** A buffer for an AUTOINC lock for this table. We allocate the + memory here so that individual transactions can get it and release it + without a need to allocate space from the lock heap of the trx: + otherwise the lock heap would grow rapidly if we do a large insert + from a select. */ + lock_t* autoinc_lock; + + /** Mutex protecting autoinc and freed_indexes. */ + srw_spin_mutex autoinc_mutex; +private: + /** Mutex protecting locks on this table. */ + srw_spin_mutex lock_mutex; +#ifdef UNIV_DEBUG + /** The owner of lock_mutex (0 if none) */ + Atomic_relaxed lock_mutex_owner{0}; +#endif +public: + /** Autoinc counter value to give to the next inserted row. */ + uint64_t autoinc; + + /** The transaction that currently holds the the AUTOINC lock on this table. + Protected by lock_mutex. + The thread that is executing autoinc_trx may read this field without + holding a latch, in row_lock_table_autoinc_for_mysql(). + Only the autoinc_trx thread may clear this field; it cannot be + modified on the behalf of a transaction that is being handled by a + different thread. */ + Atomic_relaxed autoinc_trx; + + /** Number of granted or pending autoinc_lock on this table. This + value is set after acquiring lock_sys.latch but + in innodb_autoinc_lock_mode=1 (the default), + ha_innobase::innobase_lock_autoinc() will perform a dirty read + to determine whether other transactions have acquired the autoinc_lock. */ + uint32_t n_waiting_or_granted_auto_inc_locks; + + /* @} */ + + /** Number of granted or pending LOCK_S or LOCK_X on the table. + Protected by lock_sys.assert_locked(*this). */ + uint32_t n_lock_x_or_s; + + /** FTS specific state variables. */ + fts_t* fts; + + /** Quiescing states, protected by the dict_index_t::lock. ie. we can + only change the state if we acquire all the latches (dict_index_t::lock) + in X mode of this table's indexes. */ + ib_quiesce_t quiesce; + + /** Count of the number of record locks on this table. We use this to + determine whether we can evict the table from the dictionary cache. + Modified when lock_sys.is_writer(), or + lock_sys.assert_locked(page_id) and trx->mutex_is_owner() hold. + @see trx_lock_t::trx_locks */ + Atomic_counter n_rec_locks; +private: + /** Count of how many handles are opened to this table. Dropping of the + table is NOT allowed until this count gets to zero. MySQL does NOT + itself check the number of open handles at DROP. */ + Atomic_counter n_ref_count; +public: + /** List of locks on the table. Protected by lock_sys.assert_locked(lock). */ + table_lock_list_t locks; + + /** Timestamp of the last modification of this table. */ + Atomic_relaxed update_time; + /** Transactions whose view low limit is greater than this number are + not allowed to access the MariaDB query cache. + @see innobase_query_caching_table_check_low() + @see trx_t::commit_tables() */ + Atomic_relaxed query_cache_inv_trx_id; + +#ifdef UNIV_DEBUG + /** Value of 'magic_n'. */ + #define DICT_TABLE_MAGIC_N 76333786 + + /** Magic number. */ + ulint magic_n; +#endif /* UNIV_DEBUG */ + /** mysql_row_templ_t for base columns used for compute the virtual + columns */ + dict_vcol_templ_t* vc_templ; + + /* @return whether the table has any other transcation lock + other than the given transaction */ + bool has_lock_other_than(const trx_t *trx) const + { + for (lock_t *lock= UT_LIST_GET_FIRST(locks); lock; + lock= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) + if (lock->trx != trx) + return true; + return false; + } + + /** @return whether a DDL operation is in progress on this table */ + bool is_active_ddl() const + { + return UT_LIST_GET_FIRST(indexes)->online_log; + } + + /** @return whether the name is + mysql.innodb_index_stats or mysql.innodb_table_stats */ + bool is_stats_table() const; + + /** @return number of unique columns in FTS_DOC_ID index */ + unsigned fts_n_uniq() const { return versioned() ? 2 : 1; } + + /** Create metadata. + @param name table name + @param space tablespace + @param n_cols total number of columns (both virtual and non-virtual) + @param n_v_cols number of virtual columns + @param flags table flags + @param flags2 table flags2 + @return newly allocated table object */ + static dict_table_t *create(const span &name, fil_space_t *space, + ulint n_cols, ulint n_v_cols, ulint flags, + ulint flags2); + + /** Check whether the table has any spatial indexes */ + bool has_spatial_index() const + { + for (auto i= UT_LIST_GET_FIRST(indexes); + (i= UT_LIST_GET_NEXT(indexes, i)) != nullptr; ) + if (i->is_spatial()) + return true; + return false; + } +}; + +inline void dict_index_t::set_modified(mtr_t& mtr) const +{ + mtr.set_named_space(table->space); +} + +inline bool table_name_t::is_temporary() const +{ + return dict_table_t::is_temporary_name(m_name); +} + +inline bool dict_index_t::is_readable() const { return table->is_readable(); } + +inline bool dict_index_t::is_instant() const +{ + ut_ad(n_core_fields > 0); + ut_ad(n_core_fields <= n_fields || table->n_dropped()); + ut_ad(n_core_fields == n_fields + || (type & ~(DICT_UNIQUE | DICT_CORRUPT)) == DICT_CLUSTERED); + ut_ad(n_core_fields == n_fields || table->supports_instant()); + ut_ad(n_core_fields == n_fields || !table->is_temporary()); + ut_ad(!table->instant || !table->is_temporary()); + + return n_core_fields != n_fields + || (is_primary() && table->instant); +} + +inline bool dict_index_t::is_corrupted() const +{ + return UNIV_UNLIKELY(online_status >= ONLINE_INDEX_ABORTED + || (type & DICT_CORRUPT) + || (table && table->corrupted)); +} + +inline void dict_index_t::clear_instant_add() +{ + DBUG_ASSERT(is_primary()); + DBUG_ASSERT(is_instant()); + DBUG_ASSERT(!table->instant); + for (unsigned i= n_core_fields; i < n_fields; i++) + fields[i].col->clear_instant(); + n_core_fields= n_fields; + n_core_null_bytes= static_cast + (UT_BITS_IN_BYTES(static_cast(n_nullable))); +} + +inline void dict_index_t::clear_instant_alter() +{ + DBUG_ASSERT(is_primary()); + DBUG_ASSERT(n_fields == n_def); + + if (!table->instant) { + if (is_instant()) { + clear_instant_add(); + } + return; + } + +#ifndef DBUG_OFF + for (unsigned i = first_user_field(); i--; ) { + DBUG_ASSERT(!fields[i].col->is_dropped()); + DBUG_ASSERT(!fields[i].col->is_nullable()); + } +#endif + const dict_col_t* ai_col = table->persistent_autoinc + ? fields[table->persistent_autoinc - 1].col + : NULL; + dict_field_t* const begin = &fields[first_user_field()]; + dict_field_t* end = &fields[n_fields]; + + for (dict_field_t* d = begin; d < end; ) { + /* Move fields for dropped columns to the end. */ + if (!d->col->is_dropped()) { + d++; + } else { + if (d->col->is_nullable()) { + n_nullable--; + } + + std::swap(*d, *--end); + } + } + + DBUG_ASSERT(&fields[n_fields - table->n_dropped()] == end); + n_core_fields = n_fields = n_def + = static_cast(end - fields) & MAX_N_FIELDS; + n_core_null_bytes = static_cast(UT_BITS_IN_BYTES(n_nullable)); + std::sort(begin, end, [](const dict_field_t& a, const dict_field_t& b) + { return a.col->ind < b.col->ind; }); + table->instant = NULL; + if (ai_col) { + auto a = std::find_if(fields, end, + [ai_col](const dict_field_t& f) + { return f.col == ai_col; }); + table->persistent_autoinc = (a == end) + ? 0 + : (1 + static_cast(a - fields)) + & MAX_N_FIELDS; + } +} + +/** @return whether the column was instantly dropped +@param[in] index the clustered index */ +inline bool dict_col_t::is_dropped(const dict_index_t& index) const +{ + DBUG_ASSERT(index.is_primary()); + DBUG_ASSERT(!is_dropped() == !index.table->instant); + DBUG_ASSERT(!is_dropped() || (this >= index.table->instant->dropped + && this < index.table->instant->dropped + + index.table->instant->n_dropped)); + return is_dropped(); +} + +/*******************************************************************//** +Initialise the table lock list. */ +void +lock_table_lock_list_init( +/*======================*/ + table_lock_list_t* locks); /*!< List to initialise */ + +/** A function object to add the foreign key constraint to the referenced set +of the referenced table, if it exists in the dictionary cache. */ +struct dict_foreign_add_to_referenced_table { + void operator()(dict_foreign_t* foreign) const + { + if (dict_table_t* table = foreign->referenced_table) { + std::pair ret + = table->referenced_set.insert(foreign); + ut_a(ret.second); + } + } +}; + +/** Check whether the col is used in spatial index or regular index. +@param[in] col column to check +@return spatial status */ +inline +spatial_status_t +dict_col_get_spatial_status( + const dict_col_t* col) +{ + spatial_status_t spatial_status = SPATIAL_NONE; + + /* Column is not a part of any index. */ + if (!col->ord_part) { + return(spatial_status); + } + + if (DATA_GEOMETRY_MTYPE(col->mtype)) { + if (col->max_prefix == 0) { + spatial_status = SPATIAL_ONLY; + } else { + /* Any regular index on a geometry column + should have a prefix. */ + spatial_status = SPATIAL_MIXED; + } + } + + return(spatial_status); +} + +/** Clear defragmentation summary. */ +inline void dict_stats_empty_defrag_summary(dict_index_t* index) +{ + index->stat_defrag_n_pages_freed = 0; +} + +/** Clear defragmentation related index stats. */ +inline void dict_stats_empty_defrag_stats(dict_index_t* index) +{ + index->stat_defrag_modified_counter = 0; + index->stat_defrag_n_page_split = 0; +} + +#include "dict0mem.inl" + +#endif /* dict0mem_h */ diff --git a/storage/innobase/include/dict0mem.inl b/storage/innobase/include/dict0mem.inl new file mode 100644 index 00000000..d60ee5d9 --- /dev/null +++ b/storage/innobase/include/dict0mem.inl @@ -0,0 +1,68 @@ +/***************************************************************************** + +Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0mem.ic +Data dictionary memory object creation + +Created 1/8/1996 Heikki Tuuri +***********************************************************************/ + +#include "data0type.h" +#include "dict0mem.h" +#include "fil0fil.h" + +/**********************************************************************//** +This function poplulates a dict_index_t index memory structure with +supplied information. */ +UNIV_INLINE +void +dict_mem_fill_index_struct( +/*=======================*/ + dict_index_t* index, /*!< out: index to be filled */ + mem_heap_t* heap, /*!< in: memory heap */ + const char* index_name, /*!< in: index name */ + ulint type, /*!< in: DICT_UNIQUE, + DICT_CLUSTERED, ... ORed */ + ulint n_fields) /*!< in: number of fields */ +{ + + if (heap) { + index->heap = heap; + index->name = mem_heap_strdup(heap, index_name); + index->fields = (dict_field_t*) mem_heap_alloc( + heap, 1 + n_fields * sizeof(dict_field_t)); + } else { + index->name = index_name; + index->heap = NULL; + index->fields = NULL; + } + + index->type = type & ((1U << DICT_IT_BITS) - 1); + index->page = FIL_NULL; + index->merge_threshold = DICT_INDEX_MERGE_THRESHOLD_DEFAULT; + index->n_fields = static_cast(n_fields) + & index->MAX_N_FIELDS; + index->n_core_fields = static_cast(n_fields) + & index->MAX_N_FIELDS; + /* The '1 +' above prevents allocation + of an empty mem block */ + index->nulls_equal = false; + ut_d(index->magic_n = DICT_INDEX_MAGIC_N); +} diff --git a/storage/innobase/include/dict0pagecompress.h b/storage/innobase/include/dict0pagecompress.h new file mode 100644 index 00000000..f1272dc4 --- /dev/null +++ b/storage/innobase/include/dict0pagecompress.h @@ -0,0 +1,61 @@ +/***************************************************************************** + +Copyright (C) 2013, 2017, MariaDB Corporation. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0pagecompress.h +Helper functions for extracting/storing page compression information +to dictionary. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#ifndef dict0pagecompress_h +#define dict0pagecompress_h + +/********************************************************************//** +Extract the page compression level from table flags. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_tf_get_page_compression_level( +/*===============================*/ + ulint flags) /*!< in: flags */ + __attribute__((const)); +/********************************************************************//** +Extract the page compression flag from table flags +@return page compression flag, or false if not compressed */ +UNIV_INLINE +ibool +dict_tf_get_page_compression( +/*==========================*/ + ulint flags) /*!< in: flags */ + __attribute__((const)); + +/********************************************************************//** +Check whether the table uses the page compressed page format. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_table_page_compression_level( +/*==============================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((const)); + +#include "dict0pagecompress.inl" + +#endif diff --git a/storage/innobase/include/dict0pagecompress.inl b/storage/innobase/include/dict0pagecompress.inl new file mode 100644 index 00000000..c959f9ca --- /dev/null +++ b/storage/innobase/include/dict0pagecompress.inl @@ -0,0 +1,81 @@ +/***************************************************************************** + +Copyright (C) 2013, 2017, MariaDB Corporation. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0pagecompress.ic +Inline implementation for helper functions for extracting/storing +page compression and atomic writes information to dictionary. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +/********************************************************************//** +Extract the page compression level from dict_table_t::flags. +These flags are in memory, so assert that they are valid. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_tf_get_page_compression_level( +/*===============================*/ + ulint flags) /*!< in: flags */ +{ + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags); + + ut_ad(page_compression_level <= 9); + + return(page_compression_level); +} + +/********************************************************************//** +Check whether the table uses the page compression page format. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_table_page_compression_level( +/*==============================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + ut_ad(dict_tf_get_page_compression(table->flags)); + + return(dict_tf_get_page_compression_level(table->flags)); +} + +/********************************************************************//** +Check whether the table uses the page compression page format. +@return true if page compressed, false if not */ +UNIV_INLINE +ibool +dict_tf_get_page_compression( +/*=========================*/ + ulint flags) /*!< in: flags */ +{ + return(DICT_TF_GET_PAGE_COMPRESSION(flags)); +} + +/********************************************************************//** +Check whether the table uses the page compression page format. +@return true if page compressed, false if not */ +UNIV_INLINE +ibool +dict_table_is_page_compressed( +/*==========================*/ + const dict_table_t* table) /*!< in: table */ +{ + return (dict_tf_get_page_compression(table->flags)); +} diff --git a/storage/innobase/include/dict0stats.h b/storage/innobase/include/dict0stats.h new file mode 100644 index 00000000..0dc1b984 --- /dev/null +++ b/storage/innobase/include/dict0stats.h @@ -0,0 +1,238 @@ +/***************************************************************************** + +Copyright (c) 2009, 2018, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0stats.h +Code used for calculating and manipulating table statistics. + +Created Jan 06, 2010 Vasil Dimov +*******************************************************/ + +#ifndef dict0stats_h +#define dict0stats_h + +#include "dict0types.h" +#include "trx0types.h" + +enum dict_stats_upd_option_t { + DICT_STATS_RECALC_PERSISTENT,/* (re) calculate the + statistics using a precise and slow + algo and save them to the persistent + storage, if the persistent storage is + not present then emit a warning and + fall back to transient stats */ + DICT_STATS_RECALC_TRANSIENT,/* (re) calculate the statistics + using an imprecise quick algo + without saving the results + persistently */ + DICT_STATS_EMPTY_TABLE, /* Write all zeros (or 1 where it makes sense) + into a table and its indexes' statistics + members. The resulting stats correspond to an + empty table. If the table is using persistent + statistics, then they are saved on disk. */ + DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY /* fetch the stats + from the persistent storage if the in-memory + structures have not been initialized yet, + otherwise do nothing */ +}; + +/*********************************************************************//** +Set the persistent statistics flag for a given table. This is set only +in the in-memory table object and is not saved on disk. It will be read +from the .frm file upon first open from MySQL after a server restart. */ +UNIV_INLINE +void +dict_stats_set_persistent( +/*======================*/ + dict_table_t* table, /*!< in/out: table */ + ibool ps_on, /*!< in: persistent stats explicitly enabled */ + ibool ps_off) /*!< in: persistent stats explicitly disabled */ + MY_ATTRIBUTE((nonnull)); + +/** @return whether persistent statistics is enabled for a given table */ +UNIV_INLINE +bool +dict_stats_is_persistent_enabled(const dict_table_t* table) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/*********************************************************************//** +Set the auto recalc flag for a given table (only honored for a persistent +stats enabled table). The flag is set only in the in-memory table object +and is not saved in InnoDB files. It will be read from the .frm file upon +first open from MySQL after a server restart. */ +UNIV_INLINE +void +dict_stats_auto_recalc_set( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + ibool auto_recalc_on, /*!< in: explicitly enabled */ + ibool auto_recalc_off); /*!< in: explicitly disabled */ + +/** @return whether auto recalc is enabled for a given table*/ +UNIV_INLINE +bool +dict_stats_auto_recalc_is_enabled(const dict_table_t* table) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/*********************************************************************//** +Initialize table's stats for the first time when opening a table. */ +UNIV_INLINE +void +dict_stats_init( +/*============*/ + dict_table_t* table); /*!< in/out: table */ + +/*********************************************************************//** +Deinitialize table's stats after the last close of the table. This is +used to detect "FLUSH TABLE" and refresh the stats upon next open. */ +UNIV_INLINE +void +dict_stats_deinit( +/*==============*/ + dict_table_t* table) /*!< in/out: table */ + MY_ATTRIBUTE((nonnull)); + +#ifdef WITH_WSREP +/** Update the table modification counter and if necessary, +schedule new estimates for table and index statistics to be calculated. +@param[in,out] table persistent or temporary table +@param[in] trx transaction */ +void dict_stats_update_if_needed(dict_table_t *table, const trx_t &trx) + MY_ATTRIBUTE((nonnull)); +#else +/** Update the table modification counter and if necessary, +schedule new estimates for table and index statistics to be calculated. +@param[in,out] table persistent or temporary table */ +void dict_stats_update_if_needed_func(dict_table_t *table) + MY_ATTRIBUTE((nonnull)); +# define dict_stats_update_if_needed(t,trx) dict_stats_update_if_needed_func(t) +#endif + +/*********************************************************************//** +Calculates new estimates for table and index statistics. The statistics +are used in query optimization. +@return DB_* error code or DB_SUCCESS */ +dberr_t +dict_stats_update( +/*==============*/ + dict_table_t* table, /*!< in/out: table */ + dict_stats_upd_option_t stats_upd_option); + /*!< in: whether to (re) calc + the stats or to fetch them from + the persistent storage */ + +/** Execute DELETE FROM mysql.innodb_table_stats +@param database_name database name +@param table_name table name +@param trx transaction +@return DB_SUCCESS or error code */ +dberr_t dict_stats_delete_from_table_stats(const char *database_name, + const char *table_name, + trx_t *trx) + MY_ATTRIBUTE((nonnull)); +/** Execute DELETE FROM mysql.innodb_index_stats +@param database_name database name +@param table_name table name +@param trx transaction +@return DB_SUCCESS or error code */ +dberr_t dict_stats_delete_from_index_stats(const char *database_name, + const char *table_name, + trx_t *trx) + MY_ATTRIBUTE((nonnull)); +/** Execute DELETE FROM mysql.innodb_index_stats +@param database_name database name +@param table_name table name +@param index_name name of the index +@param trx transaction (nullptr=start and commit a new one) +@return DB_SUCCESS or error code */ +dberr_t dict_stats_delete_from_index_stats(const char *database_name, + const char *table_name, + const char *index_name, trx_t *trx); + +/*********************************************************************//** +Fetches or calculates new estimates for index statistics. */ +void +dict_stats_update_for_index( +/*========================*/ + dict_index_t* index) /*!< in/out: index */ + MY_ATTRIBUTE((nonnull)); + +/** Rename a table in InnoDB persistent stats storage. +@param old_name old table name +@param new_name new table name +@param trx transaction +@return DB_SUCCESS or error code */ +dberr_t dict_stats_rename_table(const char *old_name, const char *new_name, + trx_t *trx); +/** Rename an index in InnoDB persistent statistics. +@param db database name +@param table table name +@param old_name old table name +@param new_name new table name +@param trx transaction +@return DB_SUCCESS or error code */ +dberr_t dict_stats_rename_index(const char *db, const char *table, + const char *old_name, const char *new_name, + trx_t *trx); + +/** Delete all persistent statistics for a database. +@param db database name +@param trx transaction +@return DB_SUCCESS or error code */ +dberr_t dict_stats_delete(const char *db, trx_t *trx); + +/** Save an individual index's statistic into the persistent statistics +storage. +@param[in] index index to be updated +@param[in] last_update timestamp of the stat +@param[in] stat_name name of the stat +@param[in] stat_value value of the stat +@param[in] sample_size n pages sampled or NULL +@param[in] stat_description description of the stat +@param[in,out] trx transaction +@return DB_SUCCESS or error code */ +dberr_t +dict_stats_save_index_stat( + dict_index_t* index, + time_t last_update, + const char* stat_name, + ib_uint64_t stat_value, + ib_uint64_t* sample_size, + const char* stat_description, + trx_t* trx) + MY_ATTRIBUTE((nonnull(1, 3, 6, 7))); + +/** Report an error if updating table statistics failed because +.ibd file is missing, table decryption failed or table is corrupted. +@param[in,out] table Table +@param[in] defragment true if statistics is for defragment +@retval DB_DECRYPTION_FAILED if decryption of the table failed +@retval DB_TABLESPACE_DELETED if .ibd file is missing +@retval DB_CORRUPTION if table is marked as corrupted */ +dberr_t +dict_stats_report_error(dict_table_t* table, bool defragment = false) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +#include "dict0stats.inl" + +#ifdef UNIV_ENABLE_UNIT_TEST_DICT_STATS +void test_dict_stats_all(); +#endif /* UNIV_ENABLE_UNIT_TEST_DICT_STATS */ + +#endif /* dict0stats_h */ diff --git a/storage/innobase/include/dict0stats.inl b/storage/innobase/include/dict0stats.inl new file mode 100644 index 00000000..dd516275 --- /dev/null +++ b/storage/innobase/include/dict0stats.inl @@ -0,0 +1,219 @@ +/***************************************************************************** + +Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0stats.ic +Code used for calculating and manipulating table statistics. + +Created Jan 23, 2012 Vasil Dimov +*******************************************************/ + +#include "dict0dict.h" +#include "srv0srv.h" + +/*********************************************************************//** +Set the persistent statistics flag for a given table. This is set only +in the in-memory table object and is not saved on disk. It will be read +from the .frm file upon first open from MySQL after a server restart. */ +UNIV_INLINE +void +dict_stats_set_persistent( +/*======================*/ + dict_table_t* table, /*!< in/out: table */ + ibool ps_on, /*!< in: persistent stats explicitly enabled */ + ibool ps_off) /*!< in: persistent stats explicitly disabled */ +{ + /* Not allowed to have both flags set, but a CREATE or ALTER + statement that contains "STATS_PERSISTENT=0 STATS_PERSISTENT=1" would + end up having both set. In this case we clear the OFF flag. */ + if (ps_on && ps_off) { + ps_off = FALSE; + } + + ib_uint32_t stat_persistent = 0; + + if (ps_on) { + stat_persistent |= DICT_STATS_PERSISTENT_ON; + } + + if (ps_off) { + stat_persistent |= DICT_STATS_PERSISTENT_OFF; + } + + /* we rely on this assignment to be atomic */ + table->stat_persistent = stat_persistent; +} + +/** @return whether persistent statistics is enabled for a given table */ +UNIV_INLINE +bool +dict_stats_is_persistent_enabled(const dict_table_t* table) +{ + /* Because of the nature of this check (non-locking) it is possible + that a table becomes: + * PS-disabled immediately after this function has returned TRUE or + * PS-enabled immediately after this function has returned FALSE. + This means that it is possible that we do: + + dict_stats_update(DICT_STATS_RECALC_PERSISTENT) on a table that has + just been PS-disabled or + + dict_stats_update(DICT_STATS_RECALC_TRANSIENT) on a table that has + just been PS-enabled. + This is acceptable. Avoiding this would mean that we would have to + hold dict_sys.latch or stats_mutex_lock() like for accessing the + other ::stat_ members which would be too big performance penalty, + especially when this function is called from + dict_stats_update_if_needed(). */ + + /* we rely on this read to be atomic */ + ib_uint32_t stat_persistent = table->stat_persistent; + + if (stat_persistent & DICT_STATS_PERSISTENT_ON) { + ut_ad(!(stat_persistent & DICT_STATS_PERSISTENT_OFF)); + return(true); + } else if (stat_persistent & DICT_STATS_PERSISTENT_OFF) { + return(false); + } else { + return(srv_stats_persistent); + } +} + +/*********************************************************************//** +Set the auto recalc flag for a given table (only honored for a persistent +stats enabled table). The flag is set only in the in-memory table object +and is not saved in InnoDB files. It will be read from the .frm file upon +first open from MySQL after a server restart. */ +UNIV_INLINE +void +dict_stats_auto_recalc_set( +/*=======================*/ + dict_table_t* table, /*!< in/out: table */ + ibool auto_recalc_on, /*!< in: explicitly enabled */ + ibool auto_recalc_off) /*!< in: explicitly disabled */ +{ + ut_ad(!auto_recalc_on || !auto_recalc_off); + + ib_uint32_t stats_auto_recalc = 0; + + if (auto_recalc_on) { + stats_auto_recalc |= DICT_STATS_AUTO_RECALC_ON; + } + + if (auto_recalc_off) { + stats_auto_recalc |= DICT_STATS_AUTO_RECALC_OFF; + } + + /* we rely on this assignment to be atomic */ + table->stats_auto_recalc = stats_auto_recalc; +} + +/** @return whether auto recalc is enabled for a given table*/ +UNIV_INLINE +bool +dict_stats_auto_recalc_is_enabled(const dict_table_t* table) +{ + /* we rely on this read to be atomic */ + ib_uint32_t stats_auto_recalc = table->stats_auto_recalc; + + if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_ON) { + ut_ad(!(stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF)); + return(true); + } else if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF) { + return(false); + } else { + return(srv_stats_auto_recalc); + } +} + +/*********************************************************************//** +Initialize table's stats for the first time when opening a table. */ +UNIV_INLINE +void +dict_stats_init( +/*============*/ + dict_table_t* table) /*!< in/out: table */ +{ + ut_ad(!table->stats_mutex_is_owner()); + + if (table->stat_initialized) { + return; + } + + dict_stats_upd_option_t opt; + + if (dict_stats_is_persistent_enabled(table)) { + opt = DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY; + } else { + opt = DICT_STATS_RECALC_TRANSIENT; + } + + dict_stats_update(table, opt); +} + +/*********************************************************************//** +Deinitialize table's stats after the last close of the table. This is +used to detect "FLUSH TABLE" and refresh the stats upon next open. */ +UNIV_INLINE +void +dict_stats_deinit( +/*==============*/ + dict_table_t* table) /*!< in/out: table */ +{ + ut_ad(table->stats_mutex_is_owner()); + ut_ad(table->get_ref_count() == 0); + +#ifdef HAVE_valgrind + if (!table->stat_initialized) { + return; + } + + MEM_UNDEFINED(&table->stat_n_rows, sizeof table->stat_n_rows); + MEM_UNDEFINED(&table->stat_clustered_index_size, + sizeof table->stat_clustered_index_size); + MEM_UNDEFINED(&table->stat_sum_of_other_index_sizes, + sizeof table->stat_sum_of_other_index_sizes); + MEM_UNDEFINED(&table->stat_modified_counter, + sizeof table->stat_modified_counter); + + dict_index_t* index; + + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + MEM_UNDEFINED( + index->stat_n_diff_key_vals, + index->n_uniq + * sizeof index->stat_n_diff_key_vals[0]); + MEM_UNDEFINED( + index->stat_n_sample_sizes, + index->n_uniq + * sizeof index->stat_n_sample_sizes[0]); + MEM_UNDEFINED( + index->stat_n_non_null_key_vals, + index->n_uniq + * sizeof index->stat_n_non_null_key_vals[0]); + MEM_UNDEFINED( + &index->stat_index_size, + sizeof(index->stat_index_size)); + MEM_UNDEFINED( + &index->stat_n_leaf_pages, + sizeof(index->stat_n_leaf_pages)); + } +#endif /* HAVE_valgrind */ + table->stat_initialized = FALSE; +} diff --git a/storage/innobase/include/dict0stats_bg.h b/storage/innobase/include/dict0stats_bg.h new file mode 100644 index 00000000..d9a2f628 --- /dev/null +++ b/storage/innobase/include/dict0stats_bg.h @@ -0,0 +1,59 @@ +/***************************************************************************** + +Copyright (c) 2012, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0stats_bg.h +Code used for background table and index stats gathering. + +Created Apr 26, 2012 Vasil Dimov +*******************************************************/ + +#ifndef dict0stats_bg_h +#define dict0stats_bg_h + +#include "dict0types.h" + +#ifdef HAVE_PSI_INTERFACE +extern mysql_pfs_key_t recalc_pool_mutex_key; +#endif /* HAVE_PSI_INTERFACE */ + +/** Delete a table from the auto recalc pool, and ensure that +no statistics are being updated on it. */ +void dict_stats_recalc_pool_del(table_id_t id, bool have_mdl_exclusive); + +/*****************************************************************//** +Initialize global variables needed for the operation of dict_stats_thread(). +Must be called before dict_stats task is started. */ +void dict_stats_init(); + +/*****************************************************************//** +Free resources allocated by dict_stats_thread_init(), must be called +after dict_stats task has exited. */ +void dict_stats_deinit(); + +/** Start the dict stats timer. */ +void dict_stats_start(); + +/** Shut down the dict_stats timer. */ +void dict_stats_shutdown(); + +/** Reschedule dict stats timer to run now. */ +void dict_stats_schedule_now(); + +#endif /* dict0stats_bg_h */ diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h new file mode 100644 index 00000000..ec50e8cd --- /dev/null +++ b/storage/innobase/include/dict0types.h @@ -0,0 +1,176 @@ +/***************************************************************************** + +Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dict0types.h +Data dictionary global types + +Created 1/8/1996 Heikki Tuuri +*******************************************************/ + +#ifndef dict0types_h +#define dict0types_h + +#include "univ.i" +#include "span.h" +#include + +using st_::span; + +struct dict_col_t; +struct dict_field_t; +struct dict_index_t; +struct dict_table_t; +struct dict_foreign_t; +struct dict_v_col_t; + +struct ind_node_t; +struct tab_node_t; +struct dict_add_v_col_t; + +/* Space id and page no where the dictionary header resides */ +#define DICT_HDR_SPACE 0 /* the SYSTEM tablespace */ +#define DICT_HDR_PAGE_NO FSP_DICT_HDR_PAGE_NO + +/* The ibuf table and indexes's ID are assigned as the number +DICT_IBUF_ID_MIN plus the space id */ +#define DICT_IBUF_ID_MIN 0xFFFFFFFF00000000ULL + +typedef ib_id_t table_id_t; +typedef ib_id_t index_id_t; + +/** Maximum transaction identifier */ +#define TRX_ID_MAX IB_ID_MAX + +/** The bit pattern corresponding to TRX_ID_MAX */ +extern const byte trx_id_max_bytes[8]; +extern const byte timestamp_max_bytes[7]; + +/** Error to ignore when we load table dictionary into memory. However, +the table and index will be marked as "corrupted", and caller will +be responsible to deal with corrupted table or index. +Note: please define the IGNORE_ERR_* as bits, so their value can +be or-ed together */ +enum dict_err_ignore_t { + DICT_ERR_IGNORE_NONE = 0, /*!< no error to ignore */ + DICT_ERR_IGNORE_FK_NOKEY = 1, /*!< ignore error if any foreign + key is missing */ + DICT_ERR_IGNORE_INDEX = 2, /*!< ignore corrupted indexes */ + DICT_ERR_IGNORE_RECOVER_LOCK = 4 | DICT_ERR_IGNORE_FK_NOKEY, + /*!< Used when recovering table locks + for resurrected transactions. + Silently load a missing + tablespace, and do not load + incomplete index definitions. */ + /** ignore all errors above */ + DICT_ERR_IGNORE_ALL = 7, + /** prepare some DDL operation; + do not attempt to load tablespace */ + DICT_ERR_IGNORE_TABLESPACE = 15, + /** prepare to drop the table; do not attempt to load tablespace + or the metadata */ + DICT_ERR_IGNORE_DROP = 31 +}; + +/** Quiescing states for flushing tables to disk. */ +enum ib_quiesce_t { + QUIESCE_NONE, + QUIESCE_START, /*!< Initialise, prepare to start */ + QUIESCE_COMPLETE /*!< All done */ +}; + +/** Prefix for InnoDB internal tables, adopted from sql/table.h */ +#define TEMP_FILE_PREFIX_INNODB "#sql-ib" + +/** Table name wrapper for pretty-printing */ +struct table_name_t +{ + /** The name in internal representation */ + char* m_name; + + /** Default constructor */ + table_name_t() = default; + /** Constructor */ + table_name_t(char* name) : m_name(name) {} + + /** @return the end of the schema name */ + const char* dbend() const + { + const char* sep = strchr(m_name, '/'); + ut_ad(sep); + return sep; + } + + /** @return the length of the schema name, in bytes */ + size_t dblen() const { return size_t(dbend() - m_name); } + + /** Determine the filename-safe encoded table name. + @return the filename-safe encoded table name */ + const char* basename() const { return dbend() + 1; } + + /** The start of the table basename suffix for partitioned tables */ + static const char part_suffix[4]; + + /** Determine the partition or subpartition name suffix. + @return the partition name + @retval NULL if the table is not partitioned */ + const char* part() const { return strstr(basename(), part_suffix); } + + /** @return whether this is a temporary or intermediate table name */ + inline bool is_temporary() const; +}; + +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG +/** Dump the change buffer at startup */ +extern my_bool ibuf_dump; +/** Flag to control insert buffer debugging. */ +extern uint ibuf_debug; +#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ + +/** Shift for spatial status */ +#define SPATIAL_STATUS_SHIFT 12 + +/** Mask to encode/decode spatial status. */ +#define SPATIAL_STATUS_MASK (3U << SPATIAL_STATUS_SHIFT) + +#if SPATIAL_STATUS_MASK < REC_VERSION_56_MAX_INDEX_COL_LEN +# error SPATIAL_STATUS_MASK < REC_VERSION_56_MAX_INDEX_COL_LEN +#endif + +/** whether a col is used in spatial index or regular index +Note: the spatial status is part of persistent undo log, +so we should not modify the values in MySQL 5.7 */ +enum spatial_status_t { + /* Unkown status (undo format in 5.7.9) */ + SPATIAL_UNKNOWN = 0, + + /** Not used in gis index. */ + SPATIAL_NONE = 1, + + /** Used in both spatial index and regular index. */ + SPATIAL_MIXED = 2, + + /** Only used in spatial index. */ + SPATIAL_ONLY = 3 +}; + +#define TABLE_STATS_NAME "mysql/innodb_table_stats" +#define INDEX_STATS_NAME "mysql/innodb_index_stats" + +#endif diff --git a/storage/innobase/include/dyn0buf.h b/storage/innobase/include/dyn0buf.h new file mode 100644 index 00000000..06af4dcc --- /dev/null +++ b/storage/innobase/include/dyn0buf.h @@ -0,0 +1,442 @@ +/***************************************************************************** + +Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dyn0buf.h +The dynamically allocated buffer implementation + +Created 2013-03-16 Sunny Bains +*******************************************************/ + +#ifndef dyn0buf_h +#define dyn0buf_h + +#include "mem0mem.h" +#include "dyn0types.h" +#include "ilist.h" + + +/** Class that manages dynamic buffers. It uses a UT_LIST of +mtr_buf_t::block_t instances. We don't use STL containers in +order to avoid the overhead of heap calls. Using a custom memory +allocator doesn't solve the problem either because we have to get +the memory from somewhere. We can't use the block_t::m_data as the +backend for the custom allocator because we would like the data in +the blocks to be contiguous. */ +class mtr_buf_t { +public: + /** SIZE - sizeof(m_node) + sizeof(m_used) */ + enum { MAX_DATA_SIZE = DYN_ARRAY_DATA_SIZE + - sizeof(ilist_node<>) + sizeof(uint32_t) }; + + class block_t : public ilist_node<> { + public: + + block_t() + { + compile_time_assert(MAX_DATA_SIZE <= (2 << 15)); + init(); + } + + /** + Gets the number of used bytes in a block. + @return number of bytes used */ + ulint used() const + MY_ATTRIBUTE((warn_unused_result)) + { + return(static_cast(m_used & ~DYN_BLOCK_FULL_FLAG)); + } + + /** + Gets pointer to the start of data. + @return pointer to data */ + byte* start() + MY_ATTRIBUTE((warn_unused_result)) + { + return(m_data); + } + + /** + @return start of data - non const version */ + byte* begin() + MY_ATTRIBUTE((warn_unused_result)) + { + return(m_data); + } + + /** + @return end of used data - non const version */ + byte* end() + MY_ATTRIBUTE((warn_unused_result)) + { + return(begin() + m_used); + } + + /** + @return start of data - const version */ + const byte* begin() const + MY_ATTRIBUTE((warn_unused_result)) + { + return(m_data); + } + + /** + @return end of used data - const version */ + const byte* end() const + MY_ATTRIBUTE((warn_unused_result)) + { + return(begin() + m_used); + } + + private: + /** + @return pointer to start of reserved space */ + template + Type push(uint32_t size) + { + Type ptr = reinterpret_cast(end()); + + m_used += size; + ut_ad(m_used <= uint32_t(MAX_DATA_SIZE)); + + return(ptr); + } + + /** + Grow the stack. */ + void close(const byte* ptr) + { + /* Check that it is within bounds */ + ut_ad(ptr >= begin()); + ut_ad(ptr <= begin() + m_buf_end); + + /* We have done the boundary check above */ + m_used = uint32_t(ptr - begin()); + + ut_ad(m_used <= MAX_DATA_SIZE); + ut_d(m_buf_end = 0); + } + + /** + Initialise the block */ + void init() + { + m_used = 0; + ut_d(m_buf_end = 0); + ut_d(m_magic_n = DYN_BLOCK_MAGIC_N); + } + private: +#ifdef UNIV_DEBUG + /** If opened then this is the buffer end offset, else 0 */ + ulint m_buf_end; + + /** Magic number (DYN_BLOCK_MAGIC_N) */ + ulint m_magic_n; +#endif /* UNIV_DEBUG */ + + /** Storage */ + byte m_data[MAX_DATA_SIZE]; + + /** number of data bytes used in this block; + DYN_BLOCK_FULL_FLAG is set when the block becomes full */ + uint32_t m_used; + + friend class mtr_buf_t; + }; + + typedef sized_ilist list_t; + + /** Default constructor */ + mtr_buf_t() + : + m_heap(), + m_size() + { + push_back(&m_first_block); + } + + /** Destructor */ + ~mtr_buf_t() + { + erase(); + } + + /** Reset the buffer vector */ + void erase() + { + if (m_heap != NULL) { + mem_heap_free(m_heap); + m_heap = NULL; + + /* Initialise the list and add the first block. */ + m_list.clear(); + m_list.push_back(m_first_block); + } else { + m_first_block.init(); + ut_ad(m_list.size() == 1); + } + + m_size = 0; + } + + /** + Makes room on top and returns a pointer to a buffer in it. After + copying the elements, the caller must close the buffer using close(). + @param size in bytes of the buffer; MUST be <= MAX_DATA_SIZE! + @return pointer to the buffer */ + byte* open(ulint size) + MY_ATTRIBUTE((warn_unused_result)) + { + ut_ad(size > 0); + ut_ad(size <= MAX_DATA_SIZE); + + block_t* block; + + block = has_space(size) ? back() : add_block(); + + ut_ad(block->m_used <= MAX_DATA_SIZE); + ut_d(block->m_buf_end = block->m_used + size); + + return(block->end()); + } + + /** + Closes the buffer returned by open. + @param ptr end of used space */ + void close(const byte* ptr) + { + ut_ad(!m_list.empty()); + block_t* block = back(); + + m_size -= block->used(); + + block->close(ptr); + + m_size += block->used(); + } + + /** + Makes room on top and returns a pointer to the added element. + The caller must copy the element to the pointer returned. + @param size in bytes of the element + @return pointer to the element */ + template + Type push(uint32_t size) + { + ut_ad(size > 0); + ut_ad(size <= MAX_DATA_SIZE); + + block_t* block; + + block = has_space(size) ? back() : add_block(); + + m_size += size; + + /* See ISO C++03 14.2/4 for why "template" is required. */ + + return(block->template push(size)); + } + + /** + Pushes n bytes. + @param str string to write + @param len string length */ + void push(const byte* ptr, uint32_t len) + { + while (len > 0) { + uint32_t n_copied = std::min(len, + uint32_t(MAX_DATA_SIZE)); + ::memmove(push(n_copied), ptr, n_copied); + + ptr += n_copied; + len -= n_copied; + } + } + + /** + Returns a pointer to an element in the buffer. const version. + @param pos position of element in bytes from start + @return pointer to element */ + template + const Type at(ulint pos) const + { + block_t* block = const_cast( + const_cast(this)->find(pos)); + + return(reinterpret_cast(block->begin() + pos)); + } + + /** + Returns a pointer to an element in the buffer. non const version. + @param pos position of element in bytes from start + @return pointer to element */ + template + Type at(ulint pos) + { + block_t* block = const_cast(find(pos)); + + return(reinterpret_cast(block->begin() + pos)); + } + + /** + Returns the size of the total stored data. + @return data size in bytes */ + ulint size() const + MY_ATTRIBUTE((warn_unused_result)) + { +#ifdef UNIV_DEBUG + ulint total_size = 0; + + for (list_t::iterator it = m_list.begin(), end = m_list.end(); + it != end; ++it) { + total_size += it->used(); + } + + ut_ad(total_size == m_size); +#endif /* UNIV_DEBUG */ + return(m_size); + } + + /** + Iterate over each block and call the functor. + @return false if iteration was terminated. */ + template + bool for_each_block(const Functor& functor) const + { + for (list_t::iterator it = m_list.begin(), end = m_list.end(); + it != end; ++it) { + + if (!functor(&*it)) { + return false; + } + } + + return(true); + } + + /** + @return the first block */ + block_t* front() + MY_ATTRIBUTE((warn_unused_result)) + { + return &m_list.front(); + } + + /** + @return true if m_first_block block was not filled fully */ + bool is_small() const + MY_ATTRIBUTE((warn_unused_result)) + { + return(m_heap == NULL); + } + + /** @return whether the buffer is empty */ + bool empty() const { return !back()->m_used; } + +private: + // Disable copying + mtr_buf_t(const mtr_buf_t&); + mtr_buf_t& operator=(const mtr_buf_t&); + + /** + Add the block to the end of the list*/ + void push_back(block_t* block) + { + block->init(); + m_list.push_back(*block); + } + + /** @return the last block in the list */ + block_t* back() const + { + return &const_cast(m_list.back()); + } + + /* + @return true if request can be fullfilled */ + bool has_space(ulint size) const + { + return(back()->m_used + size <= MAX_DATA_SIZE); + } + + /* + @return true if request can be fullfilled */ + bool has_space(ulint size) + { + return(back()->m_used + size <= MAX_DATA_SIZE); + } + + /** Find the block that contains the pos. + @param pos absolute offset, it is updated to make it relative + to the block + @return the block containing the pos. */ + block_t* find(ulint& pos) + { + ut_ad(!m_list.empty()); + + for (list_t::iterator it = m_list.begin(), end = m_list.end(); + it != end; ++it) { + + if (pos < it->used()) { + ut_ad(it->used() >= pos); + + return &*it; + } + + pos -= it->used(); + } + + return NULL; + } + + /** + Allocate and add a new block to m_list */ + block_t* add_block() + { + block_t* block; + + if (m_heap == NULL) { + m_heap = mem_heap_create(sizeof(*block)); + } + + block = reinterpret_cast( + mem_heap_alloc(m_heap, sizeof(*block))); + + push_back(block); + + return(block); + } + +private: + /** Heap to use for memory allocation */ + mem_heap_t* m_heap; + + /** Allocated blocks */ + list_t m_list; + + /** Total size used by all blocks */ + ulint m_size; + + /** The default block, should always be the first element. This + is for backwards compatibility and to avoid an extra heap allocation + for small REDO log records */ + block_t m_first_block; +}; + +#endif /* dyn0buf_h */ diff --git a/storage/innobase/include/dyn0types.h b/storage/innobase/include/dyn0types.h new file mode 100644 index 00000000..83d0b0d6 --- /dev/null +++ b/storage/innobase/include/dyn0types.h @@ -0,0 +1,39 @@ +/***************************************************************************** + +Copyright (c) 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/dyn0types.h +The dynamically allocated buffer types and constants + +Created 2013-03-16 Sunny Bains +*******************************************************/ + +#ifndef dyn0types_h +#define dyn0types_h + +/** Value of dyn_block_t::magic_n */ +#define DYN_BLOCK_MAGIC_N 375767 + +/** This is the initial 'payload' size of a dynamic array */ +#define DYN_ARRAY_DATA_SIZE 512 + +/** Flag for dyn_block_t::used that indicates a full block */ +#define DYN_BLOCK_FULL_FLAG 0x1000000UL + +#endif /* dyn0types_h */ diff --git a/storage/innobase/include/eval0eval.h b/storage/innobase/include/eval0eval.h new file mode 100644 index 00000000..a3ea0462 --- /dev/null +++ b/storage/innobase/include/eval0eval.h @@ -0,0 +1,109 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/eval0eval.h +SQL evaluator: evaluates simple data structures, like expressions, in +a query graph + +Created 12/29/1997 Heikki Tuuri +*******************************************************/ + +#ifndef eval0eval_h +#define eval0eval_h + +#include "que0types.h" +#include "pars0sym.h" +#include "pars0pars.h" + +/*****************************************************************//** +Free the buffer from global dynamic memory for a value of a que_node, +if it has been allocated in the above function. The freeing for pushed +column values is done in sel_col_prefetch_buf_free. */ +void +eval_node_free_val_buf( +/*===================*/ + que_node_t* node); /*!< in: query graph node */ +/*****************************************************************//** +Evaluates a symbol table symbol. */ +UNIV_INLINE +void +eval_sym( +/*=====*/ + sym_node_t* sym_node); /*!< in: symbol table node */ +/*****************************************************************//** +Evaluates an expression. */ +UNIV_INLINE +void +eval_exp( +/*=====*/ + que_node_t* exp_node); /*!< in: expression */ +/*****************************************************************//** +Sets an integer value as the value of an expression node. */ +UNIV_INLINE +void +eval_node_set_int_val( +/*==================*/ + que_node_t* node, /*!< in: expression node */ + lint val); /*!< in: value to set */ +/*****************************************************************//** +Gets an integer value from an expression node. +@return integer value */ +UNIV_INLINE +lint +eval_node_get_int_val( +/*==================*/ + que_node_t* node); /*!< in: expression node */ +/*****************************************************************//** +Copies a binary string value as the value of a query graph node. Allocates a +new buffer if necessary. */ +UNIV_INLINE +void +eval_node_copy_and_alloc_val( +/*=========================*/ + que_node_t* node, /*!< in: query graph node */ + const byte* str, /*!< in: binary string */ + ulint len); /*!< in: string length or UNIV_SQL_NULL */ +/*****************************************************************//** +Copies a query node value to another node. */ +UNIV_INLINE +void +eval_node_copy_val( +/*===============*/ + que_node_t* node1, /*!< in: node to copy to */ + que_node_t* node2); /*!< in: node to copy from */ +/*****************************************************************//** +Gets a iboolean value from a query node. +@return iboolean value */ +UNIV_INLINE +ibool +eval_node_get_ibool_val( +/*====================*/ + que_node_t* node); /*!< in: query graph node */ +/*****************************************************************//** +Evaluates a comparison node. +@return the result of the comparison */ +ibool +eval_cmp( +/*=====*/ + func_node_t* cmp_node); /*!< in: comparison node */ + + +#include "eval0eval.inl" + +#endif diff --git a/storage/innobase/include/eval0eval.inl b/storage/innobase/include/eval0eval.inl new file mode 100644 index 00000000..0ea4057f --- /dev/null +++ b/storage/innobase/include/eval0eval.inl @@ -0,0 +1,254 @@ +/***************************************************************************** + +Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/eval0eval.ic +SQL evaluator: evaluates simple data structures, like expressions, in +a query graph + +Created 12/29/1997 Heikki Tuuri +*******************************************************/ + +#include "que0que.h" +#include "rem0cmp.h" +#include "pars0grm.h" + +/*****************************************************************//** +Evaluates a function node. */ +void +eval_func( +/*======*/ + func_node_t* func_node); /*!< in: function node */ +/*****************************************************************//** +Allocate a buffer from global dynamic memory for a value of a que_node. +NOTE that this memory must be explicitly freed when the query graph is +freed. If the node already has allocated buffer, that buffer is freed +here. NOTE that this is the only function where dynamic memory should be +allocated for a query node val field. +@return pointer to allocated buffer */ +byte* +eval_node_alloc_val_buf( +/*====================*/ + que_node_t* node, /*!< in: query graph node; sets the val field + data field to point to the new buffer, and + len field equal to size */ + ulint size); /*!< in: buffer size */ + + +/*****************************************************************//** +Allocates a new buffer if needed. +@return pointer to buffer */ +UNIV_INLINE +byte* +eval_node_ensure_val_buf( +/*=====================*/ + que_node_t* node, /*!< in: query graph node; sets the val field + data field to point to the new buffer, and + len field equal to size */ + ulint size) /*!< in: buffer size */ +{ + dfield_t* dfield; + byte* data; + + dfield = que_node_get_val(node); + dfield_set_len(dfield, size); + + data = static_cast(dfield_get_data(dfield)); + + if (!data || que_node_get_val_buf_size(node) < size) { + + data = eval_node_alloc_val_buf(node, size); + } + + return(data); +} + +/*****************************************************************//** +Evaluates a symbol table symbol. */ +UNIV_INLINE +void +eval_sym( +/*=====*/ + sym_node_t* sym_node) /*!< in: symbol table node */ +{ + + ut_ad(que_node_get_type(sym_node) == QUE_NODE_SYMBOL); + + if (sym_node->indirection) { + /* The symbol table node is an alias for a variable or a + column */ + + dfield_copy_data(que_node_get_val(sym_node), + que_node_get_val(sym_node->indirection)); + } +} + +/*****************************************************************//** +Evaluates an expression. */ +UNIV_INLINE +void +eval_exp( +/*=====*/ + que_node_t* exp_node) /*!< in: expression */ +{ + if (que_node_get_type(exp_node) == QUE_NODE_SYMBOL) { + + eval_sym((sym_node_t*) exp_node); + + return; + } + + eval_func(static_cast(exp_node)); +} + +/*****************************************************************//** +Sets an integer value as the value of an expression node. */ +UNIV_INLINE +void +eval_node_set_int_val( +/*==================*/ + que_node_t* node, /*!< in: expression node */ + lint val) /*!< in: value to set */ +{ + dfield_t* dfield; + byte* data; + + dfield = que_node_get_val(node); + + data = static_cast(dfield_get_data(dfield)); + + if (data == NULL) { + data = eval_node_alloc_val_buf(node, 4); + } + + ut_ad(dfield_get_len(dfield) == 4); + + mach_write_to_4(data, (ulint) val); +} + +/*****************************************************************//** +Gets an integer non-SQL null value from an expression node. +@return integer value */ +UNIV_INLINE +lint +eval_node_get_int_val( +/*==================*/ + que_node_t* node) /*!< in: expression node */ +{ + const byte* ptr; + dfield_t* dfield; + + dfield = que_node_get_val(node); + ptr = static_cast(dfield_get_data(dfield)); + + ut_ad(dfield_get_len(dfield) == 4); + + return((int) mach_read_from_4(ptr)); +} + +/*****************************************************************//** +Gets a iboolean value from a query node. +@return iboolean value */ +UNIV_INLINE +ibool +eval_node_get_ibool_val( +/*====================*/ + que_node_t* node) /*!< in: query graph node */ +{ + dfield_t* dfield; + byte* data; + + dfield = que_node_get_val(node); + + data = static_cast(dfield_get_data(dfield)); + + ut_ad(data != NULL); + + return(mach_read_from_1(data)); +} + +/*****************************************************************//** +Sets a iboolean value as the value of a function node. */ +UNIV_INLINE +void +eval_node_set_ibool_val( +/*====================*/ + func_node_t* func_node, /*!< in: function node */ + ibool val) /*!< in: value to set */ +{ + dfield_t* dfield; + byte* data; + + dfield = que_node_get_val(func_node); + + data = static_cast(dfield_get_data(dfield)); + + if (data == NULL) { + /* Allocate 1 byte to hold the value */ + + data = eval_node_alloc_val_buf(func_node, 1); + } + + ut_ad(dfield_get_len(dfield) == 1); + + mach_write_to_1(data, val); +} + +/*****************************************************************//** +Copies a binary string value as the value of a query graph node. Allocates a +new buffer if necessary. */ +UNIV_INLINE +void +eval_node_copy_and_alloc_val( +/*=========================*/ + que_node_t* node, /*!< in: query graph node */ + const byte* str, /*!< in: binary string */ + ulint len) /*!< in: string length or UNIV_SQL_NULL */ +{ + byte* data; + + if (len == UNIV_SQL_NULL) { + dfield_set_len(que_node_get_val(node), len); + + return; + } + + data = eval_node_ensure_val_buf(node, len); + + memcpy(data, str, len); +} + +/*****************************************************************//** +Copies a query node value to another node. */ +UNIV_INLINE +void +eval_node_copy_val( +/*===============*/ + que_node_t* node1, /*!< in: node to copy to */ + que_node_t* node2) /*!< in: node to copy from */ +{ + dfield_t* dfield2; + + dfield2 = que_node_get_val(node2); + + eval_node_copy_and_alloc_val( + node1, + static_cast(dfield_get_data(dfield2)), + dfield_get_len(dfield2)); +} diff --git a/storage/innobase/include/eval0proc.h b/storage/innobase/include/eval0proc.h new file mode 100644 index 00000000..a93140bf --- /dev/null +++ b/storage/innobase/include/eval0proc.h @@ -0,0 +1,94 @@ +/***************************************************************************** + +Copyright (c) 1998, 2016, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/eval0proc.h +Executes SQL stored procedures and their control structures + +Created 1/20/1998 Heikki Tuuri +*******************************************************/ + +#ifndef eval0proc_h +#define eval0proc_h + +#include "que0types.h" +#include "pars0sym.h" +#include "pars0pars.h" + +/**********************************************************************//** +Performs an execution step of a procedure node. +@return query thread to run next or NULL */ +UNIV_INLINE +que_thr_t* +proc_step( +/*======*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of an if-statement node. +@return query thread to run next or NULL */ +que_thr_t* +if_step( +/*====*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of a while-statement node. +@return query thread to run next or NULL */ +que_thr_t* +while_step( +/*=======*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of a for-loop node. +@return query thread to run next or NULL */ +que_thr_t* +for_step( +/*=====*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of an assignment statement node. +@return query thread to run next or NULL */ +que_thr_t* +assign_step( +/*========*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of a procedure call node. +@return query thread to run next or NULL */ +UNIV_INLINE +que_thr_t* +proc_eval_step( +/*===========*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of an exit statement node. +@return query thread to run next or NULL */ +que_thr_t* +exit_step( +/*======*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs an execution step of a return-statement node. +@return query thread to run next or NULL */ +que_thr_t* +return_step( +/*========*/ + que_thr_t* thr); /*!< in: query thread */ + +#include "eval0proc.inl" + +#endif diff --git a/storage/innobase/include/eval0proc.inl b/storage/innobase/include/eval0proc.inl new file mode 100644 index 00000000..b0c5f75b --- /dev/null +++ b/storage/innobase/include/eval0proc.inl @@ -0,0 +1,88 @@ +/***************************************************************************** + +Copyright (c) 1998, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/eval0proc.ic +Executes SQL stored procedures and their control structures + +Created 1/20/1998 Heikki Tuuri +*******************************************************/ + +#include "pars0pars.h" +#include "que0que.h" +#include "eval0eval.h" + +/**********************************************************************//** +Performs an execution step of a procedure node. +@return query thread to run next or NULL */ +UNIV_INLINE +que_thr_t* +proc_step( +/*======*/ + que_thr_t* thr) /*!< in: query thread */ +{ + proc_node_t* node; + + ut_ad(thr); + + node = static_cast(thr->run_node); + ut_ad(que_node_get_type(node) == QUE_NODE_PROC); + + if (thr->prev_node == que_node_get_parent(node)) { + /* Start execution from the first statement in the statement + list */ + + thr->run_node = node->stat_list; + } else { + /* Move to the next statement */ + ut_ad(que_node_get_next(thr->prev_node) == NULL); + + thr->run_node = NULL; + } + + if (thr->run_node == NULL) { + thr->run_node = que_node_get_parent(node); + } + + return(thr); +} + +/**********************************************************************//** +Performs an execution step of a procedure call node. +@return query thread to run next or NULL */ +UNIV_INLINE +que_thr_t* +proc_eval_step( +/*===========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + func_node_t* node; + + ut_ad(thr); + + node = static_cast(thr->run_node); + ut_ad(que_node_get_type(node) == QUE_NODE_FUNC); + + /* Evaluate the procedure */ + + eval_exp(node); + + thr->run_node = que_node_get_parent(node); + + return(thr); +} diff --git a/storage/innobase/include/fil0crypt.h b/storage/innobase/include/fil0crypt.h new file mode 100644 index 00000000..f43965cd --- /dev/null +++ b/storage/innobase/include/fil0crypt.h @@ -0,0 +1,396 @@ +/***************************************************************************** +Copyright (C) 2013, 2015, Google Inc. All Rights Reserved. +Copyright (c) 2015, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/fil0crypt.h +The low-level file system encryption support functions + +Created 04/01/2015 Jan Lindström +*******************************************************/ + +#ifndef fil0crypt_h +#define fil0crypt_h + +#include "my_crypt.h" +#include "fil0fil.h" + +/** +* Magic pattern in start of crypt data on page 0 +*/ +#define MAGIC_SZ 6 + +static const unsigned char CRYPT_MAGIC[MAGIC_SZ] = { + 's', 0xE, 0xC, 'R', 'E', 't' }; + +/* This key will be used if nothing else is given */ +#define FIL_DEFAULT_ENCRYPTION_KEY ENCRYPTION_KEY_SYSTEM_DATA + +/** Wake up the encryption threads */ +void fil_crypt_threads_signal(bool broadcast= false); + +/** + * CRYPT_SCHEME_UNENCRYPTED + * + * Used as intermediate state when convering a space from unencrypted + * to encrypted + */ +/** + * CRYPT_SCHEME_1 + * + * xxx is AES_CTR or AES_CBC (or another block cypher with the same key and iv lengths) + * L = AES_ECB(KEY, IV) + * CRYPT(PAGE) = xxx(KEY=L, IV=C, PAGE) + */ + +#define CRYPT_SCHEME_1 1 +#define CRYPT_SCHEME_1_IV_LEN 16 +#define CRYPT_SCHEME_UNENCRYPTED 0 + +/* Cached L or key for given key_version */ +struct key_struct +{ + uint key_version; /*!< Version of the key */ + uint key_length; /*!< Key length */ + unsigned char key[MY_AES_MAX_KEY_LENGTH]; /*!< Cached key + (that is L in CRYPT_SCHEME_1) */ +}; + +/** is encryption enabled */ +extern ulong srv_encrypt_tables; + +/** Mutex helper for crypt_data->scheme +@param[in, out] schme encryption scheme +@param[in] exit should we exit or enter mutex ? */ +void +crypt_data_scheme_locker( + st_encryption_scheme* scheme, + int exit); + +struct fil_space_rotate_state_t +{ + time_t start_time; /*!< time when rotation started */ + ulint active_threads; /*!< active threads in space */ + uint32_t next_offset; /*!< next "free" offset */ + uint32_t max_offset; /*!< max offset needing to be rotated */ + uint min_key_version_found; /*!< min key version found but not + rotated */ + lsn_t end_lsn; /*!< max lsn created when rotating this + space */ + bool starting; /*!< initial write of IV */ + bool flushing; /*!< space is being flushed at end of rotate */ +}; + +#ifndef UNIV_INNOCHECKSUM + +struct fil_space_crypt_t : st_encryption_scheme +{ + public: + /** Constructor. Does not initialize the members! + The object is expected to be placed in a buffer that + has been zero-initialized. */ + fil_space_crypt_t( + uint new_type, + uint new_min_key_version, + uint new_key_id, + fil_encryption_t new_encryption) + : st_encryption_scheme(), + min_key_version(new_min_key_version), + encryption(new_encryption), + key_found(0), + rotate_state() + { + key_id = new_key_id; + my_random_bytes(iv, sizeof(iv)); + mysql_mutex_init(0, &mutex, nullptr); + locker = crypt_data_scheme_locker; + type = new_type; + + if (new_encryption == FIL_ENCRYPTION_OFF || + (!srv_encrypt_tables && + new_encryption == FIL_ENCRYPTION_DEFAULT)) { + type = CRYPT_SCHEME_UNENCRYPTED; + } else { + type = CRYPT_SCHEME_1; + min_key_version = key_get_latest_version(); + } + + key_found = min_key_version; + } + + /** Destructor */ + ~fil_space_crypt_t() + { + mysql_mutex_destroy(&mutex); + } + + /** Get latest key version from encryption plugin + @retval key_version or + @retval ENCRYPTION_KEY_VERSION_INVALID if used key_id + is not found from encryption plugin. */ + uint key_get_latest_version(void); + + /** Returns true if key was found from encryption plugin + and false if not. */ + bool is_key_found() const { + return key_found != ENCRYPTION_KEY_VERSION_INVALID; + } + + /** Returns true if tablespace should be encrypted */ + bool should_encrypt() const { + return ((encryption == FIL_ENCRYPTION_ON) || + (srv_encrypt_tables && + encryption == FIL_ENCRYPTION_DEFAULT)); + } + + /** Return true if tablespace is encrypted. */ + bool is_encrypted() const { + return (encryption != FIL_ENCRYPTION_OFF); + } + + /** Return true if default tablespace encryption is used, */ + bool is_default_encryption() const { + return (encryption == FIL_ENCRYPTION_DEFAULT); + } + + /** Return true if tablespace is not encrypted. */ + bool not_encrypted() const { + return (encryption == FIL_ENCRYPTION_OFF); + } + + /** Write encryption metadata to the first page. + @param[in,out] block first page of the tablespace + @param[in,out] mtr mini-transaction */ + void write_page0(buf_block_t* block, mtr_t* mtr); + + uint min_key_version; // min key version for this space + fil_encryption_t encryption; // Encryption setup + + mysql_mutex_t mutex; // mutex protecting following variables + + /** Return code from encryption_key_get_latest_version. + If ENCRYPTION_KEY_VERSION_INVALID encryption plugin + could not find the key and there is no need to call + get_latest_key_version again as keys are read only + at startup. */ + uint key_found; + + fil_space_rotate_state_t rotate_state; +}; + +/** Status info about encryption */ +struct fil_space_crypt_status_t { + ulint space; /*!< tablespace id */ + ulint scheme; /*!< encryption scheme */ + uint min_key_version; /*!< min key version */ + uint current_key_version;/*!< current key version */ + uint keyserver_requests;/*!< no of key requests to key server */ + uint key_id; /*!< current key_id */ + bool rotating; /*!< is key rotation ongoing */ + bool flushing; /*!< is flush at end of rotation ongoing */ + ulint rotate_next_page_number; /*!< next page if key rotating */ + ulint rotate_max_page_number; /*!< max page if key rotating */ +}; + +/** Statistics about encryption key rotation */ +struct fil_crypt_stat_t +{ + ulint pages_read_from_cache= 0; + ulint pages_read_from_disk= 0; + ulint pages_modified= 0; + ulint pages_flushed= 0; + ulint estimated_iops= 0; +}; + +/** Init space crypt */ +void fil_space_crypt_init(); + +/** Cleanup space crypt */ +void fil_space_crypt_cleanup(); + +/** +Create a fil_space_crypt_t object +@param[in] encrypt_mode FIL_ENCRYPTION_DEFAULT or + FIL_ENCRYPTION_ON or + FIL_ENCRYPTION_OFF + +@param[in] key_id Encryption key id +@return crypt object */ +fil_space_crypt_t* +fil_space_create_crypt_data( + fil_encryption_t encrypt_mode, + uint key_id) + MY_ATTRIBUTE((warn_unused_result)); + +/** Initialize encryption parameters from a tablespace header page. +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] page first page of the tablespace +@return crypt data from page 0 +@retval NULL if not present or not valid */ +fil_space_crypt_t* fil_space_read_crypt_data(ulint zip_size, const byte* page) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** +Free a crypt data object +@param[in,out] crypt_data crypt data to be freed */ +void fil_space_destroy_crypt_data(fil_space_crypt_t **crypt_data); + +/** Amend encryption information from redo log. +@param[in] space tablespace +@param[in] data encryption metadata */ +void fil_crypt_parse(fil_space_t* space, const byte* data); + +/** Encrypt a buffer. +@param[in,out] crypt_data Crypt data +@param[in] space space_id +@param[in] offset Page offset +@param[in] src_frame Page to encrypt +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] dst_frame Output buffer +@param[in] use_full_checksum full crc32 algo is used +@return encrypted buffer or NULL */ +byte* +fil_encrypt_buf( + fil_space_crypt_t* crypt_data, + ulint space, + ulint offset, + const byte* src_frame, + ulint zip_size, + byte* dst_frame, + bool use_full_checksum) + MY_ATTRIBUTE((warn_unused_result)); + +/** +Encrypt a page. + +@param[in] space Tablespace +@param[in] offset Page offset +@param[in] src_frame Page to encrypt +@param[in,out] dst_frame Output buffer +@return encrypted buffer or NULL */ +byte* fil_space_encrypt( + const fil_space_t* space, + ulint offset, + byte* src_frame, + byte* dst_frame) + MY_ATTRIBUTE((warn_unused_result)); + +/** Decrypt a page. +@param]in] space_id space id +@param[in] fsp_flags Tablespace flags +@param[in] crypt_data crypt_data +@param[in] tmp_frame Temporary buffer +@param[in] physical_size page size +@param[in,out] src_frame Page to decrypt +@retval DB_SUCCESS on success +@retval DB_DECRYPTION_FAILED on error */ +dberr_t +fil_space_decrypt( + uint32_t space_id, + uint32_t fsp_flags, + fil_space_crypt_t* crypt_data, + byte* tmp_frame, + ulint physical_size, + byte* src_frame); + +/****************************************************************** +Decrypt a page +@param[in] space Tablespace +@param[in] tmp_frame Temporary buffer used for decrypting +@param[in,out] src_frame Page to decrypt +@return decrypted page, or original not encrypted page if decryption is +not needed. +@retval nullptr on failure */ +byte* +fil_space_decrypt( + const fil_space_t* space, + byte* tmp_frame, + byte* src_frame) + MY_ATTRIBUTE((warn_unused_result)); + +/********************************************************************* +Adjust thread count for key rotation +@param[in] enw_cnt Number of threads to be used */ +void fil_crypt_set_thread_cnt(const uint new_cnt); + +/********************************************************************* +Adjust max key age +@param[in] val New max key age */ +void fil_crypt_set_rotate_key_age(uint val); + +/********************************************************************* +Adjust rotation iops +@param[in] val New max roation iops */ +void fil_crypt_set_rotation_iops(uint val); + +/********************************************************************* +Adjust encrypt tables +@param[in] val New setting for innodb-encrypt-tables */ +void fil_crypt_set_encrypt_tables(ulong val); + +/********************************************************************* +Init threads for key rotation */ +void fil_crypt_threads_init(); + +/********************************************************************* +Clean up key rotation threads resources */ +void fil_crypt_threads_cleanup(); + +/********************************************************************* +Wait for crypt threads to stop accessing space +@param[in] space Tablespace */ +void fil_space_crypt_close_tablespace(const fil_space_t *space); + +/********************************************************************* +Get crypt status for a space (used by information_schema) +@param[in] space Tablespace +@param[out] status Crypt status +return 0 if crypt data present */ +void +fil_space_crypt_get_status( + const fil_space_t* space, + struct fil_space_crypt_status_t* status); + +/********************************************************************* +Return crypt statistics +@param[out] stat Crypt statistics */ +void fil_crypt_total_stat(fil_crypt_stat_t *stat); + +#include "fil0crypt.inl" +#endif /* !UNIV_INNOCHECKSUM */ + +/** +Verify that post encryption checksum match calculated checksum. +This function should be called only if tablespace contains crypt_data +metadata (this is strong indication that tablespace is encrypted). +Function also verifies that traditional checksum does not match +calculated checksum as if it does page could be valid unencrypted, +encrypted, or corrupted. + +@param[in,out] page page frame (checksum is temporarily modified) +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@return true if page is encrypted AND OK, false otherwise */ +bool fil_space_verify_crypt_checksum(const byte* page, ulint zip_size) + MY_ATTRIBUTE((warn_unused_result)); + +/** Add the tablespace to the rotation list if +innodb_encrypt_rotate_key_age is 0 or encryption plugin does +not do key version rotation +@return whether the tablespace should be added to rotation list */ +bool fil_crypt_must_default_encrypt(); + +#endif /* fil0crypt_h */ diff --git a/storage/innobase/include/fil0crypt.inl b/storage/innobase/include/fil0crypt.inl new file mode 100644 index 00000000..cc59b394 --- /dev/null +++ b/storage/innobase/include/fil0crypt.inl @@ -0,0 +1,81 @@ +/***************************************************************************** + +Copyright (c) 2015, 2017, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/fil0crypt.ic +The low-level file system encryption support functions + +Created 04/01/2015 Jan Lindström +*******************************************************/ + +/*******************************************************************//** +Find out whether the page is page encrypted +@return true if page is page encrypted, false if not */ +UNIV_INLINE +bool +fil_page_is_encrypted( +/*==================*/ + const byte *buf) /*!< in: page */ +{ + return(mach_read_from_4(buf+FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) != 0); +} + +/*******************************************************************//** +Get current encryption mode from crypt_data. +@return string representation */ +UNIV_INLINE +const char * +fil_crypt_get_mode( +/*===============*/ + const fil_space_crypt_t* crypt_data) +{ + switch (crypt_data->encryption) { + case FIL_ENCRYPTION_DEFAULT: + return("Default tablespace encryption mode"); + case FIL_ENCRYPTION_ON: + return("Tablespace encrypted"); + case FIL_ENCRYPTION_OFF: + return("Tablespace not encrypted"); + } + + ut_error; + return ("NULL"); +} + +/*******************************************************************//** +Get current encryption type from crypt_data. +@return string representation */ +UNIV_INLINE +const char * +fil_crypt_get_type( + const fil_space_crypt_t* crypt_data) +{ + ut_ad(crypt_data != NULL); + switch (crypt_data->type) { + case CRYPT_SCHEME_UNENCRYPTED: + return("scheme unencrypted"); + break; + case CRYPT_SCHEME_1: + return("scheme encrypted"); + break; + default: + ut_error; + } + + return ("NULL"); +} diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h new file mode 100644 index 00000000..6f58e3c1 --- /dev/null +++ b/storage/innobase/include/fil0fil.h @@ -0,0 +1,1823 @@ +/***************************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/fil0fil.h +The low-level file system + +Created 10/25/1995 Heikki Tuuri +*******************************************************/ + +#pragma once + +#include "fsp0types.h" +#include "mach0data.h" +#include "assume_aligned.h" + +#ifndef UNIV_INNOCHECKSUM + +#include "srw_lock.h" +#include "buf0dblwr.h" +#include "hash0hash.h" +#include "log0recv.h" +#include "dict0types.h" +#include "ilist.h" +#include +#include + +struct unflushed_spaces_tag_t; +struct default_encrypt_tag_t; +struct space_list_tag_t; +struct named_spaces_tag_t; + +using space_list_t= ilist; + +// Forward declaration +extern my_bool srv_use_doublewrite_buf; + +/** Possible values of innodb_flush_method */ +enum srv_flush_t +{ + /** fsync, the default */ + SRV_FSYNC= 0, + /** open log files in O_DSYNC mode */ + SRV_O_DSYNC, + /** do not call os_file_flush() when writing data files, but do flush + after writing to log files */ + SRV_LITTLESYNC, + /** do not flush after writing */ + SRV_NOSYNC, + /** invoke os_file_set_nocache() on data files. This implies using + unbuffered I/O but still fdatasync(), because some filesystems might + not flush meta-data on write completion */ + SRV_O_DIRECT, + /** Like O_DIRECT, but skip fdatasync(), assuming that the data is + durable on write completion */ + SRV_O_DIRECT_NO_FSYNC +#ifdef _WIN32 + /** Traditional Windows appoach to open all files without caching, + and do FileFlushBuffers() */ + ,SRV_ALL_O_DIRECT_FSYNC +#endif +}; + +/** innodb_flush_method */ +extern ulong srv_file_flush_method; + +/** Undo tablespaces starts with space_id. */ +extern uint32_t srv_undo_space_id_start; +/** The number of UNDO tablespaces that are open and ready to use. */ +extern uint32_t srv_undo_tablespaces_open; + +/** Check whether given space id is undo tablespace id +@param[in] space_id space id to check +@return true if it is undo tablespace else false. */ +inline bool srv_is_undo_tablespace(uint32_t space_id) +{ + return srv_undo_space_id_start > 0 && + space_id >= srv_undo_space_id_start && + space_id < srv_undo_space_id_start + srv_undo_tablespaces_open; +} + +class page_id_t; + +/** Structure containing encryption specification */ +struct fil_space_crypt_t; + +/** File types */ +enum fil_type_t { + /** temporary tablespace (temporary undo log or tables) */ + FIL_TYPE_TEMPORARY, + /** a tablespace that is being imported (no logging until finished) */ + FIL_TYPE_IMPORT, + /** persistent tablespace (for system, undo log or tables) */ + FIL_TYPE_TABLESPACE, +}; + +struct fil_node_t; + +/** Structure to store first and last value of range */ +struct range_t +{ + uint32_t first; + uint32_t last; +}; + +/** Sort the range based on first value of the range */ +struct range_compare +{ + bool operator() (const range_t lhs, const range_t rhs) const + { + return lhs.first < rhs.first; + } +}; + +using range_set_t= std::set; +/** Range to store the set of ranges of integers */ +class range_set +{ +private: + range_set_t ranges; + + range_set_t::iterator find(uint32_t value) const + { + auto r_offset= ranges.lower_bound({value, value}); + const auto r_end= ranges.end(); + if (r_offset != r_end); + else if (empty()) + return r_end; + else + r_offset= std::prev(r_end); + if (r_offset->first <= value && r_offset->last >= value) + return r_offset; + return r_end; + } +public: + /** Merge the current range with previous range. + @param[in] range range to be merged + @param[in] prev_range range to be merged with next */ + void merge_range(range_set_t::iterator range, + range_set_t::iterator prev_range) + { + if (range->first != prev_range->last + 1) + return; + + /* Merge the current range with previous range */ + range_t new_range {prev_range->first, range->last}; + ranges.erase(prev_range); + ranges.erase(range); + ranges.emplace(new_range); + } + + /** Split the range and add two more ranges + @param[in] range range to be split + @param[in] value Value to be removed from range */ + void split_range(range_set_t::iterator range, uint32_t value) + { + range_t split1{range->first, value - 1}; + range_t split2{value + 1, range->last}; + + /* Remove the existing element */ + ranges.erase(range); + + /* Insert the two elements */ + ranges.emplace(split1); + ranges.emplace(split2); + } + + /** Remove the value with the given range + @param[in,out] range range to be changed + @param[in] value value to be removed */ + void remove_within_range(range_set_t::iterator range, uint32_t value) + { + range_t new_range{range->first, range->last}; + if (value == range->first) + { + if (range->first == range->last) + { + ranges.erase(range); + return; + } + else + new_range.first++; + } + else if (value == range->last) + new_range.last--; + else if (range->first < value && range->last > value) + return split_range(range, value); + + ranges.erase(range); + ranges.emplace(new_range); + } + + /** Remove the value from the ranges. + @param[in] value Value to be removed. */ + void remove_value(uint32_t value) + { + if (empty()) + return; + range_t new_range {value, value}; + range_set_t::iterator range= ranges.lower_bound(new_range); + if (range == ranges.end()) + return remove_within_range(std::prev(range), value); + + if (range->first > value && range != ranges.begin()) + /* Iterate the previous ranges to delete */ + return remove_within_range(std::prev(range), value); + return remove_within_range(range, value); + } + /** Add the value within the existing range + @param[in] range range to be modified + @param[in] value value to be added */ + range_set_t::iterator add_within_range(range_set_t::iterator range, + uint32_t value) + { + if (range->first <= value && range->last >= value) + return range; + + range_t new_range{range->first, range->last}; + if (range->last + 1 == value) + new_range.last++; + else if (range->first - 1 == value) + new_range.first--; + else return ranges.end(); + ranges.erase(range); + return ranges.emplace(new_range).first; + } + /** Add the range in the ranges set + @param[in] new_range range to be added */ + void add_range(range_t new_range) + { + auto r_offset= ranges.lower_bound(new_range); + auto r_begin= ranges.begin(); + auto r_end= ranges.end(); + if (!ranges.size()) + { +new_range: + ranges.emplace(new_range); + return; + } + + if (r_offset == r_end) + { + /* last range */ + if (add_within_range(std::prev(r_offset), new_range.first) == r_end) + goto new_range; + } + else if (r_offset == r_begin) + { + /* First range */ + if (add_within_range(r_offset, new_range.first) == r_end) + goto new_range; + } + else if (r_offset->first - 1 == new_range.first) + { + /* Change starting of the existing range */ + auto r_value= add_within_range(r_offset, new_range.first); + if (r_value != ranges.begin()) + merge_range(r_value, std::prev(r_value)); + } + else + { + /* previous range last_value alone */ + if (add_within_range(std::prev(r_offset), new_range.first) == r_end) + goto new_range; + } + } + + /** Add the value in the ranges + @param[in] value value to be added */ + void add_value(uint32_t value) + { + range_t new_range{value, value}; + add_range(new_range); + } + + bool remove_if_exists(uint32_t value) + { + auto r_offset= find(value); + if (r_offset != ranges.end()) + { + remove_within_range(r_offset, value); + return true; + } + return false; + } + + bool contains(uint32_t value) const + { + return find(value) != ranges.end(); + } + + ulint size() { return ranges.size(); } + void clear() { ranges.clear(); } + bool empty() const { return ranges.empty(); } + typename range_set_t::iterator begin() { return ranges.begin(); } + typename range_set_t::iterator end() { return ranges.end(); } +}; +#endif + +/** Tablespace or log data space */ +#ifndef UNIV_INNOCHECKSUM +struct fil_io_t +{ + /** error code */ + dberr_t err; + /** file; node->space->release() must follow IORequestRead call */ + fil_node_t *node; +}; + +/** Tablespace encryption mode */ +enum fil_encryption_t +{ + /** Encrypted if innodb_encrypt_tables=ON (srv_encrypt_tables) */ + FIL_ENCRYPTION_DEFAULT, + /** Encrypted */ + FIL_ENCRYPTION_ON, + /** Not encrypted */ + FIL_ENCRYPTION_OFF +}; + +struct fil_space_t final : ilist_node, + ilist_node, + ilist_node, + ilist_node +#else +struct fil_space_t final +#endif +{ +#ifndef UNIV_INNOCHECKSUM + friend fil_node_t; + ~fil_space_t() + { + ut_ad(!latch_owner); + ut_ad(!latch_count); + latch.destroy(); + } + + /** fil_system.spaces chain node */ + fil_space_t *hash; + /** LSN of the most recent fil_names_write_if_was_clean(). + Reset to 0 by fil_names_clear(). Protected by exclusive log_sys.latch. + If and only if max_lsn is nonzero, this is in fil_system.named_spaces. */ + lsn_t max_lsn; + /** tablespace identifier */ + uint32_t id; + /** whether undo tablespace truncation is in progress */ + bool is_being_truncated; + fil_type_t purpose;/*!< purpose */ + UT_LIST_BASE_NODE_T(fil_node_t) chain; + /*!< base node for the file chain */ + uint32_t size; /*!< tablespace file size in pages; + 0 if not known yet */ + uint32_t size_in_header; + /* FSP_SIZE in the tablespace header; + 0 if not known yet */ + uint32_t free_len; + /*!< length of the FSP_FREE list */ + uint32_t free_limit; + /*!< contents of FSP_FREE_LIMIT */ + uint32_t recv_size; + /*!< recovered tablespace size in pages; + 0 if no size change was read from the redo log, + or if the size change was implemented */ + uint32_t n_reserved_extents; + /*!< number of reserved free extents for + ongoing operations like B-tree page split */ +private: +#ifdef UNIV_DEBUG + fil_space_t *next_in_space_list(); + fil_space_t *prev_in_space_list(); + + fil_space_t *next_in_unflushed_spaces(); + fil_space_t *prev_in_unflushed_spaces(); +#endif + + /** the committed size of the tablespace in pages */ + Atomic_relaxed committed_size; + /** Number of pending operations on the file. + The tablespace cannot be freed while (n_pending & PENDING) != 0. */ + std::atomic n_pending; + /** Flag in n_pending that indicates that the tablespace is about to be + deleted, and no further operations should be performed */ + static constexpr uint32_t STOPPING_READS= 1U << 31; + /** Flag in n_pending that indicates that the tablespace is being + deleted, and no further operations should be performed */ + static constexpr uint32_t STOPPING_WRITES= 1U << 30; + /** Flags in n_pending that indicate that the tablespace is being + deleted, and no further operations should be performed */ + static constexpr uint32_t STOPPING= STOPPING_READS | STOPPING_WRITES; + /** Flag in n_pending that indicates that the tablespace is a candidate + for being closed, and fil_node_t::is_open() can only be trusted after + acquiring fil_system.mutex and resetting the flag */ + static constexpr uint32_t CLOSING= 1U << 29; + /** Flag in n_pending that indicates that the tablespace needs fsync(). + This must be the least significant flag bit; @see release_flush() */ + static constexpr uint32_t NEEDS_FSYNC= 1U << 28; + /** The reference count */ + static constexpr uint32_t PENDING= ~(STOPPING | CLOSING | NEEDS_FSYNC); + /** latch protecting all page allocation bitmap pages */ + srw_lock latch; + pthread_t latch_owner; + ut_d(Atomic_relaxed latch_count;) +public: + /** MariaDB encryption data */ + fil_space_crypt_t *crypt_data; + + /** Whether needs_flush(), or this is in fil_system.unflushed_spaces */ + bool is_in_unflushed_spaces; + + /** Whether this in fil_system.default_encrypt_tables (needs key rotation) */ + bool is_in_default_encrypt; + +private: + /** Whether any corrupton of this tablespace has been reported */ + mutable std::atomic_flag is_corrupted; + +public: + /** mutex to protect freed_ranges and last_freed_lsn */ + std::mutex freed_range_mutex; +private: + /** Ranges of freed page numbers; protected by freed_range_mutex */ + range_set freed_ranges; + + /** LSN of freeing last page; protected by freed_range_mutex */ + lsn_t last_freed_lsn; + +public: + /** @return whether doublewrite buffering is needed */ + inline bool use_doublewrite() const; + + /** @return whether a page has been freed */ + inline bool is_freed(uint32_t page); + + /** Apply freed_ranges to the file. + @param writable whether the file is writable + @return number of pages written or hole-punched */ + uint32_t flush_freed(bool writable); + + /** Append a file to the chain of files of a space. + @param[in] name file name of a file that is not open + @param[in] handle file handle, or OS_FILE_CLOSED + @param[in] size file size in entire database pages + @param[in] is_raw whether this is a raw device + @param[in] atomic_write true if atomic write could be enabled + @param[in] max_pages maximum number of pages in file, + or UINT32_MAX for unlimited + @return file object */ + fil_node_t* add(const char* name, pfs_os_file_t handle, + uint32_t size, bool is_raw, bool atomic_write, + uint32_t max_pages = UINT32_MAX); +#ifdef UNIV_DEBUG + /** Assert that the mini-transaction is compatible with + updating an allocation bitmap page. + @param[in] mtr mini-transaction */ + void modify_check(const mtr_t& mtr) const; +#endif /* UNIV_DEBUG */ + + /** Try to reserve free extents. + @param[in] n_free_now current number of free extents + @param[in] n_to_reserve number of extents to reserve + @return whether the reservation succeeded */ + bool reserve_free_extents(uint32_t n_free_now, uint32_t n_to_reserve) + { + if (n_reserved_extents + n_to_reserve > n_free_now) { + return false; + } + + n_reserved_extents += n_to_reserve; + return true; + } + + /** Release the reserved free extents. + @param[in] n_reserved number of reserved extents */ + void release_free_extents(uint32_t n_reserved) + { + if (!n_reserved) return; + ut_a(n_reserved_extents >= n_reserved); + n_reserved_extents -= n_reserved; + } + + /** Rename a file. + @param[in] path tablespace file name after renaming + @param[in] log whether to write redo log + @param[in] replace whether to ignore the existence of path + @return error code + @retval DB_SUCCESS on success */ + dberr_t rename(const char *path, bool log, bool replace= false) + MY_ATTRIBUTE((nonnull)); + + /** Note that the tablespace has been imported. + Initially, purpose=FIL_TYPE_IMPORT so that no redo log is + written while the space ID is being updated in each page. */ + inline void set_imported(); + + /** Report the tablespace as corrupted */ + ATTRIBUTE_COLD void set_corrupted() const; + + /** @return whether the storage device is rotational (HDD, not SSD) */ + inline bool is_rotational() const; + + /** Open each file. Never invoked on .ibd files. + @param create_new_db whether to skip the call to fil_node_t::read_page0() + @return whether all files were opened */ + bool open(bool create_new_db); + /** Close each file. Only invoked on fil_system.temp_space. */ + void close(); + + /** Note that operations on the tablespace must stop. */ + inline void set_stopping(); + + /** Note that operations on the tablespace can resume after truncation */ + inline void clear_stopping(); + + /** Drop the tablespace and wait for any pending operations to cease + @param id tablespace identifier + @param detached_handle pointer to file to be closed later, or nullptr + @return tablespace to invoke fil_space_free() on + @retval nullptr if no tablespace was found, or it was deleted by + another concurrent thread */ + static fil_space_t *drop(uint32_t id, pfs_os_file_t *detached_handle); + +private: + MY_ATTRIBUTE((warn_unused_result)) + /** Try to acquire a tablespace reference (increment referenced()). + @param avoid when these flags are set, nothing will be acquired + @return the old reference count */ + uint32_t acquire_low(uint32_t avoid= STOPPING) + { + uint32_t n= 0; + while (!n_pending.compare_exchange_strong(n, n + 1, + std::memory_order_acquire, + std::memory_order_relaxed) && + !(n & avoid)); + return n; + } +public: + MY_ATTRIBUTE((warn_unused_result)) + /** Acquire a tablespace reference. + @return whether a tablespace reference was successfully acquired */ + inline bool acquire_if_not_stopped(); + + MY_ATTRIBUTE((warn_unused_result)) + /** Acquire a tablespace reference for I/O. + @param avoid when these flags are set, nothing will be acquired + @return whether the file is usable */ + bool acquire(uint32_t avoid= STOPPING | CLOSING) + { + const auto flags= acquire_low(avoid) & (avoid); + return UNIV_LIKELY(!flags) || (flags == CLOSING && acquire_and_prepare()); + } + + /** Acquire a tablespace reference for writing. + @param avoid when these flags are set, nothing will be acquired + @return whether the file is writable */ + bool acquire_for_write() { return acquire(STOPPING_WRITES | CLOSING); } + + /** Acquire another tablespace reference for I/O. */ + inline void reacquire(); + + /** Release a tablespace reference. + @return whether this was the last reference */ + bool release() + { + uint32_t n= n_pending.fetch_sub(1, std::memory_order_release); + ut_ad(n & PENDING); + return (n & PENDING) == 1; + } + + /** Clear the NEEDS_FSYNC flag */ + void clear_flush() + { +#if defined __GNUC__ && (defined __i386__ || defined __x86_64__) + static_assert(NEEDS_FSYNC == 1U << 28, "compatibility"); + __asm__ __volatile__("lock btrl $28, %0" : "+m" (n_pending)); +#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64) + static_assert(NEEDS_FSYNC == 1U << 28, "compatibility"); + _interlockedbittestandreset(reinterpret_cast + (&n_pending), 28); +#else + n_pending.fetch_and(~NEEDS_FSYNC, std::memory_order_release); +#endif + } + +private: + /** Clear the CLOSING flag */ + void clear_closing() + { +#if defined __GNUC__ && (defined __i386__ || defined __x86_64__) + static_assert(CLOSING == 1U << 29, "compatibility"); + __asm__ __volatile__("lock btrl $29, %0" : "+m" (n_pending)); +#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64) + static_assert(CLOSING == 1U << 29, "compatibility"); + _interlockedbittestandreset(reinterpret_cast + (&n_pending), 29); +#else + n_pending.fetch_and(~CLOSING, std::memory_order_relaxed); +#endif + } + + /** @return pending operations (and flags) */ + uint32_t pending()const { return n_pending.load(std::memory_order_acquire); } +public: + /** @return whether close() of the file handle has been requested */ + bool is_closing() const { return pending() & CLOSING; } + /** @return whether the tablespace is about to be dropped */ + bool is_stopping() const { return pending() & STOPPING; } + /** @return whether the tablespace is going to be dropped */ + bool is_stopping_writes() const { return pending() & STOPPING_WRITES; } + /** @return number of pending operations */ + bool is_ready_to_close() const + { return (pending() & (PENDING | CLOSING)) == CLOSING; } + /** @return whether fsync() or similar is needed */ + bool needs_flush() const { return pending() & NEEDS_FSYNC; } + /** @return whether fsync() or similar is needed, and the tablespace is + not being dropped */ + bool needs_flush_not_stopping() const + { return (pending() & (NEEDS_FSYNC | STOPPING_WRITES)) == NEEDS_FSYNC; } + + uint32_t referenced() const { return pending() & PENDING; } +private: + MY_ATTRIBUTE((warn_unused_result)) + /** Prepare to close the file handle. + @return number of pending operations, possibly with NEEDS_FSYNC flag */ + uint32_t set_closing() + { + return n_pending.fetch_or(CLOSING, std::memory_order_acquire); + } + +public: + /** Try to close a file to adhere to the innodb_open_files limit. + @param print_info whether to diagnose why a file cannot be closed + @return whether a file was closed */ + static bool try_to_close(bool print_info); + + /** Close all tablespace files at shutdown */ + static void close_all(); + + /** Update last_freed_lsn */ + void update_last_freed_lsn(lsn_t lsn) { last_freed_lsn= lsn; } + + /** Note that the file will need fsync(). + @return whether this needs to be added to fil_system.unflushed_spaces */ + bool set_needs_flush() + { + uint32_t n= 1; + while (!n_pending.compare_exchange_strong(n, n | NEEDS_FSYNC, + std::memory_order_acquire, + std::memory_order_relaxed)) + { + ut_ad(n & PENDING); + if (n & (NEEDS_FSYNC | STOPPING_WRITES)) + return false; + } + + return true; + } + + /** Clear all freed ranges for undo tablespace when InnoDB + encounters TRIM redo log record */ + void clear_freed_ranges() { freed_ranges.clear(); } +#endif /* !UNIV_INNOCHECKSUM */ + /** FSP_SPACE_FLAGS and FSP_FLAGS_MEM_ flags; + check fsp0types.h to more info about flags. */ + uint32_t flags; + + /** Determine if full_crc32 is used for a data file + @param[in] flags tablespace flags (FSP_SPACE_FLAGS) + @return whether the full_crc32 algorithm is active */ + static bool full_crc32(uint32_t flags) + { return flags & FSP_FLAGS_FCRC32_MASK_MARKER; } + /** @return whether innodb_checksum_algorithm=full_crc32 is active */ + bool full_crc32() const { return full_crc32(flags); } + /** Determine if full_crc32 is used along with PAGE_COMPRESSED */ + static bool is_full_crc32_compressed(uint32_t flags) + { + if (!full_crc32(flags)) + return false; + auto algo= FSP_FLAGS_FCRC32_GET_COMPRESSED_ALGO(flags); + DBUG_ASSERT(algo <= PAGE_ALGORITHM_LAST); + return algo != 0; + } + /** Determine the logical page size. + @param flags tablespace flags (FSP_SPACE_FLAGS) + @return the logical page size + @retval 0 if the flags are invalid */ + static unsigned logical_size(uint32_t flags) + { + switch (full_crc32(flags) + ? FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(flags) + : FSP_FLAGS_GET_PAGE_SSIZE(flags)) { + case 3: return 4096; + case 4: return 8192; + case 5: return full_crc32(flags) ? 16384 : 0; + case 0: return full_crc32(flags) ? 0 : 16384; + case 6: return 32768; + case 7: return 65536; + default: return 0; + } + } + /** Determine the ROW_FORMAT=COMPRESSED page size. + @param flags tablespace flags (FSP_SPACE_FLAGS) + @return the ROW_FORMAT=COMPRESSED page size + @retval 0 if ROW_FORMAT=COMPRESSED is not used */ + static unsigned zip_size(uint32_t flags) + { + if (full_crc32(flags)) + return 0; + const uint32_t zip_ssize= FSP_FLAGS_GET_ZIP_SSIZE(flags); + return zip_ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip_ssize : 0; + } + /** Determine the physical page size. + @param flags tablespace flags (FSP_SPACE_FLAGS) + @return the physical page size */ + static unsigned physical_size(uint32_t flags) + { + if (full_crc32(flags)) + return logical_size(flags); + + const uint32_t zip_ssize= FSP_FLAGS_GET_ZIP_SSIZE(flags); + return zip_ssize + ? (UNIV_ZIP_SIZE_MIN >> 1) << zip_ssize + : unsigned(srv_page_size); + } + + /** @return the ROW_FORMAT=COMPRESSED page size + @retval 0 if ROW_FORMAT=COMPRESSED is not used */ + unsigned zip_size() const { return zip_size(flags); } + /** @return the physical page size */ + unsigned physical_size() const { return physical_size(flags); } + + /** Check whether PAGE_COMPRESSED is enabled. + @param[in] flags tablespace flags */ + static bool is_compressed(uint32_t flags) + { + return is_full_crc32_compressed(flags) || + FSP_FLAGS_HAS_PAGE_COMPRESSION(flags); + } + /** @return whether the compression enabled for the tablespace. */ + bool is_compressed() const { return is_compressed(flags); } + + /** Get the compression algorithm for full crc32 format. + @param flags contents of FSP_SPACE_FLAGS + @return PAGE_COMPRESSED algorithm of full_crc32 tablespace + @retval 0 if not PAGE_COMPRESSED or not full_crc32 */ + static unsigned get_compression_algo(uint32_t flags) + { + return full_crc32(flags) + ? FSP_FLAGS_FCRC32_GET_COMPRESSED_ALGO(flags) + : 0; + } + /** @return the page_compressed algorithm + @retval 0 if not page_compressed */ + unsigned get_compression_algo() const { return get_compression_algo(flags); } + /** Determine if the page_compressed page contains an extra byte + for exact compressed stream length + @param flags contents of FSP_SPACE_FLAGS + @return whether the extra byte is needed */ + static bool full_crc32_page_compressed_len(uint32_t flags) + { + DBUG_ASSERT(full_crc32(flags)); + switch (get_compression_algo(flags)) { + case PAGE_LZ4_ALGORITHM: + case PAGE_LZO_ALGORITHM: + case PAGE_SNAPPY_ALGORITHM: + return true; + } + return false; + } + + /** Whether the full checksum matches with non full checksum flags. + @param flags contents of FSP_SPACE_FLAGS + @param expected expected flags + @return true if it is equivalent */ + static bool is_flags_full_crc32_equal(uint32_t flags, uint32_t expected) + { + ut_ad(full_crc32(flags)); + uint32_t fcrc32_psize= FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(flags); + + if (full_crc32(expected)) + /* The data file may have been created with a + different innodb_compression_algorithm. But + we only support one innodb_page_size for all files. */ + return fcrc32_psize == FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(expected); + + uint32_t non_fcrc32_psize = FSP_FLAGS_GET_PAGE_SSIZE(expected); + if (!non_fcrc32_psize) + return fcrc32_psize == 5; + return fcrc32_psize == non_fcrc32_psize; + } + + /** Whether old tablespace flags match full_crc32 flags. + @param flags contents of FSP_SPACE_FLAGS + @param expected expected flags + @return true if it is equivalent */ + static bool is_flags_non_full_crc32_equal(uint32_t flags, uint32_t expected) + { + ut_ad(!full_crc32(flags)); + if (!full_crc32(expected)) + return false; + + uint32_t non_fcrc32_psize= FSP_FLAGS_GET_PAGE_SSIZE(flags); + uint32_t fcrc32_psize = FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(expected); + + if (!non_fcrc32_psize) + return fcrc32_psize == 5; + return fcrc32_psize == non_fcrc32_psize; + } + + /** Whether both fsp flags are equivalent */ + static bool is_flags_equal(uint32_t flags, uint32_t expected) + { + if (!((flags ^ expected) & ~(1U << FSP_FLAGS_POS_RESERVED))) + return true; + return full_crc32(flags) + ? is_flags_full_crc32_equal(flags, expected) + : is_flags_non_full_crc32_equal(flags, expected); + } + + /** Validate the tablespace flags for full crc32 format. + @param flags contents of FSP_SPACE_FLAGS + @return whether the flags are correct in full crc32 format */ + static bool is_fcrc32_valid_flags(uint32_t flags) + { + ut_ad(flags & FSP_FLAGS_FCRC32_MASK_MARKER); + const ulint page_ssize= physical_size(flags); + if (page_ssize < 3 || page_ssize & 8) + return false; + flags >>= FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO; + return flags <= PAGE_ALGORITHM_LAST; + } + /** Validate the tablespace flags. + @param flags contents of FSP_SPACE_FLAGS + @param is_ibd whether this is an .ibd file (not system tablespace) + @return whether the flags are correct */ + static bool is_valid_flags(uint32_t flags, bool is_ibd) + { + DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return false;); + if (full_crc32(flags)) + return is_fcrc32_valid_flags(flags); + + if (flags == 0) + return true; + if (~FSP_FLAGS_MASK & flags) + return false; + + if (FSP_FLAGS_MASK_ATOMIC_BLOBS == + (flags & (FSP_FLAGS_MASK_POST_ANTELOPE | FSP_FLAGS_MASK_ATOMIC_BLOBS))) + /* If the "atomic blobs" flag (indicating + ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED) flag is set, then the + ROW_FORMAT!=REDUNDANT flag must also be set. */ + return false; + + /* Bits 10..14 should be 0b0000d where d is the DATA_DIR flag + of MySQL 5.6 and MariaDB 10.0, which we ignore. + In the buggy FSP_SPACE_FLAGS written by MariaDB 10.1.0 to 10.1.20, + bits 10..14 would be nonzero 0bsssaa where sss is + nonzero PAGE_SSIZE (3, 4, 6, or 7) + and aa is ATOMIC_WRITES (not 0b11). */ + if (FSP_FLAGS_GET_RESERVED(flags) & ~1U) + return false; + + const uint32_t ssize= FSP_FLAGS_GET_PAGE_SSIZE(flags); + if (ssize == 1 || ssize == 2 || ssize == 5 || ssize & 8) + /* the page_size is not between 4k and 64k; + 16k should be encoded as 0, not 5 */ + return false; + + const uint32_t zssize= FSP_FLAGS_GET_ZIP_SSIZE(flags); + if (zssize == 0) + /* not ROW_FORMAT=COMPRESSED */; + else if (zssize > (ssize ? ssize : 5)) + /* Invalid KEY_BLOCK_SIZE */ + return false; + else if (~flags & + (FSP_FLAGS_MASK_POST_ANTELOPE | FSP_FLAGS_MASK_ATOMIC_BLOBS)) + /* both these flags must set for ROW_FORMAT=COMPRESSED */ + return false; + + /* The flags do look valid. But, avoid misinterpreting + buggy MariaDB 10.1 format flags for + PAGE_COMPRESSED=1 PAGE_COMPRESSION_LEVEL={0,2,3} + as valid-looking PAGE_SSIZE if this is known to be + an .ibd file and we are using the default innodb_page_size=16k. */ + return(ssize == 0 || !is_ibd || srv_page_size != UNIV_PAGE_SIZE_ORIG); + } + +#ifndef UNIV_INNOCHECKSUM + MY_ATTRIBUTE((warn_unused_result)) + /** Create a tablespace in fil_system. + @param id tablespace identifier + @param flags tablespace flags + @param purpose tablespace purpose + @param crypt_data encryption information + @param mode encryption mode + @param opened true if space files are opened + @return pointer to created tablespace, to be filled in with add() + @retval nullptr on failure (such as when the same tablespace exists) */ + static fil_space_t *create(uint32_t id, uint32_t flags, + fil_type_t purpose, fil_space_crypt_t *crypt_data, + fil_encryption_t mode= FIL_ENCRYPTION_DEFAULT, + bool opened= false); + + MY_ATTRIBUTE((warn_unused_result)) + /** Acquire a tablespace reference. + @param id tablespace identifier + @return tablespace + @retval nullptr if the tablespace is missing or inaccessible */ + static fil_space_t *get(uint32_t id); + /** Acquire a tablespace reference for writing. + @param id tablespace identifier + @return tablespace + @retval nullptr if the tablespace is missing or inaccessible */ + static fil_space_t *get_for_write(uint32_t id); + + /** Add/remove the free page in the freed ranges list. + @param[in] offset page number to be added + @param[in] free true if page to be freed */ + void free_page(uint32_t offset, bool add=true) + { + std::lock_guard freed_lock(freed_range_mutex); + if (add) + return freed_ranges.add_value(offset); + + if (freed_ranges.empty()) + return; + + return freed_ranges.remove_value(offset); + } + + /** Add the range of freed pages */ + void add_free_ranges(range_set ranges) + { + std::lock_guard freed_lock(freed_range_mutex); + freed_ranges= std::move(ranges); + } + + /** Add the set of freed page ranges */ + void add_free_range(const range_t range) + { + freed_ranges.add_range(range); + } + + /** Set the tablespace size in pages */ + void set_sizes(uint32_t s) + { + ut_ad(id ? !size : (size >= s)); + size= s; committed_size= s; + } + + /** Update committed_size in mtr_t::commit() */ + void set_committed_size() { committed_size= size; } + + /** @return the last persisted page number */ + uint32_t last_page_number() const { return committed_size - 1; } + + /** @return the size in pages (0 if unreadable) */ + inline uint32_t get_size(); + + /** Read or write data. + @param type I/O context + @param offset offset in bytes + @param len number of bytes + @param buf the data to be read or written + @param bpage buffer block (for type.is_async() completion callback) + @return status and file descriptor */ + fil_io_t io(const IORequest &type, os_offset_t offset, size_t len, + void *buf, buf_page_t *bpage= nullptr); + /** Flush pending writes from the file system cache to the file. */ + template inline void flush(); + /** Flush pending writes from the file system cache to the file. */ + void flush_low(); + + /** Read the first page of a data file. + @return whether the page was found valid */ + bool read_page0(); + + /** Determine the next tablespace for encryption key rotation. + @param space current tablespace (nullptr to start from the beginning) + @param recheck whether the removal condition needs to be rechecked after + encryption parameters were changed + @param encrypt expected state of innodb_encrypt_tables + @return the next tablespace + @retval nullptr upon reaching the end of the iteration */ + static space_list_t::iterator next(space_list_t::iterator space, + bool recheck, bool encrypt); + +#ifdef UNIV_DEBUG + bool is_latched() const { return latch_count != 0; } +#endif + bool is_owner() const { return latch_owner == pthread_self(); } + /** Acquire the allocation latch in exclusive mode */ + void x_lock() + { + latch.wr_lock(SRW_LOCK_CALL); + ut_ad(!latch_owner); + latch_owner= pthread_self(); + ut_ad(!latch_count.fetch_add(1)); + } + /** Release the allocation latch from exclusive mode */ + void x_unlock() + { + ut_ad(latch_count.fetch_sub(1) == 1); + ut_ad(latch_owner == pthread_self()); + latch_owner= 0; + latch.wr_unlock(); + } + /** Acquire the allocation latch in shared mode */ + void s_lock() + { + ut_ad(!is_owner()); + latch.rd_lock(SRW_LOCK_CALL); + ut_ad(!latch_owner); + ut_d(latch_count.fetch_add(1)); + } + /** Release the allocation latch from shared mode */ + void s_unlock() + { + ut_ad(latch_count.fetch_sub(1)); + ut_ad(!latch_owner); + latch.rd_unlock(); + } + + typedef span name_type; + + /** @return the tablespace name (databasename/tablename) */ + name_type name() const; + +private: + /** @return whether the file is usable for io() */ + ATTRIBUTE_COLD bool prepare_acquired(); + /** @return whether the file is usable for io() */ + ATTRIBUTE_COLD bool acquire_and_prepare(); +#endif /*!UNIV_INNOCHECKSUM */ +}; + +#ifndef UNIV_INNOCHECKSUM +/** File node of a tablespace or the log data space */ +struct fil_node_t final +{ + /** tablespace containing this file */ + fil_space_t *space; + /** file name; protected by fil_system.mutex and exclusive log_sys.latch */ + char *name; + /** file handle */ + pfs_os_file_t handle; + /** whether the file is on non-rotational media (SSD) */ + unsigned on_ssd:1; + /** how to write page_compressed tables + (0=do not punch holes but write minimal amount of data, 1=punch holes, + 2=always write the same amount; thinly provisioned storage will compress) */ + unsigned punch_hole:2; + /** whether this file could use atomic write */ + unsigned atomic_write:1; + /** whether the file actually is a raw device or disk partition */ + unsigned is_raw_disk:1; + /** whether the tablespace discovery is being deferred during crash + recovery due to incompletely written page 0 */ + unsigned deferred:1; + + /** size of the file in database pages (0 if not known yet); + the possible last incomplete megabyte may be ignored if space->id == 0 */ + uint32_t size; + /** initial size of the file in database pages; + FIL_IBD_FILE_INITIAL_SIZE by default */ + uint32_t init_size; + /** maximum size of the file in database pages (0 if unlimited) */ + uint32_t max_size; + /** whether the file is currently being extended */ + Atomic_relaxed being_extended; + /** link to other files in this tablespace */ + UT_LIST_NODE_T(fil_node_t) chain; + + /** Filesystem block size */ + ulint block_size; + + /** @return whether this file is open */ + bool is_open() const { return handle != OS_FILE_CLOSED; } + + /** Read the first page of a data file. + @return whether the page was found valid */ + bool read_page0(); + + /** Determine some file metadata when creating or reading the file. + @param file the file that is being created, or OS_FILE_CLOSED */ + void find_metadata(os_file_t file= OS_FILE_CLOSED +#ifndef _WIN32 + , bool create= false, struct stat *statbuf= nullptr +#endif + ); + + /** Close the file handle. */ + void close(); + /** Same as close() but returns file handle instead of closing it. */ + pfs_os_file_t detach() MY_ATTRIBUTE((warn_unused_result)); + /** Prepare to free a file from fil_system. + @param detach_handle whether to detach instead of closing a handle + @return detached handle or OS_FILE_CLOSED */ + inline pfs_os_file_t close_to_free(bool detach_handle= false); + + /** Update the data structures on write completion */ + inline void complete_write(); + +private: + /** Does stuff common for close() and detach() */ + void prepare_to_close_or_detach(); +}; + +inline bool fil_space_t::use_doublewrite() const +{ + return !UT_LIST_GET_FIRST(chain)->atomic_write && srv_use_doublewrite_buf && + buf_dblwr.is_created(); +} + +inline void fil_space_t::set_imported() +{ + ut_ad(purpose == FIL_TYPE_IMPORT); + purpose= FIL_TYPE_TABLESPACE; + UT_LIST_GET_FIRST(chain)->find_metadata(); +} + +inline bool fil_space_t::is_rotational() const +{ + for (const fil_node_t *node= UT_LIST_GET_FIRST(chain); node; + node= UT_LIST_GET_NEXT(chain, node)) + if (!node->on_ssd) + return true; + return false; +} + +/** Common InnoDB file extensions */ +enum ib_extention { + NO_EXT = 0, + IBD = 1, + ISL = 2, + CFG = 3 +}; +extern const char* dot_ext[]; +#define DOT_IBD dot_ext[IBD] +#define DOT_ISL dot_ext[ISL] +#define DOT_CFG dot_ext[CFG] + +/** When mariadbd is run, the default directory "." is the mysqld datadir, +but in the MariaDB Embedded Server Library and mysqlbackup it is not the default +directory, and we must set the base file path explicitly */ +extern const char* fil_path_to_mysql_datadir; +#else +# include "univ.i" +#endif /* !UNIV_INNOCHECKSUM */ + +/** Initial size of a single-table tablespace in pages */ +#define FIL_IBD_FILE_INITIAL_SIZE 4U + +/** 'null' (undefined) page offset in the context of file spaces */ +#define FIL_NULL ULINT32_UNDEFINED + + +#define FIL_ADDR_PAGE 0U /* first in address is the page offset */ +#define FIL_ADDR_BYTE 4U /* then comes 2-byte byte offset within page*/ +#define FIL_ADDR_SIZE 6U /* address size is 6 bytes */ + +/** File space address */ +struct fil_addr_t { + /** page number within a tablespace */ + uint32_t page; + /** byte offset within the page */ + uint16_t boffset; +}; + +/** The byte offsets on a file page for various variables @{ */ +#define FIL_PAGE_SPACE_OR_CHKSUM 0 /*!< in < MySQL-4.0.14 space id the + page belongs to (== 0) but in later + versions the 'new' checksum of the + page */ +#define FIL_PAGE_OFFSET 4U /*!< page offset inside space */ +#define FIL_PAGE_PREV 8U /*!< if there is a 'natural' + predecessor of the page, its + offset. Otherwise FIL_NULL. + This field is not set on BLOB + pages, which are stored as a + singly-linked list. See also + FIL_PAGE_NEXT. */ +#define FIL_PAGE_NEXT 12U /*!< if there is a 'natural' successor + of the page, its offset. + Otherwise FIL_NULL. + B-tree index pages + (FIL_PAGE_TYPE contains FIL_PAGE_INDEX) + on the same PAGE_LEVEL are maintained + as a doubly linked list via + FIL_PAGE_PREV and FIL_PAGE_NEXT + in the collation order of the + smallest user record on each page. */ +#define FIL_PAGE_LSN 16U /*!< lsn of the end of the newest + modification log record to the page */ +#define FIL_PAGE_TYPE 24U /*!< file page type: FIL_PAGE_INDEX,..., + 2 bytes. + + The contents of this field can only + be trusted in the following case: + if the page is an uncompressed + B-tree index page, then it is + guaranteed that the value is + FIL_PAGE_INDEX. + The opposite does not hold. + + In tablespaces created by + MySQL/InnoDB 5.1.7 or later, the + contents of this field is valid + for all uncompressed pages. */ + +/** For the first page in a system tablespace data file(ibdata*, not *.ibd): +the file has been flushed to disk at least up to this lsn +For other pages of tablespaces not in innodb_checksum_algorithm=full_crc32 +format: 32-bit key version used to encrypt the page + 32-bit checksum +or 64 bits of zero if no encryption */ +#define FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION 26U + +/** This overloads FIL_PAGE_FILE_FLUSH_LSN for RTREE Split Sequence Number */ +#define FIL_RTREE_SPLIT_SEQ_NUM FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + +/** Start of the page_compressed content */ +#define FIL_PAGE_COMP_ALGO FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + +/** starting from 4.1.x this contains the space id of the page */ +#define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID 34U + +#define FIL_PAGE_SPACE_ID FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID + +#define FIL_PAGE_DATA 38U /*!< start of the data on the page */ + +/** 32-bit key version used to encrypt the page in full_crc32 format. +For non-encrypted page, it contains 0. */ +#define FIL_PAGE_FCRC32_KEY_VERSION 0 + +/** page_compressed without innodb_checksum_algorithm=full_crc32 @{ */ +/** Number of bytes used to store actual payload data size on +page_compressed pages when not using full_crc32. */ +#define FIL_PAGE_COMP_SIZE 0 + +/** Number of bytes for FIL_PAGE_COMP_SIZE */ +#define FIL_PAGE_COMP_METADATA_LEN 2 + +/** Number of bytes used to store actual compression method +for encrypted tables when not using full_crc32. */ +#define FIL_PAGE_ENCRYPT_COMP_ALGO 2 + +/** Extra header size for encrypted page_compressed pages when +not using full_crc32 */ +#define FIL_PAGE_ENCRYPT_COMP_METADATA_LEN 4 +/* @} */ + +/** File page trailer @{ */ +#define FIL_PAGE_END_LSN_OLD_CHKSUM 8 /*!< the low 4 bytes of this are used + to store the page checksum, the + last 4 bytes should be identical + to the last 4 bytes of FIL_PAGE_LSN */ +#define FIL_PAGE_DATA_END 8 /*!< size of the page trailer */ + +/** Store the last 4 bytes of FIL_PAGE_LSN */ +#define FIL_PAGE_FCRC32_END_LSN 8 + +/** Store crc32 checksum at the end of the page */ +#define FIL_PAGE_FCRC32_CHECKSUM 4 +/* @} */ + +/** File page types (values of FIL_PAGE_TYPE) @{ */ +/** page_compressed, encrypted=YES (not used for full_crc32) */ +constexpr uint16_t FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED= 37401; +/** page_compressed (not used for full_crc32) */ +constexpr uint16_t FIL_PAGE_PAGE_COMPRESSED= 34354; +/** B-tree index page */ +constexpr uint16_t FIL_PAGE_INDEX= 17855; +/** R-tree index page (SPATIAL INDEX) */ +constexpr uint16_t FIL_PAGE_RTREE= 17854; +/** Undo log page */ +constexpr uint16_t FIL_PAGE_UNDO_LOG= 2; +/** Index node (of file-in-file metadata) */ +constexpr uint16_t FIL_PAGE_INODE= 3; +/** Insert buffer free list */ +constexpr uint16_t FIL_PAGE_IBUF_FREE_LIST= 4; +/** Freshly allocated page */ +constexpr uint16_t FIL_PAGE_TYPE_ALLOCATED= 0; +/** Change buffer bitmap (pages n*innodb_page_size+1) */ +constexpr uint16_t FIL_PAGE_IBUF_BITMAP= 5; +/** System page */ +constexpr uint16_t FIL_PAGE_TYPE_SYS= 6; +/** Transaction system data */ +constexpr uint16_t FIL_PAGE_TYPE_TRX_SYS= 7; +/** Tablespace header (page 0) */ +constexpr uint16_t FIL_PAGE_TYPE_FSP_HDR= 8; +/** Extent descriptor page (pages n*innodb_page_size, except 0) */ +constexpr uint16_t FIL_PAGE_TYPE_XDES= 9; +/** Uncompressed BLOB page */ +constexpr uint16_t FIL_PAGE_TYPE_BLOB= 10; +/** First ROW_FORMAT=COMPRESSED BLOB page */ +constexpr uint16_t FIL_PAGE_TYPE_ZBLOB= 11; +/** Subsequent ROW_FORMAT=COMPRESSED BLOB page */ +constexpr uint16_t FIL_PAGE_TYPE_ZBLOB2= 12; +/** In old tablespaces, garbage in FIL_PAGE_TYPE is replaced with this +value when flushing pages. */ +constexpr uint16_t FIL_PAGE_TYPE_UNKNOWN= 13; + +/* File page types introduced in MySQL 5.7, not supported in MariaDB */ +//constexpr uint16_t FIL_PAGE_COMPRESSED = 14; +//constexpr uint16_t FIL_PAGE_ENCRYPTED = 15; +//constexpr uint16_t FIL_PAGE_COMPRESSED_AND_ENCRYPTED = 16; +//constexpr FIL_PAGE_ENCRYPTED_RTREE = 17; +/** Clustered index root page after instant ADD COLUMN */ +constexpr uint16_t FIL_PAGE_TYPE_INSTANT= 18; + +/** Used by i_s.cc to index into the text description. +Note: FIL_PAGE_TYPE_INSTANT maps to the same as FIL_PAGE_INDEX. */ +constexpr uint16_t FIL_PAGE_TYPE_LAST= FIL_PAGE_TYPE_UNKNOWN; + +/** Set in FIL_PAGE_TYPE for full_crc32 pages in page_compressed format. +If the flag is set, then the following holds for the remaining bits +of FIL_PAGE_TYPE: +Bits 0..7 will contain the compressed page size in bytes. +Bits 8..14 are reserved and must be 0. */ +constexpr uint16_t FIL_PAGE_COMPRESS_FCRC32_MARKER= 15; +/* @} */ + +/** @return whether the page type is B-tree or R-tree index */ +inline bool fil_page_type_is_index(uint16_t page_type) +{ + switch (page_type) { + case FIL_PAGE_TYPE_INSTANT: + case FIL_PAGE_INDEX: + case FIL_PAGE_RTREE: + return(true); + } + return(false); +} + +/** Check whether the page is index page (either regular Btree index or Rtree +index */ +#define fil_page_index_page_check(page) \ + fil_page_type_is_index(fil_page_get_type(page)) + +/** Get the file page type. +@param[in] page file page +@return page type */ +inline uint16_t fil_page_get_type(const byte *page) +{ + return mach_read_from_2(my_assume_aligned<2>(page + FIL_PAGE_TYPE)); +} + +#ifndef UNIV_INNOCHECKSUM + +/** Number of pending tablespace flushes */ +extern Atomic_counter fil_n_pending_tablespace_flushes; + +/** Look up a tablespace. +The caller should hold an InnoDB table lock or a MDL that prevents +the tablespace from being dropped during the operation, +or the caller should be in single-threaded crash recovery mode +(no user connections that could drop tablespaces). +Normally, fil_space_t::get() should be used instead. +@param[in] id tablespace ID +@return tablespace, or NULL if not found */ +fil_space_t *fil_space_get(uint32_t id) + MY_ATTRIBUTE((warn_unused_result)); + +/** The tablespace memory cache */ +struct fil_system_t +{ + /** + Constructor. + + Some members may require late initialisation, thus we just mark object as + uninitialised. Real initialisation happens in create(). + */ + fil_system_t() : m_initialised(false) {} + + bool is_initialised() const { return m_initialised; } + + /** + Create the file system interface at database start. + + @param[in] hash_size hash table size + */ + void create(ulint hash_size); + + /** Close the file system interface at shutdown */ + void close(); + +private: + bool m_initialised; + + /** Points to the last opened space in space_list. Protected with + fil_system.mutex. */ + fil_space_t *space_list_last_opened= nullptr; + +#ifdef __linux__ + /** available block devices that reside on non-rotational storage */ + std::vector ssd; +public: + /** @return whether a file system device is on non-rotational storage */ + bool is_ssd(dev_t dev) const + { + /* Linux seems to allow up to 15 partitions per block device. + If the detected ssd carries "partition number 0" (it is the whole device), + compare the candidate file system number without the partition number. */ + for (const auto s : ssd) + if (dev == s || (dev & ~15U) == s) + return true; + return false; + } +#endif +public: + /** Detach a tablespace from the cache and close the files. + @param space tablespace + @param detach_handle whether to detach the handle, instead of closing + @return detached handle + @retval OS_FILE_CLOSED if no handle was detached */ + pfs_os_file_t detach(fil_space_t *space, bool detach_handle= false); + + /** the mutex protecting most data fields, and some fields of fil_space_t */ + mysql_mutex_t mutex; + fil_space_t* sys_space; /*!< The innodb_system tablespace */ + fil_space_t* temp_space; /*!< The innodb_temporary tablespace */ + /** Map of fil_space_t::id to fil_space_t* */ + hash_table_t spaces; + /** tablespaces for which fil_space_t::needs_flush() holds */ + sized_ilist unflushed_spaces; + /** number of currently open files; protected by mutex */ + ulint n_open; + /** last time we noted n_open exceeding the limit; protected by mutex */ + time_t n_open_exceeded_time; + /** maximum persistent tablespace id that has ever been assigned */ + uint32_t max_assigned_id; + /** nonzero if fil_node_open_file_low() should avoid moving the tablespace + to the end of space_list, for FIFO policy of try_to_close() */ + ulint freeze_space_list; + /** List of all file spaces, opened spaces should be at the top of the list + to optimize try_to_close() execution. Protected with fil_system.mutex. */ + ilist space_list; + /** list of all tablespaces for which a FILE_MODIFY record has been written + since the latest redo log checkpoint. + Protected only by exclusive log_sys.latch. */ + ilist named_spaces; + + /** list of all ENCRYPTED=DEFAULT tablespaces that need + to be converted to the current value of innodb_encrypt_tables */ + ilist default_encrypt_tables; + + /** whether fil_space_t::create() has issued a warning about + potential space_id reuse */ + bool space_id_reuse_warned; + + /** Add the file to the end of opened spaces list in + fil_system.space_list, so that fil_space_t::try_to_close() should close + it as a last resort. + @param space space to add */ + void add_opened_last_to_space_list(fil_space_t *space); + + /** Move the file to the end of opened spaces list in + fil_system.space_list, so that fil_space_t::try_to_close() should close + it as a last resort. + @param space space to move */ + inline void move_opened_last_to_space_list(fil_space_t *space) + { + /* In the case when several files of the same space are added in a + row, there is no need to remove and add a space to the same position + in space_list. It can be for system or temporary tablespaces. */ + if (freeze_space_list || space_list_last_opened == space) + return; + + space_list.erase(space_list_t::iterator(space)); + add_opened_last_to_space_list(space); + } + + /** Move closed file last in fil_system.space_list, so that + fil_space_t::try_to_close() iterates opened files first in FIFO order, + i.e. first opened, first closed. + @param space space to move */ + void move_closed_last_to_space_list(fil_space_t *space) + { + if (UNIV_UNLIKELY(freeze_space_list)) + return; + + space_list_t::iterator s= space_list_t::iterator(space); + + if (space_list_last_opened == space) + { + ut_ad(s != space_list.begin()); + space_list_t::iterator prev= s; + space_list_last_opened= &*--prev; + } + + space_list.erase(s); + space_list.push_back(*space); + } + + /** Return the next tablespace from default_encrypt_tables list. + @param space previous tablespace (nullptr to start from the start) + @param recheck whether the removal condition needs to be rechecked after + the encryption parameters were changed + @param encrypt expected state of innodb_encrypt_tables + @return the next tablespace to process (n_pending_ops incremented) + @retval fil_system.temp_space if there is no work to do + @retval nullptr upon reaching the end of the iteration */ + inline fil_space_t* default_encrypt_next(fil_space_t *space, bool recheck, + bool encrypt); + + /** Extend all open data files to the recovered size */ + ATTRIBUTE_COLD void extend_to_recv_size(); + + /** Determine if a tablespace associated with a file name exists. + @param path tablespace file name to look for + @return a matching tablespace */ + inline fil_space_t *find(const char *path) const; +}; + +/** The tablespace memory cache. */ +extern fil_system_t fil_system; + +inline void fil_space_t::reacquire() +{ + ut_d(uint32_t n=) n_pending.fetch_add(1, std::memory_order_relaxed); +#ifdef SAFE_MUTEX + if (mysql_mutex_is_owner(&fil_system.mutex)) return; + ut_ad(n & PENDING); + ut_ad(UT_LIST_GET_FIRST(chain)->is_open()); +#endif /* SAFE_MUTEX */ +} + +/** Note that operations on the tablespace must stop. */ +inline void fil_space_t::set_stopping() +{ + mysql_mutex_assert_owner(&fil_system.mutex); +#if defined __GNUC__ && (defined __i386__ || defined __x86_64__) + static_assert(STOPPING_WRITES == 1U << 30, "compatibility"); + __asm__ __volatile__("lock btsl $30, %0" : "+m" (n_pending)); +#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64) + static_assert(STOPPING_WRITES == 1U << 30, "compatibility"); + _interlockedbittestandset(reinterpret_cast(&n_pending), 30); +#else + n_pending.fetch_or(STOPPING_WRITES, std::memory_order_relaxed); +#endif +} + +inline void fil_space_t::clear_stopping() +{ + mysql_mutex_assert_owner(&fil_system.mutex); + static_assert(STOPPING_WRITES == 1U << 30, "compatibility"); + ut_d(auto n=) n_pending.fetch_sub(STOPPING_WRITES, std::memory_order_relaxed); + ut_ad((n & STOPPING) == STOPPING_WRITES); +} + +/** Flush pending writes from the file system cache to the file. */ +template inline void fil_space_t::flush() +{ + mysql_mutex_assert_not_owner(&fil_system.mutex); + ut_ad(!have_reference || (pending() & PENDING)); + ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_IMPORT); + if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC) + { + ut_ad(!is_in_unflushed_spaces); + ut_ad(!needs_flush()); + } + else if (have_reference) + flush_low(); + else + { + if (!(acquire_low(STOPPING | CLOSING) & (STOPPING | CLOSING))) + { + flush_low(); + release(); + } + } +} + +/** @return the size in pages (0 if unreadable) */ +inline uint32_t fil_space_t::get_size() +{ + if (!size) + { + mysql_mutex_lock(&fil_system.mutex); + read_page0(); + mysql_mutex_unlock(&fil_system.mutex); + } + return size; +} + +#include "fil0crypt.h" + +/*******************************************************************//** +Assigns a new space id for a new single-table tablespace. This works simply by +incrementing the global counter. If 4 billion id's is not enough, we may need +to recycle id's. +@return true if assigned, false if not */ +bool fil_assign_new_space_id(uint32_t *space_id); + +/** Frees a space object from the tablespace memory cache. +Closes the files in the chain but does not delete them. +There must not be any pending i/o's or flushes on the files. +@param id tablespace identifier +@param x_latched whether the caller holds exclusive fil_space_t::latch +@return true if success */ +bool fil_space_free(uint32_t id, bool x_latched); + +/** Set the recovered size of a tablespace in pages. +@param id tablespace ID +@param size recovered size in pages +@param flags tablespace flags */ +void fil_space_set_recv_size_and_flags(uint32_t id, uint32_t size, + uint32_t flags); + +/*******************************************************************//** +Sets the max tablespace id counter if the given number is bigger than the +previous value. */ +void fil_set_max_space_id_if_bigger(uint32_t max_id); + +MY_ATTRIBUTE((warn_unused_result)) +/** Delete a tablespace and associated .ibd file. +@param id tablespace identifier +@return detached file handle (to be closed by the caller) +@return OS_FILE_CLOSED if no file existed */ +pfs_os_file_t fil_delete_tablespace(uint32_t id); + +/** Close a single-table tablespace on failed IMPORT TABLESPACE. +The tablespace must be cached in the memory cache. +Free all pages used by the tablespace. */ +void fil_close_tablespace(uint32_t id); + +/*******************************************************************//** +Allocates and builds a file name from a path, a table or tablespace name +and a suffix. The string must be freed by caller with ut_free(). +@param[in] path NULL or the directory path or the full path and filename. +@param[in] name {} if path is full, or Table/Tablespace name +@param[in] ext the file extension to use +@param[in] trim_name true if the last name on the path should be trimmed. +@return own: file name */ +char* fil_make_filepath(const char *path, const fil_space_t::name_type &name, + ib_extention ext, bool trim_name); + +char *fil_make_filepath(const char* path, const table_name_t name, + ib_extention suffix, bool strip_name); + +/** Create a tablespace file. +@param[in] space_id Tablespace ID +@param[in] name Tablespace name in dbname/tablename format. +@param[in] path Path and filename of the datafile to create. +@param[in] flags Tablespace flags +@param[in] size Initial size of the tablespace file in pages, +must be >= FIL_IBD_FILE_INITIAL_SIZE +@param[in] mode MariaDB encryption mode +@param[in] key_id MariaDB encryption key_id +@param[out] err DB_SUCCESS or error code +@return the created tablespace +@retval NULL on error */ +fil_space_t* +fil_ibd_create( + uint32_t space_id, + const table_name_t name, + const char* path, + uint32_t flags, + uint32_t size, + fil_encryption_t mode, + uint32_t key_id, + dberr_t* err) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Try to adjust FSP_SPACE_FLAGS if they differ from the expectations. +(Typically when upgrading from MariaDB 10.1.0..10.1.20.) +@param[in,out] space tablespace +@param[in] flags desired tablespace flags */ +void fsp_flags_try_adjust(fil_space_t *space, uint32_t flags); + +/********************************************************************//** +Tries to open a single-table tablespace and optionally checks the space id is +right in it. If does not succeed, prints an error message to the .err log. This +function is used to open a tablespace when we start up mysqld, and also in +IMPORT TABLESPACE. +NOTE that we assume this operation is used either at the database startup +or under the protection of dict_sys.latch, so that two users cannot +race here. This operation does not leave the file associated with the +tablespace open, but closes it after we have looked at the space id in it. + +If the validate boolean is set, we read the first page of the file and +check that the space id in the file is what we expect. We assume that +this function runs much faster if no check is made, since accessing the +file inode probably is much faster (the OS caches them) than accessing +the first page of the file. This boolean may be initially false, but if +a remote tablespace is found it will be changed to true. + +@param[in] validate 0=maybe missing, 1=do not validate, 2=validate +@param[in] purpose FIL_TYPE_TABLESPACE or FIL_TYPE_TEMPORARY +@param[in] id tablespace ID +@param[in] flags expected FSP_SPACE_FLAGS +@param[in] name table name +If file-per-table, it is the table name in the databasename/tablename format +@param[in] path_in expected filepath, usually read from dictionary +@param[out] err DB_SUCCESS or error code +@return tablespace +@retval NULL if the tablespace could not be opened */ +fil_space_t* +fil_ibd_open( + unsigned validate, + fil_type_t purpose, + uint32_t id, + uint32_t flags, + fil_space_t::name_type name, + const char* path_in, + dberr_t* err = NULL) + MY_ATTRIBUTE((warn_unused_result)); + +enum fil_load_status { + /** The tablespace file(s) were found and valid. */ + FIL_LOAD_OK, + /** The name no longer matches space_id */ + FIL_LOAD_ID_CHANGED, + /** The file(s) were not found */ + FIL_LOAD_NOT_FOUND, + /** The file(s) were not valid */ + FIL_LOAD_INVALID, + /** The tablespace file was deferred to open */ + FIL_LOAD_DEFER +}; + +/** Open a single-file tablespace and add it to the InnoDB data structures. +@param[in] space_id tablespace ID +@param[in] filename path/to/databasename/tablename.ibd +@param[out] space the tablespace, or NULL on error +@return status of the operation */ +enum fil_load_status +fil_ibd_load(uint32_t space_id, const char *filename, fil_space_t *&space) + MY_ATTRIBUTE((warn_unused_result)); + +/** Determine if a matching tablespace exists in the InnoDB tablespace +memory cache. Note that if we have not done a crash recovery at the database +startup, there may be many tablespaces which are not yet in the memory cache. +@param[in] id Tablespace ID +@param[in] table_flags table flags +@return the tablespace +@retval NULL if no matching tablespace exists in the memory cache */ +fil_space_t *fil_space_for_table_exists_in_mem(uint32_t id, + uint32_t table_flags); + +/** Try to extend a tablespace if it is smaller than the specified size. +@param[in,out] space tablespace +@param[in] size desired size in pages +@return whether the tablespace is at least as big as requested */ +bool fil_space_extend(fil_space_t *space, uint32_t size); + +/** Flush to disk the writes in file spaces of the given type +possibly cached by the OS. */ +void fil_flush_file_spaces(); +/******************************************************************//** +Checks the consistency of the tablespace cache. +@return true if ok */ +bool fil_validate(); +/*********************************************************************//** +Sets the file page type. */ +void +fil_page_set_type( +/*==============*/ + byte* page, /*!< in/out: file page */ + ulint type); /*!< in: type */ + +/********************************************************************//** +Delete the tablespace file and any related files like .cfg. +This should not be called for temporary tables. */ +void +fil_delete_file( +/*============*/ + const char* path); /*!< in: filepath of the ibd tablespace */ + +/** Look up a tablespace. +@param tablespace identifier +@return tablespace +@retval nullptr if not found */ +fil_space_t *fil_space_get_by_id(uint32_t id); + +/** Note that a non-predefined persistent tablespace has been modified +by redo log. +@param[in,out] space tablespace */ +void +fil_names_dirty( + fil_space_t* space); + + +bool fil_comp_algo_loaded(ulint comp_algo); + +/** On a log checkpoint, reset fil_names_dirty_and_write() flags +and write out FILE_MODIFY if needed, and write FILE_CHECKPOINT. +@param lsn checkpoint LSN +@return current LSN */ +lsn_t fil_names_clear(lsn_t lsn); + +#ifdef UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH +void test_make_filepath(); +#endif /* UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH */ + +/** Determine the block size of the data file. +@param[in] space tablespace +@param[in] offset page number +@return block size */ +ulint fil_space_get_block_size(const fil_space_t* space, unsigned offset); + +/** Check whether encryption key found +@param crypt_data Encryption data +@param f_name File name +@return encryption key found */ +bool fil_crypt_check(fil_space_crypt_t *crypt_data, const char *f_name); + +#endif /* UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/fil0pagecompress.h b/storage/innobase/include/fil0pagecompress.h new file mode 100644 index 00000000..2927da3c --- /dev/null +++ b/storage/innobase/include/fil0pagecompress.h @@ -0,0 +1,57 @@ +/***************************************************************************** + +Copyright (C) 2013, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +#ifndef fil0pagecompress_h +#define fil0pagecompress_h + +#include "fsp0fsp.h" + +/******************************************************************//** +@file include/fil0pagecompress.h +Helper functions for extracting/storing page compression and +atomic writes information to table space. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +/** Compress a page_compressed page before writing to a data file. +@param[in] buf page to be compressed +@param[out] out_buf compressed page +@param[in] flags tablespace flags +@param[in] block_size file system block size +@param[in] encrypted whether the page will be subsequently encrypted +@return actual length of compressed page +@retval 0 if the page was not compressed */ +ulint fil_page_compress( + const byte* buf, + byte* out_buf, + uint32_t flags, + ulint block_size, + bool encrypted) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Decompress a page that may be subject to page_compressed compression. +@param[in,out] tmp_buf temporary buffer (of innodb_page_size) +@param[in,out] buf compressed page buffer +@param[in] flags tablespace flags +@return size of the compressed data +@retval 0 if decompression failed +@retval srv_page_size if the page was not compressed */ +ulint fil_page_decompress(byte *tmp_buf, byte *buf, uint32_t flags) + MY_ATTRIBUTE((nonnull, warn_unused_result)); +#endif diff --git a/storage/innobase/include/fsp0file.h b/storage/innobase/include/fsp0file.h new file mode 100644 index 00000000..67e79f1a --- /dev/null +++ b/storage/innobase/include/fsp0file.h @@ -0,0 +1,509 @@ +/***************************************************************************** + +Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/fsp0file.h +Tablespace data file implementation. + +Created 2013-7-26 by Kevin Lewis +*******************************************************/ + +#ifndef fsp0file_h +#define fsp0file_h + +#include "mem0mem.h" +#include "os0file.h" +#include "fil0fil.h" + +/** Types of raw partitions in innodb_data_file_path */ +enum device_t { + SRV_NOT_RAW = 0, /*!< Not a raw partition */ + SRV_NEW_RAW, /*!< A 'newraw' partition, only to be + initialized */ + SRV_OLD_RAW /*!< An initialized raw partition */ +}; + +/** Data file control information. */ +class Datafile { + + friend class Tablespace; + friend class SysTablespace; + +public: + + Datafile() + : + m_filepath(), + m_filename(), + m_handle(), + m_open_flags(OS_FILE_OPEN), + m_size(), + m_order(), + m_type(SRV_NOT_RAW), + m_space_id(UINT32_MAX), + m_flags(), + m_exists(), + m_is_valid(), + m_first_page(), + m_last_os_error(), + m_file_info() + { + /* No op */ + } + + Datafile(uint32_t flags, uint32_t size, ulint order) + : + m_filepath(), + m_filename(), + m_handle(), + m_open_flags(OS_FILE_OPEN), + m_size(size), + m_order(order), + m_type(SRV_NOT_RAW), + m_space_id(UINT32_MAX), + m_flags(flags), + m_exists(), + m_is_valid(), + m_first_page(), + m_last_os_error(), + m_file_info() + { + } + + Datafile(const Datafile& file) + : + m_handle(file.m_handle), + m_open_flags(file.m_open_flags), + m_size(file.m_size), + m_order(file.m_order), + m_type(file.m_type), + m_space_id(file.m_space_id), + m_flags(file.m_flags), + m_exists(file.m_exists), + m_is_valid(file.m_is_valid), + m_first_page(), + m_last_os_error(), + m_file_info() + { + if (file.m_filepath != NULL) { + m_filepath = mem_strdup(file.m_filepath); + ut_a(m_filepath != NULL); + set_filename(); + } else { + m_filepath = NULL; + m_filename = NULL; + } + } + + virtual ~Datafile() + { + shutdown(); + } + + Datafile& operator=(const Datafile& file) + { + ut_a(this != &file); + + m_size = file.m_size; + m_order = file.m_order; + m_type = file.m_type; + + ut_a(m_handle == OS_FILE_CLOSED); + m_handle = file.m_handle; + + m_exists = file.m_exists; + m_is_valid = file.m_is_valid; + m_open_flags = file.m_open_flags; + m_space_id = file.m_space_id; + m_flags = file.m_flags; + m_last_os_error = 0; + + if (m_filepath != NULL) { + ut_free(m_filepath); + m_filepath = NULL; + m_filename = NULL; + } + + if (file.m_filepath != NULL) { + m_filepath = mem_strdup(file.m_filepath); + ut_a(m_filepath != NULL); + set_filename(); + } + + /* Do not make a copy of the first page, + it should be reread if needed */ + m_first_page = NULL; + + return(*this); + } + + /** Initialize the tablespace flags */ + void init(uint32_t flags) { m_flags= flags; } + + /** Release the resources. */ + virtual void shutdown(); + + /** Open a data file in read-only mode to check if it exists + so that it can be validated. + @param[in] strict whether to issue error messages + @return DB_SUCCESS or error code */ + dberr_t open_read_only(bool strict); + + /** Open a data file in read-write mode during start-up so that + doublewrite pages can be restored and then it can be validated. + @return DB_SUCCESS or error code */ + inline dberr_t open_read_write() + MY_ATTRIBUTE((warn_unused_result)); + + /** Initialize OS specific file info. */ + void init_file_info(); + + /** Close a data file. + @return DB_SUCCESS or error code */ + dberr_t close(); + + /** Make a full filepath from a directory path and a filename. + Prepend the dirpath to filename using the extension given. + If dirpath is NULL, prepend the default datadir to filepath. + Store the result in m_filepath. + @param dirpath directory path + @param name tablespace (table) name + @param ext filename extension */ + void make_filepath(const char* dirpath, fil_space_t::name_type name, + ib_extention ext); + + /** Set the filepath by duplicating the filepath sent in */ + void set_filepath(const char* filepath); + + /** Validates the datafile and checks that it conforms with + the expected space ID and flags. The file should exist and be + successfully opened in order for this function to validate it. + @param[in] space_id The expected tablespace ID. + @param[in] flags The expected tablespace flags. + @retval DB_SUCCESS if tablespace is valid, DB_ERROR if not. + m_is_valid is also set true on success, else false. */ + dberr_t validate_to_dd(uint32_t space_id, uint32_t flags) + MY_ATTRIBUTE((warn_unused_result)); + + /** Validates this datafile for the purpose of recovery. + The file should exist and be successfully opened. We initially + open it in read-only mode because we just want to read the SpaceID. + However, if the first page is corrupt and needs to be restored + from the doublewrite buffer, we will reopen it in write mode and + ry to restore that page. + @retval DB_SUCCESS if tablespace is valid, DB_ERROR if not. + m_is_valid is also set true on success, else false. */ + dberr_t validate_for_recovery() + MY_ATTRIBUTE((warn_unused_result)); + + /** Checks the consistency of the first page of a datafile when the + tablespace is opened. This occurs before the fil_space_t is created + so the Space ID found here must not already be open. + m_is_valid is set true on success, else false. + @retval DB_SUCCESS on if the datafile is valid + @retval DB_CORRUPTION if the datafile is not readable + @retval DB_TABLESPACE_EXISTS if there is a duplicate space_id */ + dberr_t validate_first_page() + MY_ATTRIBUTE((warn_unused_result)); + + /** Get Datafile::m_filepath. + @return m_filepath */ + const char* filepath() const + { + return(m_filepath); + } + + /** Get Datafile::m_handle. + @return m_handle */ + pfs_os_file_t handle() const + { + return(m_handle); + } + + /** @return detached file handle */ + pfs_os_file_t detach() + { + pfs_os_file_t detached = m_handle; + m_handle = OS_FILE_CLOSED; + return detached; + } + + /** Get Datafile::m_order. + @return m_order */ + ulint order() const + { + return(m_order); + } + + /** Get Datafile::m_space_id. + @return m_space_id */ + uint32_t space_id() const { return m_space_id; } + + /** Get Datafile::m_flags. + @return m_flags */ + uint32_t flags() const { return m_flags; } + + /** + @return true if m_handle is open, false if not */ + bool is_open() const { return m_handle != OS_FILE_CLOSED; } + + /** Get Datafile::m_is_valid. + @return m_is_valid */ + bool is_valid() const + { + return(m_is_valid); + } + + /** Get the last OS error reported + @return m_last_os_error */ + ulint last_os_error() const + { + return(m_last_os_error); + } + + /** Check whether the file is empty. + @return true if file is empty */ + bool is_empty_file() const + { +#ifdef _WIN32 + os_offset_t offset = + (os_offset_t) m_file_info.nFileSizeLow + | ((os_offset_t) m_file_info.nFileSizeHigh << 32); + + return (offset == 0); +#else + return (m_file_info.st_size == 0); +#endif + } + + /** Check if the file exist. + @return true if file exists. */ + bool exists() const { return m_exists; } + + /** Test if the filepath provided looks the same as this filepath + by string comparison. If they are two different paths to the same + file, same_as() will be used to show that after the files are opened. + @param[in] other filepath to compare with + @retval true if it is the same filename by char comparison + @retval false if it looks different */ + bool same_filepath_as(const char* other) const; + + /** Test if another opened datafile is the same file as this object. + @param[in] other Datafile to compare with + @return true if it is the same file, else false */ + bool same_as(const Datafile& other) const; + + /** Get access to the first data page. + It is valid after open_read_only() succeeded. + @return the first data page */ + const byte* get_first_page() const { return(m_first_page); } + + void set_space_id(uint32_t space_id) { m_space_id= space_id; } + + void set_flags(uint32_t flags) { m_flags = flags; } +private: + /** Free the filepath buffer. */ + void free_filepath(); + + /** Set the filename pointer to the start of the file name + in the filepath. */ + void set_filename() + { + if (!m_filepath) { + return; + } + + if (char *last_slash = strrchr(m_filepath, '/')) { +#if _WIN32 + if (char *last = strrchr(m_filepath, '\\')) { + if (last > last_slash) { + last_slash = last; + } + } +#endif + m_filename = last_slash + 1; + } else { + m_filename = m_filepath; + } + } + + /** Create/open a data file. + @param[in] read_only_mode if true, then readonly mode checks + are enforced. + @return DB_SUCCESS or error code */ + dberr_t open_or_create(bool read_only_mode) + MY_ATTRIBUTE((warn_unused_result)); + + /** Reads a few significant fields from the first page of the + datafile, which must already be open. + @param[in] read_only_mode if true, then readonly mode checks + are enforced. + @return DB_SUCCESS or DB_IO_ERROR if page cannot be read */ + dberr_t read_first_page(bool read_only_mode) + MY_ATTRIBUTE((warn_unused_result)); + + /** Free the first page from memory when it is no longer needed. */ + void free_first_page(); + + /** Set the Datafile::m_open_flags. + @param open_flags The Open flags to set. */ + void set_open_flags(os_file_create_t open_flags) + { + m_open_flags = open_flags; + }; + + /** Determine if this datafile is on a Raw Device + @return true if it is a RAW device. */ + bool is_raw_device() + { + return(m_type != SRV_NOT_RAW); + } + + /* DATA MEMBERS */ + +protected: + /** Physical file path with base name and extension */ + char* m_filepath; + +private: + /** Determine the space id of the given file descriptor by reading + a few pages from the beginning of the .ibd file. + @return DB_SUCCESS if space id was successfully identified, + else DB_ERROR. */ + dberr_t find_space_id(); + + /** Points into m_filepath to the file name with extension */ + char* m_filename; + + /** Open file handle */ + pfs_os_file_t m_handle; + + /** Flags to use for opening the data file */ + os_file_create_t m_open_flags; + + /** size in megabytes or pages; converted from megabytes to + pages in SysTablespace::normalize_size() */ + uint32_t m_size; + + /** ordinal position of this datafile in the tablespace */ + ulint m_order; + + /** The type of the data file */ + device_t m_type; + + /** Tablespace ID. Contained in the datafile header. + If this is a system tablespace, FSP_SPACE_ID is only valid + in the first datafile. */ + uint32_t m_space_id; + + /** Tablespace flags. Contained in the datafile header. + If this is a system tablespace, FSP_SPACE_FLAGS are only valid + in the first datafile. */ + uint32_t m_flags; + + /** true if file already existed on startup */ + bool m_exists; + + /* true if the tablespace is valid */ + bool m_is_valid; + + /** Aligned buffer to hold first page */ + byte* m_first_page; + +protected: + /** Last OS error received so it can be reported if needed. */ + ulint m_last_os_error; + +public: + /** true if table is deferred during recovery */ + bool m_defer=false; + /** Use the following to determine the uniqueness of this datafile. */ +#ifdef _WIN32 + /* Use fields dwVolumeSerialNumber, nFileIndexLow, nFileIndexHigh. */ + BY_HANDLE_FILE_INFORMATION m_file_info; +#else + /* Use field st_ino. */ + struct stat m_file_info; +#endif /* WIN32 */ +}; + + +/** Data file control information. */ +class RemoteDatafile : public Datafile +{ +private: + /** Link filename (full path) */ + char* m_link_filepath; + +public: + + RemoteDatafile() + : + m_link_filepath() + { + /* No op - base constructor is called. */ + } + + RemoteDatafile(const char*, ulint, ulint) + : + m_link_filepath() + { + /* No op - base constructor is called. */ + } + + ~RemoteDatafile() override + { + shutdown(); + } + + /** Release the resources. */ + void shutdown() override; + + /** Get the link filepath. + @return m_link_filepath */ + const char* link_filepath() const + { + return(m_link_filepath); + } + + /** Attempt to read the contents of an .isl file into m_filepath. + @param name table name + @return filepath() + @retval nullptr if the .isl file does not exist or cannot be read */ + const char* open_link_file(const fil_space_t::name_type name); + + /** Delete an InnoDB Symbolic Link (ISL) file. */ + void delete_link_file(void); + + /****************************************************************** + Global Static Functions; Cannot refer to data members. + ******************************************************************/ + + /** Create InnoDB Symbolic Link (ISL) file. + @param name tablespace name + @param filepath full file name + @return DB_SUCCESS or error code */ + static dberr_t create_link_file(fil_space_t::name_type name, + const char *filepath); + + /** Delete an InnoDB Symbolic Link (ISL) file by name. + @param name tablespace name */ + static void delete_link_file(fil_space_t::name_type name); +}; +#endif /* fsp0file_h */ diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h new file mode 100644 index 00000000..26261554 --- /dev/null +++ b/storage/innobase/include/fsp0fsp.h @@ -0,0 +1,762 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/fsp0fsp.h +File space management + +Created 12/18/1995 Heikki Tuuri +*******************************************************/ + +#ifndef fsp0fsp_h +#define fsp0fsp_h + +#include "assume_aligned.h" +#include "fsp0types.h" +#include "fut0lst.h" +#include "ut0byte.h" + +#ifndef UNIV_INNOCHECKSUM +#include "mtr0mtr.h" +#include "page0types.h" +#include "rem0types.h" +#else +# include "mach0data.h" +#endif /* !UNIV_INNOCHECKSUM */ + +/** @return the PAGE_SSIZE flags for the current innodb_page_size */ +#define FSP_FLAGS_PAGE_SSIZE() \ + ((srv_page_size == UNIV_PAGE_SIZE_ORIG) ? \ + 0U : (srv_page_size_shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1) \ + << FSP_FLAGS_POS_PAGE_SSIZE) + +/** @return the PAGE_SSIZE flags for the current innodb_page_size in +full checksum format */ +#define FSP_FLAGS_FCRC32_PAGE_SSIZE() \ + ((srv_page_size_shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1) \ + << FSP_FLAGS_FCRC32_POS_PAGE_SSIZE) + +/* @defgroup Compatibility macros for MariaDB 10.1.0 through 10.1.20; +see the table in fsp0types.h @{ */ +/** Zero relative shift position of the PAGE_COMPRESSION field */ +#define FSP_FLAGS_POS_PAGE_COMPRESSION_MARIADB101 \ + (FSP_FLAGS_POS_ATOMIC_BLOBS \ + + FSP_FLAGS_WIDTH_ATOMIC_BLOBS) +/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */ +#define FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL_MARIADB101 \ + (FSP_FLAGS_POS_PAGE_COMPRESSION_MARIADB101 + 1) +/** Zero relative shift position of the ATOMIC_WRITES field */ +#define FSP_FLAGS_POS_ATOMIC_WRITES_MARIADB101 \ + (FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL_MARIADB101 + 4) +/** Zero relative shift position of the PAGE_SSIZE field */ +#define FSP_FLAGS_POS_PAGE_SSIZE_MARIADB101 \ + (FSP_FLAGS_POS_ATOMIC_WRITES_MARIADB101 + 2) + +/** Bit mask of the PAGE_COMPRESSION field */ +#define FSP_FLAGS_MASK_PAGE_COMPRESSION_MARIADB101 \ + (1U << FSP_FLAGS_POS_PAGE_COMPRESSION_MARIADB101) +/** Bit mask of the PAGE_COMPRESSION_LEVEL field */ +#define FSP_FLAGS_MASK_PAGE_COMPRESSION_LEVEL_MARIADB101 \ + (15U << FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL_MARIADB101) +/** Bit mask of the ATOMIC_WRITES field */ +#define FSP_FLAGS_MASK_ATOMIC_WRITES_MARIADB101 \ + (3U << FSP_FLAGS_POS_ATOMIC_WRITES_MARIADB101) +/** Bit mask of the PAGE_SSIZE field */ +#define FSP_FLAGS_MASK_PAGE_SSIZE_MARIADB101 \ + (15U << FSP_FLAGS_POS_PAGE_SSIZE_MARIADB101) + +/** Return the value of the PAGE_COMPRESSION field */ +#define FSP_FLAGS_GET_PAGE_COMPRESSION_MARIADB101(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION_MARIADB101) \ + >> FSP_FLAGS_POS_PAGE_COMPRESSION_MARIADB101) +/** Return the value of the PAGE_COMPRESSION_LEVEL field */ +#define FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL_MARIADB101(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION_LEVEL_MARIADB101) \ + >> FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL_MARIADB101) +/** Return the value of the PAGE_SSIZE field */ +#define FSP_FLAGS_GET_PAGE_SSIZE_MARIADB101(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_SSIZE_MARIADB101) \ + >> FSP_FLAGS_POS_PAGE_SSIZE_MARIADB101) + +/* @} */ + +/* @defgroup Tablespace Header Constants (moved from fsp0fsp.c) @{ */ + +/** Offset of the space header within a file page */ +#define FSP_HEADER_OFFSET FIL_PAGE_DATA + +/* The data structures in files are defined just as byte strings in C */ +typedef byte xdes_t; + +/* SPACE HEADER + ============ + +File space header data structure: this data structure is contained in the +first page of a space. The space for this header is reserved in every extent +descriptor page, but used only in the first. */ + +/*-------------------------------------*/ +#define FSP_SPACE_ID 0 /* space id */ +#define FSP_NOT_USED 4 /* this field contained a value up to + which we know that the modifications + in the database have been flushed to + the file space; not used now */ +#define FSP_SIZE 8 /* Current size of the space in + pages */ +#define FSP_FREE_LIMIT 12 /* Minimum page number for which the + free list has not been initialized: + the pages >= this limit are, by + definition, free; note that in a + single-table tablespace where size + < 64 pages, this number is 64, i.e., + we have initialized the space + about the first extent, but have not + physically allocated those pages to the + file */ +#define FSP_SPACE_FLAGS 16 /* fsp_space_t.flags, similar to + dict_table_t::flags */ +#define FSP_FRAG_N_USED 20 /* number of used pages in the + FSP_FREE_FRAG list */ +#define FSP_FREE 24 /* list of free extents */ +#define FSP_FREE_FRAG (24 + FLST_BASE_NODE_SIZE) + /* list of partially free extents not + belonging to any segment */ +#define FSP_FULL_FRAG (24 + 2 * FLST_BASE_NODE_SIZE) + /* list of full extents not belonging + to any segment */ +#define FSP_SEG_ID (24 + 3 * FLST_BASE_NODE_SIZE) + /* 8 bytes which give the first unused + segment id */ +#define FSP_SEG_INODES_FULL (32 + 3 * FLST_BASE_NODE_SIZE) + /* list of pages containing segment + headers, where all the segment inode + slots are reserved */ +#define FSP_SEG_INODES_FREE (32 + 4 * FLST_BASE_NODE_SIZE) + /* list of pages containing segment + headers, where not all the segment + header slots are reserved */ +/*-------------------------------------*/ +/* File space header size */ +#define FSP_HEADER_SIZE (32 + 5 * FLST_BASE_NODE_SIZE) + +#define FSP_FREE_ADD 4 /* this many free extents are added + to the free list from above + FSP_FREE_LIMIT at a time */ +/* @} */ + +/* @defgroup File Segment Inode Constants (moved from fsp0fsp.c) @{ */ + +/* FILE SEGMENT INODE + ================== + +Segment inode which is created for each segment in a tablespace. NOTE: in +purge we assume that a segment having only one currently used page can be +freed in a few steps, so that the freeing cannot fill the file buffer with +bufferfixed file pages. */ + +typedef byte fseg_inode_t; + +#define FSEG_INODE_PAGE_NODE FSEG_PAGE_DATA + /* the list node for linking + segment inode pages */ + +#define FSEG_ARR_OFFSET (FSEG_PAGE_DATA + FLST_NODE_SIZE) +/*-------------------------------------*/ +#define FSEG_ID 0 /* 8 bytes of segment id: if this is 0, + it means that the header is unused */ +#define FSEG_NOT_FULL_N_USED 8 + /* number of used segment pages in + the FSEG_NOT_FULL list */ +#define FSEG_FREE 12 + /* list of free extents of this + segment */ +#define FSEG_NOT_FULL (12 + FLST_BASE_NODE_SIZE) + /* list of partially free extents */ +#define FSEG_FULL (12 + 2 * FLST_BASE_NODE_SIZE) + /* list of full extents */ +#define FSEG_MAGIC_N (12 + 3 * FLST_BASE_NODE_SIZE) + /* magic number used in debugging */ +#define FSEG_FRAG_ARR (16 + 3 * FLST_BASE_NODE_SIZE) + /* array of individual pages + belonging to this segment in fsp + fragment extent lists */ +#define FSEG_FRAG_ARR_N_SLOTS (FSP_EXTENT_SIZE / 2) + /* number of slots in the array for + the fragment pages */ +#define FSEG_FRAG_SLOT_SIZE 4 /* a fragment page slot contains its + page number within space, FIL_NULL + means that the slot is not in use */ +/*-------------------------------------*/ +#define FSEG_INODE_SIZE \ + (16 + 3 * FLST_BASE_NODE_SIZE \ + + FSEG_FRAG_ARR_N_SLOTS * FSEG_FRAG_SLOT_SIZE) + +static constexpr byte FSEG_MAGIC_N_BYTES[4]={0x05,0xd6,0x69,0xd2}; + +#define FSEG_FILLFACTOR 8 /* If the number of unused but reserved + pages in a segment is less than + reserved pages / FSEG_FILLFACTOR, + and there are + at least FSEG_FRAG_LIMIT used pages, + then we allow a new empty extent to + be added to the segment in + fseg_alloc_free_page_general(). + Otherwise, we + use unused pages of the segment. */ + +#define FSEG_FRAG_LIMIT FSEG_FRAG_ARR_N_SLOTS + /* If the segment has >= this many + used pages, it may be expanded by + allocating extents to the segment; + until that only individual fragment + pages are allocated from the space */ + +#define FSEG_FREE_LIST_LIMIT 40 /* If the reserved size of a segment + is at least this many extents, we + allow extents to be put to the free + list of the extent: at most + FSEG_FREE_LIST_MAX_LEN many */ +#define FSEG_FREE_LIST_MAX_LEN 4 +/* @} */ + +/* @defgroup Extent Descriptor Constants (moved from fsp0fsp.c) @{ */ + +/* EXTENT DESCRIPTOR + ================= + +File extent descriptor data structure: contains bits to tell which pages in +the extent are free and which contain old tuple version to clean. */ + +/*-------------------------------------*/ +#define XDES_ID 0 /* The identifier of the segment + to which this extent belongs */ +#define XDES_FLST_NODE 8 /* The list node data structure + for the descriptors */ +#define XDES_STATE (FLST_NODE_SIZE + 8) + /* contains state information + of the extent */ +#define XDES_BITMAP (FLST_NODE_SIZE + 12) + /* Descriptor bitmap of the pages + in the extent */ +/*-------------------------------------*/ + +#define XDES_BITS_PER_PAGE 2 /* How many bits are there per page */ +#define XDES_FREE_BIT 0 /* Index of the bit which tells if + the page is free */ +#define XDES_CLEAN_BIT 1 /* NOTE: currently not used! + Index of the bit which tells if + there are old versions of tuples + on the page */ +/* States of a descriptor */ +#define XDES_FREE 1 /* extent is in free list of space */ +#define XDES_FREE_FRAG 2 /* extent is in free fragment list of + space */ +#define XDES_FULL_FRAG 3 /* extent is in full fragment list of + space */ +#define XDES_FSEG 4 /* extent belongs to a segment */ + +/** File extent data structure size in bytes. */ +#define XDES_SIZE \ + (XDES_BITMAP \ + + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE * XDES_BITS_PER_PAGE)) + +/** File extent data structure size in bytes for MAX page size. */ +#define XDES_SIZE_MAX \ + (XDES_BITMAP \ + + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE_MAX * XDES_BITS_PER_PAGE)) + +/** File extent data structure size in bytes for MIN page size. */ +#define XDES_SIZE_MIN \ + (XDES_BITMAP \ + + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE_MIN * XDES_BITS_PER_PAGE)) + +/** Offset of the descriptor array on a descriptor page */ +#define XDES_ARR_OFFSET (FSP_HEADER_OFFSET + FSP_HEADER_SIZE) + +/** +Determine if a page is marked free. +@param[in] descr extent descriptor +@param[in] offset page offset within extent +@return whether the page is free */ +inline bool xdes_is_free(const xdes_t *descr, ulint offset) +{ + ut_ad(offset < FSP_EXTENT_SIZE); + ulint index= XDES_FREE_BIT + XDES_BITS_PER_PAGE * offset; + return ut_bit_get_nth(descr[XDES_BITMAP + (index >> 3)], index & 7); +} + +#ifndef UNIV_INNOCHECKSUM +/* @} */ + +/** Read a tablespace header field. +@param[in] page first page of a tablespace +@param[in] field the header field +@return the contents of the header field */ +inline uint32_t fsp_header_get_field(const page_t* page, ulint field) +{ + return mach_read_from_4(FSP_HEADER_OFFSET + field + + my_assume_aligned(page)); +} + +/** Read the flags from the tablespace header page. +@param[in] page first page of a tablespace +@return the contents of FSP_SPACE_FLAGS */ +inline uint32_t fsp_header_get_flags(const page_t *page) +{ + return fsp_header_get_field(page, FSP_SPACE_FLAGS); +} + +/** Get the byte offset of encryption information in page 0. +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@return byte offset relative to FSP_HEADER_OFFSET */ +inline MY_ATTRIBUTE((pure, warn_unused_result)) +ulint fsp_header_get_encryption_offset(ulint zip_size) +{ + return zip_size + ? XDES_ARR_OFFSET + XDES_SIZE * zip_size / FSP_EXTENT_SIZE + : XDES_ARR_OFFSET + (XDES_SIZE << srv_page_size_shift) + / FSP_EXTENT_SIZE; +} + +/** Check the encryption key from the first page of a tablespace. +@param[in] fsp_flags tablespace flags +@param[in] page first page of a tablespace +@return true if success */ +bool +fsp_header_check_encryption_key( + ulint fsp_flags, + page_t* page); + +/** Initialize a tablespace header. +@param[in,out] space tablespace +@param[in] size current size in blocks +@param[in,out] mtr mini-transaction +@return error code */ +dberr_t fsp_header_init(fil_space_t *space, uint32_t size, mtr_t *mtr) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Create a new segment. +@param space tablespace +@param byte_offset byte offset of the created segment header +@param mtr mini-transaction +@param err error code +@param has_done_reservation whether fsp_reserve_free_extents() was invoked +@param block block where segment header is placed, + or NULL to allocate an additional page for that +@return the block where the segment header is placed, x-latched +@retval nullptr if could not create segment */ +buf_block_t* +fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr, dberr_t *err, + bool has_done_reservation= false, buf_block_t *block= nullptr) + MY_ATTRIBUTE((nonnull(1,3,4), warn_unused_result)); + +/** Calculate the number of pages reserved by a segment, +and how many pages are currently used. +@param[in] block buffer block containing the file segment header +@param[in] header file segment header +@param[out] used number of pages that are used (not more than reserved) +@param[in,out] mtr mini-transaction +@return number of reserved pages */ +ulint fseg_n_reserved_pages(const buf_block_t &block, + const fseg_header_t *header, ulint *used, + mtr_t *mtr) + MY_ATTRIBUTE((nonnull)); +/**********************************************************************//** +Allocates a single free page from a segment. This function implements +the intelligent allocation strategy which tries to minimize file space +fragmentation. +@retval NULL if no page could be allocated */ +buf_block_t* +fseg_alloc_free_page_general( +/*=========================*/ + fseg_header_t* seg_header,/*!< in/out: segment header */ + uint32_t hint, /*!< in: hint of which page would be + desirable */ + byte direction,/*!< in: if the new page is needed because + of an index page split, and records are + inserted there in order, into which + direction they go alphabetically: FSP_DOWN, + FSP_UP, FSP_NO_DIR */ + bool has_done_reservation, /*!< in: true if the caller has + already done the reservation for the page + with fsp_reserve_free_extents, then there + is no need to do the check for this individual + page */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + mtr_t* init_mtr,/*!< in/out: mtr or another mini-transaction + in which the page should be initialized. */ + dberr_t* err) /*!< out: error code */ + MY_ATTRIBUTE((warn_unused_result, nonnull)); + +/** Reserves free pages from a tablespace. All mini-transactions which may +use several pages from the tablespace should call this function beforehand +and reserve enough free extents so that they certainly will be able +to do their operation, like a B-tree page split, fully. Reservations +must be released with function fil_space_t::release_free_extents()! + +The alloc_type below has the following meaning: FSP_NORMAL means an +operation which will probably result in more space usage, like an +insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are +deleting rows, then this allocation will in the long run result in +less space usage (after a purge); FSP_CLEANING means allocation done +in a physical record delete (like in a purge) or other cleaning operation +which will result in less space usage in the long run. We prefer the latter +two types of allocation: when space is scarce, FSP_NORMAL allocations +will not succeed, but the latter two allocations will succeed, if possible. +The purpose is to avoid dead end where the database is full but the +user cannot free any space because these freeing operations temporarily +reserve some space. + +Single-table tablespaces whose size is < FSP_EXTENT_SIZE pages are a special +case. In this function we would liberally reserve several extents for +every page split or merge in a B-tree. But we do not want to waste disk space +if the table only occupies < FSP_EXTENT_SIZE pages. That is why we apply +different rules in that special case, just ensuring that there are n_pages +free pages available. + +@param[out] n_reserved number of extents actually reserved; if we + return true and the tablespace size is < + FSP_EXTENT_SIZE pages, then this can be 0, + otherwise it is n_ext +@param[in,out] space tablespace +@param[in] n_ext number of extents to reserve +@param[in] alloc_type page reservation type (FSP_BLOB, etc) +@param[in,out] mtr the mini transaction +@param[out] err error code +@param[in] n_pages for small tablespaces (tablespace size is + less than FSP_EXTENT_SIZE), number of free + pages to reserve. +@return error code +@retval DB_SUCCESS if we were able to make the reservation */ +dberr_t +fsp_reserve_free_extents( + uint32_t* n_reserved, + fil_space_t* space, + uint32_t n_ext, + fsp_reserve_t alloc_type, + mtr_t* mtr, + uint32_t n_pages = 2); + +/** Free a page in a file segment. +@param[in,out] seg_header file segment header +@param[in,out] space tablespace +@param[in] offset page number +@param[in,out] mtr mini-transaction +@param[in] have_latch whether space->x_lock() was already called +@return error code */ +dberr_t +fseg_free_page( + fseg_header_t* seg_header, + fil_space_t* space, + uint32_t offset, + mtr_t* mtr, + bool have_latch = false) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Determine whether a page is allocated. +@param space tablespace +@param page page number +@return error code +@retval DB_SUCCESS if the page is marked as free +@retval DB_SUCCESS_LOCKED_REC if the page is marked as allocated */ +dberr_t fseg_page_is_allocated(fil_space_t *space, unsigned page) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Frees part of a segment. This function can be used to free +a segment by repeatedly calling this function in different +mini-transactions. Doing the freeing in a single mini-transaction +might result in too big a mini-transaction. +@param header segment header; NOTE: if the header resides on first + page of the frag list of the segment, this pointer + becomes obsolete after the last freeing step +@param mtr mini-transaction +@param ahi Drop the adaptive hash index +@return whether the freeing was completed */ +bool +fseg_free_step( + fseg_header_t* header, + mtr_t* mtr +#ifdef BTR_CUR_HASH_ADAPT + ,bool ahi=false +#endif /* BTR_CUR_HASH_ADAPT */ + ) + MY_ATTRIBUTE((warn_unused_result)); + +/** Frees part of a segment. Differs from fseg_free_step because +this function leaves the header page unfreed. +@param header segment header which must reside on the first + fragment page of the segment +@param mtr mini-transaction +@param ahi drop the adaptive hash index +@return whether the freeing was completed, except for the header page */ +bool +fseg_free_step_not_header( + fseg_header_t* header, + mtr_t* mtr +#ifdef BTR_CUR_HASH_ADAPT + ,bool ahi=false +#endif /* BTR_CUR_HASH_ADAPT */ + ) + MY_ATTRIBUTE((warn_unused_result)); + +/** Reset the page type. +Data files created before MySQL 5.1.48 may contain garbage in FIL_PAGE_TYPE. +In MySQL 3.23.53, only undo log pages and index pages were tagged. +Any other pages were written with uninitialized bytes in FIL_PAGE_TYPE. +@param[in] block block with invalid FIL_PAGE_TYPE +@param[in] type expected page type +@param[in,out] mtr mini-transaction */ +ATTRIBUTE_COLD +void fil_block_reset_type(const buf_block_t& block, ulint type, mtr_t* mtr); + +/** Check (and if needed, reset) the page type. +Data files created before MySQL 5.1.48 may contain +garbage in the FIL_PAGE_TYPE field. +In MySQL 3.23.53, only undo log pages and index pages were tagged. +Any other pages were written with uninitialized bytes in FIL_PAGE_TYPE. +@param[in] page_id page number +@param[in,out] page page with possibly invalid FIL_PAGE_TYPE +@param[in] type expected page type +@param[in,out] mtr mini-transaction */ +inline void +fil_block_check_type( + const buf_block_t& block, + ulint type, + mtr_t* mtr) +{ + if (UNIV_UNLIKELY(type != fil_page_get_type(block.page.frame))) + fil_block_reset_type(block, type, mtr); +} + +/** Checks if a page address is an extent descriptor page address. +@param[in] page_id page id +@param[in] physical_size page size +@return whether a descriptor page */ +inline bool fsp_descr_page(const page_id_t page_id, ulint physical_size) +{ + return (page_id.page_no() & (physical_size - 1)) == FSP_XDES_OFFSET; +} + +/** Initialize a file page whose prior contents should be ignored. +@param[in,out] block buffer pool block */ +void fsp_apply_init_file_page(buf_block_t *block); + +/** Initialize a file page. +@param[in] space tablespace +@param[in,out] block file page +@param[in,out] mtr mini-transaction */ +inline void fsp_init_file_page( +#ifdef UNIV_DEBUG + const fil_space_t* space, +#endif + buf_block_t* block, mtr_t* mtr) +{ + ut_d(space->modify_check(*mtr)); + ut_ad(space->id == block->page.id().space()); + fsp_apply_init_file_page(block); + mtr->init(block); +} + +#ifndef UNIV_DEBUG +# define fsp_init_file_page(space, block, mtr) fsp_init_file_page(block, mtr) +#endif + +#ifdef UNIV_BTR_PRINT +/*******************************************************************//** +Writes info of a segment. */ +void +fseg_print( +/*=======*/ + fseg_header_t* header, /*!< in: segment header */ + mtr_t* mtr); /*!< in/out: mini-transaction */ +#endif /* UNIV_BTR_PRINT */ + +/** Convert FSP_SPACE_FLAGS from the buggy MariaDB 10.1.0..10.1.20 format. +@param[in] flags the contents of FSP_SPACE_FLAGS +@return the flags corrected from the buggy MariaDB 10.1 format +@retval UINT32_MAX if the flags are not in the buggy 10.1 format */ +MY_ATTRIBUTE((warn_unused_result, const)) +inline uint32_t fsp_flags_convert_from_101(uint32_t flags) +{ + DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return UINT32_MAX;); + if (flags == 0 || fil_space_t::full_crc32(flags)) { + return(flags); + } + + if (flags >> 18) { + /* The most significant FSP_SPACE_FLAGS bit that was ever set + by MariaDB 10.1.0 to 10.1.20 was bit 17 (misplaced DATA_DIR flag). + The flags must be less than 1<<18 in order to be valid. */ + return UINT32_MAX; + } + + if ((flags & (FSP_FLAGS_MASK_POST_ANTELOPE | FSP_FLAGS_MASK_ATOMIC_BLOBS)) + == FSP_FLAGS_MASK_ATOMIC_BLOBS) { + /* If the "atomic blobs" flag (indicating + ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED) flag + is set, then the "post Antelope" (ROW_FORMAT!=REDUNDANT) flag + must also be set. */ + return UINT32_MAX; + } + + /* Bits 6..10 denote compression in MariaDB 10.1.0 to 10.1.20. + They must be either 0b00000 or 0b00011 through 0b10011. + In correct versions, these bits would be + 0bd0sss where d is the DATA_DIR flag (garbage bit) and + sss is the PAGE_SSIZE (3, 4, 6, or 7). + + NOTE: MariaDB 10.1.0 to 10.1.20 can misinterpret + uncompressed data files with innodb_page_size=4k or 64k as + compressed innodb_page_size=16k files. Below is an exhaustive + state space analysis. + + -0by1zzz: impossible (the bit 4 must be clean; see above) + -0b101xx: DATA_DIR, innodb_page_size>4k: invalid (COMPRESSION_LEVEL>9) + +0bx0011: innodb_page_size=4k: + !!! Misinterpreted as COMPRESSION_LEVEL=9 or 1, COMPRESSION=1. + -0bx0010: impossible, because sss must be 0b011 or 0b1xx + -0bx0001: impossible, because sss must be 0b011 or 0b1xx + -0b10000: DATA_DIR, innodb_page_size=16: + invalid (COMPRESSION_LEVEL=8 but COMPRESSION=0) + +0b00111: no DATA_DIR, innodb_page_size=64k: + !!! Misinterpreted as COMPRESSION_LEVEL=3, COMPRESSION=1. + -0b00101: impossible, because sss must be 0 for 16k, not 0b101 + -0b001x0: no DATA_DIR, innodb_page_size=32k or 8k: + invalid (COMPRESSION_LEVEL=3 but COMPRESSION=0) + +0b00000: innodb_page_size=16k (looks like COMPRESSION=0) + ??? Could actually be compressed; see PAGE_SSIZE below */ + const uint32_t level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL_MARIADB101( + flags); + if (FSP_FLAGS_GET_PAGE_COMPRESSION_MARIADB101(flags) != (level != 0) + || level > 9) { + /* The compression flags are not in the buggy MariaDB + 10.1 format. */ + return UINT32_MAX; + } + if (!(~flags & FSP_FLAGS_MASK_ATOMIC_WRITES_MARIADB101)) { + /* The ATOMIC_WRITES flags cannot be 0b11. + (The bits 11..12 should actually never be 0b11, + because in MySQL they would be SHARED|TEMPORARY.) */ + return UINT32_MAX; + } + + /* Bits 13..16 are the wrong position for PAGE_SSIZE, and they + should contain one of the values 3,4,6,7, that is, be of the form + 0b0011 or 0b01xx (except 0b0101). + In correct versions, these bits should be 0bc0se + where c is the MariaDB COMPRESSED flag + and e is the MySQL 5.7 ENCRYPTION flag + and s is the MySQL 8.0 SDI flag. MariaDB can only support s=0, e=0. + + Compressed innodb_page_size=16k tables with correct FSP_SPACE_FLAGS + will be properly rejected by older MariaDB 10.1.x because they + would read as PAGE_SSIZE>=8 which is not valid. */ + + const uint32_t ssize = FSP_FLAGS_GET_PAGE_SSIZE_MARIADB101(flags); + if (ssize == 1 || ssize == 2 || ssize == 5 || ssize & 8) { + /* the page_size is not between 4k and 64k; + 16k should be encoded as 0, not 5 */ + return UINT32_MAX; + } + const uint32_t zssize = FSP_FLAGS_GET_ZIP_SSIZE(flags); + if (zssize == 0) { + /* not ROW_FORMAT=COMPRESSED */ + } else if (zssize > (ssize ? ssize : 5)) { + /* invalid KEY_BLOCK_SIZE */ + return UINT32_MAX; + } else if (~flags & (FSP_FLAGS_MASK_POST_ANTELOPE + | FSP_FLAGS_MASK_ATOMIC_BLOBS)) { + /* both these flags should be set for + ROW_FORMAT=COMPRESSED */ + return UINT32_MAX; + } + + flags = ((flags & 0x3f) | ssize << FSP_FLAGS_POS_PAGE_SSIZE + | FSP_FLAGS_GET_PAGE_COMPRESSION_MARIADB101(flags) + << FSP_FLAGS_POS_PAGE_COMPRESSION); + ut_ad(fil_space_t::is_valid_flags(flags, false)); + return(flags); +} + +/** Compare tablespace flags. +@param[in] expected expected flags from dict_tf_to_fsp_flags() +@param[in] actual flags read from FSP_SPACE_FLAGS +@return whether the flags match */ +MY_ATTRIBUTE((warn_unused_result)) +inline bool fsp_flags_match(uint32_t expected, uint32_t actual) +{ + expected&= ~FSP_FLAGS_MEM_MASK; + ut_ad(fil_space_t::is_valid_flags(expected, false)); + return actual == expected || fsp_flags_convert_from_101(actual) == expected; +} + +/** Determine if FSP_SPACE_FLAGS are from an incompatible MySQL format. +@param flags the contents of FSP_SPACE_FLAGS +@return MySQL flags shifted. +@retval 0, if not a MySQL incompatible format. */ +MY_ATTRIBUTE((warn_unused_result, const)) +inline uint32_t fsp_flags_is_incompatible_mysql(uint32_t flags) +{ + /* + MySQL-8.0 SDI flag (bit 14), + or MySQL 5.7 Encyption flag (bit 13) + */ + return flags >> 13 & 3; +} + +/** Determine the descriptor index within a descriptor page. +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] offset page offset +@return descriptor index */ +inline ulint xdes_calc_descriptor_index(ulint zip_size, ulint offset) +{ + return ut_2pow_remainder(offset, + zip_size ? zip_size : srv_page_size) + / FSP_EXTENT_SIZE; +} + +/** Determine the descriptor page number for a page. +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] offset page offset +@return descriptor page offset */ +inline uint32_t xdes_calc_descriptor_page(ulint zip_size, uint32_t offset) +{ + compile_time_assert(UNIV_PAGE_SIZE_MAX > XDES_ARR_OFFSET + + (UNIV_PAGE_SIZE_MAX / FSP_EXTENT_SIZE_MAX) + * XDES_SIZE_MAX); + compile_time_assert(UNIV_PAGE_SIZE_MIN > XDES_ARR_OFFSET + + (UNIV_PAGE_SIZE_MIN / FSP_EXTENT_SIZE_MIN) + * XDES_SIZE_MIN); + + ut_ad(srv_page_size > XDES_ARR_OFFSET + + (srv_page_size / FSP_EXTENT_SIZE) + * XDES_SIZE); + ut_ad(UNIV_ZIP_SIZE_MIN > XDES_ARR_OFFSET + + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE) + * XDES_SIZE); + ut_ad(!zip_size + || zip_size > XDES_ARR_OFFSET + + (zip_size / FSP_EXTENT_SIZE) * XDES_SIZE); + return ut_2pow_round(offset, + uint32_t(zip_size ? zip_size : srv_page_size)); +} + +#endif /* UNIV_INNOCHECKSUM */ + +#endif diff --git a/storage/innobase/include/fsp0space.h b/storage/innobase/include/fsp0space.h new file mode 100644 index 00000000..a2bb46d3 --- /dev/null +++ b/storage/innobase/include/fsp0space.h @@ -0,0 +1,209 @@ +/***************************************************************************** + +Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/fsp0space.h +Shared tablespace interface + +Created 2013-7-26 by Kevin Lewis +*******************************************************/ + +#ifndef fsp0space_h +#define fsp0space_h + +#include "fsp0file.h" +#include "fsp0fsp.h" +#include "fsp0types.h" + +#include + +/** Data structure that contains the information about shared tablespaces. +Currently this can be the system tablespace or a temporary table tablespace */ +class Tablespace { + +public: + typedef std::vector > files_t; + + /** Data file information - each Datafile can be accessed globally */ + files_t m_files; + /** Data file iterator */ + typedef files_t::iterator iterator; + /** Data file iterator */ + typedef files_t::const_iterator const_iterator; + + Tablespace() {} + + virtual ~Tablespace() + { + shutdown(); + ut_ad(m_files.empty()); + ut_ad(m_space_id == UINT32_MAX); + } + + // Disable copying + Tablespace(const Tablespace&); + Tablespace& operator=(const Tablespace&); + + /** Data file iterator */ + const_iterator begin() const { return m_files.begin(); } + /** Data file iterator */ + const_iterator end() const { return m_files.end(); } + /** Data file iterator */ + iterator begin() { return m_files.begin(); } + /** Data file iterator */ + iterator end() { return m_files.end(); } + + /** Set tablespace path and filename members. + @param[in] path where tablespace file(s) resides + @param[in] len length of the file path */ + void set_path(const char* path, size_t len) + { + ut_ad(m_path == NULL); + m_path = mem_strdupl(path, len); + ut_ad(m_path != NULL); + } + + /** Set tablespace path and filename members. + @param[in] path where tablespace file(s) resides */ + void set_path(const char* path) + { + set_path(path, strlen(path)); + } + + /** Get tablespace path + @return tablespace path */ + const char* path() const + { + return(m_path); + } + + /** Set the space id of the tablespace + @param[in] space_id tablespace ID to set */ + void set_space_id(uint32_t space_id) + { + ut_ad(m_space_id == UINT32_MAX); + m_space_id = space_id; + } + + /** Get the space id of the tablespace + @return m_space_id space id of the tablespace */ + uint32_t space_id() const { return m_space_id; } + + /** Set the tablespace flags + @param[in] fsp_flags tablespace flags */ + void set_flags(uint32_t fsp_flags) + { + ut_ad(fil_space_t::is_valid_flags(fsp_flags, false)); + m_flags = fsp_flags; + } + + /** Get the tablespace flags + @return m_flags tablespace flags */ + uint32_t flags() const { return m_flags; } + + /** Get the tablespace encryption mode + @return m_mode tablespace encryption mode */ + fil_encryption_t encryption_mode() const { return m_mode; } + + /** Get the tablespace encryption key_id + @return m_key_id tablespace encryption key_id */ + uint32_t key_id() const { return m_key_id; } + + /** Set Ignore Read Only Status for tablespace. + @param[in] read_only_status read only status indicator */ + void set_ignore_read_only(bool read_only_status) + { + m_ignore_read_only = read_only_status; + } + + /** Free the memory allocated by the Tablespace object */ + void shutdown(); + + /** @return the sum of the file sizes of each Datafile */ + uint32_t get_sum_of_sizes() const + { + uint32_t sum = 0; + + for (const_iterator it = begin(); it != end(); ++it) { + sum += it->m_size; + } + + return(sum); + } + + /** Open or Create the data files if they do not exist. + @param[in] is_temp whether this is a temporary tablespace + @return DB_SUCCESS or error code */ + dberr_t open_or_create(bool is_temp) + MY_ATTRIBUTE((warn_unused_result)); + + /** Delete all the data files. */ + void delete_files(); + + /** Check if two tablespaces have common data file names. + @param[in] other_space Tablespace to check against this. + @return true if they have the same data filenames and paths */ + bool intersection(const Tablespace* other_space); + + /** Use the ADD DATAFILE path to create a Datafile object and add + it to the front of m_files. Parse the datafile path into a path + and a basename with extension 'ibd'. This datafile_path provided + may be an absolute or relative path, but it must end with the + extension .ibd and have a basename of at least 1 byte. + + Set tablespace m_path member and add a Datafile with the filename. + @param[in] datafile_path full path of the tablespace file. */ + dberr_t add_datafile( + const char* datafile_path); + + /* Return a pointer to the first Datafile for this Tablespace + @return pointer to the first Datafile for this Tablespace*/ + Datafile* first_datafile() + { + ut_a(!m_files.empty()); + return(&m_files.front()); + } +private: + /** + @param[in] filename Name to lookup in the data files. + @return true if the filename exists in the data files */ + bool find(const char* filename) const; + + /** Note that the data file was found. + @param[in] file data file object */ + void file_found(Datafile& file); + + /** Tablespace ID */ + uint32_t m_space_id = UINT32_MAX; + /** Tablespace flags */ + uint32_t m_flags = UINT32_MAX; + + /** Path where tablespace files will reside, excluding a filename */ + char* m_path; + + /** Encryption mode and key_id */ + fil_encryption_t m_mode; + uint32_t m_key_id; + +protected: + /** Ignore server read only configuration for this tablespace. */ + bool m_ignore_read_only = false; +}; + +#endif /* fsp0space_h */ diff --git a/storage/innobase/include/fsp0sysspace.h b/storage/innobase/include/fsp0sysspace.h new file mode 100644 index 00000000..514f3fdb --- /dev/null +++ b/storage/innobase/include/fsp0sysspace.h @@ -0,0 +1,278 @@ +/***************************************************************************** + +Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/fsp0sysspace.h +Multi file, shared, system tablespace implementation. + +Created 2013-7-26 by Kevin Lewis +*******************************************************/ + +#ifndef fsp0sysspace_h +#define fsp0sysspace_h + +#include "fsp0space.h" + +/** If the last data file is auto-extended, we add this many pages to it +at a time. We have to make this public because it is a config variable. */ +extern uint sys_tablespace_auto_extend_increment; + +/** Data structure that contains the information about shared tablespaces. +Currently this can be the system tablespace or a temporary table tablespace */ +class SysTablespace : public Tablespace +{ +public: + + SysTablespace() + : + m_auto_extend_last_file(), + m_last_file_size_max(), + m_created_new_raw(), + m_is_tablespace_full(false), + m_sanity_checks_done(false) + { + /* No op */ + } + + ~SysTablespace() override + { + shutdown(); + } + + /** Set tablespace full status + @param[in] is_full true if full */ + void set_tablespace_full_status(bool is_full) + { + m_is_tablespace_full = is_full; + } + + /** Get tablespace full status + @return true if table is full */ + bool get_tablespace_full_status() + { + return(m_is_tablespace_full); + } + + /** Set sanity check status + @param[in] status true if sanity checks are done */ + void set_sanity_check_status(bool status) + { + m_sanity_checks_done = status; + } + + /** Get sanity check status + @return true if sanity checks are done */ + bool get_sanity_check_status() + { + return(m_sanity_checks_done); + } + + /** Parse the input params and populate member variables. + @param filepath path to data files + @param supports_raw true if it supports raw devices + @return true on success parse */ + bool parse_params(const char* filepath, bool supports_raw); + + /** Check the data file specification. + @param[out] create_new_db true if a new database + is to be created + @param[in] min_expected_size expected tablespace + size in bytes + @return DB_SUCCESS if all OK else error code */ + dberr_t check_file_spec( + bool* create_new_db, + ulint min_expected_tablespace_size); + + /** Free the memory allocated by parse() */ + void shutdown(); + + /** Normalize the file size, convert to extents. */ + void normalize_size(); + + /** + @return true if a new raw device was created. */ + bool created_new_raw() const + { + return(m_created_new_raw); + } + + /** + @return auto_extend value setting */ + ulint can_auto_extend_last_file() const + { + return(m_auto_extend_last_file); + } + + /** Set the last file size. + @param[in] size the size to set */ + void set_last_file_size(uint32_t size) + { + ut_ad(!m_files.empty()); + m_files.back().m_size = size; + } + + /** Get the size of the last data file in the tablespace + @return the size of the last data file in the array */ + uint32_t last_file_size() const + { + ut_ad(!m_files.empty()); + return(m_files.back().m_size); + } + + /** + @return the autoextend increment in pages. */ + uint32_t get_autoextend_increment() const + { + return sys_tablespace_auto_extend_increment + << (20 - srv_page_size_shift); + } + + /** + @return next increment size */ + uint32_t get_increment() const; + + /** Open or create the data files + @param[in] is_temp whether this is a temporary tablespace + @param[in] create_new_db whether we are creating a new database + @param[out] sum_new_sizes sum of sizes of the new files added + @return DB_SUCCESS or error code */ + dberr_t open_or_create( + bool is_temp, + bool create_new_db, + ulint* sum_new_sizes) + MY_ATTRIBUTE((warn_unused_result)); + +private: + /** Check the tablespace header for this tablespace. + @return DB_SUCCESS or error code */ + inline dberr_t read_lsn_and_check_flags(); + + /** + @return true if the last file size is valid. */ + bool is_valid_size() const + { + return(m_last_file_size_max >= last_file_size()); + } + + /** + @return true if configured to use raw devices */ + bool has_raw_device(); + + /** Note that the data file was not found. + @param[in] file data file object + @param[out] create_new_db true if a new instance to be created + @return DB_SUCESS or error code */ + dberr_t file_not_found(Datafile& file, bool* create_new_db); + + /** Note that the data file was found. + @param[in,out] file data file object + @return true if a new instance to be created */ + bool file_found(Datafile& file); + + /** Create a data file. + @param[in,out] file data file object + @return DB_SUCCESS or error code */ + dberr_t create(Datafile& file); + + /** Create a data file. + @param[in,out] file data file object + @return DB_SUCCESS or error code */ + dberr_t create_file(Datafile& file); + + /** Open a data file. + @param[in,out] file data file object + @return DB_SUCCESS or error code */ + dberr_t open_file(Datafile& file); + + /** Set the size of the file. + @param[in,out] file data file object + @return DB_SUCCESS or error code */ + dberr_t set_size(Datafile& file); + + /** Convert a numeric string that optionally ends in G or M, to a + number containing megabytes. + @param[in] ptr string with a quantity in bytes + @param[out] megs the number in megabytes + @return next character in string */ + static char* parse_units(char* ptr, ulint* megs); + +private: + enum file_status_t { + FILE_STATUS_VOID = 0, /** status not set */ + FILE_STATUS_RW_PERMISSION_ERROR,/** permission error */ + FILE_STATUS_READ_WRITE_ERROR, /** not readable/writable */ + FILE_STATUS_NOT_REGULAR_FILE_ERROR /** not a regular file */ + }; + + /** Verify the size of the physical file + @param[in] file data file object + @return DB_SUCCESS if OK else error code. */ + dberr_t check_size(Datafile& file); + + /** Check if a file can be opened in the correct mode. + @param[in,out] file data file object + @param[out] reason exact reason if file_status check failed. + @return DB_SUCCESS or error code. */ + dberr_t check_file_status( + const Datafile& file, + file_status_t& reason); + + /* DATA MEMBERS */ + + /** if true, then we auto-extend the last data file */ + bool m_auto_extend_last_file; + + /** maximum size of the last data file (0=unlimited) */ + ulint m_last_file_size_max; + + /** If the following is true we do not allow + inserts etc. This protects the user from forgetting + the 'newraw' keyword to my.cnf */ + bool m_created_new_raw; + + /** Tablespace full status */ + bool m_is_tablespace_full; + + /** if false, then sanity checks are still pending */ + bool m_sanity_checks_done; +}; + +/* GLOBAL OBJECTS */ + +/** The control info of the system tablespace. */ +extern SysTablespace srv_sys_space; + +/** The control info of a temporary table shared tablespace. */ +extern SysTablespace srv_tmp_space; + +/** Check if the space_id is for a system-tablespace (shared + temp). +@param[in] id Space ID to check +@return true if id is a system tablespace, false if not. */ +inline bool is_system_tablespace(uint32_t id) +{ + return id == TRX_SYS_SPACE || id == SRV_TMP_SPACE_ID; +} + +/** Check if predefined shared tablespace. +@return true if predefined shared tablespace */ +inline bool is_predefined_tablespace(uint32_t id) +{ + return is_system_tablespace(id) || srv_is_undo_tablespace(id); +} +#endif /* fsp0sysspace_h */ diff --git a/storage/innobase/include/fsp0types.h b/storage/innobase/include/fsp0types.h new file mode 100644 index 00000000..9a23e840 --- /dev/null +++ b/storage/innobase/include/fsp0types.h @@ -0,0 +1,404 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2014, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/****************************************************** +@file include/fsp0types.h +File space management types + +Created May 26, 2009 Vasil Dimov +*******************************************************/ + +#pragma once +#include "ut0byte.h" + +/** All persistent tablespaces have a smaller fil_space_t::id than this. */ +constexpr uint32_t SRV_SPACE_ID_UPPER_BOUND= 0xFFFFFFF0U; +/** The fil_space_t::id of the innodb_temporary tablespace. */ +constexpr uint32_t SRV_TMP_SPACE_ID= 0xFFFFFFFEU; + +/* Possible values of innodb_compression_algorithm */ +#define PAGE_UNCOMPRESSED 0 +#define PAGE_ZLIB_ALGORITHM 1 +#define PAGE_LZ4_ALGORITHM 2 +#define PAGE_LZO_ALGORITHM 3 +#define PAGE_LZMA_ALGORITHM 4 +#define PAGE_BZIP2_ALGORITHM 5 +#define PAGE_SNAPPY_ALGORITHM 6 +#define PAGE_ALGORITHM_LAST PAGE_SNAPPY_ALGORITHM + +extern const char *page_compression_algorithms[]; + +/** @name Flags for inserting records in order +If records are inserted in order, there are the following +flags to tell this (their type is made byte for the compiler +to warn if direction and hint parameters are switched in +fseg_alloc_free_page_general) */ +/* @{ */ +#define FSP_UP ((byte)111) /*!< alphabetically upwards */ +#define FSP_DOWN ((byte)112) /*!< alphabetically downwards */ +#define FSP_NO_DIR ((byte)113) /*!< no order */ +/* @} */ + +/** File space extent size in pages +page size | file space extent size +----------+----------------------- + 4 KiB | 256 pages = 1 MiB + 8 KiB | 128 pages = 1 MiB + 16 KiB | 64 pages = 1 MiB + 32 KiB | 64 pages = 2 MiB + 64 KiB | 64 pages = 4 MiB +*/ +#define FSP_EXTENT_SIZE (srv_page_size_shift < 14 ? \ + (1048576U >> srv_page_size_shift) : 64U) + +/** File space extent size (four megabyte) in pages for MAX page size */ +#define FSP_EXTENT_SIZE_MAX (4194304 / UNIV_PAGE_SIZE_MAX) + +/** File space extent size (one megabyte) in pages for MIN page size */ +#define FSP_EXTENT_SIZE_MIN (1048576 / UNIV_PAGE_SIZE_MIN) + +/** On a page of any file segment, data may be put starting from this +offset */ +#define FSEG_PAGE_DATA FIL_PAGE_DATA + +/** @name File segment header +The file segment header points to the inode describing the file segment. */ +/* @{ */ +/** Data type for file segment header */ +typedef byte fseg_header_t; + +#define FSEG_HDR_SPACE 0 /*!< space id of the inode */ +#define FSEG_HDR_PAGE_NO 4 /*!< page number of the inode */ +#define FSEG_HDR_OFFSET 8 /*!< byte offset of the inode */ + +#define FSEG_HEADER_SIZE 10 /*!< Length of the file system + header, in bytes */ +/* @} */ + +#ifndef UNIV_INNOCHECKSUM +#ifdef UNIV_DEBUG + +struct mtr_t; + +/** A wrapper class to print the file segment header information. */ +class fseg_header +{ +public: + /** Constructor of fseg_header. + @param[in] header the underlying file segment header object + @param[in] mtr the mini-transaction. No redo logs are + generated, only latches are checked within + mini-transaction */ + fseg_header( + const fseg_header_t* header, + mtr_t* mtr) + : + m_header(header), + m_mtr(mtr) + {} + + /** Print the file segment header to the given output stream. + @param[in,out] out the output stream into which the object + is printed. + @retval the output stream into which the object was printed. */ + std::ostream& + to_stream(std::ostream& out) const; +private: + /** The underlying file segment header */ + const fseg_header_t* m_header; + + /** The mini transaction, which is used mainly to check whether + appropriate latches have been taken by the calling thread. */ + mtr_t* m_mtr; +}; + +/* Overloading the global output operator to print a file segment header +@param[in,out] out the output stream into which object will be printed +@param[in] header the file segment header to be printed +@retval the output stream */ +inline +std::ostream& +operator<<( + std::ostream& out, + const fseg_header& header) +{ + return(header.to_stream(out)); +} +#endif /* UNIV_DEBUG */ + +/** Flags for fsp_reserve_free_extents */ +enum fsp_reserve_t { + FSP_NORMAL, /* reservation during normal B-tree operations */ + FSP_UNDO, /* reservation done for undo logging */ + FSP_CLEANING, /* reservation done during purge operations */ + FSP_BLOB /* reservation being done for BLOB insertion */ +}; + +/* Number of pages described in a single descriptor page: currently each page +description takes less than 1 byte; a descriptor page is repeated every +this many file pages */ +/* #define XDES_DESCRIBED_PER_PAGE srv_page_size */ +/* This has been replaced with either srv_page_size or page_zip->size. */ + +/** @name The space low address page map +The pages at FSP_XDES_OFFSET and FSP_IBUF_BITMAP_OFFSET are repeated +every XDES_DESCRIBED_PER_PAGE pages in every tablespace. */ +/* @{ */ +/*--------------------------------------*/ +#define FSP_XDES_OFFSET 0U /* !< extent descriptor */ +#define FSP_IBUF_BITMAP_OFFSET 1U /* !< insert buffer bitmap */ + /* The ibuf bitmap pages are the ones whose + page number is the number above plus a + multiple of XDES_DESCRIBED_PER_PAGE */ + +#define FSP_FIRST_INODE_PAGE_NO 2U /*!< in every tablespace */ + /* The following pages exist + in the system tablespace (space 0). */ +#define FSP_IBUF_HEADER_PAGE_NO 3U /*!< insert buffer + header page, in + tablespace 0 */ +#define FSP_IBUF_TREE_ROOT_PAGE_NO 4U /*!< insert buffer + B-tree root page in + tablespace 0 */ + /* The ibuf tree root page number in + tablespace 0; its fseg inode is on the page + number FSP_FIRST_INODE_PAGE_NO */ +#define FSP_TRX_SYS_PAGE_NO 5U /*!< transaction + system header, in + tablespace 0 */ +#define FSP_FIRST_RSEG_PAGE_NO 6U /*!< first rollback segment + page, in tablespace 0 */ +#define FSP_DICT_HDR_PAGE_NO 7U /*!< data dictionary header + page, in tablespace 0 */ +/*--------------------------------------*/ +/* @} */ + +/** Check if tablespace is system temporary. +@param[in] space_id verify is checksum is enabled for given space. +@return true if tablespace is system temporary. */ +inline +bool +fsp_is_system_temporary(ulint space_id) +{ + return(space_id == SRV_TMP_SPACE_ID); +} +#endif /* !UNIV_INNOCHECKSUM */ + +/* @defgroup fsp_flags InnoDB Tablespace Flag Constants @{ */ + +/** Width of the POST_ANTELOPE flag */ +#define FSP_FLAGS_WIDTH_POST_ANTELOPE 1 +/** Number of flag bits used to indicate the tablespace zip page size */ +#define FSP_FLAGS_WIDTH_ZIP_SSIZE 4 +/** Width of the ATOMIC_BLOBS flag. The ability to break up a long +column into an in-record prefix and an externally stored part is available +to ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT. */ +#define FSP_FLAGS_WIDTH_ATOMIC_BLOBS 1 +/** Number of flag bits used to indicate the tablespace page size */ +#define FSP_FLAGS_WIDTH_PAGE_SSIZE 4 +/** Number of reserved bits */ +#define FSP_FLAGS_WIDTH_RESERVED 6 +/** Number of flag bits used to indicate the page compression */ +#define FSP_FLAGS_WIDTH_PAGE_COMPRESSION 1 + +/** Width of all the currently known persistent tablespace flags */ +#define FSP_FLAGS_WIDTH (FSP_FLAGS_WIDTH_POST_ANTELOPE \ + + FSP_FLAGS_WIDTH_ZIP_SSIZE \ + + FSP_FLAGS_WIDTH_ATOMIC_BLOBS \ + + FSP_FLAGS_WIDTH_PAGE_SSIZE \ + + FSP_FLAGS_WIDTH_RESERVED \ + + FSP_FLAGS_WIDTH_PAGE_COMPRESSION) + +/** A mask of all the known/used bits in FSP_SPACE_FLAGS */ +#define FSP_FLAGS_MASK (~(~0U << FSP_FLAGS_WIDTH)) + +/** Number of flag bits used to indicate the tablespace page size */ +#define FSP_FLAGS_FCRC32_WIDTH_PAGE_SSIZE 4 + +/** Marker to indicate whether tablespace is in full checksum format. */ +#define FSP_FLAGS_FCRC32_WIDTH_MARKER 1 + +/** Stores the compressed algo for full checksum format. */ +#define FSP_FLAGS_FCRC32_WIDTH_COMPRESSED_ALGO 3 + +/* FSP_SPACE_FLAGS position and name in MySQL 5.6/MariaDB 10.0 or older +and MariaDB 10.1.20 or older MariaDB 10.1 and in MariaDB 10.1.21 +or newer. +MySQL 5.6 MariaDB 10.1.x MariaDB 10.1.21 +==================================================================== +Below flags in same offset +==================================================================== +0: POST_ANTELOPE 0:POST_ANTELOPE 0: POST_ANTELOPE +1..4: ZIP_SSIZE(0..5) 1..4:ZIP_SSIZE(0..5) 1..4: ZIP_SSIZE(0..5) +(NOTE: bit 4 is always 0) +5: ATOMIC_BLOBS 5:ATOMIC_BLOBS 5: ATOMIC_BLOBS +===================================================================== +Below note the order difference: +===================================================================== +6..9: PAGE_SSIZE(3..7) 6: COMPRESSION 6..9: PAGE_SSIZE(3..7) +10: DATA_DIR 7..10: COMP_LEVEL(0..9) 10: RESERVED (5.6 DATA_DIR) +===================================================================== +The flags below were in incorrect position in MariaDB 10.1, +or have been introduced in MySQL 5.7 or 8.0: +===================================================================== +11: UNUSED 11..12:ATOMIC_WRITES 11: RESERVED (5.7 SHARED) + 12: RESERVED (5.7 TEMPORARY) + 13..15:PAGE_SSIZE(3..7) 13: RESERVED (5.7 ENCRYPTION) + 14: RESERVED (8.0 SDI) + 15: RESERVED + 16: PAGE_SSIZE_msb(0) 16: COMPRESSION + 17: DATA_DIR 17: UNUSED + 18: UNUSED +===================================================================== +The flags below only exist in fil_space_t::flags, not in FSP_SPACE_FLAGS: +===================================================================== + 27: DATA_DIR + 28..31: COMPRESSION_LEVEL +*/ + +/** A mask of the memory-only flags in fil_space_t::flags */ +#define FSP_FLAGS_MEM_MASK (~0U << FSP_FLAGS_MEM_DATA_DIR) + +/** Zero relative shift position of the DATA_DIR flag */ +#define FSP_FLAGS_MEM_DATA_DIR 27 +/** Zero relative shift position of the COMPRESSION_LEVEL field */ +#define FSP_FLAGS_MEM_COMPRESSION_LEVEL 28 + +/** Zero relative shift position of the POST_ANTELOPE field */ +#define FSP_FLAGS_POS_POST_ANTELOPE 0 +/** Zero relative shift position of the ZIP_SSIZE field */ +#define FSP_FLAGS_POS_ZIP_SSIZE (FSP_FLAGS_POS_POST_ANTELOPE \ + + FSP_FLAGS_WIDTH_POST_ANTELOPE) +/** Zero relative shift position of the ATOMIC_BLOBS field */ +#define FSP_FLAGS_POS_ATOMIC_BLOBS (FSP_FLAGS_POS_ZIP_SSIZE \ + + FSP_FLAGS_WIDTH_ZIP_SSIZE) +/** Zero relative shift position of the start of the PAGE_SSIZE bits */ +#define FSP_FLAGS_POS_PAGE_SSIZE (FSP_FLAGS_POS_ATOMIC_BLOBS \ + + FSP_FLAGS_WIDTH_ATOMIC_BLOBS) +/** Zero relative shift position of the start of the RESERVED bits +these are only used in MySQL 5.7 and used for compatibility. */ +#define FSP_FLAGS_POS_RESERVED (FSP_FLAGS_POS_PAGE_SSIZE \ + + FSP_FLAGS_WIDTH_PAGE_SSIZE) +/** Zero relative shift position of the PAGE_COMPRESSION field */ +#define FSP_FLAGS_POS_PAGE_COMPRESSION (FSP_FLAGS_POS_RESERVED \ + + FSP_FLAGS_WIDTH_RESERVED) + +/** Zero relative shift position of the PAGE_SIZE field +in full crc32 format */ +#define FSP_FLAGS_FCRC32_POS_PAGE_SSIZE 0 + +/** Zero relative shift position of the MARKER field in full crc32 format. */ +#define FSP_FLAGS_FCRC32_POS_MARKER (FSP_FLAGS_FCRC32_POS_PAGE_SSIZE \ + + FSP_FLAGS_FCRC32_WIDTH_PAGE_SSIZE) + +/** Zero relative shift position of the compressed algorithm stored +in full crc32 format. */ +#define FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO (FSP_FLAGS_FCRC32_POS_MARKER \ + + FSP_FLAGS_FCRC32_WIDTH_MARKER) + +/** Bit mask of the POST_ANTELOPE field */ +#define FSP_FLAGS_MASK_POST_ANTELOPE \ + ((~(~0U << FSP_FLAGS_WIDTH_POST_ANTELOPE)) \ + << FSP_FLAGS_POS_POST_ANTELOPE) +/** Bit mask of the ZIP_SSIZE field */ +#define FSP_FLAGS_MASK_ZIP_SSIZE \ + ((~(~0U << FSP_FLAGS_WIDTH_ZIP_SSIZE)) \ + << FSP_FLAGS_POS_ZIP_SSIZE) +/** Bit mask of the ATOMIC_BLOBS field */ +#define FSP_FLAGS_MASK_ATOMIC_BLOBS \ + ((~(~0U << FSP_FLAGS_WIDTH_ATOMIC_BLOBS)) \ + << FSP_FLAGS_POS_ATOMIC_BLOBS) +/** Bit mask of the PAGE_SSIZE field */ +#define FSP_FLAGS_MASK_PAGE_SSIZE \ + ((~(~0U << FSP_FLAGS_WIDTH_PAGE_SSIZE)) \ + << FSP_FLAGS_POS_PAGE_SSIZE) +/** Bit mask of the RESERVED1 field */ +#define FSP_FLAGS_MASK_RESERVED \ + ((~(~0U << FSP_FLAGS_WIDTH_RESERVED)) \ + << FSP_FLAGS_POS_RESERVED) +/** Bit mask of the PAGE_COMPRESSION field */ +#define FSP_FLAGS_MASK_PAGE_COMPRESSION \ + ((~(~0U << FSP_FLAGS_WIDTH_PAGE_COMPRESSION)) \ + << FSP_FLAGS_POS_PAGE_COMPRESSION) + +/** Bit mask of the in-memory COMPRESSION_LEVEL field */ +#define FSP_FLAGS_MASK_MEM_COMPRESSION_LEVEL \ + (15U << FSP_FLAGS_MEM_COMPRESSION_LEVEL) + +/** Bit mask of the PAGE_SIZE field in full crc32 format */ +#define FSP_FLAGS_FCRC32_MASK_PAGE_SSIZE \ + ((~(~0U << FSP_FLAGS_FCRC32_WIDTH_PAGE_SSIZE)) \ + << FSP_FLAGS_FCRC32_POS_PAGE_SSIZE) + +/** Bit mask of the MARKER field in full crc32 format */ +#define FSP_FLAGS_FCRC32_MASK_MARKER \ + ((~(~0U << FSP_FLAGS_FCRC32_WIDTH_MARKER)) \ + << FSP_FLAGS_FCRC32_POS_MARKER) + +/** Bit mask of the COMPRESSED ALGO field in full crc32 format */ +#define FSP_FLAGS_FCRC32_MASK_COMPRESSED_ALGO \ + ((~(~0U << FSP_FLAGS_FCRC32_WIDTH_COMPRESSED_ALGO)) \ + << FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO) + +/** Return the value of the POST_ANTELOPE field */ +#define FSP_FLAGS_GET_POST_ANTELOPE(flags) \ + ((flags & FSP_FLAGS_MASK_POST_ANTELOPE) \ + >> FSP_FLAGS_POS_POST_ANTELOPE) +/** Return the value of the ZIP_SSIZE field */ +#define FSP_FLAGS_GET_ZIP_SSIZE(flags) \ + ((flags & FSP_FLAGS_MASK_ZIP_SSIZE) \ + >> FSP_FLAGS_POS_ZIP_SSIZE) +/** Return the value of the ATOMIC_BLOBS field */ +#define FSP_FLAGS_HAS_ATOMIC_BLOBS(flags) \ + ((flags & FSP_FLAGS_MASK_ATOMIC_BLOBS) \ + >> FSP_FLAGS_POS_ATOMIC_BLOBS) +/** Return the value of the PAGE_SSIZE field */ +#define FSP_FLAGS_GET_PAGE_SSIZE(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_SSIZE) \ + >> FSP_FLAGS_POS_PAGE_SSIZE) +/** @return the RESERVED flags */ +#define FSP_FLAGS_GET_RESERVED(flags) \ + ((flags & FSP_FLAGS_MASK_RESERVED) \ + >> FSP_FLAGS_POS_RESERVED) +/** @return the PAGE_COMPRESSION flag */ +#define FSP_FLAGS_HAS_PAGE_COMPRESSION(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION) \ + >> FSP_FLAGS_POS_PAGE_COMPRESSION) +/** @return the PAGE_SSIZE flags in full crc32 format */ +#define FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(flags) \ + ((flags & FSP_FLAGS_FCRC32_MASK_PAGE_SSIZE) \ + >> FSP_FLAGS_FCRC32_POS_PAGE_SSIZE) +/** @return the COMPRESSED_ALGO flags in full crc32 format */ +#define FSP_FLAGS_FCRC32_GET_COMPRESSED_ALGO(flags) \ + ((flags & FSP_FLAGS_FCRC32_MASK_COMPRESSED_ALGO) \ + >> FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO) + +/** @return the value of the DATA_DIR field */ +#define FSP_FLAGS_HAS_DATA_DIR(flags) \ + (flags & 1U << FSP_FLAGS_MEM_DATA_DIR) +/** @return the COMPRESSION_LEVEL field */ +#define FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags) \ + ((flags & FSP_FLAGS_MASK_MEM_COMPRESSION_LEVEL) \ + >> FSP_FLAGS_MEM_COMPRESSION_LEVEL) + +/* @} */ + +struct fil_node_t; +struct fil_space_t; +class buf_page_t; diff --git a/storage/innobase/include/fts0ast.h b/storage/innobase/include/fts0ast.h new file mode 100644 index 00000000..15bf30bc --- /dev/null +++ b/storage/innobase/include/fts0ast.h @@ -0,0 +1,340 @@ +/***************************************************************************** + +Copyright (c) 2007, 2018, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0ast.h +The FTS query parser (AST) abstract syntax tree routines + +Created 2007/03/16/03 Sunny Bains +*******************************************************/ + +#ifndef INNOBASE_FST0AST_H +#define INNOBASE_FST0AST_H + +#include "mem0mem.h" + +/* The type of AST Node */ +enum fts_ast_type_t { + FTS_AST_OPER, /*!< Operator */ + FTS_AST_NUMB, /*!< Number */ + FTS_AST_TERM, /*!< Term (or word) */ + FTS_AST_TEXT, /*!< Text string */ + FTS_AST_PARSER_PHRASE_LIST, /*!< Phase for plugin parser + The difference from text type + is that we tokenize text into + term list */ + FTS_AST_LIST, /*!< Expression list */ + FTS_AST_SUBEXP_LIST /*!< Sub-Expression list */ +}; + +/* The FTS query operators that we support */ +enum fts_ast_oper_t { + FTS_NONE, /*!< No operator */ + + FTS_IGNORE, /*!< Ignore rows that contain + this word */ + + FTS_EXIST, /*!< Include rows that contain + this word */ + + FTS_NEGATE, /*!< Include rows that contain + this word but rank them + lower*/ + + FTS_INCR_RATING, /*!< Increase the rank for this + word*/ + + FTS_DECR_RATING, /*!< Decrease the rank for this + word*/ + + FTS_DISTANCE, /*!< Proximity distance */ + FTS_IGNORE_SKIP, /*!< Transient node operator + signifies that this is a + FTS_IGNORE node, and ignored in + the first pass of + fts_ast_visit() */ + FTS_EXIST_SKIP /*!< Transient node operator + signifies that this ia a + FTS_EXIST node, and ignored in + the first pass of + fts_ast_visit() */ +}; + +/* Data types used by the FTS parser */ +struct fts_lexer_t; +struct fts_ast_node_t; +struct fts_ast_state_t; +struct fts_ast_string_t; + +typedef dberr_t (*fts_ast_callback)(fts_ast_oper_t, fts_ast_node_t*, void*); + +/******************************************************************** +Parse the string using the lexer setup within state.*/ +int +fts_parse( +/*======*/ + /* out: 0 on OK, 1 on error */ + fts_ast_state_t* state); /*!< in: ast state instance.*/ + +/******************************************************************** +Create an AST operator node */ +extern +fts_ast_node_t* +fts_ast_create_node_oper( +/*=====================*/ + void* arg, /*!< in: ast state */ + fts_ast_oper_t oper); /*!< in: ast operator */ +/******************************************************************** +Create an AST term node, makes a copy of ptr */ +extern +fts_ast_node_t* +fts_ast_create_node_term( +/*=====================*/ + void* arg, /*!< in: ast state */ + const fts_ast_string_t* ptr); /*!< in: term string */ +/******************************************************************** +Create an AST text node */ +extern +fts_ast_node_t* +fts_ast_create_node_text( +/*=====================*/ + void* arg, /*!< in: ast state */ + const fts_ast_string_t* ptr); /*!< in: text string */ +/******************************************************************** +Create an AST expr list node */ +extern +fts_ast_node_t* +fts_ast_create_node_list( +/*=====================*/ + void* arg, /*!< in: ast state */ + fts_ast_node_t* expr); /*!< in: ast expr */ +/******************************************************************** +Create a sub-expression list node. This function takes ownership of +expr and is responsible for deleting it. */ +extern +fts_ast_node_t* +fts_ast_create_node_subexp_list( +/*============================*/ + /* out: new node */ + void* arg, /*!< in: ast state instance */ + fts_ast_node_t* expr); /*!< in: ast expr instance */ +/******************************************************************** +Set the wildcard attribute of a term.*/ +extern +void +fts_ast_term_set_wildcard( +/*======================*/ + fts_ast_node_t* node); /*!< in: term to change */ +/******************************************************************** +Set the proximity attribute of a text node. */ +void +fts_ast_text_set_distance( +/*======================*/ + fts_ast_node_t* node, /*!< in/out: text node */ + ulint distance); /*!< in: the text proximity + distance */ +/********************************************************************//** +Free a fts_ast_node_t instance. +@return next node to free */ +fts_ast_node_t* +fts_ast_free_node( +/*==============*/ + fts_ast_node_t* node); /*!< in: node to free */ +/******************************************************************** +Add a sub-expression to an AST*/ +extern +fts_ast_node_t* +fts_ast_add_node( +/*=============*/ + fts_ast_node_t* list, /*!< in: list node instance */ + fts_ast_node_t* node); /*!< in: (sub) expr to add */ +/******************************************************************** +Print the AST node recursively.*/ +extern +void +fts_ast_node_print( +/*===============*/ + fts_ast_node_t* node); /*!< in: ast node to print */ +/******************************************************************** +Free node and expr allocations.*/ +extern +void +fts_ast_state_free( +/*===============*/ + fts_ast_state_t*state); /*!< in: state instance + to free */ +/** Check only union operation involved in the node +@param[in] node ast node to check +@return true if the node contains only union else false. */ +bool +fts_ast_node_check_union( + fts_ast_node_t* node); + +/******************************************************************//** +Traverse the AST - in-order traversal. +@return DB_SUCCESS if all went well */ +dberr_t +fts_ast_visit( +/*==========*/ + fts_ast_oper_t oper, /*!< in: FTS operator */ + fts_ast_node_t* node, /*!< in: instance to traverse*/ + fts_ast_callback visitor, /*!< in: callback */ + void* arg, /*!< in: callback arg */ + bool* has_ignore) /*!< out: whether we encounter + and ignored processing an + operator, currently we only + ignore FTS_IGNORE operator */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/******************************************************************** +Create a lex instance.*/ +fts_lexer_t* +fts_lexer_create( +/*=============*/ + ibool boolean_mode, /*!< in: query type */ + const byte* query, /*!< in: query string */ + ulint query_len) /*!< in: query string len */ + MY_ATTRIBUTE((nonnull, malloc, warn_unused_result)); +/******************************************************************** +Free an fts_lexer_t instance.*/ +void +fts_lexer_free( +/*===========*/ + fts_lexer_t* fts_lexer) /*!< in: lexer instance to + free */ + MY_ATTRIBUTE((nonnull)); + +/** +Create an ast string object, with NUL-terminator, so the string +has one more byte than len +@param[in] str pointer to string +@param[in] len length of the string +@return ast string with NUL-terminator */ +fts_ast_string_t* +fts_ast_string_create( + const byte* str, + ulint len); + +/** +Free an ast string instance +@param[in,out] ast_str string to free */ +void +fts_ast_string_free( + fts_ast_string_t* ast_str); + +/** +Translate ast string of type FTS_AST_NUMB to unsigned long by strtoul +@param[in] str string to translate +@param[in] base the base +@return translated number */ +ulint +fts_ast_string_to_ul( + const fts_ast_string_t* ast_str, + int base); + +/* String of length len. +We always store the string of length len with a terminating '\0', +regardless of there is any 0x00 in the string itself */ +struct fts_ast_string_t { + /*!< Pointer to string. */ + byte* str; + + /*!< Length of the string. */ + ulint len; +}; + +/* Query term type */ +struct fts_ast_term_t { + fts_ast_string_t* ptr; /*!< Pointer to term string.*/ + ibool wildcard; /*!< TRUE if wild card set.*/ +}; + +/* Query text type */ +struct fts_ast_text_t { + fts_ast_string_t* ptr; /*!< Pointer to text string.*/ + ulint distance; /*!< > 0 if proximity distance + set */ +}; + +/* The list of nodes in an expr list */ +struct fts_ast_list_t { + fts_ast_node_t* head; /*!< Children list head */ + fts_ast_node_t* tail; /*!< Children list tail */ +}; + +/* FTS AST node to store the term, text, operator and sub-expressions.*/ +struct fts_ast_node_t { + fts_ast_type_t type; /*!< The type of node */ + fts_ast_text_t text; /*!< Text node */ + fts_ast_term_t term; /*!< Term node */ + fts_ast_oper_t oper; /*!< Operator value */ + fts_ast_list_t list; /*!< Expression list */ + fts_ast_node_t* next; /*!< Link for expr list */ + fts_ast_node_t* next_alloc; /*!< For tracking allocations */ + bool visited; /*!< whether this node is + already processed */ + /** current transaction */ + const trx_t* trx; + /* Used by plugin parser */ + fts_ast_node_t* up_node; /*!< Direct up node */ + bool go_up; /*!< Flag if go one level up */ +}; + +/* To track state during parsing */ +struct fts_ast_state_t { + mem_heap_t* heap; /*!< Heap to use for alloc */ + fts_ast_node_t* root; /*!< If all goes OK, then this + will point to the root.*/ + + fts_ast_list_t list; /*!< List of nodes allocated */ + + fts_lexer_t* lexer; /*!< Lexer callback + arg */ + CHARSET_INFO* charset; /*!< charset used for + tokenization */ + /* Used by plugin parser */ + fts_ast_node_t* cur_node; /*!< Current node into which + we add new node */ + int depth; /*!< Depth of parsing state */ +}; + +/******************************************************************//** +Create an AST term node, makes a copy of ptr for plugin parser +@return node */ +extern +fts_ast_node_t* +fts_ast_create_node_term_for_parser( +/*==========i=====================*/ + void* arg, /*!< in: ast state */ + const char* ptr, /*!< in: term string */ + const ulint len); /*!< in: term string length */ + +/******************************************************************//** +Create an AST phrase list node for plugin parser +@return node */ +extern +fts_ast_node_t* +fts_ast_create_node_phrase_list( +/*============================*/ + void* arg); /*!< in: ast state */ + +#ifdef UNIV_DEBUG +const char* +fts_ast_node_type_get(fts_ast_type_t type); +#endif /* UNIV_DEBUG */ + +#endif /* INNOBASE_FSTS0AST_H */ diff --git a/storage/innobase/include/fts0blex.h b/storage/innobase/include/fts0blex.h new file mode 100644 index 00000000..b16e7f2c --- /dev/null +++ b/storage/innobase/include/fts0blex.h @@ -0,0 +1,702 @@ +#ifndef fts0bHEADER_H +#define fts0bHEADER_H 1 +#define fts0bIN_HEADER 1 + +#line 6 "../include/fts0blex.h" + +#line 8 "../include/fts0blex.h" + +#define YY_INT_ALIGNED short int + +/* A lexical scanner generated by flex */ + +#define FLEX_SCANNER +#define YY_FLEX_MAJOR_VERSION 2 +#define YY_FLEX_MINOR_VERSION 6 +#define YY_FLEX_SUBMINOR_VERSION 4 +#if YY_FLEX_SUBMINOR_VERSION > 0 +#define FLEX_BETA +#endif + +#ifdef yy_create_buffer +#define fts0b_create_buffer_ALREADY_DEFINED +#else +#define yy_create_buffer fts0b_create_buffer +#endif + +#ifdef yy_delete_buffer +#define fts0b_delete_buffer_ALREADY_DEFINED +#else +#define yy_delete_buffer fts0b_delete_buffer +#endif + +#ifdef yy_scan_buffer +#define fts0b_scan_buffer_ALREADY_DEFINED +#else +#define yy_scan_buffer fts0b_scan_buffer +#endif + +#ifdef yy_scan_string +#define fts0b_scan_string_ALREADY_DEFINED +#else +#define yy_scan_string fts0b_scan_string +#endif + +#ifdef yy_scan_bytes +#define fts0b_scan_bytes_ALREADY_DEFINED +#else +#define yy_scan_bytes fts0b_scan_bytes +#endif + +#ifdef yy_init_buffer +#define fts0b_init_buffer_ALREADY_DEFINED +#else +#define yy_init_buffer fts0b_init_buffer +#endif + +#ifdef yy_flush_buffer +#define fts0b_flush_buffer_ALREADY_DEFINED +#else +#define yy_flush_buffer fts0b_flush_buffer +#endif + +#ifdef yy_load_buffer_state +#define fts0b_load_buffer_state_ALREADY_DEFINED +#else +#define yy_load_buffer_state fts0b_load_buffer_state +#endif + +#ifdef yy_switch_to_buffer +#define fts0b_switch_to_buffer_ALREADY_DEFINED +#else +#define yy_switch_to_buffer fts0b_switch_to_buffer +#endif + +#ifdef yypush_buffer_state +#define fts0bpush_buffer_state_ALREADY_DEFINED +#else +#define yypush_buffer_state fts0bpush_buffer_state +#endif + +#ifdef yypop_buffer_state +#define fts0bpop_buffer_state_ALREADY_DEFINED +#else +#define yypop_buffer_state fts0bpop_buffer_state +#endif + +#ifdef yyensure_buffer_stack +#define fts0bensure_buffer_stack_ALREADY_DEFINED +#else +#define yyensure_buffer_stack fts0bensure_buffer_stack +#endif + +#ifdef yylex +#define fts0blex_ALREADY_DEFINED +#else +#define yylex fts0blex +#endif + +#ifdef yyrestart +#define fts0brestart_ALREADY_DEFINED +#else +#define yyrestart fts0brestart +#endif + +#ifdef yylex_init +#define fts0blex_init_ALREADY_DEFINED +#else +#define yylex_init fts0blex_init +#endif + +#ifdef yylex_init_extra +#define fts0blex_init_extra_ALREADY_DEFINED +#else +#define yylex_init_extra fts0blex_init_extra +#endif + +#ifdef yylex_destroy +#define fts0blex_destroy_ALREADY_DEFINED +#else +#define yylex_destroy fts0blex_destroy +#endif + +#ifdef yyget_debug +#define fts0bget_debug_ALREADY_DEFINED +#else +#define yyget_debug fts0bget_debug +#endif + +#ifdef yyset_debug +#define fts0bset_debug_ALREADY_DEFINED +#else +#define yyset_debug fts0bset_debug +#endif + +#ifdef yyget_extra +#define fts0bget_extra_ALREADY_DEFINED +#else +#define yyget_extra fts0bget_extra +#endif + +#ifdef yyset_extra +#define fts0bset_extra_ALREADY_DEFINED +#else +#define yyset_extra fts0bset_extra +#endif + +#ifdef yyget_in +#define fts0bget_in_ALREADY_DEFINED +#else +#define yyget_in fts0bget_in +#endif + +#ifdef yyset_in +#define fts0bset_in_ALREADY_DEFINED +#else +#define yyset_in fts0bset_in +#endif + +#ifdef yyget_out +#define fts0bget_out_ALREADY_DEFINED +#else +#define yyget_out fts0bget_out +#endif + +#ifdef yyset_out +#define fts0bset_out_ALREADY_DEFINED +#else +#define yyset_out fts0bset_out +#endif + +#ifdef yyget_leng +#define fts0bget_leng_ALREADY_DEFINED +#else +#define yyget_leng fts0bget_leng +#endif + +#ifdef yyget_text +#define fts0bget_text_ALREADY_DEFINED +#else +#define yyget_text fts0bget_text +#endif + +#ifdef yyget_lineno +#define fts0bget_lineno_ALREADY_DEFINED +#else +#define yyget_lineno fts0bget_lineno +#endif + +#ifdef yyset_lineno +#define fts0bset_lineno_ALREADY_DEFINED +#else +#define yyset_lineno fts0bset_lineno +#endif + +#ifdef yyget_column +#define fts0bget_column_ALREADY_DEFINED +#else +#define yyget_column fts0bget_column +#endif + +#ifdef yyset_column +#define fts0bset_column_ALREADY_DEFINED +#else +#define yyset_column fts0bset_column +#endif + +#ifdef yywrap +#define fts0bwrap_ALREADY_DEFINED +#else +#define yywrap fts0bwrap +#endif + +#ifdef yyalloc +#define fts0balloc_ALREADY_DEFINED +#else +#define yyalloc fts0balloc +#endif + +#ifdef yyrealloc +#define fts0brealloc_ALREADY_DEFINED +#else +#define yyrealloc fts0brealloc +#endif + +#ifdef yyfree +#define fts0bfree_ALREADY_DEFINED +#else +#define yyfree fts0bfree +#endif + +/* First, we deal with platform-specific or compiler-specific issues. */ + +/* begin standard C headers. */ +#include +#include +#include +#include + +/* end standard C headers. */ + +/* flex integer type definitions */ + +#ifndef FLEXINT_H +#define FLEXINT_H + +/* C99 systems have . Non-C99 systems may or may not. */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + +/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, + * if you want the limit (max/min) macros for int types. + */ +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif + +#include +typedef int8_t flex_int8_t; +typedef uint8_t flex_uint8_t; +typedef int16_t flex_int16_t; +typedef uint16_t flex_uint16_t; +typedef int32_t flex_int32_t; +typedef uint32_t flex_uint32_t; +#else +typedef signed char flex_int8_t; +typedef short int flex_int16_t; +typedef int flex_int32_t; +typedef unsigned char flex_uint8_t; +typedef unsigned short int flex_uint16_t; +typedef unsigned int flex_uint32_t; + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#ifndef SIZE_MAX +#define SIZE_MAX (~(size_t)0) +#endif + +#endif /* ! C99 */ + +#endif /* ! FLEXINT_H */ + +/* begin standard C++ headers. */ + +/* TODO: this is always defined, so inline it */ +#define yyconst const + +#if defined(__GNUC__) && __GNUC__ >= 3 +#define yynoreturn __attribute__((__noreturn__)) +#else +#define yynoreturn +#endif + +/* An opaque pointer. */ +#ifndef YY_TYPEDEF_YY_SCANNER_T +#define YY_TYPEDEF_YY_SCANNER_T +typedef void* yyscan_t; +#endif + +/* For convenience, these vars (plus the bison vars far below) + are macros in the reentrant scanner. */ +#define yyin yyg->yyin_r +#define yyout yyg->yyout_r +#define yyextra yyg->yyextra_r +#define yyleng yyg->yyleng_r +#define yytext yyg->yytext_r +#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno) +#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column) +#define yy_flex_debug yyg->yy_flex_debug_r + +/* Size of default input buffer. */ +#ifndef YY_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k. + * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case. + * Ditto for the __ia64__ case accordingly. + */ +#define YY_BUF_SIZE 32768 +#else +#define YY_BUF_SIZE 16384 +#endif /* __ia64__ */ +#endif + +#ifndef YY_TYPEDEF_YY_BUFFER_STATE +#define YY_TYPEDEF_YY_BUFFER_STATE +typedef struct yy_buffer_state *YY_BUFFER_STATE; +#endif + +#ifndef YY_TYPEDEF_YY_SIZE_T +#define YY_TYPEDEF_YY_SIZE_T +typedef size_t yy_size_t; +#endif + +#ifndef YY_STRUCT_YY_BUFFER_STATE +#define YY_STRUCT_YY_BUFFER_STATE +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + int yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + int yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + int yy_bs_lineno; /**< The line count. */ + int yy_bs_column; /**< The column count. */ + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; + + }; +#endif /* !YY_STRUCT_YY_BUFFER_STATE */ + +void yyrestart ( FILE *input_file , yyscan_t yyscanner ); +void yy_switch_to_buffer ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner ); +YY_BUFFER_STATE yy_create_buffer ( FILE *file, int size , yyscan_t yyscanner ); +void yy_delete_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner ); +void yy_flush_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner ); +void yypush_buffer_state ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner ); +void yypop_buffer_state ( yyscan_t yyscanner ); + +YY_BUFFER_STATE yy_scan_buffer ( char *base, yy_size_t size , yyscan_t yyscanner ); +YY_BUFFER_STATE yy_scan_string ( const char *yy_str , yyscan_t yyscanner ); +YY_BUFFER_STATE yy_scan_bytes ( const char *bytes, int len , yyscan_t yyscanner ); + +void *yyalloc ( yy_size_t , yyscan_t yyscanner ); +void *yyrealloc ( void *, yy_size_t , yyscan_t yyscanner ); +void yyfree ( void * , yyscan_t yyscanner ); + +/* Begin user sect3 */ + +#define fts0bwrap(yyscanner) (/*CONSTCOND*/1) +#define YY_SKIP_YYWRAP + +#define yytext_ptr yytext_r + +#ifdef YY_HEADER_EXPORT_START_CONDITIONS +#define INITIAL 0 + +#endif + +#ifndef YY_NO_UNISTD_H +/* Special case for "unistd.h", since it is non-ANSI. We include it way + * down here because we want the user's section 1 to have been scanned first. + * The user has a chance to override it with an option. + */ +#include +#endif + +#ifndef YY_EXTRA_TYPE +#define YY_EXTRA_TYPE void * +#endif + +int yylex_init (yyscan_t* scanner); + +int yylex_init_extra ( YY_EXTRA_TYPE user_defined, yyscan_t* scanner); + +/* Accessor methods to globals. + These are made visible to non-reentrant scanners for convenience. */ + +int yylex_destroy ( yyscan_t yyscanner ); + +int yyget_debug ( yyscan_t yyscanner ); + +void yyset_debug ( int debug_flag , yyscan_t yyscanner ); + +YY_EXTRA_TYPE yyget_extra ( yyscan_t yyscanner ); + +void yyset_extra ( YY_EXTRA_TYPE user_defined , yyscan_t yyscanner ); + +FILE *yyget_in ( yyscan_t yyscanner ); + +void yyset_in ( FILE * _in_str , yyscan_t yyscanner ); + +FILE *yyget_out ( yyscan_t yyscanner ); + +void yyset_out ( FILE * _out_str , yyscan_t yyscanner ); + + int yyget_leng ( yyscan_t yyscanner ); + +char *yyget_text ( yyscan_t yyscanner ); + +int yyget_lineno ( yyscan_t yyscanner ); + +void yyset_lineno ( int _line_number , yyscan_t yyscanner ); + +int yyget_column ( yyscan_t yyscanner ); + +void yyset_column ( int _column_no , yyscan_t yyscanner ); + +/* Macros after this point can all be overridden by user definitions in + * section 1. + */ + +#ifndef YY_SKIP_YYWRAP +#ifdef __cplusplus +extern "C" int yywrap ( yyscan_t yyscanner ); +#else +extern int yywrap ( yyscan_t yyscanner ); +#endif +#endif + +#ifndef yytext_ptr +static void yy_flex_strncpy ( char *, const char *, int , yyscan_t yyscanner); +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen ( const char * , yyscan_t yyscanner); +#endif + +#ifndef YY_NO_INPUT + +#endif + +/* Amount of stuff to slurp up with each read. */ +#ifndef YY_READ_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k */ +#define YY_READ_BUF_SIZE 16384 +#else +#define YY_READ_BUF_SIZE 8192 +#endif /* __ia64__ */ +#endif + +/* Number of entries by which start-condition stack grows. */ +#ifndef YY_START_STACK_INCR +#define YY_START_STACK_INCR 25 +#endif + +/* Default declaration of generated scanner - a define so the user can + * easily add parameters. + */ +#ifndef YY_DECL +#define YY_DECL_IS_OURS 1 + +extern int yylex (yyscan_t yyscanner); + +#define YY_DECL int yylex (yyscan_t yyscanner) +#endif /* !YY_DECL */ + +/* yy_get_previous_state - get the state just before the EOB char was reached */ + +#undef YY_NEW_FILE +#undef YY_FLUSH_BUFFER +#undef yy_set_bol +#undef yy_new_buffer +#undef yy_set_interactive +#undef YY_DO_BEFORE_ACTION + +#ifdef YY_DECL_IS_OURS +#undef YY_DECL_IS_OURS +#undef YY_DECL +#endif + +#ifndef fts0b_create_buffer_ALREADY_DEFINED +#undef yy_create_buffer +#endif +#ifndef fts0b_delete_buffer_ALREADY_DEFINED +#undef yy_delete_buffer +#endif +#ifndef fts0b_scan_buffer_ALREADY_DEFINED +#undef yy_scan_buffer +#endif +#ifndef fts0b_scan_string_ALREADY_DEFINED +#undef yy_scan_string +#endif +#ifndef fts0b_scan_bytes_ALREADY_DEFINED +#undef yy_scan_bytes +#endif +#ifndef fts0b_init_buffer_ALREADY_DEFINED +#undef yy_init_buffer +#endif +#ifndef fts0b_flush_buffer_ALREADY_DEFINED +#undef yy_flush_buffer +#endif +#ifndef fts0b_load_buffer_state_ALREADY_DEFINED +#undef yy_load_buffer_state +#endif +#ifndef fts0b_switch_to_buffer_ALREADY_DEFINED +#undef yy_switch_to_buffer +#endif +#ifndef fts0bpush_buffer_state_ALREADY_DEFINED +#undef yypush_buffer_state +#endif +#ifndef fts0bpop_buffer_state_ALREADY_DEFINED +#undef yypop_buffer_state +#endif +#ifndef fts0bensure_buffer_stack_ALREADY_DEFINED +#undef yyensure_buffer_stack +#endif +#ifndef fts0blex_ALREADY_DEFINED +#undef yylex +#endif +#ifndef fts0brestart_ALREADY_DEFINED +#undef yyrestart +#endif +#ifndef fts0blex_init_ALREADY_DEFINED +#undef yylex_init +#endif +#ifndef fts0blex_init_extra_ALREADY_DEFINED +#undef yylex_init_extra +#endif +#ifndef fts0blex_destroy_ALREADY_DEFINED +#undef yylex_destroy +#endif +#ifndef fts0bget_debug_ALREADY_DEFINED +#undef yyget_debug +#endif +#ifndef fts0bset_debug_ALREADY_DEFINED +#undef yyset_debug +#endif +#ifndef fts0bget_extra_ALREADY_DEFINED +#undef yyget_extra +#endif +#ifndef fts0bset_extra_ALREADY_DEFINED +#undef yyset_extra +#endif +#ifndef fts0bget_in_ALREADY_DEFINED +#undef yyget_in +#endif +#ifndef fts0bset_in_ALREADY_DEFINED +#undef yyset_in +#endif +#ifndef fts0bget_out_ALREADY_DEFINED +#undef yyget_out +#endif +#ifndef fts0bset_out_ALREADY_DEFINED +#undef yyset_out +#endif +#ifndef fts0bget_leng_ALREADY_DEFINED +#undef yyget_leng +#endif +#ifndef fts0bget_text_ALREADY_DEFINED +#undef yyget_text +#endif +#ifndef fts0bget_lineno_ALREADY_DEFINED +#undef yyget_lineno +#endif +#ifndef fts0bset_lineno_ALREADY_DEFINED +#undef yyset_lineno +#endif +#ifndef fts0bget_column_ALREADY_DEFINED +#undef yyget_column +#endif +#ifndef fts0bset_column_ALREADY_DEFINED +#undef yyset_column +#endif +#ifndef fts0bwrap_ALREADY_DEFINED +#undef yywrap +#endif +#ifndef fts0bget_lval_ALREADY_DEFINED +#undef yyget_lval +#endif +#ifndef fts0bset_lval_ALREADY_DEFINED +#undef yyset_lval +#endif +#ifndef fts0bget_lloc_ALREADY_DEFINED +#undef yyget_lloc +#endif +#ifndef fts0bset_lloc_ALREADY_DEFINED +#undef yyset_lloc +#endif +#ifndef fts0balloc_ALREADY_DEFINED +#undef yyalloc +#endif +#ifndef fts0brealloc_ALREADY_DEFINED +#undef yyrealloc +#endif +#ifndef fts0bfree_ALREADY_DEFINED +#undef yyfree +#endif +#ifndef fts0btext_ALREADY_DEFINED +#undef yytext +#endif +#ifndef fts0bleng_ALREADY_DEFINED +#undef yyleng +#endif +#ifndef fts0bin_ALREADY_DEFINED +#undef yyin +#endif +#ifndef fts0bout_ALREADY_DEFINED +#undef yyout +#endif +#ifndef fts0b_flex_debug_ALREADY_DEFINED +#undef yy_flex_debug +#endif +#ifndef fts0blineno_ALREADY_DEFINED +#undef yylineno +#endif +#ifndef fts0btables_fload_ALREADY_DEFINED +#undef yytables_fload +#endif +#ifndef fts0btables_destroy_ALREADY_DEFINED +#undef yytables_destroy +#endif +#ifndef fts0bTABLES_NAME_ALREADY_DEFINED +#undef yyTABLES_NAME +#endif + +#line 74 "fts0blex.l" + + +#line 701 "../include/fts0blex.h" +#undef fts0bIN_HEADER +#endif /* fts0bHEADER_H */ diff --git a/storage/innobase/include/fts0fts.h b/storage/innobase/include/fts0fts.h new file mode 100644 index 00000000..c0151b44 --- /dev/null +++ b/storage/innobase/include/fts0fts.h @@ -0,0 +1,947 @@ +/***************************************************************************** + +Copyright (c) 2011, 2018, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0fts.h +Full text search header file + +Created 2011/09/02 Sunny Bains +***********************************************************************/ + +#pragma once + +#include "data0type.h" +#include "data0types.h" +#include "mem0mem.h" +#include "rem0types.h" +#include "row0types.h" +#include "trx0types.h" +#include "ut0vec.h" +#include "ut0rbt.h" +#include "ut0wqueue.h" +#include "que0types.h" +#include "ft_global.h" +#include "mysql/plugin_ftparser.h" + +/** "NULL" value of a document id. */ +#define FTS_NULL_DOC_ID 0 + +/** FTS hidden column that is used to map to and from the row */ +#define FTS_DOC_ID_COL_NAME "FTS_DOC_ID" + +/** The name of the index created by FTS */ +#define FTS_DOC_ID_INDEX_NAME "FTS_DOC_ID_INDEX" + +#define FTS_DOC_ID_INDEX_NAME_LEN 16 + +/** Doc ID is a 8 byte value */ +#define FTS_DOC_ID_LEN 8 + +/** The number of fields to sort when we build FT index with +FIC. Three fields are sort: (word, doc_id, position) */ +#define FTS_NUM_FIELDS_SORT 3 + +/** Maximum number of rows in a table, smaller than which, we will +optimize using a 4 byte Doc ID for FIC merge sort to reduce sort size */ +#define MAX_DOC_ID_OPT_VAL 1073741824 + +/** Document id type. */ +typedef ib_id_t doc_id_t; + +/** doc_id_t printf format */ +#define FTS_DOC_ID_FORMAT IB_ID_FMT + +/** Convert document id to the InnoDB (BIG ENDIAN) storage format. */ +#define fts_write_doc_id(d, s) mach_write_to_8(d, s) + +/** Read a document id to internal format. */ +#define fts_read_doc_id(s) mach_read_from_8(s) + +/** Bind the doc id to a variable */ +#define fts_bind_doc_id(i, n, v) pars_info_bind_int8_literal(i, n, v) + +/** Defines for FTS query mode, they have the same values as +those defined in mysql file ft_global.h */ +#define FTS_NL 0 +#define FTS_BOOL 1 +#define FTS_SORTED 2 +#define FTS_EXPAND 4 +#define FTS_NO_RANKING 8 +#define FTS_PROXIMITY 16 +#define FTS_PHRASE 32 +#define FTS_OPT_RANKING 64 + +#define FTS_INDEX_TABLE_IND_NAME "FTS_INDEX_TABLE_IND" + +/** The number of FTS index partitions for a fulltext idnex */ +#define FTS_NUM_AUX_INDEX 6 + +/** Threshold where our optimize thread automatically kicks in */ +#define FTS_OPTIMIZE_THRESHOLD 10000000 + +/** Maximum possible Fulltext word length in bytes (assuming mbmaxlen=4) */ +#define FTS_MAX_WORD_LEN (HA_FT_MAXCHARLEN * 4) + +/** Maximum possible Fulltext word length (in characters) */ +#define FTS_MAX_WORD_LEN_IN_CHAR HA_FT_MAXCHARLEN + +/** Number of columns in FTS AUX Tables */ +#define FTS_DELETED_TABLE_NUM_COLS 1 +#define FTS_CONFIG_TABLE_NUM_COLS 2 +#define FTS_AUX_INDEX_TABLE_NUM_COLS 5 + +/** DELETED_TABLE(doc_id BIGINT UNSIGNED) */ +#define FTS_DELETED_TABLE_COL_LEN 8 +/** CONFIG_TABLE(key CHAR(50), value CHAR(200)) */ +#define FTS_CONFIG_TABLE_KEY_COL_LEN 50 +#define FTS_CONFIG_TABLE_VALUE_COL_LEN 200 + +#define FTS_INDEX_FIRST_DOC_ID_LEN 8 +#define FTS_INDEX_LAST_DOC_ID_LEN 8 +#define FTS_INDEX_DOC_COUNT_LEN 4 +/* BLOB COLUMN, 0 means VARIABLE SIZE */ +#define FTS_INDEX_ILIST_LEN 0 + + +/** Variable specifying the FTS parallel sort degree */ +extern ulong fts_sort_pll_degree; + +/** Variable specifying the number of word to optimize for each optimize table +call */ +extern ulong fts_num_word_optimize; + +/** Variable specifying whether we do additional FTS diagnostic printout +in the log */ +extern char fts_enable_diag_print; + +/** FTS rank type, which will be between 0 .. 1 inclusive */ +typedef float fts_rank_t; + +/** Type of a row during a transaction. FTS_NOTHING means the row can be +forgotten from the FTS system's POV, FTS_INVALID is an internal value used +to mark invalid states. + +NOTE: Do not change the order or value of these, fts_trx_row_get_new_state +depends on them being exactly as they are. */ +enum fts_row_state { + FTS_INSERT = 0, + FTS_MODIFY, + FTS_DELETE, + FTS_NOTHING, + FTS_INVALID +}; + +/** The FTS table types. */ +enum fts_table_type_t { + FTS_INDEX_TABLE, /*!< FTS auxiliary table that is + specific to a particular FTS index + on a table */ + + FTS_COMMON_TABLE /*!< FTS auxiliary table that is common + for all FTS index on a table */ +}; + +struct fts_doc_t; +struct fts_cache_t; +struct fts_token_t; +struct fts_doc_ids_t; +struct fts_index_cache_t; + + +/** Initialize the "fts_table" for internal query into FTS auxiliary +tables */ +#define FTS_INIT_FTS_TABLE(fts_table, m_suffix, m_type, m_table)\ +do { \ + (fts_table)->suffix = m_suffix; \ + (fts_table)->type = m_type; \ + (fts_table)->table_id = m_table->id; \ + (fts_table)->table = m_table; \ +} while (0); + +#define FTS_INIT_INDEX_TABLE(fts_table, m_suffix, m_type, m_index)\ +do { \ + (fts_table)->suffix = m_suffix; \ + (fts_table)->type = m_type; \ + (fts_table)->table_id = m_index->table->id; \ + (fts_table)->table = m_index->table; \ + (fts_table)->index_id = m_index->id; \ +} while (0); + +/** Information about changes in a single transaction affecting +the FTS system. */ +struct fts_trx_t { + trx_t* trx; /*!< InnoDB transaction */ + + ib_vector_t* savepoints; /*!< Active savepoints, must have at + least one element, the implied + savepoint */ + ib_vector_t* last_stmt; /*!< last_stmt */ + + mem_heap_t* heap; /*!< heap */ +}; + +/** Information required for transaction savepoint handling. */ +struct fts_savepoint_t { + char* name; /*!< First entry is always NULL, the + default instance. Otherwise the name + of the savepoint */ + + ib_rbt_t* tables; /*!< Modified FTS tables */ +}; + +/** Information about changed rows in a transaction for a single table. */ +struct fts_trx_table_t { + dict_table_t* table; /*!< table */ + + fts_trx_t* fts_trx; /*!< link to parent */ + + ib_rbt_t* rows; /*!< rows changed; indexed by doc-id, + cells are fts_trx_row_t* */ + + fts_doc_ids_t* added_doc_ids; /*!< list of added doc ids (NULL until + the first addition) */ + + /*!< for adding doc ids */ + que_t* docs_added_graph; +}; + +/** Information about one changed row in a transaction. */ +struct fts_trx_row_t { + doc_id_t doc_id; /*!< Id of the ins/upd/del document */ + + fts_row_state state; /*!< state of the row */ + + ib_vector_t* fts_indexes; /*!< The indexes that are affected */ +}; + +/** List of document ids that were added during a transaction. This +list is passed on to a background 'Add' thread and OPTIMIZE, so it +needs its own memory heap. */ +struct fts_doc_ids_t { + ib_vector_t* doc_ids; /*!< document ids (each element is + of type doc_id_t). */ + + ib_alloc_t* self_heap; /*!< Allocator used to create an + instance of this type and the + doc_ids vector */ +}; + +// FIXME: Get rid of this if possible. +/** Since MySQL's character set support for Unicode is woefully inadequate +(it supports basic operations like isalpha etc. only for 8-bit characters), +we have to implement our own. We use UTF-16 without surrogate processing +as our in-memory format. This typedef is a single such character. */ +typedef unsigned short ib_uc_t; + +/** An UTF-16 ro UTF-8 string. */ +struct fts_string_t { + byte* f_str; /*!< string, not necessary terminated in + any way */ + ulint f_len; /*!< Length of the string in bytes */ + ulint f_n_char; /*!< Number of characters */ +}; + +/** Query ranked doc ids. */ +struct fts_ranking_t { + doc_id_t doc_id; /*!< Document id */ + + fts_rank_t rank; /*!< Rank is between 0 .. 1 */ + + byte* words; /*!< this contains the words + that were queried + and found in this document */ + ulint words_len; /*!< words len */ +}; + +/** Query result. */ +struct fts_result_t { + ib_rbt_node_t* current; /*!< Current element */ + + ib_rbt_t* rankings_by_id; /*!< RB tree of type fts_ranking_t + indexed by doc id */ + ib_rbt_t* rankings_by_rank;/*!< RB tree of type fts_ranking_t + indexed by rank */ +}; + +/** This is used to generate the FTS auxiliary table name, we need the +table id and the index id to generate the column specific FTS auxiliary +table name. */ +struct fts_table_t { + fts_table_type_t + type; /*!< The auxiliary table type */ + + table_id_t table_id; /*!< The table id */ + + index_id_t index_id; /*!< The index id */ + + const char* suffix; /*!< The suffix of the fts auxiliary + table name, can be NULL, not used + everywhere (yet) */ + const dict_table_t* + table; /*!< Parent table */ + CHARSET_INFO* charset; /*!< charset info if it is for FTS + index auxiliary table */ +}; + +/** The state of the FTS sub system. */ +class fts_t { +public: + /** fts_t constructor. + @param[in] table table with FTS indexes + @param[in,out] heap memory heap where 'this' is stored */ + fts_t( + const dict_table_t* table, + mem_heap_t* heap); + + /** fts_t destructor. */ + ~fts_t(); + + /** Whether the ADDED table record sync-ed after crash recovery */ + unsigned added_synced:1; + /** Whether the table holds dict_sys.latch */ + unsigned dict_locked:1; + + /** Work queue for scheduling jobs for the FTS 'Add' thread, or NULL + if the thread has not yet been created. Each work item is a + fts_trx_doc_ids_t*. */ + ib_wqueue_t* add_wq; + + /** FTS memory buffer for this table, or NULL if the table has no FTS + index. */ + fts_cache_t* cache; + + /** FTS doc id hidden column number in the CLUSTERED index. */ + ulint doc_col; + + /** Vector of FTS indexes, this is mainly for caching purposes. */ + ib_vector_t* indexes; + + /** Whether the table exists in fts_optimize_wq; + protected by fts_optimize_wq mutex */ + bool in_queue; + + /** Whether the sync message exists in fts_optimize_wq; + protected by fts_optimize_wq mutex */ + bool sync_message; + + /** Heap for fts_t allocation. */ + mem_heap_t* fts_heap; +}; + +struct fts_stopword_t; + +/** status bits for fts_stopword_t status field. */ +#define STOPWORD_NOT_INIT 0x1 +#define STOPWORD_OFF 0x2 +#define STOPWORD_FROM_DEFAULT 0x4 +#define STOPWORD_USER_TABLE 0x8 + +extern const char* fts_default_stopword[]; + +/** Variable specifying the maximum FTS cache size for each table */ +extern Atomic_relaxed fts_max_cache_size; + +/** Variable specifying the total memory allocated for FTS cache */ +extern Atomic_relaxed fts_max_total_cache_size; + +/** Variable specifying the FTS result cache limit for each query */ +extern size_t fts_result_cache_limit; + +/** Variable specifying the maximum FTS max token size */ +extern ulong fts_max_token_size; + +/** Variable specifying the minimum FTS max token size */ +extern ulong fts_min_token_size; + +/** Whether the total memory used for FTS cache is exhausted, and we will +need a sync to free some memory */ +extern bool fts_need_sync; + +/******************************************************************//** +Create a FTS cache. */ +fts_cache_t* +fts_cache_create( +/*=============*/ + dict_table_t* table); /*!< table owns the FTS cache */ + +/******************************************************************//** +Create a FTS index cache. +@return Index Cache */ +fts_index_cache_t* +fts_cache_index_cache_create( +/*=========================*/ + dict_table_t* table, /*!< in: table with FTS index */ + dict_index_t* index); /*!< in: FTS index */ + +/******************************************************************//** +Get the next available document id. This function creates a new +transaction to generate the document id. +@return DB_SUCCESS if OK */ +dberr_t +fts_get_next_doc_id( +/*================*/ + const dict_table_t* table, /*!< in: table */ + doc_id_t* doc_id);/*!< out: new document id */ + +/******************************************************************//** +Create a new fts_doc_ids_t. +@return new fts_doc_ids_t. */ +fts_doc_ids_t* +fts_doc_ids_create(void); +/*=====================*/ + +/** Free fts_doc_ids_t */ +inline void fts_doc_ids_free(fts_doc_ids_t* doc_ids) +{ + mem_heap_free(static_cast(doc_ids->self_heap->arg)); +} + +/******************************************************************//** +Notify the FTS system about an operation on an FTS-indexed table. */ +void +fts_trx_add_op( +/*===========*/ + trx_t* trx, /*!< in: InnoDB transaction */ + dict_table_t* table, /*!< in: table */ + doc_id_t doc_id, /*!< in: doc id */ + fts_row_state state, /*!< in: state of the row */ + ib_vector_t* fts_indexes); /*!< in: FTS indexes affected + (NULL=all) */ + +/******************************************************************//** +Free an FTS trx. */ +void +fts_trx_free( +/*=========*/ + fts_trx_t* fts_trx); /*!< in, own: FTS trx */ + +/** Creates the common auxiliary tables needed for supporting an FTS index +on the given table. +The following tables are created. +CREATE TABLE $FTS_PREFIX_DELETED + (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id) +CREATE TABLE $FTS_PREFIX_DELETED_CACHE + (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id) +CREATE TABLE $FTS_PREFIX_BEING_DELETED + (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id) +CREATE TABLE $FTS_PREFIX_BEING_DELETED_CACHE + (doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id) +CREATE TABLE $FTS_PREFIX_CONFIG + (key CHAR(50), value CHAR(200), UNIQUE CLUSTERED INDEX on key) +@param[in,out] trx transaction +@param[in] table table with FTS index +@param[in] skip_doc_id_index Skip index on doc id +@return DB_SUCCESS if succeed */ +dberr_t +fts_create_common_tables( + trx_t* trx, + dict_table_t* table, + bool skip_doc_id_index) + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/** Creates the column specific ancillary tables needed for supporting an +FTS index on the given table. + +All FTS AUX Index tables have the following schema. +CREAT TABLE $FTS_PREFIX_INDEX_[1-6]( + word VARCHAR(FTS_MAX_WORD_LEN), + first_doc_id INT NOT NULL, + last_doc_id UNSIGNED NOT NULL, + doc_count UNSIGNED INT NOT NULL, + ilist VARBINARY NOT NULL, + UNIQUE CLUSTERED INDEX ON (word, first_doc_id)) +@param[in,out] trx dictionary transaction +@param[in] index fulltext index +@param[in] id table id +@return DB_SUCCESS or error code */ +dberr_t +fts_create_index_tables(trx_t* trx, const dict_index_t* index, table_id_t id) + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/******************************************************************//** +Add the FTS document id hidden column. */ +void +fts_add_doc_id_column( +/*==================*/ + dict_table_t* table, /*!< in/out: Table with FTS index */ + mem_heap_t* heap); /*!< in: temporary memory heap, or NULL */ + +/** Lock the internal FTS_ tables for an index, before fts_drop_index_tables(). +@param trx transaction +@param index fulltext index */ +dberr_t fts_lock_index_tables(trx_t *trx, const dict_index_t &index); + +/** Lock the internal common FTS_ tables, before fts_drop_common_tables(). +@param trx transaction +@param table table containing FULLTEXT INDEX +@return DB_SUCCESS or error code */ +dberr_t fts_lock_common_tables(trx_t *trx, const dict_table_t &table); + +/** Lock the internal FTS_ tables for table, before fts_drop_tables(). +@param trx transaction +@param table table containing FULLTEXT INDEX +@return DB_SUCCESS or error code */ +dberr_t fts_lock_tables(trx_t *trx, const dict_table_t &table); + +/** Drop the internal FTS_ tables for table. +@param trx transaction +@param table table containing FULLTEXT INDEX +@return DB_SUCCESS or error code */ +dberr_t fts_drop_tables(trx_t *trx, const dict_table_t &table); + +/******************************************************************//** +The given transaction is about to be committed; do whatever is necessary +from the FTS system's POV. +@return DB_SUCCESS or error code */ +dberr_t +fts_commit( +/*=======*/ + trx_t* trx) /*!< in: transaction */ + MY_ATTRIBUTE((warn_unused_result)); + +/** FTS Query entry point. +@param[in,out] trx transaction +@param[in] index fts index to search +@param[in] flags FTS search mode +@param[in] query_str FTS query +@param[in] query_len FTS query string len in bytes +@param[in,out] result result doc ids +@return DB_SUCCESS if successful otherwise error code */ +dberr_t +fts_query( + trx_t* trx, + dict_index_t* index, + uint flags, + const byte* query_str, + ulint query_len, + fts_result_t** result) + MY_ATTRIBUTE((warn_unused_result)); + +/******************************************************************//** +Retrieve the FTS Relevance Ranking result for doc with doc_id +@return the relevance ranking value. */ +float +fts_retrieve_ranking( +/*=================*/ + fts_result_t* result, /*!< in: FTS result structure */ + doc_id_t doc_id); /*!< in: the interested document + doc_id */ + +/******************************************************************//** +FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */ +void +fts_query_sort_result_on_rank( +/*==========================*/ + fts_result_t* result); /*!< out: result instance + to sort.*/ + +/******************************************************************//** +FTS Query free result, returned by fts_query(). */ +void +fts_query_free_result( +/*==================*/ + fts_result_t* result); /*!< in: result instance + to free.*/ + +/******************************************************************//** +Extract the doc id from the FTS hidden column. */ +doc_id_t +fts_get_doc_id_from_row( +/*====================*/ + dict_table_t* table, /*!< in: table */ + dtuple_t* row); /*!< in: row whose FTS doc id we + want to extract.*/ + +/** Extract the doc id from the record that belongs to index. +@param[in] rec record containing FTS_DOC_ID +@param[in] index index of rec +@param[in] offsets rec_get_offsets(rec,index) +@return doc id that was extracted from rec */ +doc_id_t +fts_get_doc_id_from_rec( + const rec_t* rec, + const dict_index_t* index, + const rec_offs* offsets); + +/** Add new fts doc id to the update vector. +@param[in] table the table that contains the FTS index. +@param[in,out] ufield the fts doc id field in the update vector. + No new memory is allocated for this in this + function. +@param[in,out] next_doc_id the fts doc id that has been added to the + update vector. If 0, a new fts doc id is + automatically generated. The memory provided + for this argument will be used by the update + vector. Ensure that the life time of this + memory matches that of the update vector. +@return the fts doc id used in the update vector */ +doc_id_t +fts_update_doc_id( + dict_table_t* table, + upd_field_t* ufield, + doc_id_t* next_doc_id); + +/******************************************************************//** +FTS initialize. */ +void +fts_startup(void); +/*==============*/ + +/******************************************************************//** +Create an instance of fts_t. +@return instance of fts_t */ +fts_t* +fts_create( +/*=======*/ + dict_table_t* table); /*!< out: table with FTS + indexes */ + +/*********************************************************************//** +Run OPTIMIZE on the given table. +@return DB_SUCCESS if all OK */ +dberr_t +fts_optimize_table( +/*===============*/ + dict_table_t* table); /*!< in: table to optimiza */ + +/**********************************************************************//** +Startup the optimize thread and create the work queue. */ +void +fts_optimize_init(void); +/*====================*/ + +/****************************************************************//** +Drops index ancillary tables for a FTS index +@return DB_SUCCESS or error code */ +dberr_t fts_drop_index_tables(trx_t *trx, const dict_index_t &index) + MY_ATTRIBUTE((warn_unused_result)); + +/** Add the table to add to the OPTIMIZER's list. +@param[in] table table to add */ +void +fts_optimize_add_table( + dict_table_t* table); + +/******************************************************************//** +Remove the table from the OPTIMIZER's list. We do wait for +acknowledgement from the consumer of the message. */ +void +fts_optimize_remove_table( +/*======================*/ + dict_table_t* table); /*!< in: table to remove */ + +/** Shutdown fts optimize thread. */ +void +fts_optimize_shutdown(); + +/** Send sync fts cache for the table. +@param[in] table table to sync */ +void +fts_optimize_request_sync_table( + dict_table_t* table); + +/**********************************************************************//** +Take a FTS savepoint. */ +void +fts_savepoint_take( +/*===============*/ + fts_trx_t* fts_trx, /*!< in: fts transaction */ + const char* name); /*!< in: savepoint name */ + +/**********************************************************************//** +Refresh last statement savepoint. */ +void +fts_savepoint_laststmt_refresh( +/*===========================*/ + trx_t* trx); /*!< in: transaction */ + +/**********************************************************************//** +Release the savepoint data identified by name. */ +void +fts_savepoint_release( +/*==================*/ + trx_t* trx, /*!< in: transaction */ + const char* name); /*!< in: savepoint name */ + +/** Clear cache. +@param[in,out] cache fts cache */ +void +fts_cache_clear( + fts_cache_t* cache); + +/*********************************************************************//** +Initialize things in cache. */ +void +fts_cache_init( +/*===========*/ + fts_cache_t* cache); /*!< in: cache */ + +/*********************************************************************//** +Rollback to and including savepoint indentified by name. */ +void +fts_savepoint_rollback( +/*===================*/ + trx_t* trx, /*!< in: transaction */ + const char* name); /*!< in: savepoint name */ + +/*********************************************************************//** +Rollback to and including savepoint indentified by name. */ +void +fts_savepoint_rollback_last_stmt( +/*=============================*/ + trx_t* trx); /*!< in: transaction */ + +/** Run SYNC on the table, i.e., write out data from the cache to the +FTS auxiliary INDEX table and clear the cache at the end. +@param[in,out] table fts table +@param[in] wait whether to wait for existing sync to finish +@return DB_SUCCESS on success, error code on failure. */ +dberr_t fts_sync_table(dict_table_t* table, bool wait = true); + +/****************************************************************//** +Create an FTS index cache. */ +CHARSET_INFO* +fts_index_get_charset( +/*==================*/ + dict_index_t* index); /*!< in: FTS index */ + +/*********************************************************************//** +Get the initial Doc ID by consulting the CONFIG table +@return initial Doc ID */ +doc_id_t +fts_init_doc_id( +/*============*/ + const dict_table_t* table); /*!< in: table */ + +/******************************************************************//** +compare two character string according to their charset. */ +extern +int +innobase_fts_text_cmp( +/*==================*/ + const void* cs, /*!< in: Character set */ + const void* p1, /*!< in: key */ + const void* p2); /*!< in: node */ + +/******************************************************************//** +Makes all characters in a string lower case. */ +extern +size_t +innobase_fts_casedn_str( +/*====================*/ + CHARSET_INFO* cs, /*!< in: Character set */ + char* src, /*!< in: string to put in + lower case */ + size_t src_len, /*!< in: input string length */ + char* dst, /*!< in: buffer for result + string */ + size_t dst_len); /*!< in: buffer size */ + + +/******************************************************************//** +compare two character string according to their charset. */ +extern +int +innobase_fts_text_cmp_prefix( +/*=========================*/ + const void* cs, /*!< in: Character set */ + const void* p1, /*!< in: key */ + const void* p2); /*!< in: node */ + +/*************************************************************//** +Get the next token from the given string and store it in *token. */ +extern +ulint +innobase_mysql_fts_get_token( +/*=========================*/ + CHARSET_INFO* charset, /*!< in: Character set */ + const byte* start, /*!< in: start of text */ + const byte* end, /*!< in: one character past + end of text */ + fts_string_t* token); /*!< out: token's text */ + +/*************************************************************//** +Get token char size by charset +@return the number of token char size */ +ulint +fts_get_token_size( +/*===============*/ + const CHARSET_INFO* cs, /*!< in: Character set */ + const char* token, /*!< in: token */ + ulint len); /*!< in: token length */ + +/*************************************************************//** +FULLTEXT tokenizer internal in MYSQL_FTPARSER_SIMPLE_MODE +@return 0 if tokenize sucessfully */ +int +fts_tokenize_document_internal( +/*===========================*/ + MYSQL_FTPARSER_PARAM* param, /*!< in: parser parameter */ + const char* doc, /*!< in: document to tokenize */ + int len); /*!< in: document length */ + +/*********************************************************************//** +Fetch COUNT(*) from specified table. +@return the number of rows in the table */ +ulint +fts_get_rows_count( +/*===============*/ + fts_table_t* fts_table); /*!< in: fts table to read */ + +/*************************************************************//** +Get maximum Doc ID in a table if index "FTS_DOC_ID_INDEX" exists +@return max Doc ID or 0 if index "FTS_DOC_ID_INDEX" does not exist */ +doc_id_t +fts_get_max_doc_id( +/*===============*/ + dict_table_t* table); /*!< in: user table */ + +/** Check whether a stopword table is in the right format. +@param stopword_table_name table name +@param row_end name of the system-versioning end column, or "value" +@return the stopword column charset +@retval NULL if the table does not exist or qualify */ +CHARSET_INFO *fts_valid_stopword_table(const char *stopword_table_name, + const char **row_end= NULL); + +/****************************************************************//** +This function loads specified stopword into FTS cache +@return true if success */ +bool +fts_load_stopword( +/*==============*/ + const dict_table_t* + table, /*!< in: Table with FTS */ + trx_t* trx, /*!< in: Transaction */ + const char* session_stopword_table, /*!< in: Session stopword table + name */ + bool stopword_is_on, /*!< in: Whether stopword + option is turned on/off */ + bool reload); /*!< in: Whether it is during + reload of FTS table */ + +/****************************************************************//** +Read the rows from the FTS index +@return DB_SUCCESS if OK */ +dberr_t +fts_table_fetch_doc_ids( +/*====================*/ + trx_t* trx, /*!< in: transaction */ + fts_table_t* fts_table, /*!< in: aux table */ + fts_doc_ids_t* doc_ids); /*!< in: For collecting + doc ids */ +/****************************************************************//** +This function brings FTS index in sync when FTS index is first +used. There are documents that have not yet sync-ed to auxiliary +tables from last server abnormally shutdown, we will need to bring +such document into FTS cache before any further operations */ +void +fts_init_index( +/*===========*/ + dict_table_t* table, /*!< in: Table with FTS */ + bool has_cache_lock); /*!< in: Whether we already + have cache lock */ +/*******************************************************************//** +Add a newly create index in FTS cache */ +void +fts_add_index( +/*==========*/ + dict_index_t* index, /*!< FTS index to be added */ + dict_table_t* table); /*!< table */ + +/*******************************************************************//** +Drop auxiliary tables related to an FTS index +@return DB_SUCCESS or error number */ +dberr_t +fts_drop_index( +/*===========*/ + dict_table_t* table, /*!< in: Table where indexes are dropped */ + dict_index_t* index, /*!< in: Index to be dropped */ + trx_t* trx); /*!< in: Transaction for the drop */ + +/****************************************************************//** +Rename auxiliary tables for all fts index for a table +@return DB_SUCCESS or error code */ +dberr_t +fts_rename_aux_tables( +/*==================*/ + dict_table_t* table, /*!< in: user Table */ + const char* new_name, /*!< in: new table name */ + trx_t* trx); /*!< in: transaction */ + +/*******************************************************************//** +Check indexes in the fts->indexes is also present in index cache and +table->indexes list +@return TRUE if all indexes match */ +ibool +fts_check_cached_index( +/*===================*/ + dict_table_t* table); /*!< in: Table where indexes are dropped */ + +/** Fetch the document from tuple, tokenize the text data and +insert the text data into fts auxiliary table and +its cache. Moreover this tuple fields doesn't contain any information +about externally stored field. This tuple contains data directly +converted from mysql. +@param[in] ftt FTS transaction table +@param[in] doc_id doc id +@param[in] tuple tuple from where data can be retrieved + and tuple should be arranged in table + schema order. */ +void +fts_add_doc_from_tuple( + fts_trx_table_t*ftt, + doc_id_t doc_id, + const dtuple_t* tuple); + +/** Create an FTS trx. +@param[in,out] trx InnoDB Transaction +@return FTS transaction. */ +fts_trx_t* +fts_trx_create( + trx_t* trx); + +/** Clear all fts resources when there is no internal DOC_ID +and there are no new fts index to add. +@param[in,out] table table where fts is to be freed */ +void fts_clear_all(dict_table_t *table); + +/** Check whether the given name is fts auxiliary table +and fetch the parent table id and index id +@param[in] name table name +@param[in,out] table_id parent table id +@param[in,out] index_id index id +@return true if it is auxilary table */ +bool fts_check_aux_table(const char *name, + table_id_t *table_id, + index_id_t *index_id); + +/** Update the last document id. This function could create a new +transaction to update the last document id. +@param table table to be updated +@param doc_id last document id +@param trx update trx or null +@retval DB_SUCCESS if OK */ +dberr_t +fts_update_sync_doc_id(const dict_table_t *table, + doc_id_t doc_id, + trx_t *trx) + MY_ATTRIBUTE((nonnull(1))); + +/** Sync the table during commit phase +@param[in] table table to be synced */ +void fts_sync_during_ddl(dict_table_t* table); diff --git a/storage/innobase/include/fts0opt.h b/storage/innobase/include/fts0opt.h new file mode 100644 index 00000000..c527ad8e --- /dev/null +++ b/storage/innobase/include/fts0opt.h @@ -0,0 +1,39 @@ +/***************************************************************************** + +Copyright (c) 2001, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0opt.h +Full Text Search optimize thread + +Created 2011-02-15 Jimmy Yang +***********************************************************************/ +#ifndef INNODB_FTS0OPT_H +#define INNODB_FTS0OPT_H + +/** The FTS optimize thread's work queue. */ +extern ib_wqueue_t* fts_optimize_wq; + +/******************************************************************** +Callback function to fetch the rows in an FTS INDEX record. */ +ibool +fts_optimize_index_fetch_node( +/*==========================*/ + /* out: always returns non-NULL */ + void* row, /* in: sel_node_t* */ + void* user_arg); /* in: pointer to ib_vector_t */ +#endif diff --git a/storage/innobase/include/fts0pars.h b/storage/innobase/include/fts0pars.h new file mode 100644 index 00000000..8108e811 --- /dev/null +++ b/storage/innobase/include/fts0pars.h @@ -0,0 +1,72 @@ +/* A Bison parser, made by GNU Bison 2.5. */ + +/* Bison interface for Yacc-like parsers in C + + Copyright (C) 1984, 1989-1990, 2000-2011 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* As a special exception, you may create a larger work that contains + part or all of the Bison parser skeleton and distribute that work + under terms of your choice, so long as that work isn't itself a + parser generator using the skeleton or a modified version thereof + as a parser skeleton. Alternatively, if you modify or redistribute + the parser skeleton itself, you may (at your option) remove this + special exception, which will cause the skeleton and the resulting + Bison output files to be licensed under the GNU General Public + License without this special exception. + + This special exception was added by the Free Software Foundation in + version 2.2 of Bison. */ + + +/* Tokens. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + /* Put the tokens into the symbol table, so that GDB and other debuggers + know about them. */ + enum yytokentype { + FTS_OPER = 258, + FTS_TEXT = 259, + FTS_TERM = 260, + FTS_NUMB = 261 + }; +#endif + + + +#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED +typedef union YYSTYPE +{ + +/* Line 2068 of yacc.c */ +#line 61 "fts0pars.y" + + int oper; + fts_ast_string_t* token; + fts_ast_node_t* node; + + + +/* Line 2068 of yacc.c */ +#line 64 "fts0pars.hh" +} YYSTYPE; +# define YYSTYPE_IS_TRIVIAL 1 +# define yystype YYSTYPE /* obsolescent; will be withdrawn */ +# define YYSTYPE_IS_DECLARED 1 +#endif + + + + diff --git a/storage/innobase/include/fts0plugin.h b/storage/innobase/include/fts0plugin.h new file mode 100644 index 00000000..18ec2d6d --- /dev/null +++ b/storage/innobase/include/fts0plugin.h @@ -0,0 +1,50 @@ +/***************************************************************************** + +Copyright (c) 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0plugin.h +Full text search plugin header file + +Created 2013/06/04 Shaohua Wang +***********************************************************************/ + +#ifndef INNOBASE_FTS0PLUGIN_H +#define INNOBASE_FTS0PLUGIN_H + +#include "univ.i" + +extern struct st_mysql_ftparser fts_default_parser; + +struct fts_ast_state_t; + +#define PARSER_INIT(parser, arg) if (parser->init) { parser->init(arg); } +#define PARSER_DEINIT(parser, arg) if (parser->deinit) { parser->deinit(arg); } + +/******************************************************************//** +fts parse query by plugin parser. +@return 0 if parse successfully, or return non-zero. */ +int +fts_parse_by_parser( +/*================*/ + ibool mode, /*!< in: query boolean mode */ + uchar* query, /*!< in: query string */ + ulint len, /*!< in: query string length */ + st_mysql_ftparser* parse, /*!< in: fts plugin parser */ + fts_ast_state_t* state); /*!< in: query parser state */ + +#endif /* INNOBASE_FTS0PLUGIN_H */ diff --git a/storage/innobase/include/fts0priv.h b/storage/innobase/include/fts0priv.h new file mode 100644 index 00000000..ae0bb036 --- /dev/null +++ b/storage/innobase/include/fts0priv.h @@ -0,0 +1,485 @@ +/***************************************************************************** + +Copyright (c) 2011, 2018, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0priv.h +Full text search internal header file + +Created 2011/09/02 Sunny Bains +***********************************************************************/ + +#ifndef INNOBASE_FTS0PRIV_H +#define INNOBASE_FTS0PRIV_H + +#include "dict0dict.h" +#include "pars0pars.h" +#include "que0que.h" +#include "que0types.h" +#include "fts0types.h" + +/* The various states of the FTS sub system pertaining to a table with +FTS indexes defined on it. */ +enum fts_table_state_enum { + /* !_SUFFIX + +This constant is for the minimum length required to store the +component. +*/ +#define FTS_AUX_MIN_TABLE_ID_LENGTH 48 + +/** Maximum length of an integer stored in the config table value column. */ +#define FTS_MAX_INT_LEN 32 + +/******************************************************************//** +Parse an SQL string. %s is replaced with the table's id. +@return query graph */ +que_t* +fts_parse_sql( +/*==========*/ + fts_table_t* fts_table, /*!< in: FTS aux table */ + pars_info_t* info, /*!< in: info struct, or NULL */ + const char* sql) /*!< in: SQL string to evaluate */ + MY_ATTRIBUTE((nonnull(3), malloc, warn_unused_result)); +/******************************************************************//** +Evaluate a parsed SQL statement +@return DB_SUCCESS or error code */ +dberr_t +fts_eval_sql( +/*=========*/ + trx_t* trx, /*!< in: transaction */ + que_t* graph) /*!< in: Parsed statement */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Construct the name of an internal FTS table for the given table. +@param[in] fts_table metadata on fulltext-indexed table +@param[out] table_name a name up to MAX_FULL_NAME_LEN +@param[in] dict_locked whether dict_sys.latch is being held */ +void fts_get_table_name(const fts_table_t* fts_table, char* table_name, + bool dict_locked = false) + MY_ATTRIBUTE((nonnull)); +/******************************************************************//** +Construct the column specification part of the SQL string for selecting the +indexed FTS columns for the given table. Adds the necessary bound +ids to the given 'info' and returns the SQL string. Examples: + +One indexed column named "text": + + "$sel0", + info/ids: sel0 -> "text" + +Two indexed columns named "subject" and "content": + + "$sel0, $sel1", + info/ids: sel0 -> "subject", sel1 -> "content", +@return heap-allocated WHERE string */ +const char* +fts_get_select_columns_str( +/*=======================*/ + dict_index_t* index, /*!< in: FTS index */ + pars_info_t* info, /*!< in/out: parser info */ + mem_heap_t* heap) /*!< in: memory heap */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** define for fts_doc_fetch_by_doc_id() "option" value, defines whether +we want to get Doc whose ID is equal to or greater or smaller than supplied +ID */ +#define FTS_FETCH_DOC_BY_ID_EQUAL 1 +#define FTS_FETCH_DOC_BY_ID_LARGE 2 +#define FTS_FETCH_DOC_BY_ID_SMALL 3 + +/*************************************************************//** +Fetch document (= a single row's indexed text) with the given +document id. +@return: DB_SUCCESS if fetch is successful, else error */ +dberr_t +fts_doc_fetch_by_doc_id( +/*====================*/ + fts_get_doc_t* get_doc, /*!< in: state */ + doc_id_t doc_id, /*!< in: id of document to fetch */ + dict_index_t* index_to_use, /*!< in: caller supplied FTS index, + or NULL */ + ulint option, /*!< in: search option, if it is + greater than doc_id or equal */ + fts_sql_callback + callback, /*!< in: callback to read + records */ + void* arg) /*!< in: callback arg */ + MY_ATTRIBUTE((nonnull(6))); + +/*******************************************************************//** +Callback function for fetch that stores the text of an FTS document, +converting each column to UTF-16. +@return always FALSE */ +ibool +fts_query_expansion_fetch_doc( +/*==========================*/ + void* row, /*!< in: sel_node_t* */ + void* user_arg) /*!< in: fts_doc_t* */ + MY_ATTRIBUTE((nonnull)); +/******************************************************************** +Write out a single word's data as new entry/entries in the INDEX table. +@return DB_SUCCESS if all OK. */ +dberr_t +fts_write_node( +/*===========*/ + trx_t* trx, /*!< in: transaction */ + que_t** graph, /*!< in: query graph */ + fts_table_t* fts_table, /*!< in: the FTS aux index */ + fts_string_t* word, /*!< in: word in UTF-8 */ + fts_node_t* node) /*!< in: node columns */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Check if a fts token is a stopword or less than fts_min_token_size +or greater than fts_max_token_size. +@param[in] token token string +@param[in] stopwords stopwords rb tree +@param[in] cs token charset +@retval true if it is not stopword and length in range +@retval false if it is stopword or length not in range */ +bool +fts_check_token( + const fts_string_t* token, + const ib_rbt_t* stopwords, + const CHARSET_INFO* cs); + +/******************************************************************//** +Initialize a document. */ +void +fts_doc_init( +/*=========*/ + fts_doc_t* doc) /*!< in: doc to initialize */ + MY_ATTRIBUTE((nonnull)); + +/******************************************************************//** +Do a binary search for a doc id in the array +@return +ve index if found -ve index where it should be + inserted if not found */ +int +fts_bsearch( +/*========*/ + doc_id_t* array, /*!< in: array to sort */ + int lower, /*!< in: lower bound of array*/ + int upper, /*!< in: upper bound of array*/ + doc_id_t doc_id) /*!< in: doc id to lookup */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/******************************************************************//** +Free document. */ +void +fts_doc_free( +/*=========*/ + fts_doc_t* doc) /*!< in: document */ + MY_ATTRIBUTE((nonnull)); +/******************************************************************//** +Free fts_optimizer_word_t instanace.*/ +void +fts_word_free( +/*==========*/ + fts_word_t* word) /*!< in: instance to free.*/ + MY_ATTRIBUTE((nonnull)); +/******************************************************************//** +Read the rows from the FTS inde +@return DB_SUCCESS or error code */ +dberr_t +fts_index_fetch_nodes( +/*==================*/ + trx_t* trx, /*!< in: transaction */ + que_t** graph, /*!< in: prepared statement */ + fts_table_t* fts_table, /*!< in: FTS aux table */ + const fts_string_t* + word, /*!< in: the word to fetch */ + fts_fetch_t* fetch) /*!< in: fetch callback.*/ + MY_ATTRIBUTE((nonnull)); +/******************************************************************//** +Compare two fts_trx_table_t instances, we actually compare the +table id's here. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_trx_table_cmp( +/*==============*/ + const void* v1, /*!< in: id1 */ + const void* v2) /*!< in: id2 */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/******************************************************************//** +Compare a table id with a trx_table_t table id. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_trx_table_id_cmp( +/*=================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +#define fts_sql_commit(trx) trx_commit_for_mysql(trx) +#define fts_sql_rollback(trx) (trx)->rollback() +/******************************************************************//** +Get value from config table. The caller must ensure that enough +space is allocated for value to hold the column contents +@return DB_SUCCESS or error code */ +dberr_t +fts_config_get_value( +/*=================*/ + trx_t* trx, /* transaction */ + fts_table_t* fts_table, /*!< in: the indexed FTS table */ + const char* name, /*!< in: get config value for + this parameter name */ + fts_string_t* value) /*!< out: value read from + config table */ + MY_ATTRIBUTE((nonnull)); +/******************************************************************//** +Get value specific to an FTS index from the config table. The caller +must ensure that enough space is allocated for value to hold the +column contents. +@return DB_SUCCESS or error code */ +dberr_t +fts_config_get_index_value( +/*=======================*/ + trx_t* trx, /*!< transaction */ + dict_index_t* index, /*!< in: index */ + const char* param, /*!< in: get config value for + this parameter name */ + fts_string_t* value) /*!< out: value read from + config table */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/******************************************************************//** +Set the value in the config table for name. +@return DB_SUCCESS or error code */ +dberr_t +fts_config_set_value( +/*=================*/ + trx_t* trx, /*!< transaction */ + fts_table_t* fts_table, /*!< in: the indexed FTS table */ + const char* name, /*!< in: get config value for + this parameter name */ + const fts_string_t* + value) /*!< in: value to update */ + MY_ATTRIBUTE((nonnull)); +/****************************************************************//** +Set an ulint value in the config table. +@return DB_SUCCESS if all OK else error code */ +dberr_t +fts_config_set_ulint( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + fts_table_t* fts_table, /*!< in: the indexed FTS table */ + const char* name, /*!< in: param name */ + ulint int_value) /*!< in: value */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/******************************************************************//** +Set the value specific to an FTS index in the config table. +@return DB_SUCCESS or error code */ +dberr_t +fts_config_set_index_value( +/*=======================*/ + trx_t* trx, /*!< transaction */ + dict_index_t* index, /*!< in: index */ + const char* param, /*!< in: get config value for + this parameter name */ + fts_string_t* value) /*!< out: value read from + config table */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +#ifdef FTS_OPTIMIZE_DEBUG +/******************************************************************//** +Get an ulint value from the config table. +@return DB_SUCCESS or error code */ +dberr_t +fts_config_get_index_ulint( +/*=======================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: FTS index */ + const char* name, /*!< in: param name */ + ulint* int_value) /*!< out: value */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +#endif /* FTS_OPTIMIZE_DEBUG */ + +/******************************************************************//** +Set an ulint value int the config table. +@return DB_SUCCESS or error code */ +dberr_t +fts_config_set_index_ulint( +/*=======================*/ + trx_t* trx, /*!< in: transaction */ + dict_index_t* index, /*!< in: FTS index */ + const char* name, /*!< in: param name */ + ulint int_value) /*!< in: value */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/******************************************************************//** +Get an ulint value from the config table. +@return DB_SUCCESS or error code */ +dberr_t +fts_config_get_ulint( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + fts_table_t* fts_table, /*!< in: the indexed FTS table */ + const char* name, /*!< in: param name */ + ulint* int_value) /*!< out: value */ + MY_ATTRIBUTE((nonnull)); +/******************************************************************//** +Search cache for word. +@return the word node vector if found else NULL */ +const ib_vector_t* +fts_cache_find_word( +/*================*/ + const fts_index_cache_t* + index_cache, /*!< in: cache to search */ + const fts_string_t* + text) /*!< in: word to search for */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/******************************************************************//** +Append deleted doc ids to vector and sort the vector. */ +void +fts_cache_append_deleted_doc_ids( +/*=============================*/ + fts_cache_t* cache, /*!< in: cache to use */ + ib_vector_t* vector); /*!< in: append to this vector */ +/******************************************************************//** +Search the index specific cache for a particular FTS index. +@return the index specific cache else NULL */ +fts_index_cache_t* +fts_find_index_cache( +/*================*/ + const fts_cache_t* + cache, /*!< in: cache to search */ + const dict_index_t* + index) /*!< in: index to search for */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/******************************************************************//** +Write the table id to the given buffer (including final NUL). Buffer must be +at least FTS_AUX_MIN_TABLE_ID_LENGTH bytes long. +@return number of bytes written */ +UNIV_INLINE +int +fts_write_object_id( +/*================*/ + ib_id_t id, /*!< in: a table/index id */ + char* str); /*!< in: buffer to write the id to */ +/******************************************************************//** +Read the table id from the string generated by fts_write_object_id(). +@return TRUE if parse successful */ +UNIV_INLINE +ibool +fts_read_object_id( +/*===============*/ + ib_id_t* id, /*!< out: a table id */ + const char* str) /*!< in: buffer to read from */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/******************************************************************//** +Get the table id. +@return number of bytes written */ +int +fts_get_table_id( +/*=============*/ + const fts_table_t* + fts_table, /*!< in: FTS Auxiliary table */ + char* table_id) /*!< out: table id, must be at least + FTS_AUX_MIN_TABLE_ID_LENGTH bytes + long */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/******************************************************************//** +Add node positions. */ +void +fts_cache_node_add_positions( +/*=========================*/ + fts_cache_t* cache, /*!< in: cache */ + fts_node_t* node, /*!< in: word node */ + doc_id_t doc_id, /*!< in: doc id */ + ib_vector_t* positions) /*!< in: fts_token_t::positions */ + MY_ATTRIBUTE((nonnull(2,4))); + +/******************************************************************//** +Create the config table name for retrieving index specific value. +@return index config parameter name */ +char* +fts_config_create_index_param_name( +/*===============================*/ + const char* param, /*!< in: base name of param */ + const dict_index_t* index) /*!< in: index for config */ + MY_ATTRIBUTE((nonnull, malloc, warn_unused_result)); + +#include "fts0priv.inl" + +#endif /* INNOBASE_FTS0PRIV_H */ diff --git a/storage/innobase/include/fts0priv.inl b/storage/innobase/include/fts0priv.inl new file mode 100644 index 00000000..da14cfcb --- /dev/null +++ b/storage/innobase/include/fts0priv.inl @@ -0,0 +1,121 @@ +/***************************************************************************** + +Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0priv.ic +Full text search internal header file + +Created 2011/11/12 Sunny Bains +***********************************************************************/ + +/******************************************************************//** +Write the table id to the given buffer (including final NUL). Buffer must be +at least FTS_AUX_MIN_TABLE_ID_LENGTH bytes long. +@return number of bytes written */ +UNIV_INLINE +int +fts_write_object_id( +/*================*/ + ib_id_t id, /* in: a table/index id */ + char* str) /* in: buffer to write the id to */ +{ + +#ifdef _WIN32 + + DBUG_EXECUTE_IF("innodb_test_wrong_non_windows_fts_aux_table_name", + return(sprintf(str, UINT64PFx, id));); + + /* Use this to construct old(5.6.14 and 5.7.3) windows + ambiguous aux table names */ + DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name", + return(sprintf(str, "%016llu", (ulonglong) id));); + +#else /* _WIN32 */ + + /* Use this to construct old(5.6.14 and 5.7.3) windows + ambiguous aux table names */ + DBUG_EXECUTE_IF("innodb_test_wrong_windows_fts_aux_table_name", + return(sprintf(str, "%016llu", (ulonglong) id));); + + DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name", + return(sprintf(str, "%016llx", (ulonglong) id));); + +#endif /* _WIN32 */ + + return(sprintf(str, "%016llx", (ulonglong) id)); +} + +/******************************************************************//** +Read the table id from the string generated by fts_write_object_id(). +@return TRUE if parse successful */ +UNIV_INLINE +ibool +fts_read_object_id( +/*===============*/ + ib_id_t* id, /* out: an id */ + const char* str) /* in: buffer to read from */ +{ + /* NOTE: this func doesn't care about whether current table + is set with HEX_NAME, the user of the id read here will check + if the id is HEX or DEC and do the right thing with it. */ + return(sscanf(str, UINT64PFx, id) == 1); +} + +/******************************************************************//** +Compare two fts_trx_table_t instances. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_trx_table_cmp( +/*==============*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const dict_table_t* table1 + = (*static_cast(p1))->table; + + const dict_table_t* table2 + = (*static_cast(p2))->table; + + return((table1->id > table2->id) + ? 1 + : (table1->id == table2->id) + ? 0 + : -1); +} + +/******************************************************************//** +Compare a table id with a fts_trx_table_t table id. +@return < 0 if n1 < n2, 0 if n1 == n2,> 0 if n1 > n2 */ +UNIV_INLINE +int +fts_trx_table_id_cmp( +/*=================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const uintmax_t* table_id = static_cast(p1); + const dict_table_t* table2 + = (*static_cast(p2))->table; + + return((*table_id > table2->id) + ? 1 + : (*table_id == table2->id) + ? 0 + : -1); +} diff --git a/storage/innobase/include/fts0tlex.h b/storage/innobase/include/fts0tlex.h new file mode 100644 index 00000000..89655ca1 --- /dev/null +++ b/storage/innobase/include/fts0tlex.h @@ -0,0 +1,702 @@ +#ifndef fts0tHEADER_H +#define fts0tHEADER_H 1 +#define fts0tIN_HEADER 1 + +#line 6 "../include/fts0tlex.h" + +#line 8 "../include/fts0tlex.h" + +#define YY_INT_ALIGNED short int + +/* A lexical scanner generated by flex */ + +#define FLEX_SCANNER +#define YY_FLEX_MAJOR_VERSION 2 +#define YY_FLEX_MINOR_VERSION 6 +#define YY_FLEX_SUBMINOR_VERSION 4 +#if YY_FLEX_SUBMINOR_VERSION > 0 +#define FLEX_BETA +#endif + +#ifdef yy_create_buffer +#define fts0t_create_buffer_ALREADY_DEFINED +#else +#define yy_create_buffer fts0t_create_buffer +#endif + +#ifdef yy_delete_buffer +#define fts0t_delete_buffer_ALREADY_DEFINED +#else +#define yy_delete_buffer fts0t_delete_buffer +#endif + +#ifdef yy_scan_buffer +#define fts0t_scan_buffer_ALREADY_DEFINED +#else +#define yy_scan_buffer fts0t_scan_buffer +#endif + +#ifdef yy_scan_string +#define fts0t_scan_string_ALREADY_DEFINED +#else +#define yy_scan_string fts0t_scan_string +#endif + +#ifdef yy_scan_bytes +#define fts0t_scan_bytes_ALREADY_DEFINED +#else +#define yy_scan_bytes fts0t_scan_bytes +#endif + +#ifdef yy_init_buffer +#define fts0t_init_buffer_ALREADY_DEFINED +#else +#define yy_init_buffer fts0t_init_buffer +#endif + +#ifdef yy_flush_buffer +#define fts0t_flush_buffer_ALREADY_DEFINED +#else +#define yy_flush_buffer fts0t_flush_buffer +#endif + +#ifdef yy_load_buffer_state +#define fts0t_load_buffer_state_ALREADY_DEFINED +#else +#define yy_load_buffer_state fts0t_load_buffer_state +#endif + +#ifdef yy_switch_to_buffer +#define fts0t_switch_to_buffer_ALREADY_DEFINED +#else +#define yy_switch_to_buffer fts0t_switch_to_buffer +#endif + +#ifdef yypush_buffer_state +#define fts0tpush_buffer_state_ALREADY_DEFINED +#else +#define yypush_buffer_state fts0tpush_buffer_state +#endif + +#ifdef yypop_buffer_state +#define fts0tpop_buffer_state_ALREADY_DEFINED +#else +#define yypop_buffer_state fts0tpop_buffer_state +#endif + +#ifdef yyensure_buffer_stack +#define fts0tensure_buffer_stack_ALREADY_DEFINED +#else +#define yyensure_buffer_stack fts0tensure_buffer_stack +#endif + +#ifdef yylex +#define fts0tlex_ALREADY_DEFINED +#else +#define yylex fts0tlex +#endif + +#ifdef yyrestart +#define fts0trestart_ALREADY_DEFINED +#else +#define yyrestart fts0trestart +#endif + +#ifdef yylex_init +#define fts0tlex_init_ALREADY_DEFINED +#else +#define yylex_init fts0tlex_init +#endif + +#ifdef yylex_init_extra +#define fts0tlex_init_extra_ALREADY_DEFINED +#else +#define yylex_init_extra fts0tlex_init_extra +#endif + +#ifdef yylex_destroy +#define fts0tlex_destroy_ALREADY_DEFINED +#else +#define yylex_destroy fts0tlex_destroy +#endif + +#ifdef yyget_debug +#define fts0tget_debug_ALREADY_DEFINED +#else +#define yyget_debug fts0tget_debug +#endif + +#ifdef yyset_debug +#define fts0tset_debug_ALREADY_DEFINED +#else +#define yyset_debug fts0tset_debug +#endif + +#ifdef yyget_extra +#define fts0tget_extra_ALREADY_DEFINED +#else +#define yyget_extra fts0tget_extra +#endif + +#ifdef yyset_extra +#define fts0tset_extra_ALREADY_DEFINED +#else +#define yyset_extra fts0tset_extra +#endif + +#ifdef yyget_in +#define fts0tget_in_ALREADY_DEFINED +#else +#define yyget_in fts0tget_in +#endif + +#ifdef yyset_in +#define fts0tset_in_ALREADY_DEFINED +#else +#define yyset_in fts0tset_in +#endif + +#ifdef yyget_out +#define fts0tget_out_ALREADY_DEFINED +#else +#define yyget_out fts0tget_out +#endif + +#ifdef yyset_out +#define fts0tset_out_ALREADY_DEFINED +#else +#define yyset_out fts0tset_out +#endif + +#ifdef yyget_leng +#define fts0tget_leng_ALREADY_DEFINED +#else +#define yyget_leng fts0tget_leng +#endif + +#ifdef yyget_text +#define fts0tget_text_ALREADY_DEFINED +#else +#define yyget_text fts0tget_text +#endif + +#ifdef yyget_lineno +#define fts0tget_lineno_ALREADY_DEFINED +#else +#define yyget_lineno fts0tget_lineno +#endif + +#ifdef yyset_lineno +#define fts0tset_lineno_ALREADY_DEFINED +#else +#define yyset_lineno fts0tset_lineno +#endif + +#ifdef yyget_column +#define fts0tget_column_ALREADY_DEFINED +#else +#define yyget_column fts0tget_column +#endif + +#ifdef yyset_column +#define fts0tset_column_ALREADY_DEFINED +#else +#define yyset_column fts0tset_column +#endif + +#ifdef yywrap +#define fts0twrap_ALREADY_DEFINED +#else +#define yywrap fts0twrap +#endif + +#ifdef yyalloc +#define fts0talloc_ALREADY_DEFINED +#else +#define yyalloc fts0talloc +#endif + +#ifdef yyrealloc +#define fts0trealloc_ALREADY_DEFINED +#else +#define yyrealloc fts0trealloc +#endif + +#ifdef yyfree +#define fts0tfree_ALREADY_DEFINED +#else +#define yyfree fts0tfree +#endif + +/* First, we deal with platform-specific or compiler-specific issues. */ + +/* begin standard C headers. */ +#include +#include +#include +#include + +/* end standard C headers. */ + +/* flex integer type definitions */ + +#ifndef FLEXINT_H +#define FLEXINT_H + +/* C99 systems have . Non-C99 systems may or may not. */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + +/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, + * if you want the limit (max/min) macros for int types. + */ +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif + +#include +typedef int8_t flex_int8_t; +typedef uint8_t flex_uint8_t; +typedef int16_t flex_int16_t; +typedef uint16_t flex_uint16_t; +typedef int32_t flex_int32_t; +typedef uint32_t flex_uint32_t; +#else +typedef signed char flex_int8_t; +typedef short int flex_int16_t; +typedef int flex_int32_t; +typedef unsigned char flex_uint8_t; +typedef unsigned short int flex_uint16_t; +typedef unsigned int flex_uint32_t; + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#ifndef SIZE_MAX +#define SIZE_MAX (~(size_t)0) +#endif + +#endif /* ! C99 */ + +#endif /* ! FLEXINT_H */ + +/* begin standard C++ headers. */ + +/* TODO: this is always defined, so inline it */ +#define yyconst const + +#if defined(__GNUC__) && __GNUC__ >= 3 +#define yynoreturn __attribute__((__noreturn__)) +#else +#define yynoreturn +#endif + +/* An opaque pointer. */ +#ifndef YY_TYPEDEF_YY_SCANNER_T +#define YY_TYPEDEF_YY_SCANNER_T +typedef void* yyscan_t; +#endif + +/* For convenience, these vars (plus the bison vars far below) + are macros in the reentrant scanner. */ +#define yyin yyg->yyin_r +#define yyout yyg->yyout_r +#define yyextra yyg->yyextra_r +#define yyleng yyg->yyleng_r +#define yytext yyg->yytext_r +#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno) +#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column) +#define yy_flex_debug yyg->yy_flex_debug_r + +/* Size of default input buffer. */ +#ifndef YY_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k. + * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case. + * Ditto for the __ia64__ case accordingly. + */ +#define YY_BUF_SIZE 32768 +#else +#define YY_BUF_SIZE 16384 +#endif /* __ia64__ */ +#endif + +#ifndef YY_TYPEDEF_YY_BUFFER_STATE +#define YY_TYPEDEF_YY_BUFFER_STATE +typedef struct yy_buffer_state *YY_BUFFER_STATE; +#endif + +#ifndef YY_TYPEDEF_YY_SIZE_T +#define YY_TYPEDEF_YY_SIZE_T +typedef size_t yy_size_t; +#endif + +#ifndef YY_STRUCT_YY_BUFFER_STATE +#define YY_STRUCT_YY_BUFFER_STATE +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + int yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + int yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + int yy_bs_lineno; /**< The line count. */ + int yy_bs_column; /**< The column count. */ + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; + + }; +#endif /* !YY_STRUCT_YY_BUFFER_STATE */ + +void yyrestart ( FILE *input_file , yyscan_t yyscanner ); +void yy_switch_to_buffer ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner ); +YY_BUFFER_STATE yy_create_buffer ( FILE *file, int size , yyscan_t yyscanner ); +void yy_delete_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner ); +void yy_flush_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner ); +void yypush_buffer_state ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner ); +void yypop_buffer_state ( yyscan_t yyscanner ); + +YY_BUFFER_STATE yy_scan_buffer ( char *base, yy_size_t size , yyscan_t yyscanner ); +YY_BUFFER_STATE yy_scan_string ( const char *yy_str , yyscan_t yyscanner ); +YY_BUFFER_STATE yy_scan_bytes ( const char *bytes, int len , yyscan_t yyscanner ); + +void *yyalloc ( yy_size_t , yyscan_t yyscanner ); +void *yyrealloc ( void *, yy_size_t , yyscan_t yyscanner ); +void yyfree ( void * , yyscan_t yyscanner ); + +/* Begin user sect3 */ + +#define fts0twrap(yyscanner) (/*CONSTCOND*/1) +#define YY_SKIP_YYWRAP + +#define yytext_ptr yytext_r + +#ifdef YY_HEADER_EXPORT_START_CONDITIONS +#define INITIAL 0 + +#endif + +#ifndef YY_NO_UNISTD_H +/* Special case for "unistd.h", since it is non-ANSI. We include it way + * down here because we want the user's section 1 to have been scanned first. + * The user has a chance to override it with an option. + */ +#include +#endif + +#ifndef YY_EXTRA_TYPE +#define YY_EXTRA_TYPE void * +#endif + +int yylex_init (yyscan_t* scanner); + +int yylex_init_extra ( YY_EXTRA_TYPE user_defined, yyscan_t* scanner); + +/* Accessor methods to globals. + These are made visible to non-reentrant scanners for convenience. */ + +int yylex_destroy ( yyscan_t yyscanner ); + +int yyget_debug ( yyscan_t yyscanner ); + +void yyset_debug ( int debug_flag , yyscan_t yyscanner ); + +YY_EXTRA_TYPE yyget_extra ( yyscan_t yyscanner ); + +void yyset_extra ( YY_EXTRA_TYPE user_defined , yyscan_t yyscanner ); + +FILE *yyget_in ( yyscan_t yyscanner ); + +void yyset_in ( FILE * _in_str , yyscan_t yyscanner ); + +FILE *yyget_out ( yyscan_t yyscanner ); + +void yyset_out ( FILE * _out_str , yyscan_t yyscanner ); + + int yyget_leng ( yyscan_t yyscanner ); + +char *yyget_text ( yyscan_t yyscanner ); + +int yyget_lineno ( yyscan_t yyscanner ); + +void yyset_lineno ( int _line_number , yyscan_t yyscanner ); + +int yyget_column ( yyscan_t yyscanner ); + +void yyset_column ( int _column_no , yyscan_t yyscanner ); + +/* Macros after this point can all be overridden by user definitions in + * section 1. + */ + +#ifndef YY_SKIP_YYWRAP +#ifdef __cplusplus +extern "C" int yywrap ( yyscan_t yyscanner ); +#else +extern int yywrap ( yyscan_t yyscanner ); +#endif +#endif + +#ifndef yytext_ptr +static void yy_flex_strncpy ( char *, const char *, int , yyscan_t yyscanner); +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen ( const char * , yyscan_t yyscanner); +#endif + +#ifndef YY_NO_INPUT + +#endif + +/* Amount of stuff to slurp up with each read. */ +#ifndef YY_READ_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k */ +#define YY_READ_BUF_SIZE 16384 +#else +#define YY_READ_BUF_SIZE 8192 +#endif /* __ia64__ */ +#endif + +/* Number of entries by which start-condition stack grows. */ +#ifndef YY_START_STACK_INCR +#define YY_START_STACK_INCR 25 +#endif + +/* Default declaration of generated scanner - a define so the user can + * easily add parameters. + */ +#ifndef YY_DECL +#define YY_DECL_IS_OURS 1 + +extern int yylex (yyscan_t yyscanner); + +#define YY_DECL int yylex (yyscan_t yyscanner) +#endif /* !YY_DECL */ + +/* yy_get_previous_state - get the state just before the EOB char was reached */ + +#undef YY_NEW_FILE +#undef YY_FLUSH_BUFFER +#undef yy_set_bol +#undef yy_new_buffer +#undef yy_set_interactive +#undef YY_DO_BEFORE_ACTION + +#ifdef YY_DECL_IS_OURS +#undef YY_DECL_IS_OURS +#undef YY_DECL +#endif + +#ifndef fts0t_create_buffer_ALREADY_DEFINED +#undef yy_create_buffer +#endif +#ifndef fts0t_delete_buffer_ALREADY_DEFINED +#undef yy_delete_buffer +#endif +#ifndef fts0t_scan_buffer_ALREADY_DEFINED +#undef yy_scan_buffer +#endif +#ifndef fts0t_scan_string_ALREADY_DEFINED +#undef yy_scan_string +#endif +#ifndef fts0t_scan_bytes_ALREADY_DEFINED +#undef yy_scan_bytes +#endif +#ifndef fts0t_init_buffer_ALREADY_DEFINED +#undef yy_init_buffer +#endif +#ifndef fts0t_flush_buffer_ALREADY_DEFINED +#undef yy_flush_buffer +#endif +#ifndef fts0t_load_buffer_state_ALREADY_DEFINED +#undef yy_load_buffer_state +#endif +#ifndef fts0t_switch_to_buffer_ALREADY_DEFINED +#undef yy_switch_to_buffer +#endif +#ifndef fts0tpush_buffer_state_ALREADY_DEFINED +#undef yypush_buffer_state +#endif +#ifndef fts0tpop_buffer_state_ALREADY_DEFINED +#undef yypop_buffer_state +#endif +#ifndef fts0tensure_buffer_stack_ALREADY_DEFINED +#undef yyensure_buffer_stack +#endif +#ifndef fts0tlex_ALREADY_DEFINED +#undef yylex +#endif +#ifndef fts0trestart_ALREADY_DEFINED +#undef yyrestart +#endif +#ifndef fts0tlex_init_ALREADY_DEFINED +#undef yylex_init +#endif +#ifndef fts0tlex_init_extra_ALREADY_DEFINED +#undef yylex_init_extra +#endif +#ifndef fts0tlex_destroy_ALREADY_DEFINED +#undef yylex_destroy +#endif +#ifndef fts0tget_debug_ALREADY_DEFINED +#undef yyget_debug +#endif +#ifndef fts0tset_debug_ALREADY_DEFINED +#undef yyset_debug +#endif +#ifndef fts0tget_extra_ALREADY_DEFINED +#undef yyget_extra +#endif +#ifndef fts0tset_extra_ALREADY_DEFINED +#undef yyset_extra +#endif +#ifndef fts0tget_in_ALREADY_DEFINED +#undef yyget_in +#endif +#ifndef fts0tset_in_ALREADY_DEFINED +#undef yyset_in +#endif +#ifndef fts0tget_out_ALREADY_DEFINED +#undef yyget_out +#endif +#ifndef fts0tset_out_ALREADY_DEFINED +#undef yyset_out +#endif +#ifndef fts0tget_leng_ALREADY_DEFINED +#undef yyget_leng +#endif +#ifndef fts0tget_text_ALREADY_DEFINED +#undef yyget_text +#endif +#ifndef fts0tget_lineno_ALREADY_DEFINED +#undef yyget_lineno +#endif +#ifndef fts0tset_lineno_ALREADY_DEFINED +#undef yyset_lineno +#endif +#ifndef fts0tget_column_ALREADY_DEFINED +#undef yyget_column +#endif +#ifndef fts0tset_column_ALREADY_DEFINED +#undef yyset_column +#endif +#ifndef fts0twrap_ALREADY_DEFINED +#undef yywrap +#endif +#ifndef fts0tget_lval_ALREADY_DEFINED +#undef yyget_lval +#endif +#ifndef fts0tset_lval_ALREADY_DEFINED +#undef yyset_lval +#endif +#ifndef fts0tget_lloc_ALREADY_DEFINED +#undef yyget_lloc +#endif +#ifndef fts0tset_lloc_ALREADY_DEFINED +#undef yyset_lloc +#endif +#ifndef fts0talloc_ALREADY_DEFINED +#undef yyalloc +#endif +#ifndef fts0trealloc_ALREADY_DEFINED +#undef yyrealloc +#endif +#ifndef fts0tfree_ALREADY_DEFINED +#undef yyfree +#endif +#ifndef fts0ttext_ALREADY_DEFINED +#undef yytext +#endif +#ifndef fts0tleng_ALREADY_DEFINED +#undef yyleng +#endif +#ifndef fts0tin_ALREADY_DEFINED +#undef yyin +#endif +#ifndef fts0tout_ALREADY_DEFINED +#undef yyout +#endif +#ifndef fts0t_flex_debug_ALREADY_DEFINED +#undef yy_flex_debug +#endif +#ifndef fts0tlineno_ALREADY_DEFINED +#undef yylineno +#endif +#ifndef fts0ttables_fload_ALREADY_DEFINED +#undef yytables_fload +#endif +#ifndef fts0ttables_destroy_ALREADY_DEFINED +#undef yytables_destroy +#endif +#ifndef fts0tTABLES_NAME_ALREADY_DEFINED +#undef yyTABLES_NAME +#endif + +#line 69 "fts0tlex.l" + + +#line 701 "../include/fts0tlex.h" +#undef fts0tIN_HEADER +#endif /* fts0tHEADER_H */ diff --git a/storage/innobase/include/fts0tokenize.h b/storage/innobase/include/fts0tokenize.h new file mode 100644 index 00000000..1cddaf5b --- /dev/null +++ b/storage/innobase/include/fts0tokenize.h @@ -0,0 +1,189 @@ +/***************************************************************************** + +Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file fts/fts0tokenize.cc +Full Text Search plugin tokenizer refer to MyISAM + +Created 2014/11/17 Shaohua Wang +***********************************************************************/ + +#include "ft_global.h" +#include "mysql/plugin_ftparser.h" +#include "m_ctype.h" + +/* Macros and structs below are from ftdefs.h in MyISAM */ +/** Check a char is true word */ +#define true_word_char(c, ch) ((c) & (_MY_U | _MY_L | _MY_NMR) || (ch) == '_') + +/** Check if a char is misc word */ +#define misc_word_char(X) 0 + +/** Boolean search syntax */ +static const char* fts_boolean_syntax = DEFAULT_FTB_SYNTAX; + +#define FTB_YES (fts_boolean_syntax[0]) +#define FTB_EGAL (fts_boolean_syntax[1]) +#define FTB_NO (fts_boolean_syntax[2]) +#define FTB_INC (fts_boolean_syntax[3]) +#define FTB_DEC (fts_boolean_syntax[4]) +#define FTB_LBR (fts_boolean_syntax[5]) +#define FTB_RBR (fts_boolean_syntax[6]) +#define FTB_NEG (fts_boolean_syntax[7]) +#define FTB_TRUNC (fts_boolean_syntax[8]) +#define FTB_LQUOT (fts_boolean_syntax[10]) +#define FTB_RQUOT (fts_boolean_syntax[11]) + +/** FTS query token */ +typedef struct st_ft_word { + uchar* pos; /*!< word start pointer */ + uint len; /*!< word len */ + double weight; /*!< word weight, unused in innodb */ +} FT_WORD; + +/** Tokenizer for ngram referring to ft_get_word(ft_parser.c) in MyISAM. +Differences: a. code format changed; b. stopword processing removed. +@param[in] cs charset +@param[in,out] start doc start pointer +@param[in,out] end doc end pointer +@param[in,out] word token +@param[in,out] info token info +@retval 0 eof +@retval 1 word found +@retval 2 left bracket +@retval 3 right bracket +@retval 4 stopword found */ +inline +uchar +fts_get_word( + const CHARSET_INFO* cs, + uchar** start, + uchar* end, + FT_WORD* word, + MYSQL_FTPARSER_BOOLEAN_INFO* + info) +{ + uchar* doc = *start; + int ctype; + uint mwc; + uint length; + int mbl; + + info->yesno = (FTB_YES ==' ') ? 1 : (info->quot != 0); + info->weight_adjust = info->wasign = 0; + info->type = FT_TOKEN_EOF; + + while (doc < end) { + for (; doc < end; + doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) { + mbl = cs->ctype(&ctype, doc, end); + + if (true_word_char(ctype, *doc)) { + break; + } + + if (*doc == FTB_RQUOT && info->quot) { + *start = doc + 1; + info->type = FT_TOKEN_RIGHT_PAREN; + + return(info->type); + } + + if (!info->quot) { + if (*doc == FTB_LBR + || *doc == FTB_RBR + || *doc == FTB_LQUOT) { + /* param->prev=' '; */ + *start = doc + 1; + if (*doc == FTB_LQUOT) { + info->quot = (char*)1; + } + + info->type = (*doc == FTB_RBR ? + FT_TOKEN_RIGHT_PAREN : + FT_TOKEN_LEFT_PAREN); + + return(info->type); + } + + if (info->prev == ' ') { + if (*doc == FTB_YES) { + info->yesno = +1; + continue; + } else if (*doc == FTB_EGAL) { + info->yesno = 0; + continue; + } else if (*doc == FTB_NO) { + info->yesno = -1; + continue; + } else if (*doc == FTB_INC) { + info->weight_adjust++; + continue; + } else if (*doc == FTB_DEC) { + info->weight_adjust--; + continue; + } else if (*doc == FTB_NEG) { + info->wasign = !info->wasign; + continue; + } + } + } + + info->prev = char(*doc); + info->yesno = (FTB_YES == ' ') ? 1 : (info->quot != 0); + info->weight_adjust = info->wasign = 0; + } + + mwc = length = 0; + for (word->pos = doc; + doc < end; + length++, doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) { + mbl = cs->ctype(&ctype, doc, end); + + if (true_word_char(ctype, *doc)) { + mwc = 0; + } else if (!misc_word_char(*doc) || mwc) { + break; + } else { + mwc++; + } + } + + /* Be sure *prev is true_word_char. */ + info->prev = 'A'; + word->len = (uint)(doc-word->pos) - mwc; + + if ((info->trunc = (doc < end && *doc == FTB_TRUNC))) { + doc++; + } + + /* We don't check stopword here. */ + *start = doc; + info->type = FT_TOKEN_WORD; + + return(info->type); + } + + if (info->quot) { + *start = doc; + info->type = FT_TOKEN_RIGHT_PAREN; + } + + return(info->type); +} diff --git a/storage/innobase/include/fts0types.h b/storage/innobase/include/fts0types.h new file mode 100644 index 00000000..fb278d54 --- /dev/null +++ b/storage/innobase/include/fts0types.h @@ -0,0 +1,354 @@ +/***************************************************************************** + +Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0types.h +Full text search types file + +Created 2007-03-27 Sunny Bains +*******************************************************/ + +#ifndef INNOBASE_FTS0TYPES_H +#define INNOBASE_FTS0TYPES_H + +#include "fts0fts.h" +#include "pars0pars.h" +#include "que0types.h" +#include "ut0byte.h" +#include "ut0rbt.h" + +/** Types used within FTS. */ +struct fts_que_t; +struct fts_node_t; + +/** Callbacks used within FTS. */ +typedef pars_user_func_cb_t fts_sql_callback; +typedef void (*fts_filter)(void*, fts_node_t*, void*, ulint len); + +/** Statistics relevant to a particular document, used during retrieval. */ +struct fts_doc_stats_t { + doc_id_t doc_id; /*!< Document id */ + ulint word_count; /*!< Total words in the document */ +}; + +/** It's main purpose is to store the SQL prepared statements that +are required to retrieve a document from the database. */ +struct fts_get_doc_t { + fts_index_cache_t* + index_cache; /*!< The index cache instance */ + + /*!< Parsed sql statement */ + que_t* get_document_graph; + fts_cache_t* cache; /*!< The parent cache */ +}; + +/** Since we can have multiple FTS indexes on a table, we keep a +per index cache of words etc. */ +struct fts_index_cache_t { + dict_index_t* index; /*!< The FTS index instance */ + + ib_rbt_t* words; /*!< Nodes; indexed by fts_string_t*, + cells are fts_tokenizer_word_t*.*/ + + ib_vector_t* doc_stats; /*!< Array of the fts_doc_stats_t + contained in the memory buffer. + Must be in sorted order (ascending). + The ideal choice is an rb tree but + the rb tree imposes a space overhead + that we can do without */ + + que_t** ins_graph; /*!< Insert query graphs */ + + que_t** sel_graph; /*!< Select query graphs */ + CHARSET_INFO* charset; /*!< charset */ +}; + +/** Stop word control infotmation. */ +struct fts_stopword_t { + ulint status; /*!< Status of the stopword tree */ + ib_alloc_t* heap; /*!< The memory allocator to use */ + ib_rbt_t* cached_stopword;/*!< This stores all active stopwords */ + CHARSET_INFO* charset; /*!< charset for stopword */ +}; + +/** The SYNC state of the cache. There is one instance of this struct +associated with each ADD thread. */ +struct fts_sync_t { + trx_t* trx; /*!< The transaction used for SYNCing + the cache to disk */ + dict_table_t* table; /*!< Table with FTS index(es) */ + ulint max_cache_size; /*!< Max size in bytes of the cache */ + ibool cache_full; /*!< flag, when true it indicates that + we need to sync the cache to disk */ + ulint lower_index; /*!< the start index of the doc id + vector from where to start adding + documents to the FTS cache */ + ulint upper_index; /*!< max index of the doc id vector to + add to the FTS cache */ + ibool interrupted; /*!< TRUE if SYNC was interrupted */ + doc_id_t min_doc_id; /*!< The smallest doc id added to the + cache. It should equal to + doc_ids[lower_index] */ + doc_id_t max_doc_id; /*!< The doc id at which the cache was + noted as being full, we use this to + set the upper_limit field */ + time_t start_time; /*!< SYNC start time; only used if + fts_enable_diag_print */ + bool in_progress; /*!< flag whether sync is in progress.*/ + bool unlock_cache; /*!< flag whether unlock cache when + write fts node */ + /** condition variable for in_progress; used with table->fts->cache->lock */ + pthread_cond_t cond; +}; + +/** The cache for the FTS system. It is a memory-based inverted index +that new entries are added to, until it grows over the configured maximum +size, at which time its contents are written to the INDEX table. */ +struct fts_cache_t +{ + /** lock protecting all access to the memory buffer */ + mysql_mutex_t lock; + /** cache initialization */ + mysql_mutex_t init_lock; + + /** protection for deleted_doc_ids */ + mysql_mutex_t deleted_lock; + + /** protection for DOC_ID */ + mysql_mutex_t doc_id_lock; + + ib_vector_t* deleted_doc_ids;/*!< Array of deleted doc ids, each + element is of type fts_update_t */ + + ib_vector_t* indexes; /*!< We store the stats and inverted + index for the individual FTS indexes + in this vector. Each element is + an instance of fts_index_cache_t */ + + ib_vector_t* get_docs; /*!< information required to read + the document from the table. Each + element is of type fts_doc_t */ + + size_t total_size; /*!< total size consumed by the ilist + field of all nodes. SYNC is run + whenever this gets too big */ + /** total_size at the time of the previous SYNC request */ + size_t total_size_at_sync; + + fts_sync_t* sync; /*!< sync structure to sync data to + disk */ + ib_alloc_t* sync_heap; /*!< The heap allocator, for indexes + and deleted_doc_ids, ie. transient + objects, they are recreated after + a SYNC is completed */ + + ib_alloc_t* self_heap; /*!< This heap is the heap out of + which an instance of the cache itself + was created. Objects created using + this heap will last for the lifetime + of the cache */ + + doc_id_t next_doc_id; /*!< Next doc id */ + + doc_id_t synced_doc_id; /*!< Doc ID sync-ed to CONFIG table */ + + doc_id_t first_doc_id; /*!< first doc id since this table + was opened */ + + ulint deleted; /*!< Number of doc ids deleted since + last optimized. This variable is + covered by deleted_lock */ + + ulint added; /*!< Number of doc ids added since last + optimized. This variable is covered by + the deleted lock */ + + fts_stopword_t stopword_info; /*!< Cached stopwords for the FTS */ + mem_heap_t* cache_heap; /*!< Cache Heap */ +}; + +/** Columns of the FTS auxiliary INDEX table */ +struct fts_node_t { + doc_id_t first_doc_id; /*!< First document id in ilist. */ + + doc_id_t last_doc_id; /*!< Last document id in ilist. */ + + byte* ilist; /*!< Binary list of documents & word + positions the token appears in. + TODO: For now, these are simply + ut_malloc'd, but if testing shows + that they waste memory unacceptably, a + special memory allocator will have + to be written */ + + ulint doc_count; /*!< Number of doc ids in ilist */ + + ulint ilist_size; /*!< Used size of ilist in bytes. */ + + ulint ilist_size_alloc; + /*!< Allocated size of ilist in + bytes */ + bool synced; /*!< flag whether the node is synced */ +}; + +/** A tokenizer word. Contains information about one word. */ +struct fts_tokenizer_word_t { + fts_string_t text; /*!< Token text. */ + + ib_vector_t* nodes; /*!< Word node ilists, each element is + of type fts_node_t */ +}; + +/** Word text plus it's array of nodes as on disk in FTS index */ +struct fts_word_t { + fts_string_t text; /*!< Word value in UTF-8 */ + ib_vector_t* nodes; /*!< Nodes read from disk */ + + ib_alloc_t* heap_alloc; /*!< For handling all allocations */ +}; + +/** Callback for reading and filtering nodes that are read from FTS index */ +struct fts_fetch_t { + void* read_arg; /*!< Arg for the sql_callback */ + + fts_sql_callback + read_record; /*!< Callback for reading index + record */ + size_t total_memory; /*!< Total memory used */ +}; + +/** For horizontally splitting an FTS auxiliary index */ +struct fts_index_selector_t { + ulint value; /*!< Character value at which + to split */ + + const char* suffix; /*!< FTS aux index suffix */ +}; + +/** This type represents a single document. */ +struct fts_doc_t { + fts_string_t text; /*!< document text */ + + ibool found; /*!< TRUE if the document was found + successfully in the database */ + + ib_rbt_t* tokens; /*!< This is filled when the document + is tokenized. Tokens; indexed by + fts_string_t*, cells are of type + fts_token_t* */ + + ib_alloc_t* self_heap; /*!< An instance of this type is + allocated from this heap along + with any objects that have the + same lifespan, most notably + the vector of token positions */ + CHARSET_INFO* charset; /*!< Document's charset info */ + + st_mysql_ftparser* parser; /*!< fts plugin parser */ + + ib_rbt_t* stopwords; /*!< Stopwords */ +}; + +/** A token and its positions within a document. */ +struct fts_token_t { + fts_string_t text; /*!< token text */ + + ib_vector_t* positions; /*!< an array of the positions the + token is found in; each item is + actually an ulint. */ +}; + +/** It's defined in fts/fts0fts.c */ +extern const fts_index_selector_t fts_index_selector[]; + +/******************************************************************//** +Compare two fts_trx_row_t instances doc_ids. */ +UNIV_INLINE +int +fts_trx_row_doc_id_cmp( +/*===================*/ + /*!< out: + < 0 if n1 < n2, + 0 if n1 == n2, + > 0 if n1 > n2 */ + const void* p1, /*!< in: id1 */ + const void* p2); /*!< in: id2 */ + +/******************************************************************//** +Compare two fts_ranking_t instances doc_ids. */ +UNIV_INLINE +int +fts_ranking_doc_id_cmp( +/*===================*/ + /*!< out: + < 0 if n1 < n2, + 0 if n1 == n2, + > 0 if n1 > n2 */ + const void* p1, /*!< in: id1 */ + const void* p2); /*!< in: id2 */ + +/******************************************************************//** +Compare two doc_ids. */ +UNIV_INLINE +int fts_doc_id_cmp( +/*==================*/ + /*!< out: + < 0 if n1 < n2, + 0 if n1 == n2, + > 0 if n1 > n2 */ + const void* p1, /*!< in: id1 */ + const void* p2); /*!< in: id2 */ + +/******************************************************************//** +Duplicate a string. */ +UNIV_INLINE +void +fts_string_dup( +/*===========*/ + /*!< out: + < 0 if n1 < n2, + 0 if n1 == n2, + > 0 if n1 > n2 */ + fts_string_t* dst, /*!< in: dup to here */ + const fts_string_t* src, /*!< in: src string */ + mem_heap_t* heap); /*!< in: heap to use */ + +/******************************************************************//** +Get the selected FTS aux INDEX suffix. */ +UNIV_INLINE +const char* +fts_get_suffix( +/*===========*/ + ulint selected); /*!< in: selected index */ + +/** Select the FTS auxiliary index for the given character. +@param[in] cs charset +@param[in] str string +@param[in] len string length in bytes +@return the index to use for the string */ +UNIV_INLINE +ulint +fts_select_index( + const CHARSET_INFO* cs, + const byte* str, + ulint len); + +#include "fts0types.inl" + +#endif /* INNOBASE_FTS0TYPES_H */ diff --git a/storage/innobase/include/fts0types.inl b/storage/innobase/include/fts0types.inl new file mode 100644 index 00000000..facc1e5c --- /dev/null +++ b/storage/innobase/include/fts0types.inl @@ -0,0 +1,231 @@ +/***************************************************************************** + +Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fts0types.ic +Full text search types. + +Created 2007-03-27 Sunny Bains +*******************************************************/ + +#ifndef INNOBASE_FTS0TYPES_IC +#define INNOBASE_FTS0TYPES_IC + +/******************************************************************//** +Duplicate a string. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +void +fts_string_dup( +/*===========*/ + fts_string_t* dst, /*!< in: dup to here */ + const fts_string_t* src, /*!< in: src string */ + mem_heap_t* heap) /*!< in: heap to use */ +{ + dst->f_str = (byte*)mem_heap_alloc(heap, src->f_len + 1); + memcpy(dst->f_str, src->f_str, src->f_len); + + dst->f_len = src->f_len; + dst->f_str[src->f_len] = 0; + dst->f_n_char = src->f_n_char; +} + +/******************************************************************//** +Compare two fts_trx_row_t doc_ids. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_trx_row_doc_id_cmp( +/*===================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const fts_trx_row_t* tr1 = (const fts_trx_row_t*) p1; + const fts_trx_row_t* tr2 = (const fts_trx_row_t*) p2; + + return((int)(tr1->doc_id - tr2->doc_id)); +} + +/******************************************************************//** +Compare two fts_ranking_t doc_ids. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int +fts_ranking_doc_id_cmp( +/*===================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const fts_ranking_t* rk1 = (const fts_ranking_t*) p1; + const fts_ranking_t* rk2 = (const fts_ranking_t*) p2; + + return((int)(rk1->doc_id - rk2->doc_id)); +} + +/******************************************************************//** +Compare two doc_ids. +@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */ +UNIV_INLINE +int fts_doc_id_cmp( +/*==================*/ + const void* p1, /*!< in: id1 */ + const void* p2) /*!< in: id2 */ +{ + const doc_id_t* up1 = static_cast(p1); + const doc_id_t* up2 = static_cast(p2); + + return static_cast(*up1 - *up2); +} + +/******************************************************************//** +Get the first character's code position for FTS index partition */ +extern +ulint +innobase_strnxfrm( +/*==============*/ + const CHARSET_INFO* cs, /*!< in: Character set */ + const uchar* p2, /*!< in: string */ + const ulint len2); /*!< in: string length */ + +/** Check if fts index charset is cjk +@param[in] cs charset +@retval true if the charset is cjk +@retval false if not. */ +inline bool fts_is_charset_cjk(const CHARSET_INFO* cs) +{ + switch (cs->number) { + case 24: /* my_charset_gb2312_chinese_ci */ + case 28: /* my_charset_gbk_chinese_ci */ + case 1: /* my_charset_big5_chinese_ci */ + case 12: /* my_charset_ujis_japanese_ci */ + case 13: /* my_charset_sjis_japanese_ci */ + case 95: /* my_charset_cp932_japanese_ci */ + case 97: /* my_charset_eucjpms_japanese_ci */ + case 19: /* my_charset_euckr_korean_ci */ + return true; + default: + return false; + } +} + +/** Select the FTS auxiliary index for the given character by range. +@param[in] cs charset +@param[in] str string +@param[in] len string length +@retval the index to use for the string */ +UNIV_INLINE +ulint +fts_select_index_by_range( + const CHARSET_INFO* cs, + const byte* str, + ulint len) +{ + ulint selected = 0; + ulint value = innobase_strnxfrm(cs, str, len); + + while (fts_index_selector[selected].value != 0) { + + if (fts_index_selector[selected].value == value) { + + return(selected); + + } else if (fts_index_selector[selected].value > value) { + + return(selected > 0 ? selected - 1 : 0); + } + + ++selected; + } + + ut_ad(selected > 1); + + return(selected - 1); +} + +/** Select the FTS auxiliary index for the given character by hash. +@param[in] cs charset +@param[in] str string +@param[in] len string length +@retval the index to use for the string */ +UNIV_INLINE +ulint +fts_select_index_by_hash( + const CHARSET_INFO* cs, + const byte* str, + ulint len) +{ + ulong nr1 = 1; + ulong nr2 = 4; + + ut_ad(!(str == NULL && len > 0)); + + if (str == NULL || len == 0) { + return 0; + } + + /* Get the first char */ + /* JAN: TODO: MySQL 5.7 had + char_len = my_mbcharlen_ptr(cs, reinterpret_cast(str), + reinterpret_cast(str + len)); + */ + size_t char_len = size_t(cs->charlen(str, str + len)); + + ut_ad(char_len <= len); + + /* Get collation hash code */ + my_ci_hash_sort(cs, str, char_len, &nr1, &nr2); + + return(nr1 % FTS_NUM_AUX_INDEX); +} + +/** Select the FTS auxiliary index for the given character. +@param[in] cs charset +@param[in] str string +@param[in] len string length in bytes +@retval the index to use for the string */ +UNIV_INLINE +ulint +fts_select_index( + const CHARSET_INFO* cs, + const byte* str, + ulint len) +{ + ulint selected; + + if (fts_is_charset_cjk(cs)) { + selected = fts_select_index_by_hash(cs, str, len); + } else { + selected = fts_select_index_by_range(cs, str, len); + } + + return(selected); +} + +/******************************************************************//** +Return the selected FTS aux index suffix. */ +UNIV_INLINE +const char* +fts_get_suffix( +/*===========*/ + ulint selected) /*!< in: selected index */ +{ + return(fts_index_selector[selected].suffix); +} + +#endif /* INNOBASE_FTS0TYPES_IC */ diff --git a/storage/innobase/include/fts0vlc.h b/storage/innobase/include/fts0vlc.h new file mode 100644 index 00000000..d6e60377 --- /dev/null +++ b/storage/innobase/include/fts0vlc.h @@ -0,0 +1,124 @@ +/** + +Copyright (c) 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA +**/ +/** +@file include/fts0vlc.h +Full text variable length integer encoding/decoding. + +Created 2021-10-19 Thirunarayanan Balathandayuthapani +**/ + +/** Return length of val if it were encoded using our VLC scheme. +@param val value to encode +@return length of value encoded, in bytes */ +inline size_t fts_get_encoded_len(doc_id_t val) +{ + if (val < static_cast(1) << 7) + return 1; + if (val < static_cast(1) << 14) + return 2; + if (val < static_cast(1) << 21) + return 3; + if (val < static_cast(1) << 28) + return 4; + if (val < static_cast(1) << 35) + return 5; + if (val < static_cast(1) << 42) + return 6; + if (val < static_cast(1) << 49) + return 7; + if (val < static_cast(1) << 56) + return 8; + if (val < static_cast(1) << 63) + return 9; + return 10; +} + +/** Encode an integer using our VLC scheme and return the +length in bytes. +@param val value to encode +@param buf buffer, must have enough space +@return length of value encoded, in bytes */ +inline byte *fts_encode_int(doc_id_t val, byte *buf) +{ + if (val < static_cast(1) << 7) + goto add_1; + if (val < static_cast(1) << 14) + goto add_2; + if (val < static_cast(1) << 21) + goto add_3; + if (val < static_cast(1) << 28) + goto add_4; + if (val < static_cast(1) << 35) + goto add_5; + if (val < static_cast(1) << 42) + goto add_6; + if (val < static_cast(1) << 49) + goto add_7; + if (val < static_cast(1) << 56) + goto add_8; + if (val < static_cast(1) << 63) + goto add_9; + + *buf++= static_cast(val >> 63); +add_9: + *buf++= static_cast(val >> 56) & 0x7F; +add_8: + *buf++= static_cast(val >> 49) & 0x7F; +add_7: + *buf++= static_cast(val >> 42) & 0x7F; +add_6: + *buf++= static_cast(val >> 35) & 0x7F; +add_5: + *buf++= static_cast(val >> 28) & 0x7F; +add_4: + *buf++= static_cast(val >> 21) & 0x7F; +add_3: + *buf++= static_cast(val >> 14) & 0x7F; +add_2: + *buf++= static_cast(val >> 7) & 0x7F; +add_1: + *buf++= static_cast(val) | 0x80; + return buf; +} + +/** Decode and return the integer that was encoded using +our VLC scheme. +@param ptr pointer to decode from, this ptr is + incremented by the number of bytes decoded +@return value decoded */ +inline doc_id_t fts_decode_vlc(const byte **ptr) +{ + ut_d(const byte *const start= *ptr); + ut_ad(*start); + + doc_id_t val= 0; + for (;;) + { + byte b= *(*ptr)++; + val|= (b & 0x7F); + + /* High-bit on means "last byte in the encoded integer". */ + if (b & 0x80) + break; + ut_ad(val < static_cast(1) << (64 - 7)); + val <<= 7; + } + + ut_ad(*ptr - start <= 10); + + return(val); +} diff --git a/storage/innobase/include/fut0lst.h b/storage/innobase/include/fut0lst.h new file mode 100644 index 00000000..746dab80 --- /dev/null +++ b/storage/innobase/include/fut0lst.h @@ -0,0 +1,156 @@ +/***************************************************************************** + +Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fut0lst.h +File-based list utilities + +Created 11/28/1995 Heikki Tuuri +***********************************************************************/ + +#pragma once + +/* The physical size of a list base node in bytes */ +#define FLST_BASE_NODE_SIZE (4 + 2 * FIL_ADDR_SIZE) +/* The physical size of a list node in bytes */ +#define FLST_NODE_SIZE (2 * FIL_ADDR_SIZE) + +#ifdef UNIV_INNOCHECKSUM +# include "fil0fil.h" +#else +# include "mtr0log.h" + +typedef byte flst_base_node_t; +typedef byte flst_node_t; + +/* We define the field offsets of a node for the list */ +#define FLST_PREV 0 /* 6-byte address of the previous list element; + the page part of address is FIL_NULL, if no + previous element */ +#define FLST_NEXT FIL_ADDR_SIZE /* 6-byte address of the next + list element; the page part of address + is FIL_NULL, if no next element */ + +/* We define the field offsets of a base node for the list */ +#define FLST_LEN 0 /* 32-bit list length field */ +#define FLST_FIRST 4 /* 6-byte address of the first element + of the list; undefined if empty list */ +#define FLST_LAST (4 + FIL_ADDR_SIZE) /* 6-byte address of the + last element of the list; undefined + if empty list */ + +/** Initialize a zero-initialized list base node. +@param[in,out] block file page +@param[in] ofs byte offset of the list base node +@param[in,out] mtr mini-transaction */ +inline void flst_init(const buf_block_t* block, uint16_t ofs, mtr_t* mtr) +{ + ut_d(const page_t *page= block->page.frame); + ut_ad(!mach_read_from_2(FLST_LEN + ofs + page)); + ut_ad(!mach_read_from_2(FLST_FIRST + FIL_ADDR_BYTE + ofs + page)); + ut_ad(!mach_read_from_2(FLST_LAST + FIL_ADDR_BYTE + ofs + page)); + compile_time_assert(FIL_NULL == 0xffU * 0x1010101U); + mtr->memset(block, FLST_FIRST + FIL_ADDR_PAGE + ofs, 4, 0xff); + mtr->memset(block, FLST_LAST + FIL_ADDR_PAGE + ofs, 4, 0xff); +} + +/** Initialize a list base node. +@param[in] block file page +@param[in,out] base base node +@param[in,out] mtr mini-transaction */ +void flst_init(const buf_block_t &block, byte *base, mtr_t *mtr) + MY_ATTRIBUTE((nonnull)); + +/** Append a file list node to a list. +@param[in,out] base base node block +@param[in] boffset byte offset of the base node +@param[in,out] add block to be added +@param[in] aoffset byte offset of the node to be added +@param[in,out] mtr mini-transaction +@return error code */ +dberr_t flst_add_last(buf_block_t *base, uint16_t boffset, + buf_block_t *add, uint16_t aoffset, mtr_t *mtr) + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/** Prepend a file list node to a list. +@param[in,out] base base node block +@param[in] boffset byte offset of the base node +@param[in,out] add block to be added +@param[in] aoffset byte offset of the node to be added +@param[in,out] mtr mini-transaction +@return error code */ +dberr_t flst_add_first(buf_block_t *base, uint16_t boffset, + buf_block_t *add, uint16_t aoffset, mtr_t *mtr) + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/** Remove a file list node. +@param[in,out] base base node block +@param[in] boffset byte offset of the base node +@param[in,out] cur block to be removed +@param[in] coffset byte offset of the current record to be removed +@param[in,out] mtr mini-transaction +@return error code */ +dberr_t flst_remove(buf_block_t *base, uint16_t boffset, + buf_block_t *cur, uint16_t coffset, mtr_t *mtr) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** @return the length of a list */ +inline uint32_t flst_get_len(const flst_base_node_t *base) +{ + return mach_read_from_4(base + FLST_LEN); +} + +/** @return a file address */ +inline fil_addr_t flst_read_addr(const byte *faddr) +{ + fil_addr_t addr= { mach_read_from_4(faddr + FIL_ADDR_PAGE), + mach_read_from_2(faddr + FIL_ADDR_BYTE) }; + ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA); + ut_a(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA); + return addr; +} + +/** @return list first node address */ +inline fil_addr_t flst_get_first(const flst_base_node_t *base) +{ + return flst_read_addr(base + FLST_FIRST); +} + +/** @return list last node address */ +inline fil_addr_t flst_get_last(const flst_base_node_t *base) +{ + return flst_read_addr(base + FLST_LAST); +} + +/** @return list next node address */ +inline fil_addr_t flst_get_next_addr(const flst_node_t* node) +{ + return flst_read_addr(node + FLST_NEXT); +} + +/** @return list prev node address */ +inline fil_addr_t flst_get_prev_addr(const flst_node_t *node) +{ + return flst_read_addr(node + FLST_PREV); +} + +# ifdef UNIV_DEBUG +/** Validate a file-based list. */ +void flst_validate(const buf_block_t *base, uint16_t boffset, mtr_t *mtr); +# endif + +#endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/gis0geo.h b/storage/innobase/include/gis0geo.h new file mode 100644 index 00000000..3fd01a3a --- /dev/null +++ b/storage/innobase/include/gis0geo.h @@ -0,0 +1,122 @@ +/***************************************************************************** +Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved. +Copyright (c) 2019, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software Foundation, +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA +*****************************************************************************/ + +/**************************************************//** +@file gis0geo.h +The r-tree define from MyISAM +*******************************************************/ + +#ifndef _gis0geo_h +#define _gis0geo_h + +#include "my_global.h" +#include "string.h" + +#define SPTYPE HA_KEYTYPE_DOUBLE +#define SPLEN 8 + +/* Since the mbr could be a point or a linestring, in this case, area of +mbr is 0. So, we define this macro for calculating the area increasing +when we need to enlarge the mbr. */ +#define LINE_MBR_WEIGHTS 0.001 + +/* Types of "well-known binary representation" (wkb) format. */ +enum wkbType +{ + wkbPoint = 1, + wkbLineString = 2, + wkbPolygon = 3, + wkbMultiPoint = 4, + wkbMultiLineString = 5, + wkbMultiPolygon = 6, + wkbGeometryCollection = 7 +}; + +/* Byte order of "well-known binary representation" (wkb) format. */ +enum wkbByteOrder +{ + wkbXDR = 0, /* Big Endian */ + wkbNDR = 1 /* Little Endian */ +}; + +/*************************************************************//** +Calculate minimal bounding rectangle (mbr) of the spatial object +stored in "well-known binary representation" (wkb) format. +@return 0 if ok */ +int +rtree_mbr_from_wkb( +/*===============*/ + const uchar* wkb, /*!< in: pointer to wkb. */ + uint size, /*!< in: size of wkb. */ + uint n_dims, /*!< in: dimensions. */ + double* mbr); /*!< in/out: mbr. */ + +/* Rtree split node structure. */ +struct rtr_split_node_t +{ + double square; /* square of the mbr.*/ + int n_node; /* which group in.*/ + uchar* key; /* key. */ + double* coords; /* mbr. */ +}; + +/*************************************************************//** +Inline function for reserving coords */ +inline +static +double* +reserve_coords(double **d_buffer, /*!< in/out: buffer. */ + int n_dim) /*!< in: dimensions. */ +/*===========*/ +{ + double *coords = *d_buffer; + (*d_buffer) += n_dim * 2; + return coords; +} + +/*************************************************************//** +Split rtree nodes. +Return which group the first rec is in. */ +int +split_rtree_node( +/*=============*/ + rtr_split_node_t* node, /*!< in: split nodes.*/ + int n_entries, /*!< in: entries number.*/ + int all_size, /*!< in: total key's size.*/ + int key_size, /*!< in: key's size.*/ + int min_size, /*!< in: minimal group size.*/ + int size1, /*!< in: size of group.*/ + int size2, /*!< in: initial group sizes */ + double** d_buffer, /*!< in/out: buffer.*/ + int n_dim, /*!< in: dimensions. */ + uchar* first_rec); /*!< in: the first rec. */ + +/** Compare two minimum bounding rectangles. +@param mode comparison operator + MBR_INTERSECT(a,b) a overlaps b + MBR_CONTAIN(a,b) a contains b + MBR_DISJOINT(a,b) a disjoint b + MBR_WITHIN(a,b) a within b + MBR_EQUAL(a,b) All coordinates of MBRs are equal + MBR_DATA(a,b) Data reference is the same +@param b first MBR +@param a second MBR +@retval 0 if the predicate holds +@retval 1 if the precidate does not hold */ +int rtree_key_cmp(page_cur_mode_t mode, const void *b, const void *a); +#endif diff --git a/storage/innobase/include/gis0rtree.h b/storage/innobase/include/gis0rtree.h new file mode 100644 index 00000000..b07261ce --- /dev/null +++ b/storage/innobase/include/gis0rtree.h @@ -0,0 +1,513 @@ +/***************************************************************************** + +Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include gis0rtree.h +R-tree header file + +Created 2013/03/27 Jimmy Yang and Allen Lai +***********************************************************************/ + +#ifndef gis0rtree_h +#define gis0rtree_h + +#include "btr0cur.h" +#include "rem0types.h" + +/* Whether MBR 'a' contains 'b' */ +#define MBR_CONTAIN_CMP(a, b) \ + ((((b)->xmin >= (a)->xmin) && ((b)->xmax <= (a)->xmax) \ + && ((b)->ymin >= (a)->ymin) && ((b)->ymax <= (a)->ymax))) + +/* Whether MBR 'a' equals to 'b' */ +#define MBR_EQUAL_CMP(a, b) \ + ((((b)->xmin == (a)->xmin) && ((b)->xmax == (a)->xmax)) \ + && (((b)->ymin == (a)->ymin) && ((b)->ymax == (a)->ymax))) + +/* Whether MBR 'a' intersects 'b' */ +#define MBR_INTERSECT_CMP(a, b) \ + ((((b)->xmin <= (a)->xmax) || ((b)->xmax >= (a)->xmin)) \ + && (((b)->ymin <= (a)->ymax) || ((b)->ymax >= (a)->ymin))) + +/* Whether MBR 'a' and 'b' disjoint */ +#define MBR_DISJOINT_CMP(a, b) (!MBR_INTERSECT_CMP(a, b)) + +/* Whether MBR 'a' within 'b' */ +#define MBR_WITHIN_CMP(a, b) \ + ((((b)->xmin <= (a)->xmin) && ((b)->xmax >= (a)->xmax)) \ + && (((b)->ymin <= (a)->ymin) && ((b)->ymax >= (a)->ymax))) + +/* Define it for rtree search mode checking. */ +#define RTREE_SEARCH_MODE(mode) \ + (((mode) >= PAGE_CUR_CONTAIN) && ((mode <= PAGE_CUR_RTREE_GET_FATHER))) + +/* Geometry data header */ +#define GEO_DATA_HEADER_SIZE 4 + +/** Search for a spatial index leaf page record. +@param cur cursor +@param tuple search tuple +@param latch_mode latching mode +@param mtr mini-transaction +@param mode search mode */ +dberr_t rtr_search_leaf(btr_cur_t *cur, const dtuple_t *tuple, + btr_latch_mode latch_mode, mtr_t *mtr, + page_cur_mode_t mode= PAGE_CUR_RTREE_LOCATE) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Search for inserting a spatial index leaf page record. +@param cur cursor +@param tuple search tuple +@param latch_mode latching mode +@param mtr mini-transaction */ +inline dberr_t rtr_insert_leaf(btr_cur_t *cur, const dtuple_t *tuple, + btr_latch_mode latch_mode, mtr_t *mtr) +{ + return rtr_search_leaf(cur, tuple, latch_mode, mtr, PAGE_CUR_RTREE_INSERT); +} + +/** Search for a spatial index leaf page record. +@param pcur cursor +@param tuple search tuple +@param mode search mode +@param mtr mini-transaction */ +dberr_t rtr_search_leaf(btr_pcur_t *pcur, const dtuple_t *tuple, + page_cur_mode_t mode, mtr_t *mtr) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +dberr_t rtr_search_to_nth_level(ulint level, const dtuple_t *tuple, + page_cur_mode_t mode, + btr_latch_mode latch_mode, + btr_cur_t *cur, mtr_t *mtr) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/**********************************************************************//** +Builds a Rtree node pointer out of a physical record and a page number. +@return own: node pointer */ +dtuple_t* +rtr_index_build_node_ptr( +/*=====================*/ + const dict_index_t* index, /*!< in: index */ + const rtr_mbr_t* mbr, /*!< in: mbr of lower page */ + const rec_t* rec, /*!< in: record for which to build node + pointer */ + ulint page_no,/*!< in: page number to put in node + pointer */ + mem_heap_t* heap); /*!< in: memory heap where pointer + created */ + +/*************************************************************//** +Splits an R-tree index page to halves and inserts the tuple. It is assumed +that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is +released within this function! NOTE that the operation of this +function must always succeed, we cannot reverse it: therefore enough +free disk space (2 pages) must be guaranteed to be available before +this function is called. +@return inserted record */ +rec_t* +rtr_page_split_and_insert( +/*======================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_cur_t* cursor, /*!< in/out: cursor at which to insert; when the + function returns, the cursor is positioned + on the predecessor of the inserted record */ + rec_offs** offsets,/*!< out: offsets on inserted record */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + const dtuple_t* tuple, /*!< in: tuple to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr, /*!< in: mtr */ + dberr_t* err); /*!< out: error code */ + +/**************************************************************//** +Sets the child node mbr in a node pointer. */ +UNIV_INLINE +void +rtr_page_cal_mbr( +/*=============*/ + const dict_index_t* index, /*!< in: index */ + const buf_block_t* block, /*!< in: buffer block */ + rtr_mbr_t* mbr, /*!< out: MBR encapsulates the page */ + mem_heap_t* heap); /*!< in: heap for the memory + allocation */ +/*************************************************************//** +Find the next matching record. This function will first exhaust +the copied record listed in the rtr_info->matches vector before +moving to next page +@return true if there is next qualified record found, otherwise(if +exhausted) false */ +bool +rtr_pcur_move_to_next( +/*==================*/ + const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in + tuple must be set so that it cannot get + compared to the node ptr page number field! */ + page_cur_mode_t mode, /*!< in: cursor search mode */ + btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the + function may release the page latch */ + ulint cur_level, + /*!< in: current level */ + mtr_t* mtr) /*!< in: mtr */ + MY_ATTRIBUTE((warn_unused_result)); + +/****************************************************************//** +Searches the right position in rtree for a page cursor. */ +bool +rtr_cur_search_with_match( +/*======================*/ + const buf_block_t* block, /*!< in: buffer block */ + dict_index_t* index, /*!< in: index descriptor */ + const dtuple_t* tuple, /*!< in: data tuple */ + page_cur_mode_t mode, /*!< in: PAGE_CUR_L, + PAGE_CUR_LE, PAGE_CUR_G, or + PAGE_CUR_GE */ + page_cur_t* cursor, /*!< in/out: page cursor */ + rtr_info_t* rtr_info);/*!< in/out: search stack */ + +/****************************************************************//** +Calculate the area increased for a new record +@return area increased */ +double +rtr_rec_cal_increase( +/*=================*/ + const dtuple_t* dtuple, /*!< in: data tuple to insert, which + cause area increase */ + const rec_t* rec, /*!< in: physical record which differs from + dtuple in some of the common fields, or which + has an equal number or more fields than + dtuple */ + double* area); /*!< out: increased area */ + +/****************************************************************//** +Following the right link to find the proper block for insert. +@return the proper block.*/ +dberr_t +rtr_ins_enlarge_mbr( +/*=================*/ + btr_cur_t* cursor, /*!< in: btr cursor */ + mtr_t* mtr); /*!< in: mtr */ + +/**************************************************************//** +push a nonleaf index node to the search path */ +UNIV_INLINE +void +rtr_non_leaf_stack_push( +/*====================*/ + rtr_node_path_t* path, /*!< in/out: search path */ + uint32_t pageno, /*!< in: pageno to insert */ + node_seq_t seq_no, /*!< in: Node sequence num */ + ulint level, /*!< in: index level */ + uint32_t child_no, /*!< in: child page no */ + btr_pcur_t* cursor, /*!< in: position cursor */ + double mbr_inc); /*!< in: MBR needs to be + enlarged */ + +/**************************************************************//** +push a nonleaf index node to the search path for insertion */ +void +rtr_non_leaf_insert_stack_push( +/*===========================*/ + dict_index_t* index, /*!< in: index descriptor */ + rtr_node_path_t* path, /*!< in/out: search path */ + ulint level, /*!< in: index level */ + const buf_block_t* block, /*!< in: block of the page */ + const rec_t* rec, /*!< in: positioned record */ + double mbr_inc); /*!< in: MBR needs to be + enlarged */ + +#define rtr_get_new_ssn_id(index) (index)->assign_ssn() +#define rtr_get_current_ssn_id(index) (index)->ssn() + +/********************************************************************//** +Create a RTree search info structure */ +rtr_info_t* +rtr_create_rtr_info( +/******************/ + bool need_prdt, /*!< in: Whether predicate lock is + needed */ + bool init_matches, /*!< in: Whether to initiate the + "matches" structure for collecting + matched leaf records */ + btr_cur_t* cursor, /*!< in: tree search cursor */ + dict_index_t* index); /*!< in: index struct */ + +/********************************************************************//** +Update a btr_cur_t with rtr_info */ +void +rtr_info_update_btr( +/******************/ + btr_cur_t* cursor, /*!< in/out: tree cursor */ + rtr_info_t* rtr_info); /*!< in: rtr_info to set to the + cursor */ + +/********************************************************************//** +Update a btr_cur_t with rtr_info */ +void +rtr_init_rtr_info( +/****************/ + rtr_info_t* rtr_info, /*!< in: rtr_info to set to the + cursor */ + bool need_prdt, /*!< in: Whether predicate lock is + needed */ + btr_cur_t* cursor, /*!< in: tree search cursor */ + dict_index_t* index, /*!< in: index structure */ + bool reinit); /*!< in: Whether this is a reinit */ + +/**************************************************************//** +Clean up Rtree cursor */ +void +rtr_clean_rtr_info( +/*===============*/ + rtr_info_t* rtr_info, /*!< in: RTree search info */ + bool free_all); /*!< in: need to free rtr_info itself */ + +/****************************************************************//** +Get the bounding box content from an index record*/ +void +rtr_get_mbr_from_rec( +/*=================*/ + const rec_t* rec, /*!< in: data tuple */ + const rec_offs* offsets,/*!< in: offsets array */ + rtr_mbr_t* mbr); /*!< out MBR */ + +/****************************************************************//** +Get the bounding box content from a MBR data record */ +void +rtr_get_mbr_from_tuple( +/*===================*/ + const dtuple_t* dtuple, /*!< in: data tuple */ + rtr_mbr* mbr); /*!< out: mbr to fill */ + +/* Get the rtree page father. +@param[in,out] mtr mtr +@param[in] sea_cur search cursor, contains information + about parent nodes in search +@param[in,out] cursor cursor on node pointer record, + its page x-latched +@return whether the cursor was successfully positioned */ +bool rtr_page_get_father(mtr_t *mtr, btr_cur_t *sea_cur, btr_cur_t *cursor) + MY_ATTRIBUTE((nonnull(1,3), warn_unused_result)); + +/************************************************************//** +Returns the father block to a page. It is assumed that mtr holds +an X or SX latch on the tree. +@return rec_get_offsets() of the node pointer record */ +rec_offs* +rtr_page_get_father_block( +/*======================*/ + rec_offs* offsets,/*!< in: work area for the return value */ + mem_heap_t* heap, /*!< in: memory heap to use */ + mtr_t* mtr, /*!< in: mtr */ + btr_cur_t* sea_cur,/*!< in: search cursor, contains information + about parent nodes in search */ + btr_cur_t* cursor);/*!< out: cursor on node pointer record, + its page x-latched */ +/**************************************************************//** +Store the parent path cursor +@return number of cursor stored */ +ulint +rtr_store_parent_path( +/*==================*/ + const buf_block_t* block, /*!< in: block of the page */ + btr_cur_t* btr_cur,/*!< in/out: persistent cursor */ + btr_latch_mode latch_mode, + /*!< in: latch_mode */ + ulint level, /*!< in: index level */ + mtr_t* mtr); /*!< in: mtr */ + +/**************************************************************//** +Initializes and opens a persistent cursor to an index tree. It should be +closed with btr_pcur_close. */ +bool rtr_search( + const dtuple_t* tuple, /*!< in: tuple on which search done */ + btr_latch_mode latch_mode,/*!< in: BTR_MODIFY_LEAF, ... */ + btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */ + mtr_t* mtr) /*!< in: mtr */ + MY_ATTRIBUTE((warn_unused_result)); + +/*********************************************************//** +Returns the R-Tree node stored in the parent search path +@return pointer to R-Tree cursor component */ +UNIV_INLINE +node_visit_t* +rtr_get_parent_node( +/*================*/ + btr_cur_t* btr_cur, /*!< in: persistent cursor */ + ulint level, /*!< in: index level of buffer page */ + ulint is_insert); /*!< in: whether it is insert */ + +/*********************************************************//** +Returns the R-Tree cursor stored in the parent search path +@return pointer to R-Tree cursor component */ +UNIV_INLINE +btr_pcur_t* +rtr_get_parent_cursor( +/*==================*/ + btr_cur_t* btr_cur, /*!< in: persistent cursor */ + ulint level, /*!< in: index level of buffer page */ + ulint is_insert); /*!< in: whether insert operation */ + +MY_ATTRIBUTE((warn_unused_result)) +/*************************************************************//** +Copy recs from a page to new_block of rtree. + +@return error code */ +dberr_t +rtr_page_copy_rec_list_end_no_locks( +/*================================*/ + buf_block_t* new_block, /*!< in: index page to copy to */ + buf_block_t* block, /*!< in: index page of rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mem_heap_t* heap, /*!< in/out: heap memory */ + rtr_rec_move_t* rec_move, /*!< in: recording records moved */ + ulint max_move, /*!< in: num of rec to move */ + ulint* num_moved, /*!< out: num of rec to move */ + mtr_t* mtr); /*!< in: mtr */ + +MY_ATTRIBUTE((warn_unused_result)) +/*************************************************************//** +Copy recs till a specified rec from a page to new_block of rtree. + +@return error code */ +dberr_t +rtr_page_copy_rec_list_start_no_locks( +/*==================================*/ + buf_block_t* new_block, /*!< in: index page to copy to */ + buf_block_t* block, /*!< in: index page of rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mem_heap_t* heap, /*!< in/out: heap memory */ + rtr_rec_move_t* rec_move, /*!< in: recording records moved */ + ulint max_move, /*!< in: num of rec to move */ + ulint* num_moved, /*!< out: num of rec to move */ + mtr_t* mtr); /*!< in: mtr */ + +/****************************************************************//** +Merge 2 mbrs and update the the mbr that cursor is on. */ +void +rtr_merge_and_update_mbr( +/*=====================*/ + btr_cur_t* cursor, /*!< in/out: cursor */ + btr_cur_t* cursor2, /*!< in: the other cursor */ + rec_offs* offsets, /*!< in: rec offsets */ + rec_offs* offsets2, /*!< in: rec offsets */ + page_t* child_page, /*!< in: the child page. */ + mtr_t* mtr); /*!< in: mtr */ + +/*************************************************************//** +Deletes on the upper level the node pointer to a page. */ +void +rtr_node_ptr_delete( +/*================*/ + btr_cur_t* cursor, /*!< in: search cursor, contains information + about parent nodes in search */ + mtr_t* mtr); /*!< in: mtr */ + +/****************************************************************//** +Check two MBRs are identical or need to be merged */ +bool +rtr_merge_mbr_changed( +/*==================*/ + btr_cur_t* cursor, /*!< in: cursor */ + btr_cur_t* cursor2, /*!< in: the other cursor */ + rec_offs* offsets, /*!< in: rec offsets */ + rec_offs* offsets2, /*!< in: rec offsets */ + rtr_mbr_t* new_mbr); /*!< out: MBR to update */ + + +/**************************************************************//** +Update the mbr field of a spatial index row. */ +void +rtr_update_mbr_field( +/*=================*/ + btr_cur_t* cursor, /*!< in: cursor pointed to rec.*/ + rec_offs* offsets, /*!< in: offsets on rec. */ + btr_cur_t* cursor2, /*!< in/out: cursor pointed to rec + that should be deleted. + this cursor is for btr_compress to + delete the merged page's father rec.*/ + page_t* child_page, /*!< in: child page. */ + rtr_mbr_t* new_mbr, /*!< in: the new mbr. */ + rec_t* new_rec, /*!< in: rec to use */ + mtr_t* mtr); /*!< in: mtr */ + +/**************************************************************//** +Check whether a Rtree page is child of a parent page +@return true if there is child/parent relationship */ +bool +rtr_check_same_block( +/*=================*/ + dict_index_t* index, /*!< in: index tree */ + btr_cur_t* cur, /*!< in/out: position at the parent entry + pointing to the child if successful */ + buf_block_t* parentb,/*!< in: parent page to check */ + mem_heap_t* heap); /*!< in: memory heap */ + +/*********************************************************************//** +Sets pointer to the data and length in a field. */ +UNIV_INLINE +void +rtr_write_mbr( +/*==========*/ + byte* data, /*!< out: data */ + const rtr_mbr_t* mbr); /*!< in: data */ + +/*********************************************************************//** +Sets pointer to the data and length in a field. */ +UNIV_INLINE +void +rtr_read_mbr( +/*==========*/ + const byte* data, /*!< in: data */ + rtr_mbr_t* mbr); /*!< out: data */ + +/**************************************************************//** +Check whether a discarding page is in anyone's search path */ +void +rtr_check_discard_page( +/*===================*/ + dict_index_t* index, /*!< in: index */ + btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on + the root page */ + buf_block_t* block); /*!< in: block of page to be discarded */ + +/********************************************************************//** +Reinitialize a RTree search info */ +UNIV_INLINE +void +rtr_info_reinit_in_cursor( +/************************/ + btr_cur_t* cursor, /*!< in/out: tree cursor */ + dict_index_t* index, /*!< in: index struct */ + bool need_prdt); /*!< in: Whether predicate lock is + needed */ + +/** Estimates the number of rows in a given area. +@param[in] index index +@param[in] tuple range tuple containing mbr, may also be empty tuple +@param[in] mode search mode +@return estimated number of rows */ +ha_rows +rtr_estimate_n_rows_in_range( + dict_index_t* index, + const dtuple_t* tuple, + page_cur_mode_t mode); + +#include "gis0rtree.inl" +#endif /*!< gis0rtree.h */ diff --git a/storage/innobase/include/gis0rtree.inl b/storage/innobase/include/gis0rtree.inl new file mode 100644 index 00000000..5101eeb6 --- /dev/null +++ b/storage/innobase/include/gis0rtree.inl @@ -0,0 +1,245 @@ +/***************************************************************************** + +Copyright (c) 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include gis0rtree.h +R-tree Inline code + +Created 2013/03/27 Jimmy Yang and Allen Lai +***********************************************************************/ + +/**************************************************************//** +Sets the child node mbr in a node pointer. */ +UNIV_INLINE +void +rtr_page_cal_mbr( +/*=============*/ + const dict_index_t* index, /*!< in: index */ + const buf_block_t* block, /*!< in: buffer block */ + rtr_mbr_t* rtr_mbr,/*!< out: MBR encapsulates the page */ + mem_heap_t* heap) /*!< in: heap for the memory + allocation */ +{ + page_t* page; + rec_t* rec; + const byte* field; + ulint len; + rec_offs* offsets = NULL; + double bmin, bmax; + double* amin; + double* amax; + ulint inc = 0; + double* mbr; + + rtr_mbr->xmin = DBL_MAX; + rtr_mbr->ymin = DBL_MAX; + rtr_mbr->xmax = -DBL_MAX; + rtr_mbr->ymax = -DBL_MAX; + + mbr = reinterpret_cast(rtr_mbr); + + page = buf_block_get_frame(block); + + rec = page_rec_get_next(page_get_infimum_rec(page)); + if (UNIV_UNLIKELY(!rec)) { + return; + } + offsets = rec_get_offsets(rec, index, offsets, page_is_leaf(page) + ? index->n_fields : 0, + ULINT_UNDEFINED, &heap); + + do { + /* The mbr address is in the first field. */ + field = rec_get_nth_field(rec, offsets, 0, &len); + + ut_ad(len == DATA_MBR_LEN); + inc = 0; + for (unsigned i = 0; i < SPDIMS; i++) { + bmin = mach_double_read(field + inc); + bmax = mach_double_read(field + inc + sizeof(double)); + + amin = mbr + i * SPDIMS; + amax = mbr + i * SPDIMS + 1; + + if (*amin > bmin) + *amin = bmin; + if (*amax < bmax) + *amax = bmax; + + inc += 2 * sizeof(double); + } + + rec = page_rec_get_next(rec); + + if (rec == NULL) { + break; + } + } while (!page_rec_is_supremum(rec)); +} + +/**************************************************************//** +push a nonleaf index node to the search path */ +UNIV_INLINE +void +rtr_non_leaf_stack_push( +/*====================*/ + rtr_node_path_t* path, /*!< in/out: search path */ + uint32_t pageno, /*!< in: pageno to insert */ + node_seq_t seq_no, /*!< in: Node sequence num */ + ulint level, /*!< in: index page level */ + uint32_t child_no, /*!< in: child page no */ + btr_pcur_t* cursor, /*!< in: position cursor */ + double mbr_inc) /*!< in: MBR needs to be + enlarged */ +{ + node_visit_t insert_val; + + insert_val.page_no = pageno; + insert_val.seq_no = seq_no; + insert_val.level = level; + insert_val.child_no = child_no; + insert_val.cursor = cursor; + insert_val.mbr_inc = mbr_inc; + + path->push_back(insert_val); + +#ifdef RTR_SEARCH_DIAGNOSTIC + fprintf(stderr, "INNODB_RTR: Push page %d, level %d, seq %d" + " to search stack \n", + static_cast(pageno), static_cast(level), + static_cast(seq_no)); +#endif /* RTR_SEARCH_DIAGNOSTIC */ +} + +/*********************************************************************//** +Sets pointer to the data and length in a field. */ +UNIV_INLINE +void +rtr_write_mbr( +/*==========*/ + byte* data, /*!< out: data */ + const rtr_mbr_t* mbr) /*!< in: data */ +{ + const double* my_mbr = reinterpret_cast(mbr); + + for (unsigned i = 0; i < SPDIMS * 2; i++) { + mach_double_write(data + i * sizeof(double), my_mbr[i]); + } +} + +/*********************************************************************//** +Sets pointer to the data and length in a field. */ +UNIV_INLINE +void +rtr_read_mbr( +/*==========*/ + const byte* data, /*!< in: data */ + rtr_mbr_t* mbr) /*!< out: MBR */ +{ + for (unsigned i = 0; i < SPDIMS * 2; i++) { + (reinterpret_cast(mbr))[i] = mach_double_read( + data + + i * sizeof(double)); + } +} + +/*********************************************************//** +Returns the R-Tree node stored in the parent search path +@return pointer to R-Tree cursor component in the parent path, +NULL if parent path is empty or index is larger than num of items contained */ +UNIV_INLINE +node_visit_t* +rtr_get_parent_node( +/*================*/ + btr_cur_t* btr_cur, /*!< in: persistent cursor */ + ulint level, /*!< in: index level of buffer page */ + ulint is_insert) /*!< in: whether it is insert */ +{ + ulint num; + ulint tree_height = btr_cur->tree_height; + node_visit_t* found_node = NULL; + + if (level >= tree_height) { + return(NULL); + } + + mysql_mutex_lock(&btr_cur->rtr_info->rtr_path_mutex); + + num = btr_cur->rtr_info->parent_path->size(); + + if (!num) { + mysql_mutex_unlock(&btr_cur->rtr_info->rtr_path_mutex); + return(NULL); + } + + if (is_insert) { + ulint idx = tree_height - level - 1; + ut_ad(idx < num); + + found_node = &(*btr_cur->rtr_info->parent_path)[idx]; + } else { + node_visit_t* node; + + while (num > 0) { + node = &(*btr_cur->rtr_info->parent_path)[num - 1]; + + if (node->level == level) { + found_node = node; + break; + } + num--; + } + } + + mysql_mutex_unlock(&btr_cur->rtr_info->rtr_path_mutex); + + return(found_node); +} + +/*********************************************************//** +Returns the R-Tree cursor stored in the parent search path +@return pointer to R-Tree cursor component */ +UNIV_INLINE +btr_pcur_t* +rtr_get_parent_cursor( +/*==================*/ + btr_cur_t* btr_cur, /*!< in: persistent cursor */ + ulint level, /*!< in: index level of buffer page */ + ulint is_insert) /*!< in: whether insert operation */ +{ + node_visit_t* found_node = rtr_get_parent_node( + btr_cur, level, is_insert); + + return((found_node) ? found_node->cursor : NULL); +} + +/********************************************************************//** +Reinitialize a R-Tree search info in btr_cur_t */ +UNIV_INLINE +void +rtr_info_reinit_in_cursor( +/************************/ + btr_cur_t* cursor, /*!< in/out: tree cursor */ + dict_index_t* index, /*!< in: index struct */ + bool need_prdt) /*!< in: Whether predicate lock is + needed */ +{ + rtr_clean_rtr_info(cursor->rtr_info, false); + rtr_init_rtr_info(cursor->rtr_info, need_prdt, cursor, index, true); +} diff --git a/storage/innobase/include/gis0type.h b/storage/innobase/include/gis0type.h new file mode 100644 index 00000000..d6a4ef67 --- /dev/null +++ b/storage/innobase/include/gis0type.h @@ -0,0 +1,146 @@ +/***************************************************************************** + +Copyright (c) 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include gis0type.h +R-tree header file + +Created 2013/03/27 Jimmy Yang +***********************************************************************/ + +#ifndef gis0type_h +#define gis0type_h + +#include "buf0buf.h" +#include "data0type.h" +#include "data0types.h" +#include "dict0types.h" +#include "ut0vec.h" +#include "gis0geo.h" + +#include +#include + +/** Node Sequence Number. Only updated when page splits */ +typedef uint32_t node_seq_t; + +/* RTree internal non-leaf Nodes to be searched, from root to leaf */ +struct node_visit_t { + uint32_t page_no; /*!< the page number */ + node_seq_t seq_no; /*!< the SSN (split sequence number */ + ulint level; /*!< the page's index level */ + uint32_t child_no; /*!< child page num if for parent + recording */ + btr_pcur_t* cursor; /*!< cursor structure if we positioned + FIXME: there is no need to use whole + btr_pcur_t, just the position related + members */ + double mbr_inc; /*!< whether this node needs to be + enlarged for insertion */ +}; + +typedef std::vector > rtr_node_path_t; + +typedef struct rtr_rec { + rec_t* r_rec; /*!< matched record */ + bool locked; /*!< whether the record locked */ +} rtr_rec_t; + +typedef std::vector > rtr_rec_vector; + +/* Structure for matched records on the leaf page */ +typedef struct matched_rec { + byte* bufp; /*!< aligned buffer point */ + byte rec_buf[UNIV_PAGE_SIZE_MAX * 2]; + /*!< buffer used to copy matching rec */ + buf_block_t block; /*!< the shadow buffer block */ + ulint used; /*!< memory used */ + rtr_rec_vector* matched_recs; /*!< vector holding the matching rec */ + mysql_mutex_t rtr_match_mutex;/*!< mutex protect the match_recs + vector */ + bool valid; /*!< whether result in matched_recs + or this search is valid (page not + dropped) */ + bool locked; /*!< whether these recs locked */ +} matched_rec_t; + +/* In memory representation of a minimum bounding rectangle */ +typedef struct rtr_mbr { + double xmin; /*!< minimum on x */ + double xmax; /*!< maximum on x */ + double ymin; /*!< minimum on y */ + double ymax; /*!< maximum on y */ +} rtr_mbr_t; + +/* Maximum index level for R-Tree, this is consistent with BTR_MAX_LEVELS */ +#define RTR_MAX_LEVELS 100 + +/* Number of pages we latch at leaf level when there is possible Tree +modification (split, shrink), we always latch left, current +and right pages */ +#define RTR_LEAF_LATCH_NUM 3 + +/** Vectors holding the matching internal pages/nodes and leaf records */ +typedef struct rtr_info{ + rtr_node_path_t*path; /*!< vector holding matching pages */ + rtr_node_path_t*parent_path; + /*!< vector holding parent pages during + search */ + matched_rec_t* matches;/*!< struct holding matching leaf records */ + mysql_mutex_t rtr_path_mutex; + /*!< mutex protect the "path" vector */ + rtr_mbr_t mbr; /*!< the search MBR */ + que_thr_t* thr; /*!< the search thread */ + mem_heap_t* heap; /*!< memory heap */ + btr_cur_t* cursor; /*!< cursor used for search */ + dict_index_t* index; /*!< index it is searching */ + bool need_prdt_lock; + /*!< whether we will need predicate lock + the tree */ + bool need_page_lock; + /*!< whether we will need predicate page lock + the tree */ + bool allocated;/*!< whether this structure is allocate or + on stack */ + bool mbr_adj;/*!< whether mbr will need to be enlarged + for an insertion operation */ + bool fd_del; /*!< found deleted row */ + const dtuple_t* search_tuple; + /*!< search tuple being used */ + page_cur_mode_t search_mode; + /*!< current search mode */ +} rtr_info_t; + +/* Tracking structure for all ongoing search for an index */ +struct rtr_info_track_t { + /** Active search info */ + std::forward_list > rtr_active; + mysql_mutex_t rtr_active_mutex; + /*!< mutex to protect + rtr_active */ +}; + +/* This is to record the record movement between pages. Used for corresponding +lock movement */ +typedef struct rtr_rec_move { + rec_t* old_rec; /*!< record being moved in old page */ + rec_t* new_rec; /*!< new record location */ + bool moved; /*!< whether lock are moved too */ +} rtr_rec_move_t; +#endif /*!< gis0rtree.h */ diff --git a/storage/innobase/include/ha0ha.h b/storage/innobase/include/ha0ha.h new file mode 100644 index 00000000..5aaa559b --- /dev/null +++ b/storage/innobase/include/ha0ha.h @@ -0,0 +1,60 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ha0ha.h +The hash table interface for the adaptive hash index + +Created 8/18/1994 Heikki Tuuri +*******************************************************/ + +#ifndef ha0ha_h +#define ha0ha_h + +#include "hash0hash.h" +#include "page0types.h" +#include "buf0types.h" +#include "rem0types.h" + +#ifdef BTR_CUR_HASH_ADAPT +/*************************************************************//** +Looks for an element in a hash table. +@return pointer to the data of the first hash table node in chain +having the fold number, NULL if not found */ +UNIV_INLINE +const rec_t* +ha_search_and_get_data( +/*===================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold); /*!< in: folded value of the searched data */ + +/** The hash table external chain node */ +struct ha_node_t { + ulint fold; /*!< fold value for the data */ + ha_node_t* next; /*!< next chain node or NULL if none */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + buf_block_t* block; /*!< buffer block containing the data, or NULL */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + const rec_t* data; /*!< pointer to the data */ +}; + +#include "ha0ha.inl" +#endif /* BTR_CUR_HASH_ADAPT */ + +#endif diff --git a/storage/innobase/include/ha0ha.inl b/storage/innobase/include/ha0ha.inl new file mode 100644 index 00000000..0b256257 --- /dev/null +++ b/storage/innobase/include/ha0ha.inl @@ -0,0 +1,154 @@ +/***************************************************************************** + +Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/ha0ha.ic +The hash table interface for the adaptive hash index + +Created 8/18/1994 Heikki Tuuri +*************************************************************************/ + +#ifdef BTR_CUR_HASH_ADAPT +#include "btr0types.h" + +/******************************************************************//** +Gets a hash node data. +@return pointer to the data */ +UNIV_INLINE +const rec_t* +ha_node_get_data( +/*=============*/ + const ha_node_t* node) /*!< in: hash chain node */ +{ + return(node->data); +} + +/******************************************************************//** +Sets hash node data. */ +UNIV_INLINE +void +ha_node_set_data_func( +/*==================*/ + ha_node_t* node, /*!< in: hash chain node */ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + buf_block_t* block, /*!< in: buffer block containing the data */ +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + const rec_t* data) /*!< in: pointer to the data */ +{ +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG + node->block = block; +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + node->data = data; +} + +#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG +/** Sets hash node data. +@param n in: hash chain node +@param b in: buffer block containing the data +@param d in: pointer to the data */ +# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,b,d) +#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */ +/** Sets hash node data. +@param n in: hash chain node +@param b in: buffer block containing the data +@param d in: pointer to the data */ +# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,d) +#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ + +/******************************************************************//** +Gets the next node in a hash chain. +@return next node, NULL if none */ +UNIV_INLINE +ha_node_t* +ha_chain_get_next( +/*==============*/ + const ha_node_t* node) /*!< in: hash chain node */ +{ + return(node->next); +} + +/******************************************************************//** +Gets the first node in a hash chain. +@return first node, NULL if none */ +UNIV_INLINE +ha_node_t* +ha_chain_get_first( +/*===============*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: fold value determining the chain */ +{ + return static_cast(table->array[table->calc_hash(fold)].node); +} + +/*************************************************************//** +Looks for an element in a hash table. +@return pointer to the data of the first hash table node in chain +having the fold number, NULL if not found */ +UNIV_INLINE +const rec_t* +ha_search_and_get_data( +/*===================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold) /*!< in: folded value of the searched data */ +{ + ut_ad(btr_search_enabled); + + for (const ha_node_t* node = ha_chain_get_first(table, fold); + node != NULL; + node = ha_chain_get_next(node)) { + + if (node->fold == fold) { + + return(node->data); + } + } + + return(NULL); +} + +/*********************************************************//** +Looks for an element when we know the pointer to the data. +@return pointer to the hash table node, NULL if not found in the table */ +UNIV_INLINE +ha_node_t* +ha_search_with_data( +/*================*/ + hash_table_t* table, /*!< in: hash table */ + ulint fold, /*!< in: folded value of the searched data */ + const rec_t* data) /*!< in: pointer to the data */ +{ + ha_node_t* node; + + ut_ad(btr_search_enabled); + + node = ha_chain_get_first(table, fold); + + while (node) { + if (node->data == data) { + + return(node); + } + + node = ha_chain_get_next(node); + } + + return(NULL); +} + +#endif /* BTR_CUR_HASH_ADAPT */ diff --git a/storage/innobase/include/ha0storage.h b/storage/innobase/include/ha0storage.h new file mode 100644 index 00000000..fdf50a2e --- /dev/null +++ b/storage/innobase/include/ha0storage.h @@ -0,0 +1,137 @@ +/***************************************************************************** + +Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ha0storage.h +Hash storage. +Provides a data structure that stores chunks of data in +its own storage, avoiding duplicates. + +Created September 22, 2007 Vasil Dimov +*******************************************************/ + +#ifndef ha0storage_h +#define ha0storage_h + +#include "univ.i" + +/** This value is used by default by ha_storage_create(). More memory +is allocated later when/if it is needed. */ +#define HA_STORAGE_DEFAULT_HEAP_BYTES 1024 + +/** This value is used by default by ha_storage_create(). It is a +constant per ha_storage's lifetime. */ +#define HA_STORAGE_DEFAULT_HASH_CELLS 4096 + +/** Hash storage */ +struct ha_storage_t; + +/*******************************************************************//** +Creates a hash storage. If any of the parameters is 0, then a default +value is used. +@return own: hash storage */ +UNIV_INLINE +ha_storage_t* +ha_storage_create( +/*==============*/ + ulint initial_heap_bytes, /*!< in: initial heap's size */ + ulint initial_hash_cells); /*!< in: initial number of cells + in the hash table */ + +/*******************************************************************//** +Copies data into the storage and returns a pointer to the copy. If the +same data chunk is already present, then pointer to it is returned. +Data chunks are considered to be equal if len1 == len2 and +memcmp(data1, data2, len1) == 0. If "data" is not present (and thus +data_len bytes need to be allocated) and the size of storage is going to +become more than "memlim" then "data" is not added and NULL is returned. +To disable this behavior "memlim" can be set to 0, which stands for +"no limit". +@return pointer to the copy */ +const void* +ha_storage_put_memlim( +/*==================*/ + ha_storage_t* storage, /*!< in/out: hash storage */ + const void* data, /*!< in: data to store */ + ulint data_len, /*!< in: data length */ + ulint memlim); /*!< in: memory limit to obey */ + +/*******************************************************************//** +Same as ha_storage_put_memlim() but without memory limit. +@param storage in/out: hash storage +@param data in: data to store +@param data_len in: data length +@return pointer to the copy of the string */ +#define ha_storage_put(storage, data, data_len) \ + ha_storage_put_memlim((storage), (data), (data_len), 0) + +/*******************************************************************//** +Copies string into the storage and returns a pointer to the copy. If the +same string is already present, then pointer to it is returned. +Strings are considered to be equal if strcmp(str1, str2) == 0. +@param storage in/out: hash storage +@param str in: string to put +@return pointer to the copy of the string */ +#define ha_storage_put_str(storage, str) \ + ((const char*) ha_storage_put((storage), (str), strlen(str) + 1)) + +/*******************************************************************//** +Copies string into the storage and returns a pointer to the copy obeying +a memory limit. +If the same string is already present, then pointer to it is returned. +Strings are considered to be equal if strcmp(str1, str2) == 0. +@param storage in/out: hash storage +@param str in: string to put +@param memlim in: memory limit to obey +@return pointer to the copy of the string */ +#define ha_storage_put_str_memlim(storage, str, memlim) \ + ((const char*) ha_storage_put_memlim((storage), (str), \ + strlen(str) + 1, (memlim))) + +/*******************************************************************//** +Empties a hash storage, freeing memory occupied by data chunks. +This invalidates any pointers previously returned by ha_storage_put(). +The hash storage is not invalidated itself and can be used again. */ +UNIV_INLINE +void +ha_storage_empty( +/*=============*/ + ha_storage_t** storage); /*!< in/out: hash storage */ + +/*******************************************************************//** +Frees a hash storage and everything it contains, it cannot be used after +this call. +This invalidates any pointers previously returned by ha_storage_put(). */ +UNIV_INLINE +void +ha_storage_free( +/*============*/ + ha_storage_t* storage); /*!< in, own: hash storage */ + +/*******************************************************************//** +Gets the size of the memory used by a storage. +@return bytes used */ +UNIV_INLINE +ulint +ha_storage_get_size( +/*================*/ + const ha_storage_t* storage); /*!< in: hash storage */ + +#include "ha0storage.inl" + +#endif /* ha0storage_h */ diff --git a/storage/innobase/include/ha0storage.inl b/storage/innobase/include/ha0storage.inl new file mode 100644 index 00000000..df9679cf --- /dev/null +++ b/storage/innobase/include/ha0storage.inl @@ -0,0 +1,142 @@ +/***************************************************************************** + +Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ha0storage.ic +Hash storage. +Provides a data structure that stores chunks of data in +its own storage, avoiding duplicates. + +Created September 24, 2007 Vasil Dimov +*******************************************************/ + +#include "hash0hash.h" +#include "mem0mem.h" + +/** Hash storage for strings */ +struct ha_storage_t { + mem_heap_t* heap; /*!< memory heap from which memory is + allocated */ + hash_table_t hash; /*!< hash table used to avoid + duplicates */ +}; + +/** Objects of this type are stored in ha_storage_t */ +struct ha_storage_node_t { + ulint data_len;/*!< length of the data */ + const void* data; /*!< pointer to data */ + ha_storage_node_t* next; /*!< next node in hash chain */ +}; + +/*******************************************************************//** +Creates a hash storage. If any of the parameters is 0, then a default +value is used. +@return own: hash storage */ +UNIV_INLINE +ha_storage_t* +ha_storage_create( +/*==============*/ + ulint initial_heap_bytes, /*!< in: initial heap's size */ + ulint initial_hash_cells) /*!< in: initial number of cells + in the hash table */ +{ + ha_storage_t* storage; + mem_heap_t* heap; + + if (initial_heap_bytes == 0) { + + initial_heap_bytes = HA_STORAGE_DEFAULT_HEAP_BYTES; + } + + if (initial_hash_cells == 0) { + + initial_hash_cells = HA_STORAGE_DEFAULT_HASH_CELLS; + } + + /* we put "storage" within "storage->heap" */ + + heap = mem_heap_create(sizeof(ha_storage_t) + + initial_heap_bytes); + + storage = (ha_storage_t*) mem_heap_alloc(heap, + sizeof(ha_storage_t)); + + storage->heap = heap; + storage->hash.create(initial_hash_cells); + + return(storage); +} + +/*******************************************************************//** +Empties a hash storage, freeing memory occupied by data chunks. +This invalidates any pointers previously returned by ha_storage_put(). +The hash storage is not invalidated itself and can be used again. */ +UNIV_INLINE +void +ha_storage_empty( +/*=============*/ + ha_storage_t** storage) /*!< in/out: hash storage */ +{ + ha_storage_t temp_storage; + + temp_storage.heap = (*storage)->heap; + temp_storage.hash = (*storage)->hash; + + temp_storage.hash.clear(); + mem_heap_empty(temp_storage.heap); + + *storage = (ha_storage_t*) mem_heap_alloc(temp_storage.heap, + sizeof(ha_storage_t)); + + (*storage)->heap = temp_storage.heap; + (*storage)->hash = temp_storage.hash; +} + +/*******************************************************************//** +Frees a hash storage and everything it contains, it cannot be used after +this call. +This invalidates any pointers previously returned by ha_storage_put(). */ +UNIV_INLINE +void +ha_storage_free( +/*============*/ + ha_storage_t* storage) /*!< in, own: hash storage */ +{ + storage->hash.free(); + mem_heap_free(storage->heap); +} + +/*******************************************************************//** +Gets the size of the memory used by a storage. +@return bytes used */ +UNIV_INLINE +ulint +ha_storage_get_size( +/*================*/ + const ha_storage_t* storage) /*!< in: hash storage */ +{ + ulint ret; + + ret = mem_heap_get_size(storage->heap); + + /* this assumes hash->heap and hash->heaps are NULL */ + ret += sizeof(hash_table_t); + ret += sizeof(hash_cell_t) * storage->hash.n_cells; + + return(ret); +} diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h new file mode 100644 index 00000000..d5239ec3 --- /dev/null +++ b/storage/innobase/include/ha_prototypes.h @@ -0,0 +1,476 @@ +/***************************************************************************** + +Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ha_prototypes.h +Prototypes for global functions in ha_innodb.cc that are called by +InnoDB C code. + +NOTE: This header is intended to insulate InnoDB from SQL names and functions. +Do not include any headers other than univ.i into this unless they are very +simple headers. +************************************************************************/ + +#ifndef HA_INNODB_PROTOTYPES_H +#define HA_INNODB_PROTOTYPES_H + +#include "univ.i" + +#ifndef UNIV_INNOCHECKSUM + +/* Forward declarations */ +class THD; +class Field; + +// JAN: TODO missing features: +#undef MYSQL_FT_INIT_EXT +#undef MYSQL_PFS +#undef MYSQL_STORE_FTS_DOC_ID + +/*******************************************************************//** +Formats the raw data in "data" (in InnoDB on-disk format) that is of +type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes +the result to "buf". The result is converted to "system_charset_info". +Not more than "buf_size" bytes are written to "buf". +The result is always NUL-terminated (provided buf_size > 0) and the +number of bytes that were written to "buf" is returned (including the +terminating NUL). +@return number of bytes that were written */ +ulint +innobase_raw_format( +/*================*/ + const char* data, /*!< in: raw data */ + ulint data_len, /*!< in: raw data length + in bytes */ + ulint charset_coll, /*!< in: charset collation */ + char* buf, /*!< out: output buffer */ + ulint buf_size); /*!< in: output buffer size + in bytes */ + +/*****************************************************************//** +Invalidates the MySQL query cache for the table. */ +void +innobase_invalidate_query_cache( +/*============================*/ + trx_t* trx, /*!< in: transaction which + modifies the table */ + const char* full_name); /*!< in: concatenation of + database name, path separator, + table name, null char NUL; + NOTE that in Windows this is + always in LOWER CASE! */ + +/** Quote a standard SQL identifier like tablespace, index or column name. +@param[in] file output stream +@param[in] trx InnoDB transaction, or NULL +@param[in] id identifier to quote */ +void +innobase_quote_identifier( + FILE* file, + trx_t* trx, + const char* id); + +/** Quote an standard SQL identifier like tablespace, index or column name. +Return the string as an std:string object. +@param[in] trx InnoDB transaction, or NULL +@param[in] id identifier to quote +@return a std::string with id properly quoted. */ +std::string +innobase_quote_identifier( + trx_t* trx, + const char* id); + +/*****************************************************************//** +Convert a table name to the MySQL system_charset_info (UTF-8). +@return pointer to the end of buf */ +char* +innobase_convert_name( +/*==================*/ + char* buf, /*!< out: buffer for converted identifier */ + ulint buflen, /*!< in: length of buf, in bytes */ + const char* id, /*!< in: table name to convert */ + ulint idlen, /*!< in: length of id, in bytes */ + THD* thd); /*!< in: MySQL connection thread, or NULL */ + +/******************************************************************//** +Returns true if the transaction this thread is processing has edited +non-transactional tables. Used by the deadlock detector when deciding +which transaction to rollback in case of a deadlock - we try to avoid +rolling back transactions that have edited non-transactional tables. +@return true if non-transactional tables have been edited */ +ibool +thd_has_edited_nontrans_tables( +/*===========================*/ + THD* thd); /*!< in: thread handle */ + +/*************************************************************//** +Prints info of a THD object (== user session thread) to the given file. */ +void +innobase_mysql_print_thd( +/*=====================*/ + FILE* f, /*!< in: output stream */ + THD* thd, /*!< in: pointer to a MySQL THD object */ + uint max_query_len); /*!< in: max query length to print, or 0 to + use the default max length */ + +/** Converts a MySQL type to an InnoDB type. Note that this function returns +the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1 +VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'. +@param[out] unsigned_flag DATA_UNSIGNED if an 'unsigned type'; +at least ENUM and SET, and unsigned integer types are 'unsigned types' +@param[in] f MySQL Field +@return DATA_BINARY, DATA_VARCHAR, ... */ +uint8_t +get_innobase_type_from_mysql_type(unsigned *unsigned_flag, const Field *field); + +/******************************************************************//** +Compares NUL-terminated UTF-8 strings case insensitively. +@return 0 if a=b, <0 if a1 if a>b */ +int +innobase_strcasecmp( +/*================*/ + const char* a, /*!< in: first string to compare */ + const char* b); /*!< in: second string to compare */ + +/** Strip dir name from a full path name and return only the file name +@param[in] path_name full path name +@return file name or "null" if no file name */ +const char* +innobase_basename( + const char* path_name); + +/******************************************************************//** +Converts an identifier to a table name. */ +void +innobase_convert_from_table_id( +/*===========================*/ + CHARSET_INFO* cs, /*!< in: the 'from' character set */ + char* to, /*!< out: converted identifier */ + const char* from, /*!< in: identifier to convert */ + ulint len); /*!< in: length of 'to', in bytes; should + be at least 5 * strlen(to) + 1 */ +/******************************************************************//** +Converts an identifier to UTF-8. */ +void +innobase_convert_from_id( +/*=====================*/ + CHARSET_INFO* cs, /*!< in: the 'from' character set */ + char* to, /*!< out: converted identifier */ + const char* from, /*!< in: identifier to convert */ + ulint len); /*!< in: length of 'to', in bytes; + should be at least 3 * strlen(to) + 1 */ +/******************************************************************//** +Makes all characters in a NUL-terminated UTF-8 string lower case. */ +void +innobase_casedn_str( +/*================*/ + char* a); /*!< in/out: string to put in lower case */ + +#ifdef WITH_WSREP +ulint wsrep_innobase_mysql_sort(int mysql_type, uint charset_number, + unsigned char* str, ulint str_length, + ulint buf_length); +#endif /* WITH_WSREP */ + +extern "C" struct charset_info_st *thd_charset(THD *thd); + +/** Get high resolution timestamp for the current query start time. +The timestamp is not anchored to any specific point in time, +but can be used for comparison. +@param thd user thread +@retval timestamp in microseconds precision +*/ +extern "C" unsigned long long thd_start_utime(const MYSQL_THD thd); + + +/** Determines the current SQL statement. +Thread unsafe, can only be called from the thread owning the THD. +@param[in] thd MySQL thread handle +@param[out] length Length of the SQL statement +@return SQL statement string */ +const char* +innobase_get_stmt_unsafe( + THD* thd, + size_t* length); + +/******************************************************************//** +This function is used to find the storage length in bytes of the first n +characters for prefix indexes using a multibyte character set. The function +finds charset information and returns length of prefix_len characters in the +index field in bytes. +@return number of bytes occupied by the first n characters */ +ulint +innobase_get_at_most_n_mbchars( +/*===========================*/ + ulint charset_id, /*!< in: character set id */ + ulint prefix_len, /*!< in: prefix length in bytes of the index + (this has to be divided by mbmaxlen to get the + number of CHARACTERS n in the prefix) */ + ulint data_len, /*!< in: length of the string in bytes */ + const char* str); /*!< in: character string */ + +/** Get status of innodb_tmpdir. +@param[in] thd thread handle, or NULL to query + the global innodb_tmpdir. +@retval NULL if innodb_tmpdir="" */ +const char *thd_innodb_tmpdir(THD *thd); + +/******************************************************************//** +Returns the lock wait timeout for the current connection. +@return the lock wait timeout, in seconds */ +uint& +thd_lock_wait_timeout( +/*==================*/ + THD* thd); /*!< in: thread handle, or NULL to query + the global innodb_lock_wait_timeout */ + +/******************************************************************//** +compare two character string case insensitively according to their charset. */ +int +innobase_fts_text_case_cmp( +/*=======================*/ + const void* cs, /*!< in: Character set */ + const void* p1, /*!< in: key */ + const void* p2); /*!< in: node */ + +/******************************************************************//** +Returns true if transaction should be flagged as read-only. +@return true if the thd is marked as read-only */ +bool +thd_trx_is_read_only( +/*=================*/ + THD* thd); /*!< in/out: thread handle */ + +/******************************************************************//** +Check if the transaction is an auto-commit transaction. TRUE also +implies that it is a SELECT (read-only) transaction. +@return true if the transaction is an auto commit read-only transaction. */ +ibool +thd_trx_is_auto_commit( +/*===================*/ + THD* thd); /*!< in: thread handle, or NULL */ + +/*****************************************************************//** +A wrapper function of innobase_convert_name(), convert a table name +to the MySQL system_charset_info (UTF-8) and quote it if needed. +@return pointer to the end of buf */ +void +innobase_format_name( +/*==================*/ + char* buf, /*!< out: buffer for converted identifier */ + ulint buflen, /*!< in: length of buf, in bytes */ + const char* name); /*!< in: table name to format */ + +/** Corresponds to Sql_condition:enum_warning_level. */ +enum ib_log_level_t { + IB_LOG_LEVEL_INFO, + IB_LOG_LEVEL_WARN, + IB_LOG_LEVEL_ERROR, + IB_LOG_LEVEL_FATAL +}; + +/******************************************************************//** +Use this when the args are first converted to a formatted string and then +passed to the format string from errmsg-utf8.txt. The error message format +must be: "Some string ... %s". + +Push a warning message to the client, it is a wrapper around: + +void push_warning_printf( + THD *thd, Sql_condition::enum_warning_level level, + uint code, const char *format, ...); +*/ +void +ib_errf( +/*====*/ + THD* thd, /*!< in/out: session */ + ib_log_level_t level, /*!< in: warning level */ + ib_uint32_t code, /*!< MySQL error code */ + const char* format, /*!< printf format */ + ...) /*!< Args */ + MY_ATTRIBUTE((format(printf, 4, 5))); + +/******************************************************************//** +Use this when the args are passed to the format string from +errmsg-utf8.txt directly as is. + +Push a warning message to the client, it is a wrapper around: + +void push_warning_printf( + THD *thd, Sql_condition::enum_warning_level level, + uint code, const char *format, ...); +*/ +void +ib_senderrf( +/*========*/ + THD* thd, /*!< in/out: session */ + ib_log_level_t level, /*!< in: warning level */ + ib_uint32_t code, /*!< MySQL error code */ + ...); /*!< Args */ + +extern const char* TROUBLESHOOTING_MSG; +extern const char* TROUBLESHOOT_DATADICT_MSG; +extern const char* BUG_REPORT_MSG; +extern const char* FORCE_RECOVERY_MSG; +extern const char* OPERATING_SYSTEM_ERROR_MSG; +extern const char* FOREIGN_KEY_CONSTRAINTS_MSG; +extern const char* SET_TRANSACTION_MSG; +extern const char* INNODB_PARAMETERS_MSG; + +/******************************************************************//** +Returns the NUL terminated value of glob_hostname. +@return pointer to glob_hostname. */ +const char* +server_get_hostname(); +/*=================*/ + +/*********************************************************************//** +Compute the next autoinc value. + +For MySQL replication the autoincrement values can be partitioned among +the nodes. The offset is the start or origin of the autoincrement value +for a particular node. For n nodes the increment will be n and the offset +will be in the interval [1, n]. The formula tries to allocate the next +value for a particular node. + +Note: This function is also called with increment set to the number of +values we want to reserve for multi-value inserts e.g., + + INSERT INTO T VALUES(), (), (); + +innobase_next_autoinc() will be called with increment set to 3 where +autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for +the multi-value INSERT above. +@return the next value */ +ulonglong +innobase_next_autoinc( +/*==================*/ + ulonglong current, /*!< in: Current value */ + ulonglong need, /*!< in: count of values needed */ + ulonglong step, /*!< in: AUTOINC increment step */ + ulonglong offset, /*!< in: AUTOINC offset */ + ulonglong max_value) /*!< in: max value for type */ + MY_ATTRIBUTE((pure, warn_unused_result)); + +/********************************************************************** +Converts an identifier from my_charset_filename to UTF-8 charset. */ +uint +innobase_convert_to_system_charset( +/*===============================*/ + char* to, /* out: converted identifier */ + const char* from, /* in: identifier to convert */ + ulint len, /* in: length of 'to', in bytes */ + uint* errors); /* out: error return */ +/********************************************************************** +Check if the length of the identifier exceeds the maximum allowed. +The input to this function is an identifier in charset my_charset_filename. +return true when length of identifier is too long. */ +my_bool +innobase_check_identifier_length( +/*=============================*/ + const char* id); /* in: identifier to check. it must belong + to charset my_charset_filename */ + +/********************************************************************** +Converts an identifier from my_charset_filename to UTF-8 charset. */ +uint +innobase_convert_to_system_charset( +/*===============================*/ + char* to, /* out: converted identifier */ + const char* from, /* in: identifier to convert */ + ulint len, /* in: length of 'to', in bytes */ + uint* errors); /* out: error return */ + +/********************************************************************** +Converts an identifier from my_charset_filename to UTF-8 charset. */ +uint +innobase_convert_to_filename_charset( +/*=================================*/ + char* to, /* out: converted identifier */ + const char* from, /* in: identifier to convert */ + ulint len); /* in: length of 'to', in bytes */ + +/********************************************************************//** +Helper function to push warnings from InnoDB internals to SQL-layer. */ +void +ib_push_warning( + trx_t* trx, /*!< in: trx */ + dberr_t error, /*!< in: error code to push as warning */ + const char *format,/*!< in: warning message */ + ...); + +/********************************************************************//** +Helper function to push warnings from InnoDB internals to SQL-layer. */ +void +ib_push_warning( + void* ithd, /*!< in: thd */ + dberr_t error, /*!< in: error code to push as warning */ + const char *format,/*!< in: warning message */ + ...); + +/********************************************************************//** +Helper function to push warnings from InnoDB internals to SQL-layer. */ +void +ib_foreign_warn( + trx_t* trx, /*!< in: trx */ + dberr_t error, /*!< in: error code to push as warning */ + const char *table_name, + const char *format,/*!< in: warning message */ + ...); + +/*****************************************************************//** +Normalizes a table name string. A normalized name consists of the +database name catenated to '/' and table name. An example: +test/mytable. On Windows normalization puts both the database name and the +table name always to lower case if "set_lower_case" is set to TRUE. */ +void +normalize_table_name_c_low( +/*=======================*/ + char* norm_name, /*!< out: normalized name as a + null-terminated string */ + const char* name, /*!< in: table name string */ + bool set_lower_case); /*!< in: true if we want to set + name to lower case */ + +/** Create a MYSQL_THD for a background thread and mark it as such. +@param name thread info for SHOW PROCESSLIST +@return new MYSQL_THD */ +MYSQL_THD innobase_create_background_thd(const char* name); + +/** Destroy a THD object associated with a background task. +@param[in] thd MYSQL_THD to destroy */ +void destroy_background_thd(MYSQL_THD thd); + +/** Close opened tables, free memory, delete items for a MYSQL_THD. +@param[in] thd MYSQL_THD to reset */ +void +innobase_reset_background_thd(MYSQL_THD); + +#ifdef WITH_WSREP +/** Append table-level exclusive key. +@param thd MySQL thread handle +@param table table +@retval false on success +@retval true on failure */ +struct dict_table_t; +bool wsrep_append_table_key(MYSQL_THD thd, const dict_table_t &table); +#endif /* WITH_WSREP */ + +#endif /* !UNIV_INNOCHECKSUM */ +#endif /* HA_INNODB_PROTOTYPES_H */ diff --git a/storage/innobase/include/handler0alter.h b/storage/innobase/include/handler0alter.h new file mode 100644 index 00000000..add983a0 --- /dev/null +++ b/storage/innobase/include/handler0alter.h @@ -0,0 +1,108 @@ +/***************************************************************************** + +Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/handler0alter.h +Smart ALTER TABLE +*******************************************************/ + +#include "rem0types.h" + +/*************************************************************//** +Copies an InnoDB record to table->record[0]. */ +void +innobase_rec_to_mysql( +/*==================*/ + struct TABLE* table, /*!< in/out: MySQL table */ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: index */ + const rec_offs* offsets)/*!< in: rec_get_offsets( + rec, index, ...) */ + MY_ATTRIBUTE((nonnull)); + +/*************************************************************//** +Copies an InnoDB index entry to table->record[0]. */ +void +innobase_fields_to_mysql( +/*=====================*/ + struct TABLE* table, /*!< in/out: MySQL table */ + const dict_index_t* index, /*!< in: InnoDB index */ + const dfield_t* fields) /*!< in: InnoDB index fields */ + MY_ATTRIBUTE((nonnull)); + +/*************************************************************//** +Copies an InnoDB row to table->record[0]. */ +void +innobase_row_to_mysql( +/*==================*/ + struct TABLE* table, /*!< in/out: MySQL table */ + const dict_table_t* itab, /*!< in: InnoDB table */ + const dtuple_t* row) /*!< in: InnoDB row */ + MY_ATTRIBUTE((nonnull)); + +/** Generate the next autoinc based on a snapshot of the session +auto_increment_increment and auto_increment_offset variables. */ +struct ib_sequence_t { + + /** + @param thd the session + @param start_value the lower bound + @param max_value the upper bound (inclusive) */ + ib_sequence_t(THD* thd, ulonglong start_value, ulonglong max_value); + + /** Postfix increment + @return the value to insert */ + ulonglong operator++(int) UNIV_NOTHROW; + + /** Check if the autoinc "sequence" is exhausted. + @return true if the sequence is exhausted */ + bool eof() const UNIV_NOTHROW + { + return(m_eof); + } + + /** + @return the next value in the sequence */ + ulonglong last() const UNIV_NOTHROW + { + ut_ad(m_next_value > 0); + + return(m_next_value); + } + + /** @return maximum column value + @retval 0 if not adding AUTO_INCREMENT column */ + ulonglong max_value() const { return m_max_value; } + +private: + /** Maximum value if adding an AUTO_INCREMENT column, else 0 */ + ulonglong m_max_value; + + /** Value of auto_increment_increment */ + ulong m_increment; + + /** Value of auto_increment_offset */ + ulong m_offset; + + /** Next value in the sequence */ + ulonglong m_next_value; + + /** true if no more values left in the sequence */ + bool m_eof; +}; diff --git a/storage/innobase/include/hash0hash.h b/storage/innobase/include/hash0hash.h new file mode 100644 index 00000000..867ad9e0 --- /dev/null +++ b/storage/innobase/include/hash0hash.h @@ -0,0 +1,190 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/hash0hash.h +The simple hash table utility + +Created 5/20/1997 Heikki Tuuri +*******************************************************/ + +#pragma once +#include "ut0rnd.h" +#include "ut0new.h" + +struct hash_table_t; +struct hash_cell_t +{ + /** singly-linked, nullptr terminated list of hash buckets */ + void *node; + + /** Append an element. + @tparam T type of the element + @param insert the being-inserted element + @param next the next-element pointer in T */ + template + void append(T &insert, T *T::*next) + { + void **after; + for (after= &node; *after; + after= reinterpret_cast(&(static_cast(*after)->*next))); + insert.*next= nullptr; + *after= &insert; + } +}; + +/*******************************************************************//** +Inserts a struct to a hash table. */ + +#define HASH_INSERT(TYPE, NAME, TABLE, FOLD, DATA)\ +do {\ + hash_cell_t* cell3333;\ + TYPE* struct3333;\ +\ + (DATA)->NAME = NULL;\ +\ + cell3333 = &(TABLE)->array[(TABLE)->calc_hash(FOLD)]; \ +\ + if (cell3333->node == NULL) {\ + cell3333->node = DATA;\ + } else {\ + struct3333 = (TYPE*) cell3333->node;\ +\ + while (struct3333->NAME != NULL) {\ +\ + struct3333 = (TYPE*) struct3333->NAME;\ + }\ +\ + struct3333->NAME = DATA;\ + }\ +} while (0) + +#ifdef UNIV_HASH_DEBUG +# define HASH_ASSERT_VALID(DATA) ut_a((void*) (DATA) != (void*) -1) +# define HASH_INVALIDATE(DATA, NAME) *(void**) (&DATA->NAME) = (void*) -1 +#else +# define HASH_ASSERT_VALID(DATA) do {} while (0) +# define HASH_INVALIDATE(DATA, NAME) do {} while (0) +#endif + +/*******************************************************************//** +Deletes a struct from a hash table. */ + +#define HASH_DELETE(TYPE, NAME, TABLE, FOLD, DATA)\ +do {\ + hash_cell_t* cell3333;\ + TYPE* struct3333;\ +\ + cell3333 = &(TABLE)->array[(TABLE)->calc_hash(FOLD)]; \ +\ + if (cell3333->node == DATA) {\ + HASH_ASSERT_VALID(DATA->NAME);\ + cell3333->node = DATA->NAME;\ + } else {\ + struct3333 = (TYPE*) cell3333->node;\ +\ + while (struct3333->NAME != DATA) {\ +\ + struct3333 = (TYPE*) struct3333->NAME;\ + ut_a(struct3333);\ + }\ +\ + struct3333->NAME = DATA->NAME;\ + }\ + HASH_INVALIDATE(DATA, NAME);\ +} while (0) + +/*******************************************************************//** +Gets the first struct in a hash chain, NULL if none. */ + +#define HASH_GET_FIRST(TABLE, HASH_VAL) (TABLE)->array[HASH_VAL].node + +/*******************************************************************//** +Gets the next struct in a hash chain, NULL if none. */ + +#define HASH_GET_NEXT(NAME, DATA) ((DATA)->NAME) + +/********************************************************************//** +Looks for a struct in a hash table. */ +#define HASH_SEARCH(NAME, TABLE, FOLD, TYPE, DATA, ASSERTION, TEST)\ +{\ + (DATA) = (TYPE) HASH_GET_FIRST(TABLE, (TABLE)->calc_hash(FOLD)); \ + HASH_ASSERT_VALID(DATA);\ +\ + while ((DATA) != NULL) {\ + ASSERTION;\ + if (TEST) {\ + break;\ + } else {\ + HASH_ASSERT_VALID(HASH_GET_NEXT(NAME, DATA));\ + (DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA);\ + }\ + }\ +} + +/********************************************************************//** +Looks for an item in all hash buckets. */ +#define HASH_SEARCH_ALL(NAME, TABLE, TYPE, DATA, ASSERTION, TEST) \ +do { \ + ulint i3333; \ + \ + for (i3333 = (TABLE)->n_cells; i3333--; ) { \ + (DATA) = (TYPE) HASH_GET_FIRST(TABLE, i3333); \ + \ + while ((DATA) != NULL) { \ + HASH_ASSERT_VALID(DATA); \ + ASSERTION; \ + \ + if (TEST) { \ + break; \ + } \ + \ + (DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA); \ + } \ + \ + if ((DATA) != NULL) { \ + break; \ + } \ + } \ +} while (0) + +/** Hash table with singly-linked overflow lists */ +struct hash_table_t +{ + /** number of elements in array (a prime number) */ + ulint n_cells; + /** the hash array */ + hash_cell_t *array; + + /** Create the hash table. + @param n the lower bound of n_cells */ + void create(ulint n) + { + n_cells= ut_find_prime(n); + array= static_cast(ut_zalloc_nokey(n_cells * sizeof *array)); + } + + /** Clear the hash table. */ + void clear() { memset(array, 0, n_cells * sizeof *array); } + + /** Free the hash table. */ + void free() { ut_free(array); array= nullptr; } + + ulint calc_hash(ulint fold) const { return ut_hash_ulint(fold, n_cells); } +}; diff --git a/storage/innobase/include/ibuf0ibuf.h b/storage/innobase/include/ibuf0ibuf.h new file mode 100644 index 00000000..c246b2ef --- /dev/null +++ b/storage/innobase/include/ibuf0ibuf.h @@ -0,0 +1,436 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ibuf0ibuf.h +Insert buffer + +Created 7/19/1997 Heikki Tuuri +*******************************************************/ + +#ifndef ibuf0ibuf_h +#define ibuf0ibuf_h + +#include "mtr0mtr.h" +#include "dict0mem.h" +#include "fsp0fsp.h" + +/** Default value for maximum on-disk size of change buffer in terms +of percentage of the buffer pool. */ +#define CHANGE_BUFFER_DEFAULT_SIZE (25) + +/* Possible operations buffered in the insert/whatever buffer. See +ibuf_insert(). DO NOT CHANGE THE VALUES OF THESE, THEY ARE STORED ON DISK. */ +typedef enum { + IBUF_OP_INSERT = 0, + IBUF_OP_DELETE_MARK = 1, + IBUF_OP_DELETE = 2, + + /* Number of different operation types. */ + IBUF_OP_COUNT = 3 +} ibuf_op_t; + +/** Combinations of operations that can be buffered. +@see innodb_change_buffering_names */ +enum ibuf_use_t { + IBUF_USE_NONE = 0, + IBUF_USE_INSERT, /* insert */ + IBUF_USE_DELETE_MARK, /* delete */ + IBUF_USE_INSERT_DELETE_MARK, /* insert+delete */ + IBUF_USE_DELETE, /* delete+purge */ + IBUF_USE_ALL /* insert+delete+purge */ +}; + +/** Operations that can currently be buffered. */ +extern ulong innodb_change_buffering; + +/** Insert buffer struct */ +struct ibuf_t{ + Atomic_relaxed size; /*!< current size of the ibuf index + tree, in pages */ + Atomic_relaxed max_size; /*!< recommended maximum size of the + ibuf index tree, in pages */ + ulint seg_size; /*!< allocated pages of the file + segment containing ibuf header and + tree */ + bool empty; /*!< Protected by the page + latch of the root page of the + insert buffer tree + (FSP_IBUF_TREE_ROOT_PAGE_NO). true + if and only if the insert + buffer tree is empty. */ + ulint free_list_len; /*!< length of the free list */ + ulint height; /*!< tree height */ + dict_index_t* index; /*!< insert buffer index */ + + /** number of pages merged */ + Atomic_counter n_merges; + Atomic_counter n_merged_ops[IBUF_OP_COUNT]; + /*!< number of operations of each type + merged to index pages */ + Atomic_counter n_discarded_ops[IBUF_OP_COUNT]; + /*!< number of operations of each type + discarded without merging due to the + tablespace being deleted or the + index being dropped */ +}; + +/** The insert buffer control structure */ +extern ibuf_t ibuf; + +/* The purpose of the insert buffer is to reduce random disk access. +When we wish to insert a record into a non-unique secondary index and +the B-tree leaf page where the record belongs to is not in the buffer +pool, we insert the record into the insert buffer B-tree, indexed by +(space_id, page_no). When the page is eventually read into the buffer +pool, we look up the insert buffer B-tree for any modifications to the +page, and apply these upon the completion of the read operation. This +is called the insert buffer merge. */ + +/* The insert buffer merge must always succeed. To guarantee this, +the insert buffer subsystem keeps track of the free space in pages for +which it can buffer operations. Two bits per page in the insert +buffer bitmap indicate the available space in coarse increments. The +free bits in the insert buffer bitmap must never exceed the free space +on a page. It is safe to decrement or reset the bits in the bitmap in +a mini-transaction that is committed before the mini-transaction that +affects the free space. It is unsafe to increment the bits in a +separately committed mini-transaction, because in crash recovery, the +free bits could momentarily be set too high. */ + +/******************************************************************//** +Creates the insert buffer data structure at a database startup. +@return DB_SUCCESS or failure */ +dberr_t +ibuf_init_at_db_start(void); +/*=======================*/ +/*********************************************************************//** +Updates the max_size value for ibuf. */ +void +ibuf_max_size_update( +/*=================*/ + ulint new_val); /*!< in: new value in terms of + percentage of the buffer pool size */ +/*********************************************************************//** +Reads the biggest tablespace id from the high end of the insert buffer +tree and updates the counter in fil_system. */ +void +ibuf_update_max_tablespace_id(void); +/*===============================*/ +/***************************************************************//** +Starts an insert buffer mini-transaction. */ +UNIV_INLINE +void +ibuf_mtr_start( +/*===========*/ + mtr_t* mtr) /*!< out: mini-transaction */ + MY_ATTRIBUTE((nonnull)); +/***************************************************************//** +Commits an insert buffer mini-transaction. */ +UNIV_INLINE +void +ibuf_mtr_commit( +/*============*/ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull)); +/************************************************************************//** +Resets the free bits of the page in the ibuf bitmap. This is done in a +separate mini-transaction, hence this operation does not restrict +further work to only ibuf bitmap operations, which would result if the +latch to the bitmap page were kept. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to decrement or reset the bits in the bitmap in a mini-transaction +that is committed before the mini-transaction that affects the free +space. */ +void +ibuf_reset_free_bits( +/*=================*/ + buf_block_t* block); /*!< in: index page; free bits are set to 0 + if the index is a non-clustered + non-unique, and page level is 0 */ +/************************************************************************//** +Updates the free bits of an uncompressed page in the ibuf bitmap if +there is not enough free on the page any more. This is done in a +separate mini-transaction, hence this operation does not restrict +further work to only ibuf bitmap operations, which would result if the +latch to the bitmap page were kept. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is +unsafe to increment the bits in a separately committed +mini-transaction, because in crash recovery, the free bits could +momentarily be set too high. It is only safe to use this function for +decrementing the free bits. Should more free space become available, +we must not update the free bits here, because that would break crash +recovery. */ +UNIV_INLINE +void +ibuf_update_free_bits_if_full( +/*==========================*/ + buf_block_t* block, /*!< in: index page to which we have added new + records; the free bits are updated if the + index is non-clustered and non-unique and + the page level is 0, and the page becomes + fuller */ + ulint max_ins_size,/*!< in: value of maximum insert size with + reorganize before the latest operation + performed to the page */ + ulint increase);/*!< in: upper limit for the additional space + used in the latest operation, if known, or + ULINT_UNDEFINED */ +/**********************************************************************//** +Updates the free bits for an uncompressed page to reflect the present +state. Does this in the mtr given, which means that the latching +order rules virtually prevent any further operations for this OS +thread until mtr is committed. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to set the free bits in the same mini-transaction that updated the +page. */ +void +ibuf_update_free_bits_low( +/*======================*/ + const buf_block_t* block, /*!< in: index page */ + ulint max_ins_size, /*!< in: value of + maximum insert size + with reorganize before + the latest operation + performed to the page */ + mtr_t* mtr); /*!< in/out: mtr */ +/**********************************************************************//** +Updates the free bits for a compressed page to reflect the present +state. Does this in the mtr given, which means that the latching +order rules virtually prevent any further operations for this OS +thread until mtr is committed. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is safe +to set the free bits in the same mini-transaction that updated the +page. */ +void +ibuf_update_free_bits_zip( +/*======================*/ + buf_block_t* block, /*!< in/out: index page */ + mtr_t* mtr); /*!< in/out: mtr */ +/**********************************************************************//** +Updates the free bits for the two pages to reflect the present state. +Does this in the mtr given, which means that the latching order rules +virtually prevent any further operations until mtr is committed. +NOTE: The free bits in the insert buffer bitmap must never exceed the +free space on a page. It is safe to set the free bits in the same +mini-transaction that updated the pages. */ +void +ibuf_update_free_bits_for_two_pages_low( +/*====================================*/ + buf_block_t* block1, /*!< in: index page */ + buf_block_t* block2, /*!< in: index page */ + mtr_t* mtr); /*!< in: mtr */ +/**********************************************************************//** +A basic partial test if an insert to the insert buffer could be possible and +recommended. */ +UNIV_INLINE +ibool +ibuf_should_try( +/*============*/ + dict_index_t* index, /*!< in: index where to insert */ + ulint ignore_sec_unique); /*!< in: if != 0, we should + ignore UNIQUE constraint on + a secondary index when we + decide */ +/******************************************************************//** +Returns TRUE if the current OS thread is performing an insert buffer +routine. + +For instance, a read-ahead of non-ibuf pages is forbidden by threads +that are executing an insert buffer routine. +@return TRUE if inside an insert buffer routine */ +UNIV_INLINE +ibool +ibuf_inside( +/*========*/ + const mtr_t* mtr) /*!< in: mini-transaction */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Checks if a page address is an ibuf bitmap page (level 3 page) address. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@return TRUE if a bitmap page */ +inline bool ibuf_bitmap_page(const page_id_t page_id, ulint zip_size) +{ + ut_ad(ut_is_2pow(zip_size)); + ulint size = zip_size ? zip_size : srv_page_size; + return (page_id.page_no() & (size - 1)) == FSP_IBUF_BITMAP_OFFSET; +} + +/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. +Must not be called when recv_no_ibuf_operations==true. +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] x_latch FALSE if relaxed check (avoid latching the +bitmap page) +@param[in,out] mtr mtr which will contain an x-latch to the +bitmap page if the page is not one of the fixed address ibuf pages, or NULL, +in which case a new transaction is created. +@return true if level 2 or level 3 page */ +bool +ibuf_page_low( + const page_id_t page_id, + ulint zip_size, +#ifdef UNIV_DEBUG + bool x_latch, +#endif /* UNIV_DEBUG */ + mtr_t* mtr) + MY_ATTRIBUTE((warn_unused_result)); + +#ifdef UNIV_DEBUG +/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. +Must not be called when recv_no_ibuf_operations==true. +@param[in] page_id tablespace/page identifier +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] mtr mini-transaction or NULL +@return TRUE if level 2 or level 3 page */ +# define ibuf_page(page_id, zip_size, mtr) \ + ibuf_page_low(page_id, zip_size, true, mtr) + +#else /* UNIV_DEBUG */ + +/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. +Must not be called when recv_no_ibuf_operations==true. +@param[in] page_id tablespace/page identifier +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] mtr mini-transaction or NULL +@return TRUE if level 2 or level 3 page */ +# define ibuf_page(page_id, zip_size, mtr) \ + ibuf_page_low(page_id, zip_size, mtr) + +#endif /* UNIV_DEBUG */ +/***********************************************************************//** +Frees excess pages from the ibuf free list. This function is called when an OS +thread calls fsp services to allocate a new file segment, or a new page to a +file segment, and the thread did not own the fsp latch before this call. */ +void +ibuf_free_excess_pages(void); +/*========================*/ + +/** Buffer an operation in the change buffer, instead of applying it +directly to the file page, if this is possible. Does not do it if the index +is clustered or unique. +@param[in] op operation type +@param[in] entry index entry to insert +@param[in,out] index index where to insert +@param[in] page_id page id where to insert +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in,out] thr query thread +@return true if success */ +bool +ibuf_insert( + ibuf_op_t op, + const dtuple_t* entry, + dict_index_t* index, + const page_id_t page_id, + ulint zip_size, + que_thr_t* thr); + +/** Check whether buffered changes exist for a page. +@param[in] id page identifier +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@return whether buffered changes exist */ +bool ibuf_page_exists(const page_id_t id, ulint zip_size); + +/** When an index page is read from a disk to the buffer pool, this function +applies any buffered operations to the page and deletes the entries from the +insert buffer. If the page is not read, but created in the buffer pool, this +function deletes its buffered entries from the insert buffer; there can +exist entries for such a page if the page belonged to an index which +subsequently was dropped. +@param block X-latched page to try to apply changes to, or NULL to discard +@param page_id page identifier +@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@return error code */ +dberr_t ibuf_merge_or_delete_for_page(buf_block_t *block, + const page_id_t page_id, + ulint zip_size); + +/** Delete all change buffer entries for a tablespace, +in DISCARD TABLESPACE, IMPORT TABLESPACE, or read-ahead. +@param[in] space missing or to-be-discarded tablespace */ +void ibuf_delete_for_discarded_space(uint32_t space); + +/** Contract the change buffer by reading pages to the buffer pool. +@return a lower limit for the combined size in bytes of entries which +will be merged from ibuf trees to the pages read +@retval 0 if ibuf.empty */ +ulint ibuf_contract(); + +/** Contracts insert buffer trees by reading pages referring to space_id +to the buffer pool. +@returns number of pages merged.*/ +ulint +ibuf_merge_space( +/*=============*/ + ulint space); /*!< in: space id */ + +/******************************************************************//** +Looks if the insert buffer is empty. +@return true if empty */ +bool +ibuf_is_empty(void); +/*===============*/ +/******************************************************************//** +Prints info of ibuf. */ +void +ibuf_print( +/*=======*/ + FILE* file); /*!< in: file where to print */ +/******************************************************************** +Read the first two bytes from a record's fourth field (counter field in new +records; something else in older records). +@return "counter" field, or ULINT_UNDEFINED if for some reason it can't be read */ +ulint +ibuf_rec_get_counter( +/*=================*/ + const rec_t* rec); /*!< in: ibuf record */ +/******************************************************************//** +Closes insert buffer and frees the data structures. */ +void +ibuf_close(void); +/*============*/ + +/** Check the insert buffer bitmaps on IMPORT TABLESPACE. +@param[in] trx transaction +@param[in,out] space tablespace being imported +@return DB_SUCCESS or error code */ +dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Update free bits and buffered bits for bulk loaded page. +@param block secondary index leaf page +@param mtr mini-transaction +@param reset whether the page is full */ +void ibuf_set_bitmap_for_bulk_load(buf_block_t *block, mtr_t *mtr, bool reset); + +#define IBUF_HEADER_PAGE_NO FSP_IBUF_HEADER_PAGE_NO +#define IBUF_TREE_ROOT_PAGE_NO FSP_IBUF_TREE_ROOT_PAGE_NO + +/* The ibuf header page currently contains only the file segment header +for the file segment from which the pages for the ibuf tree are allocated */ +#define IBUF_HEADER PAGE_DATA +#define IBUF_TREE_SEG_HEADER 0 /* fseg header for ibuf tree */ + +/* The insert buffer tree itself is always located in space 0. */ +#define IBUF_SPACE_ID static_cast(0) + +#include "ibuf0ibuf.inl" + +#endif diff --git a/storage/innobase/include/ibuf0ibuf.inl b/storage/innobase/include/ibuf0ibuf.inl new file mode 100644 index 00000000..003bf22a --- /dev/null +++ b/storage/innobase/include/ibuf0ibuf.inl @@ -0,0 +1,282 @@ +/***************************************************************************** + +Copyright (c) 1997, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ibuf0ibuf.ic +Insert buffer + +Created 7/19/1997 Heikki Tuuri +*******************************************************/ + +#include "page0page.h" +#include "page0zip.h" +#include "fsp0types.h" +#include "buf0lru.h" + +/** An index page must contain at least srv_page_size / +IBUF_PAGE_SIZE_PER_FREE_SPACE bytes of free space for ibuf to try to +buffer inserts to this page. If there is this much of free space, the +corresponding bits are set in the ibuf bitmap. */ +#define IBUF_PAGE_SIZE_PER_FREE_SPACE 32 + +/***************************************************************//** +Starts an insert buffer mini-transaction. */ +UNIV_INLINE +void +ibuf_mtr_start( +/*===========*/ + mtr_t* mtr) /*!< out: mini-transaction */ +{ + mtr_start(mtr); + mtr->enter_ibuf(); + + if (high_level_read_only || srv_read_only_mode) { + mtr_set_log_mode(mtr, MTR_LOG_NO_REDO); + } + +} +/***************************************************************//** +Commits an insert buffer mini-transaction. */ +UNIV_INLINE +void +ibuf_mtr_commit( +/*============*/ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(mtr->is_inside_ibuf()); + ut_d(mtr->exit_ibuf()); + + mtr_commit(mtr); +} + +/************************************************************************//** +Sets the free bit of the page in the ibuf bitmap. This is done in a separate +mini-transaction, hence this operation does not restrict further work to only +ibuf bitmap operations, which would result if the latch to the bitmap page +were kept. */ +void +ibuf_set_free_bits_func( +/*====================*/ + buf_block_t* block, /*!< in: index page of a non-clustered index; + free bit is reset if page level is 0 */ +#ifdef UNIV_IBUF_DEBUG + ulint max_val,/*!< in: ULINT_UNDEFINED or a maximum + value which the bits must have before + setting; this is for debugging */ +#endif /* UNIV_IBUF_DEBUG */ + ulint val); /*!< in: value to set: < 4 */ +#ifdef UNIV_IBUF_DEBUG +# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,max,v) +#else /* UNIV_IBUF_DEBUG */ +# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,v) +#endif /* UNIV_IBUF_DEBUG */ + +/**********************************************************************//** +A basic partial test if an insert to the insert buffer could be possible and +recommended. */ +UNIV_INLINE +ibool +ibuf_should_try( +/*============*/ + dict_index_t* index, /*!< in: index where to insert */ + ulint ignore_sec_unique) /*!< in: if != 0, we should + ignore UNIQUE constraint on + a secondary index when we + decide */ +{ + if (index->type & (DICT_CLUSTERED | DICT_IBUF | DICT_SPATIAL) || + !innodb_change_buffering || !ibuf.max_size) + return false; + if (!ignore_sec_unique && index->is_unique()) + return false; + if (index->table->quiesce != QUIESCE_NONE) + return false; + for (unsigned i= 0; i < index->n_fields; i++) + if (index->fields[i].descending) + return false; + return true; +} + +/******************************************************************//** +Returns TRUE if the current OS thread is performing an insert buffer +routine. + +For instance, a read-ahead of non-ibuf pages is forbidden by threads +that are executing an insert buffer routine. +@return TRUE if inside an insert buffer routine */ +UNIV_INLINE +ibool +ibuf_inside( +/*========*/ + const mtr_t* mtr) /*!< in: mini-transaction */ +{ + return(mtr->is_inside_ibuf()); +} + +/** Translates the free space on a page to a value in the ibuf bitmap. +@param[in] page_size page size in bytes +@param[in] max_ins_size maximum insert size after reorganize for +the page +@return value for ibuf bitmap bits */ +UNIV_INLINE +ulint +ibuf_index_page_calc_free_bits( + ulint page_size, + ulint max_ins_size) +{ + ulint n; + ut_ad(ut_is_2pow(page_size)); + ut_ad(page_size > IBUF_PAGE_SIZE_PER_FREE_SPACE); + + n = max_ins_size / (page_size / IBUF_PAGE_SIZE_PER_FREE_SPACE); + + if (n == 3) { + n = 2; + } + + if (n > 3) { + n = 3; + } + + return(n); +} + +/*********************************************************************//** +Translates the free space on a compressed page to a value in the ibuf bitmap. +@return value for ibuf bitmap bits */ +UNIV_INLINE +ulint +ibuf_index_page_calc_free_zip( +/*==========================*/ + const buf_block_t* block) /*!< in: buffer block */ +{ + ulint max_ins_size; + const page_zip_des_t* page_zip; + lint zip_max_ins; + + ut_ad(block->page.zip.data); + + /* Consider the maximum insert size on the uncompressed page + without reorganizing the page. We must not assume anything + about the compression ratio. If zip_max_ins > max_ins_size and + there is 1/4 garbage on the page, recompression after the + reorganize could fail, in theory. So, let us guarantee that + merging a buffered insert to a compressed page will always + succeed without reorganizing or recompressing the page, just + by using the page modification log. */ + max_ins_size = page_get_max_insert_size( + buf_block_get_frame(block), 1); + + page_zip = buf_block_get_page_zip(block); + zip_max_ins = page_zip_max_ins_size(page_zip, + FALSE/* not clustered */); + + if (zip_max_ins < 0) { + return(0); + } else if (max_ins_size > (ulint) zip_max_ins) { + max_ins_size = (ulint) zip_max_ins; + } + + return(ibuf_index_page_calc_free_bits(block->physical_size(), + max_ins_size)); +} + +/*********************************************************************//** +Translates the free space on a page to a value in the ibuf bitmap. +@return value for ibuf bitmap bits */ +UNIV_INLINE +ulint +ibuf_index_page_calc_free( +/*======================*/ + const buf_block_t* block) /*!< in: buffer block */ +{ + if (!block->page.zip.data) { + ulint max_ins_size; + + max_ins_size = page_get_max_insert_size_after_reorganize( + buf_block_get_frame(block), 1); + + return(ibuf_index_page_calc_free_bits( + block->physical_size(), max_ins_size)); + } else { + return(ibuf_index_page_calc_free_zip(block)); + } +} + +/************************************************************************//** +Updates the free bits of an uncompressed page in the ibuf bitmap if +there is not enough free on the page any more. This is done in a +separate mini-transaction, hence this operation does not restrict +further work to only ibuf bitmap operations, which would result if the +latch to the bitmap page were kept. NOTE: The free bits in the insert +buffer bitmap must never exceed the free space on a page. It is +unsafe to increment the bits in a separately committed +mini-transaction, because in crash recovery, the free bits could +momentarily be set too high. It is only safe to use this function for +decrementing the free bits. Should more free space become available, +we must not update the free bits here, because that would break crash +recovery. */ +UNIV_INLINE +void +ibuf_update_free_bits_if_full( +/*==========================*/ + buf_block_t* block, /*!< in: index page to which we have added new + records; the free bits are updated if the + index is non-clustered and non-unique and + the page level is 0, and the page becomes + fuller */ + ulint max_ins_size,/*!< in: value of maximum insert size with + reorganize before the latest operation + performed to the page */ + ulint increase)/*!< in: upper limit for the additional space + used in the latest operation, if known, or + ULINT_UNDEFINED */ +{ + ulint before; + ulint after; + + ut_ad(buf_block_get_page_zip(block) == NULL); + + before = ibuf_index_page_calc_free_bits( + srv_page_size, max_ins_size); + + if (max_ins_size >= increase) { + compile_time_assert(ULINT32_UNDEFINED > UNIV_PAGE_SIZE_MAX); + after = ibuf_index_page_calc_free_bits( + srv_page_size, max_ins_size - increase); +#ifdef UNIV_IBUF_DEBUG + ut_a(after <= ibuf_index_page_calc_free(block)); +#endif + } else { + after = ibuf_index_page_calc_free(block); + } + + if (after == 0) { + /* We move the page to the front of the buffer pool LRU list: + the purpose of this is to prevent those pages to which we + cannot make inserts using the insert buffer from slipping + out of the buffer pool */ + + buf_page_make_young(&block->page); + } + + if (before > after) { + ibuf_set_free_bits(block, after, before); + } +} diff --git a/storage/innobase/include/lock0iter.h b/storage/innobase/include/lock0iter.h new file mode 100644 index 00000000..a7e61395 --- /dev/null +++ b/storage/innobase/include/lock0iter.h @@ -0,0 +1,66 @@ +/***************************************************************************** + +Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0iter.h +Lock queue iterator type and function prototypes. + +Created July 16, 2007 Vasil Dimov +*******************************************************/ + +#ifndef lock0iter_h +#define lock0iter_h + +#include "lock0types.h" + +struct lock_queue_iterator_t { + const lock_t* current_lock; + /* In case this is a record lock queue (not table lock queue) + then bit_no is the record number within the heap in which the + record is stored. */ + ulint bit_no; +}; + +/*******************************************************************//** +Initialize lock queue iterator so that it starts to iterate from +"lock". bit_no specifies the record number within the heap where the +record is stored. It can be undefined (ULINT_UNDEFINED) in two cases: +1. If the lock is a table lock, thus we have a table lock queue; +2. If the lock is a record lock and it is a wait lock. In this case + bit_no is calculated in this function by using + lock_rec_find_set_bit(). There is exactly one bit set in the bitmap + of a wait lock. */ +void +lock_queue_iterator_reset( +/*======================*/ + lock_queue_iterator_t* iter, /*!< out: iterator */ + const lock_t* lock, /*!< in: lock to start from */ + ulint bit_no);/*!< in: record number in the + heap */ + +/*******************************************************************//** +Gets the previous lock in the lock queue, returns NULL if there are no +more locks (i.e. the current lock is the first one). The iterator is +receded (if not-NULL is returned). +@return previous lock or NULL */ +const lock_t* +lock_queue_iterator_get_prev( +/*=========================*/ + lock_queue_iterator_t* iter); /*!< in/out: iterator */ + +#endif /* lock0iter_h */ diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h new file mode 100644 index 00000000..59ee7f55 --- /dev/null +++ b/storage/innobase/include/lock0lock.h @@ -0,0 +1,1271 @@ +/***************************************************************************** + +Copyright (c) 1996, 2022, Oracle and/or its affiliates. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0lock.h +The transaction lock system + +Created 5/7/1996 Heikki Tuuri +*******************************************************/ + +#ifndef lock0lock_h +#define lock0lock_h + +#include "buf0types.h" +#include "trx0trx.h" +#include "mtr0types.h" +#include "rem0types.h" +#include "hash0hash.h" +#include "srv0srv.h" +#include "ut0vec.h" +#include "gis0rtree.h" +#include "lock0prdt.h" +#include "transactional_lock_guard.h" + +// Forward declaration +class ReadView; + +/** The value of innodb_deadlock_detect */ +extern my_bool innodb_deadlock_detect; +/** The value of innodb_deadlock_report */ +extern ulong innodb_deadlock_report; + +namespace Deadlock +{ + /** The allowed values of innodb_deadlock_report */ + enum report { REPORT_OFF, REPORT_BASIC, REPORT_FULL }; +} + +/*********************************************************************//** +Gets the heap_no of the smallest user record on a page. +@return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */ +UNIV_INLINE +ulint +lock_get_min_heap_no( +/*=================*/ + const buf_block_t* block); /*!< in: buffer block */ + +/** Discard locks for an index when purging DELETE FROM SYS_INDEXES +after an aborted CREATE INDEX operation. +@param index a stale index on which ADD INDEX operation was aborted */ +ATTRIBUTE_COLD void lock_discard_for_index(const dict_index_t &index); + +/*************************************************************//** +Updates the lock table when we have reorganized a page. NOTE: we copy +also the locks set on the infimum of the page; the infimum may carry +locks if an update of a record is occurring on the page, and its locks +were temporarily stored on the infimum. */ +void +lock_move_reorganize_page( +/*======================*/ + const buf_block_t* block, /*!< in: old index page, now + reorganized */ + const buf_block_t* oblock);/*!< in: copy of the old, not + reorganized page */ +/*************************************************************//** +Moves the explicit locks on user records to another page if a record +list end is moved to another page. */ +void +lock_move_rec_list_end( +/*===================*/ + const buf_block_t* new_block, /*!< in: index page to move to */ + const buf_block_t* block, /*!< in: index page */ + const rec_t* rec); /*!< in: record on page: this + is the first record moved */ +/*************************************************************//** +Moves the explicit locks on user records to another page if a record +list start is moved to another page. */ +void +lock_move_rec_list_start( +/*=====================*/ + const buf_block_t* new_block, /*!< in: index page to move to */ + const buf_block_t* block, /*!< in: index page */ + const rec_t* rec, /*!< in: record on page: + this is the first + record NOT copied */ + const rec_t* old_end); /*!< in: old + previous-to-last + record on new_page + before the records + were copied */ +/*************************************************************//** +Updates the lock table when a page is split to the right. */ +void +lock_update_split_right( +/*====================*/ + const buf_block_t* right_block, /*!< in: right page */ + const buf_block_t* left_block); /*!< in: left page */ +/*************************************************************//** +Updates the lock table when a page is merged to the right. */ +void +lock_update_merge_right( +/*====================*/ + const buf_block_t* right_block, /*!< in: right page to + which merged */ + const rec_t* orig_succ, /*!< in: original + successor of infimum + on the right page + before merge */ + const buf_block_t* left_block); /*!< in: merged index + page which will be + discarded */ +/** Update locks when the root page is copied to another in +btr_root_raise_and_insert(). Note that we leave lock structs on the +root page, even though they do not make sense on other than leaf +pages: the reason is that in a pessimistic update the infimum record +of the root page will act as a dummy carrier of the locks of the record +to be updated. */ +void lock_update_root_raise(const buf_block_t &block, const page_id_t root); +/** Update the lock table when a page is copied to another. +@param new_block the target page +@param old old page (not index root page) */ +void lock_update_copy_and_discard(const buf_block_t &new_block, page_id_t old); + +/** Update gap locks between the last record of the left_block and the +first record of the right_block when a record is about to be inserted +at the start of the right_block, even though it should "naturally" be +inserted as the last record of the left_block according to the +current node pointer in the parent page. + +That is, we assume that the lowest common ancestor of the left_block +and right_block routes the key of the new record to the left_block, +but a heuristic which tries to avoid overflowing left_block has chosen +to insert the record into right_block instead. Said ancestor performs +this routing by comparing the key of the record to a "split point" - +all records greater or equal to than the split point (node pointer) +are in right_block, and smaller ones in left_block. +The split point may be smaller than the smallest key in right_block. + +The gap between the last record on the left_block and the first record +on the right_block is represented as a gap lock attached to the supremum +pseudo-record of left_block, and a gap lock attached to the new first +record of right_block. + +Thus, inserting the new record, and subsequently adjusting the node +pointers in parent pages to values smaller or equal to the new +records' key, will mean that gap will be sliced at a different place +("moved to the left"): fragment of the 1st gap will now become treated +as 2nd. Therefore, we must copy any GRANTED locks from 1st gap to the +2nd gap. Any WAITING locks must be of INSERT_INTENTION type (as no +other GAP locks ever wait for anything) and can stay at 1st gap, as +their only purpose is to notify the requester they can retry +insertion, and there's no correctness requirement to avoid waking them +up too soon. +@param left_block left page +@param right_block right page */ +void lock_update_node_pointer(const buf_block_t *left_block, + const buf_block_t *right_block); +/*************************************************************//** +Updates the lock table when a page is split to the left. */ +void +lock_update_split_left( +/*===================*/ + const buf_block_t* right_block, /*!< in: right page */ + const buf_block_t* left_block); /*!< in: left page */ +/** Update the lock table when a page is merged to the left. +@param left left page +@param orig_pred original predecessor of supremum on the left page before merge +@param right merged, to-be-discarded right page */ +void lock_update_merge_left(const buf_block_t& left, const rec_t *orig_pred, + const page_id_t right); + +/** Update the locks when a page is split and merged to two pages, +in defragmentation. */ +void lock_update_split_and_merge( + const buf_block_t* left_block, /*!< in: left page to which merged */ + const rec_t* orig_pred, /*!< in: original predecessor of + supremum on the left page before merge*/ + const buf_block_t* right_block);/*!< in: right page from which merged */ +/*************************************************************//** +Resets the original locks on heir and replaces them with gap type locks +inherited from rec. */ +void +lock_rec_reset_and_inherit_gap_locks( +/*=================================*/ + const buf_block_t& heir_block, /*!< in: block containing the + record which inherits */ + const page_id_t donor, /*!< in: page containing the + record from which inherited; + does NOT reset the locks on + this record */ + ulint heir_heap_no, /*!< in: heap_no of the + inheriting record */ + ulint heap_no); /*!< in: heap_no of the + donating record */ +/*************************************************************//** +Updates the lock table when a page is discarded. */ +void +lock_update_discard( +/*================*/ + const buf_block_t* heir_block, /*!< in: index page + which will inherit the locks */ + ulint heir_heap_no, /*!< in: heap_no of the record + which will inherit the locks */ + const buf_block_t* block); /*!< in: index page + which will be discarded */ +/*************************************************************//** +Updates the lock table when a new user record is inserted. */ +void +lock_update_insert( +/*===============*/ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec); /*!< in: the inserted record */ +/*************************************************************//** +Updates the lock table when a record is removed. */ +void +lock_update_delete( +/*===============*/ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec); /*!< in: the record to be removed */ +/*********************************************************************//** +Stores on the page infimum record the explicit locks of another record. +This function is used to store the lock state of a record when it is +updated and the size of the record changes in the update. The record +is in such an update moved, perhaps to another page. The infimum record +acts as a dummy carrier record, taking care of lock releases while the +actual record is being moved. */ +void +lock_rec_store_on_page_infimum( +/*===========================*/ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec); /*!< in: record whose lock state + is stored on the infimum + record of the same page; lock + bits are reset on the + record */ +/** Restore the explicit lock requests on a single record, where the +state was stored on the infimum of a page. +@param block buffer block containing rec +@param rec record whose lock state is restored +@param donator page (rec is not necessarily on this page) +whose infimum stored the lock state; lock bits are reset on the infimum */ +void lock_rec_restore_from_page_infimum(const buf_block_t &block, + const rec_t *rec, page_id_t donator); + +/** +Create a table lock, without checking for deadlocks or lock compatibility. +@param table table on which the lock is created +@param type_mode lock type and mode +@param trx transaction +@param c_lock conflicting lock +@return the created lock object */ +lock_t *lock_table_create(dict_table_t *table, unsigned type_mode, trx_t *trx, + lock_t *c_lock= nullptr); + +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate insert of +a record. If they do, first tests if the query thread should anyway +be suspended for some reason; if not, then puts the transaction and +the query thread to the lock wait state and inserts a waiting request +for a gap x-lock to the lock queue. +@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_rec_insert_check_and_lock( +/*===========================*/ + const rec_t* rec, /*!< in: record after which to insert */ + buf_block_t* block, /*!< in/out: buffer block of rec */ + dict_index_t* index, /*!< in: index */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + bool* inherit)/*!< out: set to true if the new + inserted record maybe should inherit + LOCK_GAP type locks from the successor + record */ + MY_ATTRIBUTE((warn_unused_result)); + +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate modify (update, +delete mark, or delete unmark) of a clustered index record. If they do, +first tests if the query thread should anyway be suspended for some +reason; if not, then puts the transaction and the query thread to the +lock wait state and inserts a waiting request for a record x-lock to the +lock queue. +@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_clust_rec_modify_check_and_lock( +/*=================================*/ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: record which should be + modified */ + dict_index_t* index, /*!< in: clustered index */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + que_thr_t* thr) /*!< in: query thread */ + MY_ATTRIBUTE((warn_unused_result)); +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate modify +(delete mark or delete unmark) of a secondary index record. +@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_sec_rec_modify_check_and_lock( +/*===============================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + buf_block_t* block, /*!< in/out: buffer block of rec */ + const rec_t* rec, /*!< in: record which should be + modified; NOTE: as this is a secondary + index, we always have to modify the + clustered index record first: see the + comment below */ + dict_index_t* index, /*!< in: secondary index */ + que_thr_t* thr, /*!< in: query thread + (can be NULL if BTR_NO_LOCKING_FLAG) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((warn_unused_result)); +/*********************************************************************//** +Like lock_clust_rec_read_check_and_lock(), but reads a +secondary index record. +@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_sec_rec_read_check_and_lock( +/*=============================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: user record or page + supremum record which should + be read or passed over by a + read cursor */ + dict_index_t* index, /*!< in: secondary index */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + lock_mode mode, /*!< in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + unsigned gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr); /*!< in: query thread */ +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate read, or passing +over by a read cursor, of a clustered index record. If they do, first tests +if the query thread should anyway be suspended for some reason; if not, then +puts the transaction and the query thread to the lock wait state and inserts a +waiting request for a record lock to the lock queue. Sets the requested mode +lock on the record. +@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_clust_rec_read_check_and_lock( +/*===============================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: user record or page + supremum record which should + be read or passed over by a + read cursor */ + dict_index_t* index, /*!< in: clustered index */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + lock_mode mode, /*!< in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + unsigned gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr); /*!< in: query thread */ +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate read, or passing +over by a read cursor, of a clustered index record. If they do, first tests +if the query thread should anyway be suspended for some reason; if not, then +puts the transaction and the query thread to the lock wait state and inserts a +waiting request for a record lock to the lock queue. Sets the requested mode +lock on the record. This is an alternative version of +lock_clust_rec_read_check_and_lock() that does not require the parameter +"offsets". +@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_clust_rec_read_check_and_lock_alt( +/*===================================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: user record or page + supremum record which should + be read or passed over by a + read cursor */ + dict_index_t* index, /*!< in: clustered index */ + lock_mode mode, /*!< in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + unsigned gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr) /*!< in: query thread */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Acquire a table lock. +@param table table to be locked +@param fktable pointer to table, in case of a FOREIGN key check +@param mode lock mode +@param thr SQL execution thread +@retval DB_SUCCESS if the lock was acquired +@retval DB_DEADLOCK if a deadlock occurred, or fktable && *fktable != table +@retval DB_LOCK_WAIT if lock_wait() must be invoked */ +dberr_t lock_table(dict_table_t *table, dict_table_t *const*fktable, + lock_mode mode, que_thr_t *thr) + MY_ATTRIBUTE((warn_unused_result)); + +/** Create a table lock object for a resurrected transaction. +@param table table to be X-locked +@param trx transaction +@param mode LOCK_X or LOCK_IX */ +void lock_table_resurrect(dict_table_t *table, trx_t *trx, lock_mode mode); + +/** Sets a lock on a table based on the given mode. +@param table table to lock +@param trx transaction +@param mode LOCK_X or LOCK_S +@param no_wait whether to skip handling DB_LOCK_WAIT +@return error code */ +dberr_t lock_table_for_trx(dict_table_t *table, trx_t *trx, lock_mode mode, + bool no_wait= false) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Exclusively lock the data dictionary tables. +@param trx dictionary transaction +@return error code +@retval DB_SUCCESS on success */ +dberr_t lock_sys_tables(trx_t *trx); + +/*************************************************************//** +Removes a granted record lock of a transaction from the queue and grants +locks to other transactions waiting in the queue if they now are entitled +to a lock. */ +void +lock_rec_unlock( +/*============*/ + trx_t* trx, /*!< in/out: transaction that has + set a record lock */ + const page_id_t id, /*!< in: page containing rec */ + const rec_t* rec, /*!< in: record */ + lock_mode lock_mode);/*!< in: LOCK_S or LOCK_X */ + +/** Release the explicit locks of a committing transaction, +and release possible other transactions waiting because of these locks. */ +void lock_release(trx_t* trx); + +/** Release the explicit locks of a committing transaction while +dict_sys.latch is exclusively locked, +and release possible other transactions waiting because of these locks. */ +void lock_release_on_drop(trx_t *trx); + +/** Release non-exclusive locks on XA PREPARE, +and release possible other transactions waiting because of these locks. */ +void lock_release_on_prepare(trx_t *trx); + +/** Release locks on a table whose creation is being rolled back */ +ATTRIBUTE_COLD void lock_release_on_rollback(trx_t *trx, dict_table_t *table); + +/**********************************************************************//** +Looks for a set bit in a record lock bitmap. Returns ULINT_UNDEFINED, +if none found. +@return bit index == heap number of the record, or ULINT_UNDEFINED if +none found */ +ulint +lock_rec_find_set_bit( +/*==================*/ + const lock_t* lock); /*!< in: record lock with at least one + bit set */ + +/*********************************************************************//** +Checks if a lock request lock1 has to wait for request lock2. +@return whether lock1 has to wait for lock2 to be removed */ +bool +lock_has_to_wait( +/*=============*/ + const lock_t* lock1, /*!< in: waiting lock */ + const lock_t* lock2); /*!< in: another lock; NOTE that it is + assumed that this has a lock bit set + on the same record as in lock1 if the + locks are record locks */ +/*********************************************************************//** +Reports that a transaction id is insensible, i.e., in the future. */ +ATTRIBUTE_COLD +void +lock_report_trx_id_insanity( +/*========================*/ + trx_id_t trx_id, /*!< in: trx id */ + const rec_t* rec, /*!< in: user record */ + dict_index_t* index, /*!< in: index */ + const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index) */ + trx_id_t max_trx_id); /*!< in: trx_sys.get_max_trx_id() */ +/*********************************************************************//** +Prints info of locks for all transactions. +@return FALSE if not able to acquire lock_sys.latch (and display info) */ +ibool +lock_print_info_summary( +/*====================*/ + FILE* file, /*!< in: file where to print */ + ibool nowait) /*!< in: whether to wait for lock_sys.latch */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Prints transaction lock wait and MVCC state. +@param[in,out] file file where to print +@param[in] trx transaction +@param[in] now current my_hrtime_coarse() */ +void lock_trx_print_wait_and_mvcc_state(FILE *file, const trx_t *trx, + my_hrtime_t now); + +/*********************************************************************//** +Prints info of locks for each transaction. This function will release +lock_sys.latch, which the caller must be holding in exclusive mode. */ +void +lock_print_info_all_transactions( +/*=============================*/ + FILE* file); /*!< in: file where to print */ + +/*********************************************************************//** +Return the number of table locks for a transaction. +The caller must be holding lock_sys.latch. */ +ulint +lock_number_of_tables_locked( +/*=========================*/ + const trx_lock_t* trx_lock) /*!< in: transaction locks */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Check if there are any locks on a table. +@return true if table has either table or record locks. */ +bool lock_table_has_locks(dict_table_t *table); + +/** Wait for a lock to be released. +@retval DB_DEADLOCK if this transaction was chosen as the deadlock victim +@retval DB_INTERRUPTED if the execution was interrupted by the user +@retval DB_LOCK_WAIT_TIMEOUT if the lock wait timed out +@retval DB_SUCCESS if the lock was granted */ +dberr_t lock_wait(que_thr_t *thr); +/*********************************************************************//** +Unlocks AUTO_INC type locks that were possibly reserved by a trx. This +function should be called at the the end of an SQL statement, by the +connection thread that owns the transaction (trx->mysql_thd). */ +void +lock_unlock_table_autoinc( +/*======================*/ + trx_t* trx); /*!< in/out: transaction */ + +/** Handle a pending lock wait (DB_LOCK_WAIT) in a semi-consistent read +while holding a clustered index leaf page latch. +@param trx transaction that is or was waiting for a lock +@retval DB_SUCCESS if the lock was granted +@retval DB_DEADLOCK if the transaction must be aborted due to a deadlock +@retval DB_LOCK_WAIT if a lock wait would be necessary; the pending + lock request was released */ +dberr_t lock_trx_handle_wait(trx_t *trx); + +/*********************************************************************//** +Checks that a transaction id is sensible, i.e., not in the future. +@return true if ok */ +bool +lock_check_trx_id_sanity( +/*=====================*/ + trx_id_t trx_id, /*!< in: trx id */ + const rec_t* rec, /*!< in: user record */ + dict_index_t* index, /*!< in: index */ + const rec_offs* offsets); /*!< in: rec_get_offsets(rec, index) */ +#ifdef UNIV_DEBUG +/*******************************************************************//** +Check if the transaction holds any locks on the sys tables +or its records. +@return the strongest lock found on any sys table or 0 for none */ +const lock_t* +lock_trx_has_sys_table_locks( +/*=========================*/ + const trx_t* trx) /*!< in: transaction to check */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Check if the transaction holds an explicit exclusive lock on a record. +@param[in] trx transaction +@param[in] table table +@param[in] id leaf page identifier +@param[in] heap_no heap number identifying the record +@return whether an explicit X-lock is held */ +bool lock_trx_has_expl_x_lock(const trx_t &trx, const dict_table_t &table, + page_id_t id, ulint heap_no); +#endif /* UNIV_DEBUG */ + +/** Lock operation struct */ +struct lock_op_t{ + dict_table_t* table; /*!< table to be locked */ + lock_mode mode; /*!< lock mode */ +}; + +/** The lock system struct */ +class lock_sys_t +{ + friend struct LockGuard; + friend struct LockMultiGuard; + friend struct TMLockGuard; + friend struct TMLockMutexGuard; + friend struct TMLockTrxGuard; + + /** Hash table latch */ + struct hash_latch +#ifdef SUX_LOCK_GENERIC + : private rw_lock + { + /** Wait for an exclusive lock */ + void wait(); + /** Try to acquire a lock */ + bool try_acquire() { return write_trylock(); } + /** Acquire a lock */ + void acquire() { if (!try_acquire()) wait(); } + /** Release a lock */ + void release(); + /** @return whether any lock is being held or waited for by any thread */ + bool is_locked_or_waiting() const + { return rw_lock::is_locked_or_waiting(); } + /** @return whether this latch is possibly held by any thread */ + bool is_locked() const { return rw_lock::is_locked(); } +#else + { + private: + srw_spin_lock_low lock; + public: + /** Try to acquire a lock */ + bool try_acquire() { return lock.wr_lock_try(); } + /** Acquire a lock */ + void acquire() { lock.wr_lock(); } + /** Release a lock */ + void release() { lock.wr_unlock(); } + /** @return whether any lock may be held by any thread */ + bool is_locked_or_waiting() const noexcept + { return lock.is_locked_or_waiting(); } + /** @return whether this latch is possibly held by any thread */ + bool is_locked() const noexcept { return lock.is_locked(); } +#endif + }; + +public: + struct hash_table + { + /** Number of consecutive array[] elements occupied by a hash_latch */ + static constexpr size_t LATCH= sizeof(void*) >= sizeof(hash_latch) ? 1 : 2; + static_assert(sizeof(hash_latch) <= LATCH * sizeof(void*), "allocation"); + + /** Number of array[] elements per hash_latch. + Must be LATCH less than a power of 2. */ + static constexpr size_t ELEMENTS_PER_LATCH= (64 / sizeof(void*)) - LATCH; + static constexpr size_t EMPTY_SLOTS_PER_LATCH= + ((CPU_LEVEL1_DCACHE_LINESIZE / 64) - 1) * (64 / sizeof(void*)); + + /** number of payload elements in array[]. Protected by lock_sys.latch. */ + ulint n_cells; + /** the hash table, with pad(n_cells) elements, aligned to L1 cache size; + in any hash chain, lock_t::is_waiting() entries must not precede + granted locks */ + hash_cell_t *array; + + /** Create the hash table. + @param n the lower bound of n_cells */ + void create(ulint n); + + /** Resize the hash table. + @param n the lower bound of n_cells */ + void resize(ulint n); + + /** Free the hash table. */ + void free() { aligned_free(array); array= nullptr; } + + /** @return the index of an array element */ + inline ulint calc_hash(ulint fold) const; + + /** @return raw array index converted to padded index */ + static ulint pad(ulint h) + { + ulint latches= LATCH * (h / ELEMENTS_PER_LATCH); + ulint empty_slots= (h / ELEMENTS_PER_LATCH) * EMPTY_SLOTS_PER_LATCH; + return LATCH + latches + empty_slots + h; + } + + /** Get a latch. */ + static hash_latch *latch(hash_cell_t *cell) + { + void *l= ut_align_down(cell, sizeof *cell * + (ELEMENTS_PER_LATCH + LATCH)); + return static_cast(l); + } + /** Get a hash table cell. */ + inline hash_cell_t *cell_get(ulint fold) const; + +#ifdef UNIV_DEBUG + void assert_locked(const page_id_t id) const; +#else + void assert_locked(const page_id_t) const {} +#endif + + private: + /** @return the hash value before any ELEMENTS_PER_LATCH padding */ + static ulint hash(ulint fold, ulint n) { return ut_hash_ulint(fold, n); } + + /** @return the index of an array element */ + static ulint calc_hash(ulint fold, ulint n_cells) + { + return pad(hash(fold, n_cells)); + } + }; + +private: + bool m_initialised; + + /** mutex proteting the locks */ + alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_spin_lock latch; +#ifdef UNIV_DEBUG + /** The owner of exclusive latch (0 if none); protected by latch */ + std::atomic writer{0}; + /** Number of shared latches */ + std::atomic readers{0}; +#endif +#ifdef SUX_LOCK_GENERIC +protected: + /** mutex for hash_latch::wait() */ + pthread_mutex_t hash_mutex; + /** condition variable for hash_latch::wait() */ + pthread_cond_t hash_cond; +#endif +public: + /** record locks */ + hash_table rec_hash; + /** predicate locks for SPATIAL INDEX */ + hash_table prdt_hash; + /** page locks for SPATIAL INDEX */ + hash_table prdt_page_hash; + + /** mutex covering lock waits; @see trx_lock_t::wait_lock */ + alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t wait_mutex; +private: + /** The increment of wait_count for a wait. Anything smaller is a + pending wait count. */ + static constexpr uint64_t WAIT_COUNT_STEP= 1U << 19; + /** waits and total number of lock waits; protected by wait_mutex */ + uint64_t wait_count; + /** Cumulative wait time; protected by wait_mutex */ + uint64_t wait_time; + /** Longest wait time; protected by wait_mutex */ + uint64_t wait_time_max; +public: + /** number of deadlocks detected; protected by wait_mutex */ + ulint deadlocks; + /** number of lock wait timeouts; protected by wait_mutex */ + ulint timeouts; + /** + Constructor. + + Some members may require late initialisation, thus we just mark object as + uninitialised. Real initialisation happens in create(). + */ + lock_sys_t(): m_initialised(false) {} + + + bool is_initialised() const { return m_initialised; } + +#ifdef UNIV_PFS_RWLOCK + /** Acquire exclusive lock_sys.latch */ + ATTRIBUTE_NOINLINE + void wr_lock(const char *file, unsigned line); + /** Release exclusive lock_sys.latch */ + ATTRIBUTE_NOINLINE void wr_unlock(); + /** Acquire shared lock_sys.latch */ + ATTRIBUTE_NOINLINE void rd_lock(const char *file, unsigned line); + /** Release shared lock_sys.latch */ + ATTRIBUTE_NOINLINE void rd_unlock(); +#else + /** Acquire exclusive lock_sys.latch */ + void wr_lock() + { + mysql_mutex_assert_not_owner(&wait_mutex); + ut_ad(!is_writer()); + latch.wr_lock(); + ut_ad(!writer.exchange(pthread_self(), + std::memory_order_relaxed)); + } + /** Release exclusive lock_sys.latch */ + void wr_unlock() + { + ut_ad(writer.exchange(0, std::memory_order_relaxed) == + pthread_self()); + latch.wr_unlock(); + } + /** Acquire shared lock_sys.latch */ + void rd_lock() + { + mysql_mutex_assert_not_owner(&wait_mutex); + ut_ad(!is_writer()); + latch.rd_lock(); + ut_ad(!writer.load(std::memory_order_relaxed)); + ut_d(readers.fetch_add(1, std::memory_order_relaxed)); + } + /** Release shared lock_sys.latch */ + void rd_unlock() + { + ut_ad(!is_writer()); + ut_ad(readers.fetch_sub(1, std::memory_order_relaxed)); + latch.rd_unlock(); + } +#endif + /** Try to acquire exclusive lock_sys.latch + @return whether the latch was acquired */ + bool wr_lock_try() + { + ut_ad(!is_writer()); + if (!latch.wr_lock_try()) return false; + ut_ad(!writer.exchange(pthread_self(), + std::memory_order_relaxed)); + return true; + } + /** Try to acquire shared lock_sys.latch + @return whether the latch was acquired */ + bool rd_lock_try() + { + ut_ad(!is_writer()); + if (!latch.rd_lock_try()) return false; + ut_ad(!writer.load(std::memory_order_relaxed)); + ut_d(readers.fetch_add(1, std::memory_order_relaxed)); + return true; + } + + /** Assert that wr_lock() has been invoked by this thread */ + void assert_locked() const { ut_ad(is_writer()); } + /** Assert that wr_lock() has not been invoked by this thread */ + void assert_unlocked() const { ut_ad(!is_writer()); } +#ifdef UNIV_DEBUG + /** @return whether the current thread is the lock_sys.latch writer */ + bool is_writer() const + { +# ifdef SUX_LOCK_GENERIC + return writer.load(std::memory_order_relaxed) == pthread_self(); +# else + return writer.load(std::memory_order_relaxed) == pthread_self() || + (xtest() && !latch.is_locked_or_waiting()); +# endif + } + /** Assert that a lock shard is exclusively latched (by some thread) */ + void assert_locked(const lock_t &lock) const; + /** Assert that a table lock shard is exclusively latched by this thread */ + void assert_locked(const dict_table_t &table) const; + /** Assert that a hash table cell is exclusively latched (by some thread) */ + void assert_locked(const hash_cell_t &cell) const; +#else + void assert_locked(const lock_t &) const {} + void assert_locked(const dict_table_t &) const {} + void assert_locked(const hash_cell_t &) const {} +#endif + + /** + Creates the lock system at database start. + + @param[in] n_cells number of slots in lock hash table + */ + void create(ulint n_cells); + + + /** + Resize the lock hash table. + + @param[in] n_cells number of slots in lock hash table + */ + void resize(ulint n_cells); + + + /** Closes the lock system at database shutdown. */ + void close(); + + + /** Check for deadlocks while holding only lock_sys.wait_mutex. */ + void deadlock_check(); + + /** Cancel a waiting lock request. + @tparam check_victim whether to check for DB_DEADLOCK + @param trx active transaction + @param lock waiting lock request + @retval DB_SUCCESS if no lock existed + @retval DB_DEADLOCK if trx->lock.was_chosen_as_deadlock_victim was set + @retval DB_LOCK_WAIT if the lock was canceled */ + template + static dberr_t cancel(trx_t *trx, lock_t *lock); + + /** Note that a record lock wait started */ + inline void wait_start(); + + /** Note that a record lock wait resumed */ + inline void wait_resume(THD *thd, my_hrtime_t start, my_hrtime_t now); + + /** @return pending number of lock waits */ + ulint get_wait_pending() const + { + return static_cast(wait_count & (WAIT_COUNT_STEP - 1)); + } + /** @return cumulative number of lock waits */ + ulint get_wait_cumulative() const + { return static_cast(wait_count / WAIT_COUNT_STEP); } + /** Cumulative wait time; protected by wait_mutex */ + uint64_t get_wait_time_cumulative() const { return wait_time; } + /** Longest wait time; protected by wait_mutex */ + uint64_t get_wait_time_max() const { return wait_time_max; } + + /** Get the lock hash table for a mode */ + hash_table &hash_get(ulint mode) + { + if (UNIV_LIKELY(!(mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE)))) + return rec_hash; + return (mode & LOCK_PREDICATE) ? prdt_hash : prdt_page_hash; + } + + /** Get the lock hash table for predicate a mode */ + hash_table &prdt_hash_get(bool page) + { return page ? prdt_page_hash : prdt_hash; } + + /** Get the first lock on a page. + @param cell hash table cell + @param id page number + @return first lock + @retval nullptr if none exists */ + static inline lock_t *get_first(const hash_cell_t &cell, page_id_t id); + + /** Get the first explicit lock request on a record. + @param cell first lock hash table cell + @param id page identifier + @param heap_no record identifier in page + @return first lock + @retval nullptr if none exists */ + static inline lock_t *get_first(const hash_cell_t &cell, page_id_t id, + ulint heap_no); + + /** Remove locks on a discarded SPATIAL INDEX page. + @param id page to be discarded + @param page whether to discard also from lock_sys.prdt_hash */ + void prdt_page_free_from_discard(const page_id_t id, bool all= false); + + /** Cancel possible lock waiting for a transaction */ + static void cancel_lock_wait_for_trx(trx_t *trx); +#ifdef WITH_WSREP + /** Cancel lock waiting for a wsrep BF abort. */ + static void cancel_lock_wait_for_wsrep_bf_abort(trx_t *trx); +#endif /* WITH_WSREP */ +}; + +/** The lock system */ +extern lock_sys_t lock_sys; + +/** @return the index of an array element */ +inline ulint lock_sys_t::hash_table::calc_hash(ulint fold) const +{ + ut_ad(lock_sys.is_writer() || lock_sys.readers); + return calc_hash(fold, n_cells); +} + +/** Get a hash table cell. */ +inline hash_cell_t *lock_sys_t::hash_table::cell_get(ulint fold) const +{ + ut_ad(lock_sys.is_writer() || lock_sys.readers); + return &array[calc_hash(fold)]; +} + +/** Get the first lock on a page. +@param cell hash table cell +@param id page number +@return first lock +@retval nullptr if none exists */ +inline lock_t *lock_sys_t::get_first(const hash_cell_t &cell, page_id_t id) +{ + lock_sys.assert_locked(cell); + for (auto lock= static_cast(cell.node); lock; lock= lock->hash) + { + ut_ad(!lock->is_table()); + if (lock->un_member.rec_lock.page_id == id) + return lock; + } + return nullptr; +} + +/** lock_sys.latch exclusive guard */ +struct LockMutexGuard +{ + LockMutexGuard(SRW_LOCK_ARGS(const char *file, unsigned line)) + { lock_sys.wr_lock(SRW_LOCK_ARGS(file, line)); } + ~LockMutexGuard() { lock_sys.wr_unlock(); } +}; + +/** lock_sys latch guard for 1 page_id_t */ +struct LockGuard +{ + LockGuard(lock_sys_t::hash_table &hash, const page_id_t id); + ~LockGuard() + { + lock_sys_t::hash_table::latch(cell_)->release(); + /* Must be last, to avoid a race with lock_sys_t::hash_table::resize() */ + lock_sys.rd_unlock(); + } + /** @return the hash array cell */ + hash_cell_t &cell() const { return *cell_; } +private: + /** The hash array cell */ + hash_cell_t *cell_; +}; + +/** lock_sys latch guard for 2 page_id_t */ +struct LockMultiGuard +{ + LockMultiGuard(lock_sys_t::hash_table &hash, + const page_id_t id1, const page_id_t id2); + ~LockMultiGuard(); + + /** @return the first hash array cell */ + hash_cell_t &cell1() const { return *cell1_; } + /** @return the second hash array cell */ + hash_cell_t &cell2() const { return *cell2_; } +private: + /** The first hash array cell */ + hash_cell_t *cell1_; + /** The second hash array cell */ + hash_cell_t *cell2_; +}; + +/** lock_sys.latch exclusive guard using transactional memory */ +struct TMLockMutexGuard +{ + TRANSACTIONAL_INLINE + TMLockMutexGuard(SRW_LOCK_ARGS(const char *file, unsigned line)) + { +#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC + if (xbegin()) + { + if (was_elided()) + return; + xabort(); + } +#endif + lock_sys.wr_lock(SRW_LOCK_ARGS(file, line)); + } + TRANSACTIONAL_INLINE + ~TMLockMutexGuard() + { +#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC + if (was_elided()) xend(); else +#endif + lock_sys.wr_unlock(); + } + +#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC + bool was_elided() const noexcept + { return !lock_sys.latch.is_locked_or_waiting(); } +#else + bool was_elided() const noexcept { return false; } +#endif +}; + +/** lock_sys latch guard for 1 page_id_t, using transactional memory */ +struct TMLockGuard +{ + TRANSACTIONAL_TARGET + TMLockGuard(lock_sys_t::hash_table &hash, const page_id_t id); + TRANSACTIONAL_INLINE ~TMLockGuard() + { +#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC + if (elided) + { + xend(); + return; + } +#endif + lock_sys_t::hash_table::latch(cell_)->release(); + /* Must be last, to avoid a race with lock_sys_t::hash_table::resize() */ + lock_sys.rd_unlock(); + } + /** @return the hash array cell */ + hash_cell_t &cell() const { return *cell_; } +private: + /** The hash array cell */ + hash_cell_t *cell_; +#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC + /** whether the latches were elided */ + bool elided; +#endif +}; + +/** guard for shared lock_sys.latch and trx_t::mutex using +transactional memory */ +struct TMLockTrxGuard +{ + trx_t &trx; + + TRANSACTIONAL_INLINE +#ifndef UNIV_PFS_RWLOCK + TMLockTrxGuard(trx_t &trx) : trx(trx) +# define TMLockTrxArgs(trx) trx +#else + TMLockTrxGuard(const char *file, unsigned line, trx_t &trx) : trx(trx) +# define TMLockTrxArgs(trx) SRW_LOCK_CALL, trx +#endif + { +#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC + if (xbegin()) + { + if (!lock_sys.latch.is_write_locked() && was_elided()) + return; + xabort(); + } +#endif + lock_sys.rd_lock(SRW_LOCK_ARGS(file, line)); + trx.mutex_lock(); + } + TRANSACTIONAL_INLINE + ~TMLockTrxGuard() + { +#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC + if (was_elided()) + { + xend(); + return; + } +#endif + lock_sys.rd_unlock(); + trx.mutex_unlock(); + } +#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC + bool was_elided() const noexcept { return !trx.mutex_is_locked(); } +#else + bool was_elided() const noexcept { return false; } +#endif +}; + +/** guard for trx_t::mutex using transactional memory */ +struct TMTrxGuard +{ + trx_t &trx; + + TRANSACTIONAL_INLINE TMTrxGuard(trx_t &trx) : trx(trx) + { +#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC + if (xbegin()) + { + if (was_elided()) + return; + xabort(); + } +#endif + trx.mutex_lock(); + } + TRANSACTIONAL_INLINE ~TMTrxGuard() + { +#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC + if (was_elided()) + { + xend(); + return; + } +#endif + trx.mutex_unlock(); + } +#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC + bool was_elided() const noexcept { return !trx.mutex_is_locked(); } +#else + bool was_elided() const noexcept { return false; } +#endif +}; + +/*********************************************************************//** +Creates a new record lock and inserts it to the lock queue. Does NOT check +for deadlocks or lock compatibility! +@return created lock */ +UNIV_INLINE +lock_t* +lock_rec_create( +/*============*/ + lock_t* c_lock, /*!< conflicting lock */ + unsigned type_mode,/*!< in: lock mode and wait flag */ + const buf_block_t* block, /*!< in: buffer block containing + the record */ + ulint heap_no,/*!< in: heap number of the record */ + dict_index_t* index, /*!< in: index of record */ + trx_t* trx, /*!< in,out: transaction */ + bool caller_owns_trx_mutex); + /*!< in: true if caller owns + trx mutex */ + +/** Remove a record lock request, waiting or granted, on a discarded page +@param hash hash table +@param in_lock lock object */ +void lock_rec_discard(lock_sys_t::hash_table &lock_hash, lock_t *in_lock); + +/** Create a new record lock and inserts it to the lock queue, +without checking for deadlocks or conflicts. +@param[in] c_lock conflicting lock, or NULL +@param[in] type_mode lock mode and wait flag +@param[in] page_id index page number +@param[in] page R-tree index page, or NULL +@param[in] heap_no record heap number in the index page +@param[in] index the index tree +@param[in,out] trx transaction +@param[in] holds_trx_mutex whether the caller holds trx->mutex +@return created lock */ +lock_t* +lock_rec_create_low( + lock_t* c_lock, + unsigned type_mode, + const page_id_t page_id, + const page_t* page, + ulint heap_no, + dict_index_t* index, + trx_t* trx, + bool holds_trx_mutex); + +/** Enqueue a waiting request for a lock which cannot be granted immediately. +Check for deadlocks. +@param[in] c_lock conflicting lock +@param[in] type_mode the requested lock mode (LOCK_S or LOCK_X) + possibly ORed with LOCK_GAP or + LOCK_REC_NOT_GAP, ORed with + LOCK_INSERT_INTENTION if this + waiting lock request is set + when performing an insert of + an index record +@param[in] id page identifier +@param[in] page leaf page in the index +@param[in] heap_no record heap number in the block +@param[in] index index tree +@param[in,out] thr query thread +@param[in] prdt minimum bounding box (spatial index) +@retval DB_LOCK_WAIT if the waiting lock was enqueued +@retval DB_DEADLOCK if this transaction was chosen as the victim */ +dberr_t +lock_rec_enqueue_waiting( + lock_t* c_lock, + unsigned type_mode, + const page_id_t id, + const page_t* page, + ulint heap_no, + dict_index_t* index, + que_thr_t* thr, + lock_prdt_t* prdt); +/*************************************************************//** +Moves the explicit locks on user records to another page if a record +list start is moved to another page. */ +void +lock_rtr_move_rec_list( +/*===================*/ + const buf_block_t* new_block, /*!< in: index page to + move to */ + const buf_block_t* block, /*!< in: index page */ + rtr_rec_move_t* rec_move, /*!< in: recording records + moved */ + ulint num_move); /*!< in: num of rec to move */ + +#include "lock0lock.inl" + +#endif diff --git a/storage/innobase/include/lock0lock.inl b/storage/innobase/include/lock0lock.inl new file mode 100644 index 00000000..1b9255ff --- /dev/null +++ b/storage/innobase/include/lock0lock.inl @@ -0,0 +1,78 @@ +/***************************************************************************** + +Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0lock.ic +The transaction lock system + +Created 5/7/1996 Heikki Tuuri +*******************************************************/ + +#include "dict0dict.h" +#include "buf0buf.h" +#include "page0page.h" + +/*********************************************************************//** +Gets the heap_no of the smallest user record on a page. +@return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */ +UNIV_INLINE +ulint +lock_get_min_heap_no( +/*=================*/ + const buf_block_t* block) /*!< in: buffer block */ +{ + const page_t* page = block->page.frame; + + if (page_is_comp(page)) { + return(rec_get_heap_no_new( + page + + rec_get_next_offs(page + PAGE_NEW_INFIMUM, + TRUE))); + } else { + return(rec_get_heap_no_old( + page + + rec_get_next_offs(page + PAGE_OLD_INFIMUM, + FALSE))); + } +} + +/*********************************************************************//** +Creates a new record lock and inserts it to the lock queue. Does NOT check +for deadlocks or lock compatibility! +@return created lock */ +UNIV_INLINE +lock_t* +lock_rec_create( +/*============*/ + lock_t* c_lock, /*!< conflicting lock */ + unsigned type_mode,/*!< in: lock mode and wait flag */ + const buf_block_t* block, /*!< in: buffer block containing + the record */ + ulint heap_no,/*!< in: heap number of the record */ + dict_index_t* index, /*!< in: index of record */ + trx_t* trx, /*!< in,out: transaction */ + bool caller_owns_trx_mutex) + /*!< in: TRUE if caller owns + trx mutex */ +{ + return lock_rec_create_low( + c_lock, + type_mode, block->page.id(), block->page.frame, heap_no, + index, trx, caller_owns_trx_mutex); +} diff --git a/storage/innobase/include/lock0prdt.h b/storage/innobase/include/lock0prdt.h new file mode 100644 index 00000000..db8e3392 --- /dev/null +++ b/storage/innobase/include/lock0prdt.h @@ -0,0 +1,192 @@ +/***************************************************************************** + +Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0prdt.h +The predicate lock system + +Created 9/7/2013 Jimmy Yang +*******************************************************/ +#ifndef lock0prdt_h +#define lock0prdt_h + +#include "lock0lock.h" + +/* Predicate lock data */ +typedef struct lock_prdt { + void* data; /* Predicate data */ + uint16 op; /* Predicate operator */ +} lock_prdt_t; + +/*********************************************************************//** +Acquire a predicate lock on a block +@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_prdt_lock( +/*===========*/ + buf_block_t* block, /*!< in/out: buffer block of rec */ + lock_prdt_t* prdt, /*!< in: Predicate for the lock */ + dict_index_t* index, /*!< in: secondary index */ + enum lock_mode mode, /*!< in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + unsigned type_mode, + /*!< in: LOCK_PREDICATE or LOCK_PRDT_PAGE */ + que_thr_t* thr); /*!< in: query thread + (can be NULL if BTR_NO_LOCKING_FLAG) */ + +/*********************************************************************//** +Acquire a "Page" lock on a block +@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_place_prdt_page_lock( + const page_id_t page_id, /*!< in: page identifier */ + dict_index_t* index, /*!< in: secondary index */ + que_thr_t* thr); /*!< in: query thread */ + +/*********************************************************************//** +Initiate a Predicate lock from a MBR */ +void +lock_init_prdt_from_mbr( +/*====================*/ + lock_prdt_t* prdt, /*!< in/out: predicate to initialized */ + rtr_mbr_t* mbr, /*!< in: Minimum Bounding Rectangle */ + ulint mode, /*!< in: Search mode */ + mem_heap_t* heap); /*!< in: heap for allocating memory */ + +/*********************************************************************//** +Get predicate lock's minimum bounding box +@return the minimum bounding box*/ +lock_prdt_t* +lock_get_prdt_from_lock( +/*====================*/ + const lock_t* lock); /*!< in: the lock */ + +/*********************************************************************//** +Checks if a predicate lock request for a new lock has to wait for +request lock2. +@return true if new lock has to wait for lock2 to be removed */ +bool +lock_prdt_has_to_wait( +/*==================*/ + const trx_t* trx, /*!< in: trx of new lock */ + unsigned type_mode,/*!< in: precise mode of the new lock + to set: LOCK_S or LOCK_X, possibly + ORed to LOCK_PREDICATE or LOCK_PRDT_PAGE, + LOCK_INSERT_INTENTION */ + lock_prdt_t* prdt, /*!< in: lock predicate to check */ + const lock_t* lock2); /*!< in: another record lock; NOTE that + it is assumed that this has a lock bit + set on the same record as in the new + lock we are setting */ + +/**************************************************************//** +Update predicate lock when page splits */ +void +lock_prdt_update_split( +/*===================*/ + buf_block_t* new_block, /*!< in/out: the new half page */ + lock_prdt_t* prdt, /*!< in: MBR on the old page */ + lock_prdt_t* new_prdt, /*!< in: MBR on the new page */ + const page_id_t page_id); /*!< in: page number */ + +/**************************************************************//** +Ajust locks from an ancester page of Rtree on the appropriate level . */ +void +lock_prdt_update_parent( +/*====================*/ + buf_block_t* left_block, /*!< in/out: page to be split */ + buf_block_t* right_block, /*!< in/out: the new half page */ + lock_prdt_t* left_prdt, /*!< in: MBR on the old page */ + lock_prdt_t* right_prdt, /*!< in: MBR on the new page */ + const page_id_t page_id); /*!< in: parent page */ + +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate insert of +a predicate record. +@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_prdt_insert_check_and_lock( +/*============================*/ + const rec_t* rec, /*!< in: record after which to insert */ + buf_block_t* block, /*!< in/out: buffer block of rec */ + dict_index_t* index, /*!< in: index */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + lock_prdt_t* prdt); /*!< in: Minimum Bound Rectangle */ + +/*********************************************************************//** +Append a predicate to the lock */ +void +lock_prdt_set_prdt( +/*===============*/ + lock_t* lock, /*!< in: lock */ + const lock_prdt_t* prdt); /*!< in: Predicate */ + +#if 0 + +/*********************************************************************//** +Checks if a predicate lock request for a new lock has to wait for +request lock2. +@return true if new lock has to wait for lock2 to be removed */ +UNIV_INLINE +bool +lock_prdt_has_to_wait( +/*==================*/ + const trx_t* trx, /*!< in: trx of new lock */ + unsigned type_mode,/*!< in: precise mode of the new lock + to set: LOCK_S or LOCK_X, possibly + ORed to LOCK_PREDICATE or LOCK_PRDT_PAGE, + LOCK_INSERT_INTENTION */ + lock_prdt_t* prdt, /*!< in: lock predicate to check */ + const lock_t* lock2); /*!< in: another record lock; NOTE that + it is assumed that this has a lock bit + set on the same record as in the new + lock we are setting */ + +/*********************************************************************//** +Get predicate lock's minimum bounding box +@return the minimum bounding box*/ +UNIV_INLINE +rtr_mbr_t* +prdt_get_mbr_from_prdt( +/*===================*/ + const lock_prdt_t* prdt); /*!< in: the lock predicate */ + + +#endif +/*************************************************************//** +Moves the locks of a record to another record and resets the lock bits of +the donating record. */ +void +lock_prdt_rec_move( +/*===============*/ + const buf_block_t* receiver, /*!< in: buffer block containing + the receiving record */ + const page_id_t donator); /*!< in: target page */ + +/** Check whether there are R-tree Page lock on a page +@param[in] trx trx to test the lock +@param[in] page_id page identifier +@return true if there is none */ +bool lock_test_prdt_page_lock(const trx_t *trx, const page_id_t page_id); + +#endif diff --git a/storage/innobase/include/lock0priv.h b/storage/innobase/include/lock0priv.h new file mode 100644 index 00000000..b0a5f7aa --- /dev/null +++ b/storage/innobase/include/lock0priv.h @@ -0,0 +1,582 @@ +/***************************************************************************** + +Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0priv.h +Lock module internal structures and methods. + +Created July 12, 2007 Vasil Dimov +*******************************************************/ + +#ifndef lock0priv_h +#define lock0priv_h + +#ifndef LOCK_MODULE_IMPLEMENTATION +/* If you need to access members of the structures defined in this +file, please write appropriate functions that retrieve them and put +those functions in lock/ */ +#error Do not include lock0priv.h outside of the lock/ module +#endif + +#include "hash0hash.h" +#include "rem0types.h" +#include "trx0trx.h" + +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +/** Print the table lock into the given output stream +@param[in,out] out the output stream +@return the given output stream. */ +inline +std::ostream& lock_table_t::print(std::ostream& out) const +{ + out << "[lock_table_t: name=" << table->name << "]"; + return(out); +} + +/** The global output operator is overloaded to conveniently +print the lock_table_t object into the given output stream. +@param[in,out] out the output stream +@param[in] lock the table lock +@return the given output stream */ +inline +std::ostream& +operator<<(std::ostream& out, const lock_table_t& lock) +{ + return(lock.print(out)); +} + +inline +std::ostream& +ib_lock_t::print(std::ostream& out) const +{ + static_assert(LOCK_MODE_MASK == 7, "compatibility"); + static_assert(LOCK_IS == 0, "compatibility"); + static_assert(LOCK_IX == 1, "compatibility"); + static_assert(LOCK_S == 2, "compatibility"); + static_assert(LOCK_X == 3, "compatibility"); + static_assert(LOCK_AUTO_INC == 4, "compatibility"); + static_assert(LOCK_NONE == 5, "compatibility"); + static_assert(LOCK_NONE_UNSET == 7, "compatibility"); + const char *const modes[8]= + { "IS", "IX", "S", "X", "AUTO_INC", "NONE", "?", "NONE_UNSET" }; + + out << "[lock_t: type_mode=" << type_mode << "(" << type_string() + << " | LOCK_" << modes[mode()]; + + if (is_record_not_gap()) + out << " | LOCK_REC_NOT_GAP"; + if (is_waiting()) + out << " | LOCK_WAIT"; + + if (is_gap()) + out << " | LOCK_GAP"; + + if (is_insert_intention()) + out << " | LOCK_INSERT_INTENTION"; + + out << ")"; + + if (is_table()) + out << un_member.tab_lock; + else + out << un_member.rec_lock; + + out << "]"; + return out; +} + +inline +std::ostream& +operator<<(std::ostream& out, const ib_lock_t& lock) +{ + return(lock.print(out)); +} + +#ifdef UNIV_DEBUG +extern ibool lock_print_waits; +#endif /* UNIV_DEBUG */ + +/* An explicit record lock affects both the record and the gap before it. +An implicit x-lock does not affect the gap, it only locks the index +record from read or update. + +If a transaction has modified or inserted an index record, then +it owns an implicit x-lock on the record. On a secondary index record, +a transaction has an implicit x-lock also if it has modified the +clustered index record, the max trx id of the page where the secondary +index record resides is >= trx id of the transaction (or database recovery +is running), and there are no explicit non-gap lock requests on the +secondary index record. + +This complicated definition for a secondary index comes from the +implementation: we want to be able to determine if a secondary index +record has an implicit x-lock, just by looking at the present clustered +index record, not at the historical versions of the record. The +complicated definition can be explained to the user so that there is +nondeterminism in the access path when a query is answered: we may, +or may not, access the clustered index record and thus may, or may not, +bump into an x-lock set there. + +Different transaction can have conflicting locks set on the gap at the +same time. The locks on the gap are purely inhibitive: an insert cannot +be made, or a select cursor may have to wait if a different transaction +has a conflicting lock on the gap. An x-lock on the gap does not give +the right to insert into the gap. + +An explicit lock can be placed on a user record or the supremum record of +a page. The locks on the supremum record are always thought to be of the gap +type, though the gap bit is not set. When we perform an update of a record +where the size of the record changes, we may temporarily store its explicit +locks on the infimum record of the page, though the infimum otherwise never +carries locks. + +A waiting record lock can also be of the gap type. A waiting lock request +can be granted when there is no conflicting mode lock request by another +transaction ahead of it in the explicit lock queue. + +In version 4.0.5 we added yet another explicit lock type: LOCK_REC_NOT_GAP. +It only locks the record it is placed on, not the gap before the record. +This lock type is necessary to emulate an Oracle-like READ COMMITTED isolation +level. + +------------------------------------------------------------------------- +RULE 1: If there is an implicit x-lock on a record, and there are non-gap +------- +lock requests waiting in the queue, then the transaction holding the implicit +x-lock also has an explicit non-gap record x-lock. Therefore, as locks are +released, we can grant locks to waiting lock requests purely by looking at +the explicit lock requests in the queue. + +RULE 3: Different transactions cannot have conflicting granted non-gap locks +------- +on a record at the same time. However, they can have conflicting granted gap +locks. +RULE 4: If a there is a waiting lock request in a queue, no lock request, +------- +gap or not, can be inserted ahead of it in the queue. In record deletes +and page splits new gap type locks can be created by the database manager +for a transaction, and without rule 4, the waits-for graph of transactions +might become cyclic without the database noticing it, as the deadlock check +is only performed when a transaction itself requests a lock! +------------------------------------------------------------------------- + +An insert is allowed to a gap if there are no explicit lock requests by +other transactions on the next record. It does not matter if these lock +requests are granted or waiting, gap bit set or not, with the exception +that a gap type request set by another transaction to wait for +its turn to do an insert is ignored. On the other hand, an +implicit x-lock by another transaction does not prevent an insert, which +allows for more concurrency when using an Oracle-style sequence number +generator for the primary key with many transactions doing inserts +concurrently. + +A modify of a record is allowed if the transaction has an x-lock on the +record, or if other transactions do not have any non-gap lock requests on the +record. + +A read of a single user record with a cursor is allowed if the transaction +has a non-gap explicit, or an implicit lock on the record, or if the other +transactions have no x-lock requests on the record. At a page supremum a +read is always allowed. + +In summary, an implicit lock is seen as a granted x-lock only on the +record, not on the gap. An explicit lock with no gap bit set is a lock +both on the record and the gap. If the gap bit is set, the lock is only +on the gap. Different transaction cannot own conflicting locks on the +record at the same time, but they may own conflicting locks on the gap. +Granted locks on a record give an access right to the record, but gap type +locks just inhibit operations. + +NOTE: Finding out if some transaction has an implicit x-lock on a secondary +index record can be cumbersome. We may have to look at previous versions of +the corresponding clustered index record to find out if a delete marked +secondary index record was delete marked by an active transaction, not by +a committed one. + +FACT A: If a transaction has inserted a row, it can delete it any time +without need to wait for locks. + +PROOF: The transaction has an implicit x-lock on every index record inserted +for the row, and can thus modify each record without the need to wait. Q.E.D. + +FACT B: If a transaction has read some result set with a cursor, it can read +it again, and retrieves the same result set, if it has not modified the +result set in the meantime. Hence, there is no phantom problem. If the +biggest record, in the alphabetical order, touched by the cursor is removed, +a lock wait may occur, otherwise not. + +PROOF: When a read cursor proceeds, it sets an s-lock on each user record +it passes, and a gap type s-lock on each page supremum. The cursor must +wait until it has these locks granted. Then no other transaction can +have a granted x-lock on any of the user records, and therefore cannot +modify the user records. Neither can any other transaction insert into +the gaps which were passed over by the cursor. Page splits and merges, +and removal of obsolete versions of records do not affect this, because +when a user record or a page supremum is removed, the next record inherits +its locks as gap type locks, and therefore blocks inserts to the same gap. +Also, if a page supremum is inserted, it inherits its locks from the successor +record. When the cursor is positioned again at the start of the result set, +the records it will touch on its course are either records it touched +during the last pass or new inserted page supremums. It can immediately +access all these records, and when it arrives at the biggest record, it +notices that the result set is complete. If the biggest record was removed, +lock wait can occur because the next record only inherits a gap type lock, +and a wait may be needed. Q.E.D. */ + +/* If an index record should be changed or a new inserted, we must check +the lock on the record or the next. When a read cursor starts reading, +we will set a record level s-lock on each record it passes, except on the +initial record on which the cursor is positioned before we start to fetch +records. Our index tree search has the convention that the B-tree +cursor is positioned BEFORE the first possibly matching record in +the search. Optimizations are possible here: if the record is searched +on an equality condition to a unique key, we could actually set a special +lock on the record, a lock which would not prevent any insert before +this record. In the next key locking an x-lock set on a record also +prevents inserts just before that record. + There are special infimum and supremum records on each page. +A supremum record can be locked by a read cursor. This records cannot be +updated but the lock prevents insert of a user record to the end of +the page. + Next key locks will prevent the phantom problem where new rows +could appear to SELECT result sets after the select operation has been +performed. Prevention of phantoms ensures the serilizability of +transactions. + What should we check if an insert of a new record is wanted? +Only the lock on the next record on the same page, because also the +supremum record can carry a lock. An s-lock prevents insertion, but +what about an x-lock? If it was set by a searched update, then there +is implicitly an s-lock, too, and the insert should be prevented. +What if our transaction owns an x-lock to the next record, but there is +a waiting s-lock request on the next record? If this s-lock was placed +by a read cursor moving in the ascending order in the index, we cannot +do the insert immediately, because when we finally commit our transaction, +the read cursor should see also the new inserted record. So we should +move the read cursor backward from the next record for it to pass over +the new inserted record. This move backward may be too cumbersome to +implement. If we in this situation just enqueue a second x-lock request +for our transaction on the next record, then the deadlock mechanism +notices a deadlock between our transaction and the s-lock request +transaction. This seems to be an ok solution. + We could have the convention that granted explicit record locks, +lock the corresponding records from changing, and also lock the gaps +before them from inserting. A waiting explicit lock request locks the gap +before from inserting. Implicit record x-locks, which we derive from the +transaction id in the clustered index record, only lock the record itself +from modification, not the gap before it from inserting. + How should we store update locks? If the search is done by a unique +key, we could just modify the record trx id. Otherwise, we could put a record +x-lock on the record. If the update changes ordering fields of the +clustered index record, the inserted new record needs no record lock in +lock table, the trx id is enough. The same holds for a secondary index +record. Searched delete is similar to update. + +PROBLEM: +What about waiting lock requests? If a transaction is waiting to make an +update to a record which another modified, how does the other transaction +know to send the end-lock-wait signal to the waiting transaction? If we have +the convention that a transaction may wait for just one lock at a time, how +do we preserve it if lock wait ends? + +PROBLEM: +Checking the trx id label of a secondary index record. In the case of a +modification, not an insert, is this necessary? A secondary index record +is modified only by setting or resetting its deleted flag. A secondary index +record contains fields to uniquely determine the corresponding clustered +index record. A secondary index record is therefore only modified if we +also modify the clustered index record, and the trx id checking is done +on the clustered index record, before we come to modify the secondary index +record. So, in the case of delete marking or unmarking a secondary index +record, we do not have to care about trx ids, only the locks in the lock +table must be checked. In the case of a select from a secondary index, the +trx id is relevant, and in this case we may have to search the clustered +index record. + +PROBLEM: How to update record locks when page is split or merged, or +-------------------------------------------------------------------- +a record is deleted or updated? +If the size of fields in a record changes, we perform the update by +a delete followed by an insert. How can we retain the locks set or +waiting on the record? Because a record lock is indexed in the bitmap +by the heap number of the record, when we remove the record from the +record list, it is possible still to keep the lock bits. If the page +is reorganized, we could make a table of old and new heap numbers, +and permute the bitmaps in the locks accordingly. We can add to the +table a row telling where the updated record ended. If the update does +not require a reorganization of the page, we can simply move the lock +bits for the updated record to the position determined by its new heap +number (we may have to allocate a new lock, if we run out of the bitmap +in the old one). + A more complicated case is the one where the reinsertion of the +updated record is done pessimistically, because the structure of the +tree may change. + +PROBLEM: If a supremum record is removed in a page merge, or a record +--------------------------------------------------------------------- +removed in a purge, what to do to the waiting lock requests? In a split to +the right, we just move the lock requests to the new supremum. If a record +is removed, we could move the waiting lock request to its inheritor, the +next record in the index. But, the next record may already have lock +requests on its own queue. A new deadlock check should be made then. Maybe +it is easier just to release the waiting transactions. They can then enqueue +new lock requests on appropriate records. + +PROBLEM: When a record is inserted, what locks should it inherit from the +------------------------------------------------------------------------- +upper neighbor? An insert of a new supremum record in a page split is +always possible, but an insert of a new user record requires that the upper +neighbor does not have any lock requests by other transactions, granted or +waiting, in its lock queue. Solution: We can copy the locks as gap type +locks, so that also the waiting locks are transformed to granted gap type +locks on the inserted record. */ + +/* LOCK COMPATIBILITY MATRIX + * IS IX S X AI + * IS + + + - + + * IX + + - - + + * S + - + - - + * X - - - - - + * AI + + - - - + * + * Note that for rows, InnoDB only acquires S or X locks. + * For tables, InnoDB normally acquires IS or IX locks. + * S or X table locks are only acquired for LOCK TABLES. + * Auto-increment (AI) locks are needed because of + * statement-level MySQL binlog. + * See also lock_mode_compatible(). + */ +static const byte lock_compatibility_matrix[5][5] = { + /** IS IX S X AI */ + /* IS */ { TRUE, TRUE, TRUE, FALSE, TRUE}, + /* IX */ { TRUE, TRUE, FALSE, FALSE, TRUE}, + /* S */ { TRUE, FALSE, TRUE, FALSE, FALSE}, + /* X */ { FALSE, FALSE, FALSE, FALSE, FALSE}, + /* AI */ { TRUE, TRUE, FALSE, FALSE, FALSE} +}; + +/* STRONGER-OR-EQUAL RELATION (mode1=row, mode2=column) + * IS IX S X AI + * IS + - - - - + * IX + + - - - + * S + - + - - + * X + + + + + + * AI - - - - + + * See lock_mode_stronger_or_eq(). + */ +static const byte lock_strength_matrix[5][5] = { + /** IS IX S X AI */ + /* IS */ { TRUE, FALSE, FALSE, FALSE, FALSE}, + /* IX */ { TRUE, TRUE, FALSE, FALSE, FALSE}, + /* S */ { TRUE, FALSE, TRUE, FALSE, FALSE}, + /* X */ { TRUE, TRUE, TRUE, TRUE, TRUE}, + /* AI */ { FALSE, FALSE, FALSE, FALSE, TRUE} +}; + +#define PRDT_HEAPNO PAGE_HEAP_NO_INFIMUM +/** Record locking request status */ +enum lock_rec_req_status { + /** Failed to acquire a lock */ + LOCK_REC_FAIL, + /** Succeeded in acquiring a lock (implicit or already acquired) */ + LOCK_REC_SUCCESS, + /** Explicitly created a new lock */ + LOCK_REC_SUCCESS_CREATED +}; + +#ifdef UNIV_DEBUG +/** The count of the types of locks. */ +static const ulint lock_types = UT_ARR_SIZE(lock_compatibility_matrix); +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Gets the previous record lock set on a record. +@return previous lock on the same record, NULL if none exists */ +const lock_t* +lock_rec_get_prev( +/*==============*/ + const lock_t* in_lock,/*!< in: record lock */ + ulint heap_no);/*!< in: heap number of the record */ + +/*********************************************************************//** +Checks if some transaction has an implicit x-lock on a record in a clustered +index. +@return transaction id of the transaction which has the x-lock, or 0 */ +UNIV_INLINE +trx_id_t +lock_clust_rec_some_has_impl( +/*=========================*/ + const rec_t* rec, /*!< in: user record */ + const dict_index_t* index, /*!< in: clustered index */ + const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */ + MY_ATTRIBUTE((warn_unused_result)); + +/*********************************************************************//** +Gets the first or next record lock on a page. +@return next lock, NULL if none exists */ +UNIV_INLINE +const lock_t* +lock_rec_get_next_on_page_const( +/*============================*/ + const lock_t* lock); /*!< in: a record lock */ + +/*********************************************************************//** +Gets the nth bit of a record lock. +@return TRUE if bit set also if i == ULINT_UNDEFINED return FALSE*/ +UNIV_INLINE +ibool +lock_rec_get_nth_bit( +/*=================*/ + const lock_t* lock, /*!< in: record lock */ + ulint i); /*!< in: index of the bit */ + +/*********************************************************************//** +Gets the number of bits in a record lock bitmap. +@return number of bits */ +UNIV_INLINE +ulint +lock_rec_get_n_bits( +/*================*/ + const lock_t* lock); /*!< in: record lock */ + +/**********************************************************************//** +Sets the nth bit of a record lock to TRUE. */ +inline +void +lock_rec_set_nth_bit( +/*=================*/ + lock_t* lock, /*!< in: record lock */ + ulint i); /*!< in: index of the bit */ + +/** Reset the nth bit of a record lock. +@param[in,out] lock record lock +@param[in] i index of the bit that will be reset +@return previous value of the bit */ +inline byte lock_rec_reset_nth_bit(lock_t* lock, ulint i) +{ + ut_ad(!lock->is_table()); +#ifdef SUX_LOCK_GENERIC + ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner()); +#else + ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner() + || (xtest() && !lock->trx->mutex_is_locked())); +#endif + ut_ad(i < lock->un_member.rec_lock.n_bits); + + byte* b = reinterpret_cast(&lock[1]) + (i >> 3); + byte mask = byte(1U << (i & 7)); + byte bit = *b & mask; + *b &= byte(~mask); + + if (bit != 0) { + ut_d(auto n=) + lock->trx->lock.n_rec_locks--; + ut_ad(n); + } + + return(bit); +} + +/*********************************************************************//** +Gets the first or next record lock on a page. +@return next lock, NULL if none exists */ +UNIV_INLINE +lock_t* +lock_rec_get_next_on_page( +/*======================*/ + lock_t* lock); /*!< in: a record lock */ + +/*********************************************************************//** +Gets the next explicit lock request on a record. +@return next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */ +UNIV_INLINE +lock_t* +lock_rec_get_next( +/*==============*/ + ulint heap_no,/*!< in: heap number of the record */ + lock_t* lock); /*!< in: lock */ + +/*********************************************************************//** +Gets the next explicit lock request on a record. +@return next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */ +UNIV_INLINE +const lock_t* +lock_rec_get_next_const( +/*====================*/ + ulint heap_no,/*!< in: heap number of the record */ + const lock_t* lock); /*!< in: lock */ + +/** Get the first explicit lock request on a record. +@param cell first lock hash table cell +@param id page identifier +@param heap_no record identifier in page +@return first lock +@retval nullptr if none exists */ +inline lock_t *lock_sys_t::get_first(const hash_cell_t &cell, page_id_t id, + ulint heap_no) +{ + lock_sys.assert_locked(cell); + + for (lock_t *lock= static_cast(cell.node); lock; lock= lock->hash) + { + ut_ad(!lock->is_table()); + if (lock->un_member.rec_lock.page_id == id && + lock_rec_get_nth_bit(lock, heap_no)) + return lock; + } + return nullptr; +} + +/*********************************************************************//** +Calculates if lock mode 1 is compatible with lock mode 2. +@return nonzero if mode1 compatible with mode2 */ +UNIV_INLINE +ulint +lock_mode_compatible( +/*=================*/ + enum lock_mode mode1, /*!< in: lock mode */ + enum lock_mode mode2); /*!< in: lock mode */ + +/*********************************************************************//** +Calculates if lock mode 1 is stronger or equal to lock mode 2. +@return nonzero if mode1 stronger or equal to mode2 */ +UNIV_INLINE +ulint +lock_mode_stronger_or_eq( +/*=====================*/ + enum lock_mode mode1, /*!< in: lock mode */ + enum lock_mode mode2); /*!< in: lock mode */ + +/*********************************************************************//** +Checks if a transaction has the specified table lock, or stronger. This +function should only be called by the thread that owns the transaction. +@return lock or NULL */ +UNIV_INLINE +const lock_t* +lock_table_has( +/*===========*/ + const trx_t* trx, /*!< in: transaction */ + const dict_table_t* table, /*!< in: table */ + enum lock_mode mode); /*!< in: lock mode */ + +#include "lock0priv.inl" + +#endif /* lock0priv_h */ diff --git a/storage/innobase/include/lock0priv.inl b/storage/innobase/include/lock0priv.inl new file mode 100644 index 00000000..3b4ebcc8 --- /dev/null +++ b/storage/innobase/include/lock0priv.inl @@ -0,0 +1,255 @@ +/***************************************************************************** + +Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0priv.ic +Lock module internal inline methods. + +Created July 16, 2007 Vasil Dimov +*******************************************************/ + +/* This file contains only methods which are used in +lock/lock0* files, other than lock/lock0lock.cc. +I.e. lock/lock0lock.cc contains more internal inline +methods but they are used only in that file. */ + +#ifndef LOCK_MODULE_IMPLEMENTATION +#error Do not include lock0priv.ic outside of the lock/ module +#endif + +#include "row0row.h" + +/*********************************************************************//** +Checks if some transaction has an implicit x-lock on a record in a clustered +index. +@return transaction id of the transaction which has the x-lock, or 0 */ +UNIV_INLINE +trx_id_t +lock_clust_rec_some_has_impl( +/*=========================*/ + const rec_t* rec, /*!< in: user record */ + const dict_index_t* index, /*!< in: clustered index */ + const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */ +{ + ut_ad(dict_index_is_clust(index)); + ut_ad(page_rec_is_user_rec(rec)); + + return(row_get_rec_trx_id(rec, index, offsets)); +} + +/*********************************************************************//** +Gets the number of bits in a record lock bitmap. +@return number of bits */ +UNIV_INLINE +ulint +lock_rec_get_n_bits( +/*================*/ + const lock_t* lock) /*!< in: record lock */ +{ + return(lock->un_member.rec_lock.n_bits); +} + +/**********************************************************************//** +Sets the nth bit of a record lock to TRUE. */ +inline +void +lock_rec_set_nth_bit( +/*=================*/ + lock_t* lock, /*!< in: record lock */ + ulint i) /*!< in: index of the bit */ +{ + ulint byte_index; + ulint bit_index; + + ut_ad(!lock->is_table()); + ut_ad(i < lock->un_member.rec_lock.n_bits); + + byte_index = i / 8; + bit_index = i % 8; + +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wconversion" /* GCC 4 and 5 need this here */ +#endif + ((byte*) &lock[1])[byte_index] |= static_cast(1 << bit_index); +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 +# pragma GCC diagnostic pop +#endif +#ifdef SUX_LOCK_GENERIC + ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner()); +#else + ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner() + || (xtest() && !lock->trx->mutex_is_locked())); +#endif + lock->trx->lock.n_rec_locks++; +} + +/*********************************************************************//** +Gets the first or next record lock on a page. +@return next lock, NULL if none exists */ +UNIV_INLINE +lock_t* +lock_rec_get_next_on_page( +/*======================*/ + lock_t* lock) /*!< in: a record lock */ +{ + return const_cast(lock_rec_get_next_on_page_const(lock)); +} + +/*********************************************************************//** +Gets the next explicit lock request on a record. +@return next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */ +UNIV_INLINE +lock_t* +lock_rec_get_next( +/*==============*/ + ulint heap_no,/*!< in: heap number of the record */ + lock_t* lock) /*!< in: lock */ +{ + do { + lock = lock_rec_get_next_on_page(lock); + } while (lock && !lock_rec_get_nth_bit(lock, heap_no)); + + return(lock); +} + +/*********************************************************************//** +Gets the next explicit lock request on a record. +@return next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */ +UNIV_INLINE +const lock_t* +lock_rec_get_next_const( +/*====================*/ + ulint heap_no,/*!< in: heap number of the record */ + const lock_t* lock) /*!< in: lock */ +{ + return lock_rec_get_next(heap_no, const_cast(lock)); +} + +/*********************************************************************//** +Gets the nth bit of a record lock. +@return TRUE if bit set also if i == ULINT_UNDEFINED return FALSE*/ +UNIV_INLINE +ibool +lock_rec_get_nth_bit( +/*=================*/ + const lock_t* lock, /*!< in: record lock */ + ulint i) /*!< in: index of the bit */ +{ + const byte* b; + + ut_ad(!lock->is_table()); + + if (i >= lock->un_member.rec_lock.n_bits) { + + return(FALSE); + } + + b = ((const byte*) &lock[1]) + (i / 8); + + return(1 & *b >> (i % 8)); +} + +/*********************************************************************//** +Gets the first or next record lock on a page. +@return next lock, NULL if none exists */ +UNIV_INLINE +const lock_t* +lock_rec_get_next_on_page_const( +/*============================*/ + const lock_t* lock) /*!< in: a record lock */ +{ + ut_ad(!lock->is_table()); + + const page_id_t page_id{lock->un_member.rec_lock.page_id}; + + while (!!(lock= static_cast(HASH_GET_NEXT(hash, lock)))) + if (lock->un_member.rec_lock.page_id == page_id) + break; + return lock; +} + +/*********************************************************************//** +Calculates if lock mode 1 is compatible with lock mode 2. +@return nonzero if mode1 compatible with mode2 */ +UNIV_INLINE +ulint +lock_mode_compatible( +/*=================*/ + enum lock_mode mode1, /*!< in: lock mode */ + enum lock_mode mode2) /*!< in: lock mode */ +{ + ut_ad((ulint) mode1 < lock_types); + ut_ad((ulint) mode2 < lock_types); + + return(lock_compatibility_matrix[mode1][mode2]); +} + +/*********************************************************************//** +Calculates if lock mode 1 is stronger or equal to lock mode 2. +@return nonzero if mode1 stronger or equal to mode2 */ +UNIV_INLINE +ulint +lock_mode_stronger_or_eq( +/*=====================*/ + enum lock_mode mode1, /*!< in: lock mode */ + enum lock_mode mode2) /*!< in: lock mode */ +{ + ut_ad((ulint) mode1 < lock_types); + ut_ad((ulint) mode2 < lock_types); + + return(lock_strength_matrix[mode1][mode2]); +} + +/*********************************************************************//** +Checks if a transaction has the specified table lock, or stronger. This +function should only be called by the thread that owns the transaction. +@return lock or NULL */ +UNIV_INLINE +const lock_t* +lock_table_has( +/*===========*/ + const trx_t* trx, /*!< in: transaction */ + const dict_table_t* table, /*!< in: table */ + lock_mode in_mode)/*!< in: lock mode */ +{ + /* Look for stronger locks the same trx already has on the table */ + + for (lock_list::const_iterator it = trx->lock.table_locks.begin(), + end = trx->lock.table_locks.end(); it != end; ++it) { + + const lock_t* lock = *it; + + if (lock == NULL) { + continue; + } + + ut_ad(trx == lock->trx); + ut_ad(lock->is_table()); + ut_ad(lock->un_member.tab_lock.table); + + if (table == lock->un_member.tab_lock.table + && lock_mode_stronger_or_eq(lock->mode(), in_mode)) { + ut_ad(!lock->is_waiting()); + return(lock); + } + } + + return(NULL); +} diff --git a/storage/innobase/include/lock0types.h b/storage/innobase/include/lock0types.h new file mode 100644 index 00000000..0d00b4b3 --- /dev/null +++ b/storage/innobase/include/lock0types.h @@ -0,0 +1,251 @@ +/***************************************************************************** + +Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/lock0types.h +The transaction lock system global types + +Created 5/7/1996 Heikki Tuuri +*******************************************************/ + +#include "dict0types.h" +#include "buf0types.h" +#include "ut0lst.h" + +#ifndef lock0types_h +#define lock0types_h + +#define lock_t ib_lock_t + +struct lock_t; +struct lock_table_t; + +/* Basic lock modes */ +enum lock_mode { + LOCK_IS = 0, /* intention shared */ + LOCK_IX, /* intention exclusive */ + LOCK_S, /* shared */ + LOCK_X, /* exclusive */ + LOCK_AUTO_INC, /* locks the auto-inc counter of a table + in an exclusive mode */ + LOCK_NONE, /* this is used elsewhere to note consistent read */ + LOCK_NUM = LOCK_NONE, /* number of lock modes */ + LOCK_NONE_UNSET = 7 +}; + +/** A table lock */ +struct lock_table_t { + dict_table_t* table; /*!< database table in dictionary + cache */ + UT_LIST_NODE_T(ib_lock_t) + locks; /*!< list of locks on the same + table */ + /** Print the table lock into the given output stream + @param[in,out] out the output stream + @return the given output stream. */ + std::ostream& print(std::ostream& out) const; +}; + +/** Record lock for a page */ +struct lock_rec_t { + /** page identifier */ + page_id_t page_id; + ib_uint32_t n_bits; /*!< number of bits in the lock + bitmap; NOTE: the lock bitmap is + placed immediately after the + lock struct */ + + /** Print the record lock into the given output stream + @param[in,out] out the output stream + @return the given output stream. */ + std::ostream& print(std::ostream& out) const; +}; + +/** Print the record lock into the given output stream +@param[in,out] out the output stream +@return the given output stream. */ +inline std::ostream &lock_rec_t::print(std::ostream &out) const +{ + out << "[lock_rec_t: space=" << page_id.space() + << ", page_no=" << page_id.page_no() + << ", n_bits=" << n_bits << "]"; + return out; +} + +inline +std::ostream& +operator<<(std::ostream& out, const lock_rec_t& lock) +{ + return(lock.print(out)); +} + +#define LOCK_MODE_MASK 0x7 /*!< mask used to extract mode from the + type_mode field in a lock */ +/** Lock types */ +/* @{ */ +/** table lock (record lock if the flag is not set) */ +#define LOCK_TABLE 8U + +#define LOCK_WAIT 256U /*!< Waiting lock flag; when set, it + means that the lock has not yet been + granted, it is just waiting for its + turn in the wait queue */ +/* Precise modes */ +#define LOCK_ORDINARY 0 /*!< this flag denotes an ordinary + next-key lock in contrast to LOCK_GAP + or LOCK_REC_NOT_GAP */ +#define LOCK_GAP 512U /*!< when this bit is set, it means that the + lock holds only on the gap before the record; + for instance, an x-lock on the gap does not + give permission to modify the record on which + the bit is set; locks of this type are created + when records are removed from the index chain + of records */ +#define LOCK_REC_NOT_GAP 1024U /*!< this bit means that the lock is only on + the index record and does NOT block inserts + to the gap before the index record; this is + used in the case when we retrieve a record + with a unique key, and is also used in + locking plain SELECTs (not part of UPDATE + or DELETE) when the user has set the READ + COMMITTED isolation level */ +#define LOCK_INSERT_INTENTION 2048U/*!< this bit is set when we place a waiting + gap type record lock request in order to let + an insert of an index record to wait until + there are no conflicting locks by other + transactions on the gap; note that this flag + remains set when the waiting lock is granted, + or if the lock is inherited to a neighboring + record */ +#define LOCK_PREDICATE 8192U /*!< Predicate lock */ +#define LOCK_PRDT_PAGE 16384U /*!< Page lock */ + + +#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION|LOCK_PREDICATE|LOCK_PRDT_PAGE)&LOCK_MODE_MASK +# error +#endif +#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION|LOCK_PREDICATE|LOCK_PRDT_PAGE)&LOCK_TYPE_MASK +# error +#endif +/* @} */ + +/** +Checks if the `mode` is LOCK_S or LOCK_X (possibly ORed with LOCK_WAIT or +LOCK_REC) which means the lock is a +Next Key Lock, a.k.a. LOCK_ORDINARY, as opposed to Predicate Lock, +GAP lock, Insert Intention or Record Lock. +@param mode A mode and flags, of a lock. +@return true if the only bits set in `mode` are LOCK_S or LOCK_X and optionally +LOCK_WAIT or LOCK_REC */ +static inline bool lock_mode_is_next_key_lock(ulint mode) +{ + static_assert(LOCK_ORDINARY == 0, "LOCK_ORDINARY must be 0 (no flags)"); + ut_ad((mode & LOCK_TABLE) == 0); + mode&= ~LOCK_WAIT; + ut_ad((mode & LOCK_WAIT) == 0); + ut_ad(((mode & ~(LOCK_MODE_MASK)) == LOCK_ORDINARY) == + (mode == LOCK_S || mode == LOCK_X)); + return (mode & ~(LOCK_MODE_MASK)) == LOCK_ORDINARY; +} + +/** Lock struct; protected by lock_sys.latch */ +struct ib_lock_t +{ + /** the owner of the lock */ + trx_t *trx; + /** other locks of the transaction; protected by + lock_sys.is_writer() and trx->mutex_is_owner(); @see trx_lock_t::trx_locks */ + UT_LIST_NODE_T(ib_lock_t) trx_locks; + + dict_index_t* index; /*!< index for a record lock */ + + ib_lock_t* hash; /*!< hash chain node for a record + lock. The link node in a singly linked + list, used during hashing. */ + + /** time(NULL) of the lock request creation. + Used for computing wait_time and diagnostics only. + Note: bogus durations may be reported + when the system time is adjusted! */ + time_t requested_time; + /** Cumulated wait time in seconds. + Note: may be bogus when the system time is adjusted! */ + ulint wait_time; + + union { + lock_table_t tab_lock;/*!< table lock */ + lock_rec_t rec_lock;/*!< record lock */ + } un_member; /*!< lock details */ + + ib_uint32_t type_mode; /*!< lock type, mode, LOCK_GAP or + LOCK_REC_NOT_GAP, + LOCK_INSERT_INTENTION, + wait flag, ORed */ + + bool is_waiting() const + { + return(type_mode & LOCK_WAIT); + } + + bool is_gap() const + { + return(type_mode & LOCK_GAP); + } + + bool is_record_not_gap() const + { + return(type_mode & LOCK_REC_NOT_GAP); + } + + /** @return true if the lock is a Next Key Lock */ + bool is_next_key_lock() const + { + return !(type_mode & LOCK_TABLE) && + lock_mode_is_next_key_lock(type_mode); + } + + bool is_insert_intention() const + { + return(type_mode & LOCK_INSERT_INTENTION); + } + + bool is_table() const { return type_mode & LOCK_TABLE; } + + enum lock_mode mode() const + { + return(static_cast(type_mode & LOCK_MODE_MASK)); + } + + bool is_rec_granted_exclusive_not_gap() const + { + return (type_mode & (LOCK_MODE_MASK | LOCK_GAP)) == LOCK_X; + } + + /** Print the lock object into the given output stream. + @param[in,out] out the output stream + @return the given output stream. */ + std::ostream& print(std::ostream& out) const; + + const char* type_string() const + { return is_table() ? "LOCK_TABLE" : "LOCK_REC"; } +}; + +typedef UT_LIST_BASE_NODE_T(ib_lock_t) trx_lock_list_t; + +#endif /* lock0types_h */ diff --git a/storage/innobase/include/log0crypt.h b/storage/innobase/include/log0crypt.h new file mode 100644 index 00000000..22c0c963 --- /dev/null +++ b/storage/innobase/include/log0crypt.h @@ -0,0 +1,115 @@ +/***************************************************************************** + +Copyright (C) 2013, 2015, Google Inc. All Rights Reserved. +Copyright (C) 2014, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ +/**************************************************//** +@file include/log0crypt.h +Innodb log encrypt/decrypt + +Created 11/25/2013 Minli Zhu +Modified Jan Lindström jan.lindstrom@mariadb.com +MDEV-11782: Rewritten for MariaDB 10.2 by Marko Mäkelä, MariaDB Corporation. +*******************************************************/ +#pragma once + +#include "log0log.h" + +/** Initialize the redo log encryption key and random parameters +when creating a new redo log. +The random parameters will be persisted in the log header. +@see log_crypt_write_header() +@see log_crypt_read_header() +@return whether the operation succeeded */ +bool log_crypt_init(); + +/** Add the encryption information to the log header buffer. +@param buf part of log header buffer */ +void log_crypt_write_header(byte *buf); + +/** Read the encryption information from a redo log checkpoint buffer. +@param buf part of checkpoint buffer +@return whether the operation was successful */ +bool log_crypt_read_header(const byte *buf); + +/** Read the MariaDB 10.1 checkpoint crypto (version, msg and iv) info. +@param[in] buf checkpoint buffer +@return whether the operation was successful */ +ATTRIBUTE_COLD bool log_crypt_101_read_checkpoint(const byte* buf); + +/** Decrypt a MariaDB 10.1 redo log block. +@param[in,out] buf log block +@param[in] start_lsn server start LSN +@return whether the decryption was successful */ +ATTRIBUTE_COLD bool log_crypt_101_read_block(byte* buf, lsn_t start_lsn); + +/** Read the checkpoint crypto (version, msg and iv) info. +@param[in] buf checkpoint buffer +@return whether the operation was successful */ +ATTRIBUTE_COLD bool log_crypt_read_checkpoint_buf(const byte* buf); + +/** Decrypt log blocks. +@param[in,out] buf log blocks to decrypt +@param[in] lsn log sequence number of the start of the buffer +@param[in] size size of the buffer, in bytes +@return whether the operation succeeded */ +ATTRIBUTE_COLD bool log_decrypt(byte* buf, lsn_t lsn, ulint size); + +/** Decrypt part of a log record. +@param iv initialization vector +@param buf buffer for the decrypted data +@param data the encrypted data +@param len length of the data, in bytes +@return buf */ +byte *log_decrypt_buf(const byte *iv, byte *buf, const byte *data, uint len); + +/** Decrypt a log snippet. +@param iv initialization vector +@param buf buffer to be replaced with encrypted contents +@param end pointer past the end of buf */ +void log_decrypt_buf(const byte *iv, byte *buf, const byte *const end); + +/** Encrypt or decrypt a temporary file block. +@param[in] src block to encrypt or decrypt +@param[in] size size of the block +@param[out] dst destination block +@param[in] offs offset to block +@param[in] encrypt true=encrypt; false=decrypt +@return whether the operation succeeded */ +bool log_tmp_block_encrypt( + const byte* src, + ulint size, + byte* dst, + uint64_t offs, + bool encrypt = true) + MY_ATTRIBUTE((warn_unused_result, nonnull)); + +/** Decrypt a temporary file block. +@param[in] src block to decrypt +@param[in] size size of the block +@param[out] dst destination block +@param[in] offs offset to block +@return whether the operation succeeded */ +inline +bool +log_tmp_block_decrypt( + const byte* src, + ulint size, + byte* dst, + uint64_t offs) +{ + return(log_tmp_block_encrypt(src, size, dst, offs, false)); +} diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h new file mode 100644 index 00000000..f873eabf --- /dev/null +++ b/storage/innobase/include/log0log.h @@ -0,0 +1,529 @@ +/***************************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All rights reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/log0log.h +Database log + +Created 12/9/1995 Heikki Tuuri +*******************************************************/ + +#pragma once + +#include "log0types.h" +#include "os0file.h" +#include "span.h" +#include "my_atomic_wrapper.h" +#include "srw_lock.h" +#include + +using st_::span; + +static const char LOG_FILE_NAME_PREFIX[] = "ib_logfile"; +static const char LOG_FILE_NAME[] = "ib_logfile0"; + +/** Composes full path for a redo log file +@param[in] filename name of the redo log file +@return path with log file name*/ +std::string get_log_file_path(const char *filename= LOG_FILE_NAME); + +/** Delete log file. +@param[in] suffix suffix of the file name */ +static inline void delete_log_file(const char* suffix) +{ + auto path = get_log_file_path(LOG_FILE_NAME_PREFIX).append(suffix); + os_file_delete_if_exists_func(path.c_str(), nullptr); +} + +struct completion_callback; + +/** Ensure that the log has been written to the log file up to a given +log entry (such as that of a transaction commit). Start a new write, or +wait and check if an already running write is covering the request. +@param lsn log sequence number that should be included in the file write +@param durable whether the write needs to be durable +@param callback log write completion callback */ +void log_write_up_to(lsn_t lsn, bool durable, + const completion_callback *callback= nullptr); + +/** Write to the log file up to the last log entry. +@param durable whether to wait for a durable write to complete */ +void log_buffer_flush_to_disk(bool durable= true); + + +/** Prepare to invoke log_write_and_flush(), before acquiring log_sys.latch. */ +ATTRIBUTE_COLD void log_write_and_flush_prepare(); + +/** Durably write the log up to log_sys.get_lsn(). */ +ATTRIBUTE_COLD void log_write_and_flush(); + +/** Make a checkpoint */ +ATTRIBUTE_COLD void log_make_checkpoint(); + +/** Make a checkpoint at the latest lsn on shutdown. */ +ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown(); + +/** +Checks that there is enough free space in the log to start a new query step. +Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this +function may only be called if the calling thread owns no synchronization +objects! */ +ATTRIBUTE_COLD void log_check_margins(); + +/******************************************************//** +Prints info of the log. */ +void +log_print( +/*======*/ + FILE* file); /*!< in: file where to print */ + +/** Offsets of a log file header */ +/* @{ */ +/** Log file header format identifier (32-bit unsigned big-endian integer). +This used to be called LOG_GROUP_ID and always written as 0, +because InnoDB never supported more than one copy of the redo log. */ +#define LOG_HEADER_FORMAT 0 +/** LSN of the start of data in this log file (with format version 1; +in format version 0, it was called LOG_FILE_START_LSN and at offset 4). */ +#define LOG_HEADER_START_LSN 8 +/** A null-terminated string which will contain either the string 'ibbackup' +and the creation time if the log file was created by mysqlbackup --restore, +or the MySQL version that created the redo log file. */ +#define LOG_HEADER_CREATOR 16 +/** End of the log file creator field. */ +#define LOG_HEADER_CREATOR_END 48 +/* @} */ + +struct log_t; + +/** File abstraction */ +class log_file_t +{ + friend log_t; + os_file_t m_file{OS_FILE_CLOSED}; +public: + log_file_t()= default; + log_file_t(os_file_t file) noexcept : m_file(file) {} + + /** Open a file + @return file size in bytes + @retval 0 if not readable */ + os_offset_t open(bool read_only) noexcept; + bool is_opened() const noexcept { return m_file != OS_FILE_CLOSED; } + + dberr_t close() noexcept; + dberr_t read(os_offset_t offset, span buf) noexcept; + void write(os_offset_t offset, span buf) noexcept; + bool flush() const noexcept { return os_file_flush(m_file); } +#ifdef HAVE_PMEM + byte *mmap(bool read_only, const struct stat &st) noexcept; +#endif +}; + +/** Redo log buffer */ +struct log_t +{ + /** The original (not version-tagged) InnoDB redo log format */ + static constexpr uint32_t FORMAT_3_23= 0; + /** The MySQL 5.7.9/MariaDB 10.2.2 log format */ + static constexpr uint32_t FORMAT_10_2= 1; + /** The MariaDB 10.3.2 log format. */ + static constexpr uint32_t FORMAT_10_3= 103; + /** The MariaDB 10.4.0 log format. */ + static constexpr uint32_t FORMAT_10_4= 104; + /** Encrypted MariaDB redo log */ + static constexpr uint32_t FORMAT_ENCRYPTED= 1U << 31; + /** The MariaDB 10.4.0 log format (only with innodb_encrypt_log=ON) */ + static constexpr uint32_t FORMAT_ENC_10_4= FORMAT_10_4 | FORMAT_ENCRYPTED; + /** The MariaDB 10.5.1 physical redo log format */ + static constexpr uint32_t FORMAT_10_5= 0x50485953; + /** The MariaDB 10.5.1 physical format (only with innodb_encrypt_log=ON) */ + static constexpr uint32_t FORMAT_ENC_10_5= FORMAT_10_5 | FORMAT_ENCRYPTED; + /** The MariaDB 10.8.0 variable-block-size redo log format */ + static constexpr uint32_t FORMAT_10_8= 0x50687973; + /** The MariaDB 10.8.0 format with innodb_encrypt_log=ON */ + static constexpr uint32_t FORMAT_ENC_10_8= FORMAT_10_8 | FORMAT_ENCRYPTED; + + /** Location of the first checkpoint block */ + static constexpr size_t CHECKPOINT_1= 4096; + /** Location of the second checkpoint block */ + static constexpr size_t CHECKPOINT_2= 8192; + /** Start of record payload */ + static constexpr lsn_t START_OFFSET= 12288; + + /** smallest possible log sequence number in the current format + (used to be 2048 before FORMAT_10_8). */ + static constexpr lsn_t FIRST_LSN= START_OFFSET; + +private: + /** The log sequence number of the last change of durable InnoDB files */ + alignas(CPU_LEVEL1_DCACHE_LINESIZE) + std::atomic lsn; + /** the first guaranteed-durable log sequence number */ + std::atomic flushed_to_disk_lsn; + /** log sequence number when log resizing was initiated, or 0 */ + std::atomic resize_lsn; + /** set when there may be need to flush the log buffer, or + preflush buffer pool pages, or initiate a log checkpoint. + This must hold if lsn - last_checkpoint_lsn > max_checkpoint_age. */ + std::atomic check_flush_or_checkpoint_; + + +#if defined(__aarch64__) +/* On ARM, we do more spinning */ +typedef srw_spin_lock log_rwlock_t; +#define LSN_LOCK_ATTR MY_MUTEX_INIT_FAST +#else +typedef srw_lock log_rwlock_t; +#define LSN_LOCK_ATTR nullptr +#endif + +public: + /** rw-lock protecting buf */ + alignas(CPU_LEVEL1_DCACHE_LINESIZE) log_rwlock_t latch; +private: + /** Last written LSN */ + lsn_t write_lsn; +public: + /** log record buffer, written to by mtr_t::commit() */ + byte *buf; + /** buffer for writing data to ib_logfile0, or nullptr if is_pmem() + In write_buf(), buf and flush_buf are swapped */ + byte *flush_buf; + /** number of std::swap(buf, flush_buf) and writes from buf to log; + protected by latch.wr_lock() */ + ulint write_to_log; + + /** Log sequence number when a log file overwrite (broken crash recovery) + was noticed. Protected by latch.wr_lock(). */ + lsn_t overwrite_warned; + + /** innodb_log_buffer_size (size of buf,flush_buf if !is_pmem(), in bytes) */ + size_t buf_size; + +private: + /** Log file being constructed during resizing; protected by latch */ + log_file_t resize_log; + /** size of resize_log; protected by latch */ + lsn_t resize_target; + /** Buffer for writing to resize_log; @see buf */ + byte *resize_buf; + /** Buffer for writing to resize_log; @see flush_buf */ + byte *resize_flush_buf; + + /** spin lock protecting lsn, buf_free in append_prepare() */ + alignas(CPU_LEVEL1_DCACHE_LINESIZE) pthread_mutex_t lsn_lock; + void init_lsn_lock() { pthread_mutex_init(&lsn_lock, LSN_LOCK_ATTR); } + void lock_lsn() { pthread_mutex_lock(&lsn_lock); } + void unlock_lsn() { pthread_mutex_unlock(&lsn_lock); } + void destroy_lsn_lock() { pthread_mutex_destroy(&lsn_lock); } + +public: + /** first free offset within buf use; protected by lsn_lock */ + Atomic_relaxed buf_free; + /** number of write requests (to buf); protected by exclusive lsn_lock */ + ulint write_to_buf; + /** number of waits in append_prepare(); protected by lsn_lock */ + ulint waits; + /** recommended maximum size of buf, after which the buffer is flushed */ + size_t max_buf_free; + + /** log file size in bytes, including the header */ + lsn_t file_size; +private: + /** the log sequence number at the start of the log file */ + lsn_t first_lsn; +#if defined __linux__ || defined _WIN32 + /** The physical block size of the storage */ + uint32_t block_size; +#endif +public: + /** format of the redo log: e.g., FORMAT_10_8 */ + uint32_t format; + /** Log file */ + log_file_t log; +#if defined __linux__ || defined _WIN32 + /** whether file system caching is enabled for the log */ + my_bool log_buffered; +# ifdef _WIN32 + static constexpr bool log_maybe_unbuffered= true; +# else + /** whether file system caching may be disabled */ + bool log_maybe_unbuffered; +# endif +#endif + + /** Fields involved in checkpoints @{ */ + lsn_t log_capacity; /*!< capacity of the log; if + the checkpoint age exceeds this, it is + a serious error because it is possible + we will then overwrite log and spoil + crash recovery */ + lsn_t max_modified_age_async; + /*!< when this recommended + value for lsn - + buf_pool.get_oldest_modification() + is exceeded, we start an + asynchronous preflush of pool pages */ + lsn_t max_checkpoint_age; + /*!< this is the maximum allowed value + for lsn - last_checkpoint_lsn when a + new query step is started */ + /** latest completed checkpoint (protected by latch.wr_lock()) */ + Atomic_relaxed last_checkpoint_lsn; + /** next checkpoint LSN (protected by log_sys.latch) */ + lsn_t next_checkpoint_lsn; + /** next checkpoint number (protected by latch.wr_lock()) */ + ulint next_checkpoint_no; + /** whether a checkpoint is pending */ + Atomic_relaxed checkpoint_pending; + + /** buffer for checkpoint header */ + byte *checkpoint_buf; + /* @} */ + + bool is_initialised() const noexcept { return max_buf_free != 0; } + +#ifdef HAVE_PMEM + bool is_pmem() const noexcept { return !flush_buf; } +#else + static constexpr bool is_pmem() { return false; } +#endif + + bool is_opened() const noexcept { return log.is_opened(); } + + /** @return LSN at which log resizing was started and is still in progress + @retval 0 if no log resizing is in progress */ + lsn_t resize_in_progress() const noexcept + { return resize_lsn.load(std::memory_order_relaxed); } + + /** Status of resize_start() */ + enum resize_start_status { + RESIZE_NO_CHANGE, RESIZE_IN_PROGRESS, RESIZE_STARTED, RESIZE_FAILED + }; + + /** Start resizing the log and release the exclusive latch. + @param size requested new file_size + @return whether the resizing was started successfully */ + resize_start_status resize_start(os_offset_t size) noexcept; + + /** Abort any resize_start(). */ + void resize_abort() noexcept; + + /** Replicate a write to the log. + @param lsn start LSN + @param end end of the mini-transaction + @param len length of the mini-transaction + @param seq offset of the sequence bit from the end */ + inline void resize_write(lsn_t lsn, const byte *end, + size_t len, size_t seq) noexcept; + + /** Write resize_buf to resize_log. + @param length the used length of resize_buf */ + ATTRIBUTE_COLD void resize_write_buf(size_t length) noexcept; + + /** Rename a log file after resizing. + @return whether an error occurred */ + static bool resize_rename() noexcept; + +#ifdef HAVE_PMEM + /** @return pointer for writing to resize_buf + @retval nullptr if no PMEM based resizing is active */ + inline byte *resize_buf_begin(lsn_t lsn) const noexcept; + /** @return end of resize_buf */ + inline const byte *resize_buf_end() const noexcept + { return resize_buf + resize_target; } + + /** Initialise the redo log subsystem. */ + void create_low(); + /** Initialise the redo log subsystem. + @return whether the initialisation succeeded */ + bool create() { create_low(); return true; } + + /** Attach a log file. + @return whether the memory allocation succeeded */ + bool attach(log_file_t file, os_offset_t size); +#else + /** Initialise the redo log subsystem. + @return whether the initialisation succeeded */ + bool create(); + /** Attach a log file. */ + void attach_low(log_file_t file, os_offset_t size); + bool attach(log_file_t file, os_offset_t size) + { attach_low(file, size); return true; } +#endif + +#if defined __linux__ || defined _WIN32 + /** Try to enable or disable file system caching (update log_buffered) */ + void set_buffered(bool buffered); +#endif + + void close_file(); + + /** Calculate the checkpoint safety margins. */ + static void set_capacity(); + + /** Write a log file header. + @param buf log header buffer + @param lsn log sequence number corresponding to log_sys.START_OFFSET + @param encrypted whether the log is encrypted */ + static void header_write(byte *buf, lsn_t lsn, bool encrypted); + + lsn_t get_lsn(std::memory_order order= std::memory_order_relaxed) const + { return lsn.load(order); } + + lsn_t get_flushed_lsn(std::memory_order order= std::memory_order_acquire) + const noexcept + { return flushed_to_disk_lsn.load(order); } + + /** Initialize the LSN on initial log file creation. */ + lsn_t init_lsn() noexcept + { + latch.wr_lock(SRW_LOCK_CALL); + const lsn_t lsn{get_lsn()}; + flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed); + write_lsn= lsn; + latch.wr_unlock(); + return lsn; + } + + void set_recovered_lsn(lsn_t lsn) noexcept + { +#ifndef SUX_LOCK_GENERIC + ut_ad(latch.is_write_locked()); +#endif /* SUX_LOCK_GENERIC */ + write_lsn= lsn; + this->lsn.store(lsn, std::memory_order_relaxed); + flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed); + } + +#ifdef HAVE_PMEM + /** Persist the log. + @param lsn desired new value of flushed_to_disk_lsn */ + inline void persist(lsn_t lsn) noexcept; +#endif + + bool check_flush_or_checkpoint() const + { + return UNIV_UNLIKELY + (check_flush_or_checkpoint_.load(std::memory_order_relaxed)); + } + void set_check_flush_or_checkpoint(bool flag= true) + { check_flush_or_checkpoint_.store(flag, std::memory_order_relaxed); } + + /** Make previous write_buf() durable and update flushed_to_disk_lsn. */ + bool flush(lsn_t lsn) noexcept; + + /** Shut down the redo log subsystem. */ + void close(); + +#if defined __linux__ || defined _WIN32 + /** @return the physical block size of the storage */ + size_t get_block_size() const noexcept + { ut_ad(block_size); return block_size; } + /** Set the log block size for file I/O. */ + void set_block_size(uint32_t size) noexcept { block_size= size; } +#else + /** @return the physical block size of the storage */ + static size_t get_block_size() { return 512; } +#endif + +private: + /** Wait in append_prepare() for buffer to become available + @param ex whether log_sys.latch is exclusively locked */ + ATTRIBUTE_COLD static void append_prepare_wait(bool ex) noexcept; +public: + /** Reserve space in the log buffer for appending data. + @tparam pmem log_sys.is_pmem() + @param size total length of the data to append(), in bytes + @param ex whether log_sys.latch is exclusively locked + @return the start LSN and the buffer position for append() */ + template + inline std::pair append_prepare(size_t size, bool ex) noexcept; + + /** Append a string of bytes to the redo log. + @param d destination + @param s string of bytes + @param size length of str, in bytes */ + void append(byte *&d, const void *s, size_t size) noexcept + { +#ifndef SUX_LOCK_GENERIC + ut_ad(latch.is_locked()); +#endif + ut_ad(d + size <= buf + (is_pmem() ? file_size : buf_size)); + memcpy(d, s, size); + d+= size; + } + + /** Set the log file format. */ + void set_latest_format(bool encrypted) noexcept + { format= encrypted ? FORMAT_ENC_10_8 : FORMAT_10_8; } + /** @return whether the redo log is encrypted */ + bool is_encrypted() const noexcept { return format & FORMAT_ENCRYPTED; } + /** @return whether the redo log is in the latest format */ + bool is_latest() const noexcept + { return (~FORMAT_ENCRYPTED & format) == FORMAT_10_8; } + + /** @return capacity in bytes */ + lsn_t capacity() const noexcept { return file_size - START_OFFSET; } + + /** Set the LSN of the log file at file creation. */ + void set_first_lsn(lsn_t lsn) noexcept { write_lsn= first_lsn= lsn; } + /** @return the first LSN of the log file */ + lsn_t get_first_lsn() const noexcept { return first_lsn; } + + /** Determine the sequence bit at a log sequence number */ + byte get_sequence_bit(lsn_t lsn) const noexcept + { + ut_ad(lsn >= first_lsn); + return !(((lsn - first_lsn) / capacity()) & 1); + } + + /** Calculate the offset of a log sequence number. + @param lsn log sequence number + @return byte offset within ib_logfile0 */ + lsn_t calc_lsn_offset(lsn_t lsn) const noexcept + { + ut_ad(lsn >= first_lsn); + return START_OFFSET + (lsn - first_lsn) % capacity(); + } + + /** Write checkpoint information and invoke latch.wr_unlock(). + @param end_lsn start LSN of the FILE_CHECKPOINT mini-transaction */ + inline void write_checkpoint(lsn_t end_lsn) noexcept; + + /** Write buf to ib_logfile0. + @tparam release_latch whether to invoke latch.wr_unlock() + @return the current log sequence number */ + template inline lsn_t write_buf() noexcept; + + /** Create the log. */ + void create(lsn_t lsn) noexcept; +}; + +/** Redo log system */ +extern log_t log_sys; + +/** Wait for a log checkpoint if needed. +NOTE that this function may only be called while not holding +any synchronization objects except dict_sys.latch. */ +void log_free_check(); + +/** Release the latches that protect log resizing. */ +void log_resize_release(); diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h new file mode 100644 index 00000000..6d75e15a --- /dev/null +++ b/storage/innobase/include/log0recv.h @@ -0,0 +1,491 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/log0recv.h +Recovery + +Created 9/20/1997 Heikki Tuuri +*******************************************************/ + +#pragma once + +#include "ut0new.h" +#include "buf0types.h" +#include "log0log.h" +#include "mtr0types.h" + +#include +#include + +/** @return whether recovery is currently running. */ +#define recv_recovery_is_on() UNIV_UNLIKELY(recv_sys.recovery_on) + +ATTRIBUTE_COLD MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Apply any buffered redo log to a page. +@param space tablespace +@param bpage buffer pool page +@return whether the page was recovered correctly */ +bool recv_recover_page(fil_space_t* space, buf_page_t* bpage); + +/** Start recovering from a redo log checkpoint. +of first system tablespace page +@return error code or DB_SUCCESS */ +dberr_t recv_recovery_from_checkpoint_start(); + +/** Report an operation to create, delete, or rename a file during backup. +@param[in] space_id tablespace identifier +@param[in] type file operation redo log type +@param[in] name file name (not NUL-terminated) +@param[in] len length of name, in bytes +@param[in] new_name new file name (NULL if not rename) +@param[in] new_len length of new_name, in bytes (0 if NULL) */ +extern void (*log_file_op)(uint32_t space_id, int type, + const byte* name, ulint len, + const byte* new_name, ulint new_len); + +/** Report an operation which does undo log tablespace truncation +during backup +@param space_id undo tablespace identifier */ +extern void (*undo_space_trunc)(uint32_t space_id); + +/** Report an operation which does INIT_PAGE for page0 during backup. +@param space_id tablespace identifier */ +extern void (*first_page_init)(uint32_t space_id); + +/** Stored redo log record */ +struct log_rec_t +{ + log_rec_t(lsn_t lsn) : next(nullptr), lsn(lsn) { ut_ad(lsn); } + log_rec_t()= delete; + log_rec_t(const log_rec_t&)= delete; + log_rec_t &operator=(const log_rec_t&)= delete; + + /** next record */ + log_rec_t *next; + /** mtr_t::commit_lsn() of the mini-transaction */ + const lsn_t lsn; +}; + +struct recv_dblwr_t +{ + /** Add a page frame to the doublewrite recovery buffer. */ + void add(byte *page) { pages.push_front(page); } + + /** Validate the page. + @param page_id page identifier + @param page page contents + @param space the tablespace of the page (not available for page 0) + @param tmp_buf 2*srv_page_size for decrypting and decompressing any + page_compressed or encrypted pages + @return whether the page is valid */ + bool validate_page(const page_id_t page_id, const byte *page, + const fil_space_t *space, byte *tmp_buf); + + /** Find a doublewrite copy of a page. + @param page_id page identifier + @param space tablespace (not available for page_id.page_no()==0) + @param tmp_buf 2*srv_page_size for decrypting and decompressing any + page_compressed or encrypted pages + @return page frame + @retval NULL if no valid page for page_id was found */ + byte* find_page(const page_id_t page_id, const fil_space_t *space= NULL, + byte *tmp_buf= NULL); + + /** Restore the first page of the given tablespace from + doublewrite buffer. + @param space_id tablespace identifier + @param name tablespace filepath + @param file tablespace file handle + @return whether the operation failed */ + bool restore_first_page(uint32_t space_id, const char *name, os_file_t file); + + typedef std::deque > list; + + /** Recovered doublewrite buffer page frames */ + list pages; +}; + +/** recv_sys.pages entry; protected by recv_sys.mutex */ +struct page_recv_t +{ + /** Recovery status: 0=not in progress, 1=log is being applied, + -1=log has been applied and the entry may be erased. + Transitions from 1 to -1 are NOT protected by recv_sys.mutex. */ + Atomic_relaxed being_processed{0}; + /** Whether reading the page will be skipped */ + bool skip_read= false; + /** Latest written byte offset when applying the log records. + @see mtr_t::m_last_offset */ + uint16_t last_offset= 1; + /** log records for a page */ + class recs_t + { + /** The first log record */ + log_rec_t *head= nullptr; + /** The last log record */ + log_rec_t *tail= nullptr; + friend struct page_recv_t; + public: + /** Append a redo log snippet for the page + @param recs log snippet */ + void append(log_rec_t* recs) + { + if (tail) + tail->next= recs; + else + head= recs; + tail= recs; + } + /** Remove the last records for the page + @param start_lsn start of the removed log */ + ATTRIBUTE_COLD void rewind(lsn_t start_lsn); + + /** @return the last log snippet */ + const log_rec_t* last() const { return tail; } + /** @return the last log snippet */ + log_rec_t* last() { return tail; } + + class iterator + { + log_rec_t *cur; + public: + iterator(log_rec_t* rec) : cur(rec) {} + log_rec_t* operator*() const { return cur; } + iterator &operator++() { cur= cur->next; return *this; } + bool operator!=(const iterator& i) const { return cur != i.cur; } + }; + iterator begin() { return head; } + iterator end() { return NULL; } + bool empty() const { ut_ad(!head == !tail); return !head; } + /** Clear and free the records; @see recv_sys_t::add() */ + void clear(); + } log; + + /** Trim old log records for a page. + @param start_lsn oldest log sequence number to preserve + @return whether all the log for the page was trimmed */ + inline bool trim(lsn_t start_lsn); + /** Ignore any earlier redo log records for this page. */ + inline void will_not_read(); +}; + +/** A page initialization operation that was parsed from the redo log */ +struct recv_init +{ + /** log sequence number of the page initialization */ + lsn_t lsn; + /** Whether btr_page_create() avoided a read of the page. + At the end of the last recovery batch, mark_ibuf_exist() + will mark pages for which this flag is set. */ + bool created; +}; + +/** Recovery system data structure */ +struct recv_sys_t +{ + using init= recv_init; + + /** mutex protecting this as well as some of page_recv_t */ + alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex; +private: + /** set when finding a corrupt log block or record, or there is a + log parsing buffer overflow */ + bool found_corrupt_log; + /** set when an inconsistency with the file system contents is detected + during log scan or apply */ + bool found_corrupt_fs; +public: + /** @return maximum guaranteed size of a mini-transaction on recovery */ + static constexpr size_t MTR_SIZE_MAX{1U << 20}; + + /** whether we are applying redo log records during crash recovery */ + bool recovery_on; + /** whether recv_recover_page(), invoked from buf_page_t::read_complete(), + should apply log records*/ + bool apply_log_recs; + /** number of bytes in log_sys.buf */ + size_t len; + /** start offset of non-parsed log records in log_sys.buf */ + size_t offset; + /** log sequence number of the first non-parsed record */ + lsn_t lsn; + /** log sequence number of the last parsed mini-transaction */ + lsn_t scanned_lsn; + /** log sequence number at the end of the FILE_CHECKPOINT record, or 0 */ + lsn_t file_checkpoint; + /** the time when progress was last reported */ + time_t progress_time; + + using map = std::map, + ut_allocator>>; + /** buffered records waiting to be applied to pages */ + map pages; + +private: + /** iterator to pages, used by parse() */ + map::iterator pages_it; + + /** Process a record that indicates that a tablespace size is being shrunk. + @param page_id first page that is not in the file + @param lsn log sequence number of the shrink operation */ + inline void trim(const page_id_t page_id, lsn_t lsn); + + /** Undo tablespaces for which truncate has been logged + (indexed by page_id_t::space() - srv_undo_space_id_start) */ + struct trunc + { + /** log sequence number of FILE_CREATE, or 0 if none */ + lsn_t lsn; + /** truncated size of the tablespace, or 0 if not truncated */ + unsigned pages; + } truncated_undo_spaces[127]; + +public: + /** The contents of the doublewrite buffer */ + recv_dblwr_t dblwr; + + __attribute__((warn_unused_result)) + inline dberr_t read(os_offset_t offset, span buf); + inline size_t files_size(); + void close_files(); + + /** Advance pages_it if it matches the iterator */ + void pages_it_invalidate(const map::iterator &p) + { + mysql_mutex_assert_owner(&mutex); + if (pages_it == p) + pages_it++; + } + /** Invalidate pages_it if it points to the given tablespace */ + void pages_it_invalidate(uint32_t space_id) + { + mysql_mutex_assert_owner(&mutex); + if (pages_it != pages.end() && pages_it->first.space() == space_id) + pages_it= pages.end(); + } + +private: + /** Attempt to initialize a page based on redo log records. + @param p iterator + @param mtr mini-transaction + @param b pre-allocated buffer pool block + @param init page initialization + @return the recovered block + @retval nullptr if the page cannot be initialized based on log records + @retval -1 if the page cannot be recovered due to corruption */ + inline buf_block_t *recover_low(const map::iterator &p, mtr_t &mtr, + buf_block_t *b, init &init); + /** Attempt to initialize a page based on redo log records. + @param page_id page identifier + @return the recovered block + @retval nullptr if the page cannot be initialized based on log records + @retval -1 if the page cannot be recovered due to corruption */ + ATTRIBUTE_COLD buf_block_t *recover_low(const page_id_t page_id); + + /** All found log files (multiple ones are possible if we are upgrading + from before MariaDB Server 10.5.1) */ + std::vector files; + + /** Base node of the redo block list. + List elements are linked via buf_block_t::unzip_LRU. */ + UT_LIST_BASE_NODE_T(buf_block_t) blocks; + + /** Allocate a block from the buffer pool for recv_sys.pages */ + ATTRIBUTE_COLD buf_block_t *add_block(); + + /** Wait for buffer pool to become available. + @param pages number of buffer pool pages needed */ + ATTRIBUTE_COLD void wait_for_pool(size_t pages); + + /** Free log for processed pages. */ + void garbage_collect(); + + /** Apply a recovery batch. + @param space_id current tablespace identifier + @param space current tablespace + @param free_block spare buffer block + @param last_batch whether it is possible to write more redo log + @return whether the caller must provide a new free_block */ + bool apply_batch(uint32_t space_id, fil_space_t *&space, + buf_block_t *&free_block, bool last_batch); + +public: + /** Apply buffered log to persistent data pages. + @param last_batch whether it is possible to write more redo log */ + void apply(bool last_batch); + +#ifdef UNIV_DEBUG + /** whether all redo log in the current batch has been applied */ + bool after_apply= false; +#endif + /** Initialize the redo log recovery subsystem. */ + void create(); + + /** Free most recovery data structures. */ + void debug_free(); + + /** Clean up after create() */ + void close(); + + bool is_initialised() const { return scanned_lsn != 0; } + + /** Find the latest checkpoint. + @return error code or DB_SUCCESS */ + dberr_t find_checkpoint(); + + /** Register a redo log snippet for a page. + @param it page iterator + @param start_lsn start LSN of the mini-transaction + @param lsn @see mtr_t::commit_lsn() + @param l redo log snippet + @param len length of l, in bytes + @return whether we ran out of memory */ + bool add(map::iterator it, lsn_t start_lsn, lsn_t lsn, + const byte *l, size_t len); + + /** Parsing result */ + enum parse_mtr_result { + /** a record was successfully parsed */ + OK, + /** the log ended prematurely (need to read more) */ + PREMATURE_EOF, + /** the end of the log was reached */ + GOT_EOF, + /** parse(l, false) ran out of memory */ + GOT_OOM + }; + +private: + /** Parse and register one log_t::FORMAT_10_8 mini-transaction. + @tparam store whether to store the records + @param l log data source + @param if_exists if store: whether to check if the tablespace exists */ + template + inline parse_mtr_result parse(source &l, bool if_exists) noexcept; + + /** Rewind a mini-transaction when parse() runs out of memory. + @param l log data source + @param begin start of the mini-transaction */ + template + ATTRIBUTE_COLD void rewind(source &l, source &begin) noexcept; + + /** Report progress in terms of LSN or pages remaining */ + ATTRIBUTE_COLD void report_progress() const; +public: + /** Parse and register one log_t::FORMAT_10_8 mini-transaction, + handling log_sys.is_pmem() buffer wrap-around. + @tparam store whether to store the records + @param if_exists if store: whether to check if the tablespace exists */ + template + static parse_mtr_result parse_mtr(bool if_exists) noexcept; + + /** Parse and register one log_t::FORMAT_10_8 mini-transaction, + handling log_sys.is_pmem() buffer wrap-around. + @tparam store whether to store the records + @param if_exists if store: whether to check if the tablespace exists */ + template + static parse_mtr_result parse_pmem(bool if_exists) noexcept +#ifdef HAVE_PMEM + ; +#else + { return parse_mtr(if_exists); } +#endif + + /** Erase log records for a page. */ + void erase(map::iterator p); + + /** Clear a fully processed set of stored redo log records. */ + void clear(); + + /** Determine whether redo log recovery progress should be reported. + @param time the current time + @return whether progress should be reported + (the last report was at least 15 seconds ago) */ + bool report(time_t time); + + /** The alloc() memory alignment, in bytes */ + static constexpr size_t ALIGNMENT= sizeof(size_t); + + /** Free a redo log snippet. + @param data buffer allocated in add() */ + inline void free(const void *data); + + /** Remove records for a corrupted page. + This function should only be called when innodb_force_recovery is set. + @param page_id corrupted page identifier */ + ATTRIBUTE_COLD void free_corrupted_page(page_id_t page_id); + + /** Flag data file corruption during recovery. */ + ATTRIBUTE_COLD void set_corrupt_fs(); + /** Flag log file corruption during recovery. */ + ATTRIBUTE_COLD void set_corrupt_log(); + + /** @return whether data file corruption was found */ + bool is_corrupt_fs() const { return UNIV_UNLIKELY(found_corrupt_fs); } + /** @return whether log file corruption was found */ + bool is_corrupt_log() const { return UNIV_UNLIKELY(found_corrupt_log); } + + /** Attempt to initialize a page based on redo log records. + @param page_id page identifier + @return the recovered block + @retval nullptr if the page cannot be initialized based on log records + @retval -1 if the page cannot be recovered due to corruption */ + buf_block_t *recover(const page_id_t page_id) + { + return UNIV_UNLIKELY(recovery_on) ? recover_low(page_id) : nullptr; + } + + /** Try to recover a tablespace that was not readable earlier + @param p iterator + @param name tablespace file name + @param free_block spare buffer block + @return recovered tablespace + @retval nullptr if recovery failed */ + fil_space_t *recover_deferred(const map::iterator &p, + const std::string &name, + buf_block_t *&free_block); +}; + +/** The recovery system */ +extern recv_sys_t recv_sys; + +/** If the following is TRUE, the buffer pool file pages must be invalidated +after recovery and no ibuf operations are allowed; this will be set if +recv_sys.pages becomes too full, and log records must be merged +to file pages already before the recovery is finished: in this case no +ibuf operations are allowed, as they could modify the pages read in the +buffer pool before the pages have been recovered to the up-to-date state. + +TRUE means that recovery is running and no operations on the log files +are allowed yet: the variable name is misleading. */ +extern bool recv_no_ibuf_operations; +/** TRUE when recv_init_crash_recovery() has been called. */ +extern bool recv_needed_recovery; +#ifdef UNIV_DEBUG +/** whether writing to the redo log is forbidden; +protected by exclusive log_sys.latch. */ +extern bool recv_no_log_write; +#endif /* UNIV_DEBUG */ + +/** TRUE if buf_page_is_corrupted() should check if the log sequence +number (FIL_PAGE_LSN) is in the future. Initially FALSE, and set by +recv_recovery_from_checkpoint_start(). */ +extern bool recv_lsn_checks_on; diff --git a/storage/innobase/include/log0types.h b/storage/innobase/include/log0types.h new file mode 100644 index 00000000..df87968d --- /dev/null +++ b/storage/innobase/include/log0types.h @@ -0,0 +1,38 @@ +/***************************************************************************** + +Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/log0types.h +Log types + +Created 2013-03-15 Sunny Bains +*******************************************************/ + +#ifndef log0types_h +#define log0types_h + +#include "univ.i" + +/* Type used for all log sequence number storage and arithmetics */ +typedef ib_uint64_t lsn_t; + +#define LSN_MAX IB_UINT64_MAX + +#define LSN_PF UINT64PF + +#endif /* log0types_h */ diff --git a/storage/innobase/include/mach0data.h b/storage/innobase/include/mach0data.h new file mode 100644 index 00000000..79cbd7d1 --- /dev/null +++ b/storage/innobase/include/mach0data.h @@ -0,0 +1,375 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/mach0data.h +Utilities for converting data from the database file +to the machine format. + +Created 11/28/1995 Heikki Tuuri +***********************************************************************/ + +#ifndef mach0data_h +#define mach0data_h + +#include "univ.i" +#include "mtr0types.h" + +#ifndef UNIV_INNOCHECKSUM + +/* The data and all fields are always stored in a database file +in the same format: ascii, big-endian, ... . +All data in the files MUST be accessed using the functions in this +module. */ + +/*******************************************************//** +The following function is used to store data in one byte. */ +UNIV_INLINE +void +mach_write_to_1( +/*============*/ + byte* b, /*!< in: pointer to byte where to store */ + ulint n); /*!< in: ulint integer to be stored, >= 0, < 256 */ +/** The following function is used to fetch data from one byte. +@param[in] b pointer to a byte to read +@return ulint integer, >= 0, < 256 */ +UNIV_INLINE +uint8_t +mach_read_from_1( + const byte* b) + MY_ATTRIBUTE((warn_unused_result)); +/*******************************************************//** +The following function is used to store data in two consecutive +bytes. We store the most significant byte to the lower address. */ +UNIV_INLINE +void +mach_write_to_2( +/*============*/ + byte* b, /*!< in: pointer to two bytes where to store */ + ulint n); /*!< in: ulint integer to be stored, >= 0, < 64k */ +#endif /* !UNIV_INNOCHECKSUM */ +/** The following function is used to fetch data from 2 consecutive +bytes. The most significant byte is at the lowest address. +@param[in] b pointer to 2 bytes where to store +@return 2-byte integer, >= 0, < 64k */ +UNIV_INLINE +uint16_t +mach_read_from_2( + const byte* b) + MY_ATTRIBUTE((warn_unused_result)); + +#ifndef UNIV_INNOCHECKSUM +/********************************************************//** +The following function is used to convert a 16-bit data item +to the canonical format, for fast bytewise equality test +against memory. +@return 16-bit integer in canonical format */ +UNIV_INLINE +uint16 +mach_encode_2( +/*==========*/ + ulint n) /*!< in: integer in machine-dependent format */ + MY_ATTRIBUTE((const)); +/********************************************************//** +The following function is used to convert a 16-bit data item +from the canonical format, for fast bytewise equality test +against memory. +@return integer in machine-dependent format */ +UNIV_INLINE +ulint +mach_decode_2( +/*==========*/ + uint16 n) /*!< in: 16-bit integer in canonical format */ + MY_ATTRIBUTE((const)); +/*******************************************************//** +The following function is used to store data in 3 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_3( +/*============*/ + byte* b, /*!< in: pointer to 3 bytes where to store */ + ulint n); /*!< in: ulint integer to be stored */ +/** The following function is used to fetch data from 3 consecutive +bytes. The most significant byte is at the lowest address. +@param[in] b pointer to 3 bytes to read +@return 32 bit integer */ +UNIV_INLINE +uint32_t +mach_read_from_3( + const byte* b) + MY_ATTRIBUTE((warn_unused_result)); +/*******************************************************//** +The following function is used to store data in four consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_4( +/*============*/ + byte* b, /*!< in: pointer to four bytes where to store */ + ulint n); /*!< in: ulint integer to be stored */ +/** The following function is used to fetch data from 4 consecutive +bytes. The most significant byte is at the lowest address. +@param[in] b pointer to 4 bytes to read +@return 32 bit integer */ +UNIV_INLINE +uint32_t +mach_read_from_4( + const byte* b) + MY_ATTRIBUTE((warn_unused_result)); +/*********************************************************//** +Writes a ulint in a compressed form (1..5 bytes). +@return stored size in bytes */ +UNIV_INLINE +ulint +mach_write_compressed( +/*==================*/ + byte* b, /*!< in: pointer to memory where to store */ + ulint n); /*!< in: ulint integer to be stored */ +/*********************************************************//** +Returns the size of an ulint when written in the compressed form. +@return compressed size in bytes */ +UNIV_INLINE +ulint +mach_get_compressed_size( +/*=====================*/ + ulint n) /*!< in: ulint integer to be stored */ + MY_ATTRIBUTE((const)); +/** Read a 32-bit integer in a compressed form. +@param[in,out] b pointer to memory where to read; +advanced by the number of bytes consumed +@return unsigned value */ +UNIV_INLINE +ib_uint32_t +mach_read_next_compressed( + const byte** b); +/*******************************************************//** +The following function is used to store data in 6 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_6( +/*============*/ + byte* b, /*!< in: pointer to 6 bytes where to store */ + ib_uint64_t id); /*!< in: 48-bit integer */ +/********************************************************//** +The following function is used to fetch data from 6 consecutive +bytes. The most significant byte is at the lowest address. +@return 48-bit integer */ +UNIV_INLINE +ib_uint64_t +mach_read_from_6( +/*=============*/ + const byte* b) /*!< in: pointer to 6 bytes */ + MY_ATTRIBUTE((warn_unused_result)); +/*******************************************************//** +The following function is used to store data in 7 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_7( +/*============*/ + byte* b, /*!< in: pointer to 7 bytes where to store */ + ib_uint64_t n); /*!< in: 56-bit integer */ +/********************************************************//** +The following function is used to fetch data from 7 consecutive +bytes. The most significant byte is at the lowest address. +@return 56-bit integer */ +UNIV_INLINE +ib_uint64_t +mach_read_from_7( +/*=============*/ + const byte* b) /*!< in: pointer to 7 bytes */ + MY_ATTRIBUTE((warn_unused_result)); +/*******************************************************//** +The following function is used to store data in 8 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_8( +/*============*/ + void* b, /*!< in: pointer to 8 bytes where to store */ + ib_uint64_t n); /*!< in: 64-bit integer to be stored */ +/********************************************************//** +The following function is used to fetch data from 8 consecutive +bytes. The most significant byte is at the lowest address. +@return 64-bit integer */ +UNIV_INLINE +ib_uint64_t +mach_read_from_8( +/*=============*/ + const byte* b) /*!< in: pointer to 8 bytes */ + MY_ATTRIBUTE((warn_unused_result)); +/*********************************************************//** +Writes a 64-bit integer in a compressed form (5..9 bytes). +@return size in bytes */ +UNIV_INLINE +ulint +mach_u64_write_compressed( +/*======================*/ + byte* b, /*!< in: pointer to memory where to store */ + ib_uint64_t n); /*!< in: 64-bit integer to be stored */ +/** Read a 64-bit integer in a compressed form. +@param[in,out] b pointer to memory where to read; +advanced by the number of bytes consumed +@return unsigned value */ +UNIV_INLINE +ib_uint64_t +mach_u64_read_next_compressed( + const byte** b); +/*********************************************************//** +Writes a 64-bit integer in a compressed form (1..11 bytes). +@return size in bytes */ +UNIV_INLINE +ulint +mach_u64_write_much_compressed( +/*===========================*/ + byte* b, /*!< in: pointer to memory where to store */ + ib_uint64_t n); /*!< in: 64-bit integer to be stored */ +/*********************************************************//** +Reads a 64-bit integer in a compressed form. +@return the value read */ +UNIV_INLINE +ib_uint64_t +mach_u64_read_much_compressed( +/*==========================*/ + const byte* b) /*!< in: pointer to memory from where to read */ + MY_ATTRIBUTE((warn_unused_result)); + +/*********************************************************//** +Reads a double. It is stored in a little-endian format. +@return double read */ +UNIV_INLINE +double +mach_double_read( +/*=============*/ + const byte* b) /*!< in: pointer to memory from where to read */ + MY_ATTRIBUTE((warn_unused_result)); +/*********************************************************//** +Writes a double. It is stored in a little-endian format. */ +UNIV_INLINE +void +mach_double_write( +/*==============*/ + byte* b, /*!< in: pointer to memory where to write */ + double d); /*!< in: double */ +/*********************************************************//** +Reads a float. It is stored in a little-endian format. +@return float read */ +UNIV_INLINE +float +mach_float_read( +/*============*/ + const byte* b) /*!< in: pointer to memory from where to read */ + MY_ATTRIBUTE((warn_unused_result)); +/*********************************************************//** +Writes a float. It is stored in a little-endian format. */ +UNIV_INLINE +void +mach_float_write( +/*=============*/ + byte* b, /*!< in: pointer to memory where to write */ + float d); /*!< in: float */ +/*********************************************************//** +Reads a ulint stored in the little-endian format. +@return unsigned long int */ +UNIV_INLINE +ulint +mach_read_from_n_little_endian( +/*===========================*/ + const byte* buf, /*!< in: from where to read */ + ulint buf_size) /*!< in: from how many bytes to read */ + MY_ATTRIBUTE((warn_unused_result)); + + +/** Reads a 64 bit stored in big endian format +@param buf From where to read +@return uint64_t */ +UNIV_INLINE +uint64_t +mach_read_uint64_little_endian(const byte* buf) +{ +#ifdef WORDS_BIGENDIAN + return + uint64_t(buf[0]) | uint64_t(buf[1]) << 8 | + uint64_t(buf[2]) << 16 | uint64_t(buf[3]) << 24 | + uint64_t(buf[4]) << 32 | uint64_t(buf[5]) << 40 | + uint64_t(buf[6]) << 48 | uint64_t(buf[7]) << 56; +#else + uint64_t n; + memcpy(&n, buf, sizeof(uint64_t)); + return n; +#endif +} + +/*********************************************************//** +Writes a ulint in the little-endian format. */ +UNIV_INLINE +void +mach_write_to_n_little_endian( +/*==========================*/ + byte* dest, /*!< in: where to write */ + ulint dest_size, /*!< in: into how many bytes to write */ + ulint n); /*!< in: unsigned long int to write */ +/*********************************************************//** +Reads a ulint stored in the little-endian format. +@return unsigned long int */ +UNIV_INLINE +ulint +mach_read_from_2_little_endian( +/*===========================*/ + const byte* buf) /*!< in: from where to read */ + MY_ATTRIBUTE((warn_unused_result)); +/*********************************************************//** +Writes a ulint in the little-endian format. */ +UNIV_INLINE +void +mach_write_to_2_little_endian( +/*==========================*/ + byte* dest, /*!< in: where to write */ + ulint n); /*!< in: unsigned long int to write */ +/*********************************************************//** +Convert integral type from storage byte order (big endian) to +host byte order. +@return integer value */ +UNIV_INLINE +ib_uint64_t +mach_read_int_type( +/*===============*/ + const byte* src, /*!< in: where to read from */ + ulint len, /*!< in: length of src */ + ibool unsigned_type); /*!< in: signed or unsigned flag */ + +/************************************************************* +Convert a ulonglong integer from host byte order to (big-endian) +storage byte order. */ +UNIV_INLINE +void +mach_write_ulonglong( +/*=================*/ + byte* dest, /*!< in: where to write */ + ulonglong src, /*!< in: where to read from */ + ulint len, /*!< in: length of dest */ + bool usign); /*!< in: signed or unsigned flag */ + +#endif /* !UNIV_INNOCHECKSUM */ + +#include "mach0data.inl" + +#endif diff --git a/storage/innobase/include/mach0data.inl b/storage/innobase/include/mach0data.inl new file mode 100644 index 00000000..2f970fd2 --- /dev/null +++ b/storage/innobase/include/mach0data.inl @@ -0,0 +1,837 @@ +/***************************************************************************** + +Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/mach0data.ic +Utilities for converting data from the database file +to the machine format. + +Created 11/28/1995 Heikki Tuuri +***********************************************************************/ + +#ifndef UNIV_INNOCHECKSUM + +#include "mtr0types.h" +#include "ut0byte.h" + +/*******************************************************//** +The following function is used to store data in one byte. */ +UNIV_INLINE +void +mach_write_to_1( +/*============*/ + byte* b, /*!< in: pointer to byte where to store */ + ulint n) /*!< in: ulint integer to be stored, >= 0, < 256 */ +{ + ut_ad((n & ~0xFFUL) == 0); + + b[0] = (byte) n; +} + +#endif /* !UNIV_INNOCHECKSUM */ + +/*******************************************************//** +The following function is used to store data in two consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_2( +/*============*/ + byte* b, /*!< in: pointer to two bytes where to store */ + ulint n) /*!< in: ulint integer to be stored */ +{ + ut_ad((n & ~0xFFFFUL) == 0); + + b[0] = (byte)(n >> 8); + b[1] = (byte)(n); +} + +/** The following function is used to fetch data from one byte. +@param[in] b pointer to a byte to read +@return ulint integer, >= 0, < 256 */ +UNIV_INLINE +uint8_t +mach_read_from_1( + const byte* b) +{ + return(uint8_t(*b)); +} + +/** The following function is used to fetch data from 2 consecutive +bytes. The most significant byte is at the lowest address. +@param[in] b pointer to 2 bytes to read +@return 2-byte integer, >= 0, < 64k */ +UNIV_INLINE +uint16_t +mach_read_from_2( + const byte* b) +{ + return(uint16_t(uint16_t(b[0]) << 8 | b[1])); +} + +#ifndef UNIV_INNOCHECKSUM + +/********************************************************//** +The following function is used to convert a 16-bit data item +to the canonical format, for fast bytewise equality test +against memory. +@return 16-bit integer in canonical format */ +UNIV_INLINE +uint16 +mach_encode_2( +/*==========*/ + ulint n) /*!< in: integer in machine-dependent format */ +{ + uint16 ret; + ut_ad(2 == sizeof ret); + mach_write_to_2((byte*) &ret, n); + return(ret); +} +/********************************************************//** +The following function is used to convert a 16-bit data item +from the canonical format, for fast bytewise equality test +against memory. +@return integer in machine-dependent format */ +UNIV_INLINE +ulint +mach_decode_2( +/*==========*/ + uint16 n) /*!< in: 16-bit integer in canonical format */ +{ + ut_ad(2 == sizeof n); + return(mach_read_from_2((const byte*) &n)); +} + +/*******************************************************//** +The following function is used to store data in 3 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_3( +/*============*/ + byte* b, /*!< in: pointer to 3 bytes where to store */ + ulint n) /*!< in: ulint integer to be stored */ +{ + ut_ad((n & ~0xFFFFFFUL) == 0); + + b[0] = (byte)(n >> 16); + b[1] = (byte)(n >> 8); + b[2] = (byte)(n); +} + +/** The following function is used to fetch data from 3 consecutive +bytes. The most significant byte is at the lowest address. +@param[in] b pointer to 3 bytes to read +@return uint32_t integer */ +UNIV_INLINE +uint32_t +mach_read_from_3( + const byte* b) +{ + return( (static_cast(b[0]) << 16) + | (static_cast(b[1]) << 8) + | static_cast(b[2]) + ); +} +#endif /* !UNIV_INNOCHECKSUM */ + +/*******************************************************//** +The following function is used to store data in four consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_4( +/*============*/ + byte* b, /*!< in: pointer to four bytes where to store */ + ulint n) /*!< in: ulint integer to be stored */ +{ + b[0] = (byte)(n >> 24); + b[1] = (byte)(n >> 16); + b[2] = (byte)(n >> 8); + b[3] = (byte) n; +} + +/** The following function is used to fetch data from 4 consecutive +bytes. The most significant byte is at the lowest address. +@param[in] b pointer to 4 bytes to read +@return 32 bit integer */ +UNIV_INLINE +uint32_t +mach_read_from_4( + const byte* b) +{ + return( (static_cast(b[0]) << 24) + | (static_cast(b[1]) << 16) + | (static_cast(b[2]) << 8) + | static_cast(b[3]) + ); +} + +#ifndef UNIV_INNOCHECKSUM + +/*********************************************************//** +Writes a ulint in a compressed form where the first byte codes the +length of the stored ulint. We look at the most significant bits of +the byte. If the most significant bit is zero, it means 1-byte storage, +else if the 2nd bit is 0, it means 2-byte storage, else if 3rd is 0, +it means 3-byte storage, else if 4th is 0, it means 4-byte storage, +else the storage is 5-byte. +@return compressed size in bytes */ +UNIV_INLINE +ulint +mach_write_compressed( +/*==================*/ + byte* b, /*!< in: pointer to memory where to store */ + ulint n) /*!< in: ulint integer (< 2^32) to be stored */ +{ + if (n < 0x80) { + /* 0nnnnnnn (7 bits) */ + mach_write_to_1(b, n); + return(1); + } else if (n < 0x4000) { + /* 10nnnnnn nnnnnnnn (14 bits) */ + mach_write_to_2(b, n | 0x8000); + return(2); + } else if (n < 0x200000) { + /* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */ + mach_write_to_3(b, n | 0xC00000); + return(3); + } else if (n < 0x10000000) { + /* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */ + mach_write_to_4(b, n | 0xE0000000); + return(4); + } else { + /* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */ + mach_write_to_1(b, 0xF0); + mach_write_to_4(b + 1, n); + return(5); + } +} + +/*********************************************************//** +Returns the size of a ulint when written in the compressed form. +@return compressed size in bytes */ +UNIV_INLINE +ulint +mach_get_compressed_size( +/*=====================*/ + ulint n) /*!< in: ulint integer (< 2^32) to be stored */ +{ + if (n < 0x80) { + /* 0nnnnnnn (7 bits) */ + return(1); + } else if (n < 0x4000) { + /* 10nnnnnn nnnnnnnn (14 bits) */ + return(2); + } else if (n < 0x200000) { + /* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */ + return(3); + } else if (n < 0x10000000) { + /* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */ + return(4); + } else { + /* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */ + return(5); + } +} + +/*********************************************************//** +Reads a ulint in a compressed form. +@return read integer (< 2^32) */ +UNIV_INLINE +ulint +mach_read_compressed( +/*=================*/ + const byte* b) /*!< in: pointer to memory from where to read */ +{ + ulint val; + + val = mach_read_from_1(b); + + if (val < 0x80) { + /* 0nnnnnnn (7 bits) */ + } else if (val < 0xC0) { + /* 10nnnnnn nnnnnnnn (14 bits) */ + val = mach_read_from_2(b) & 0x3FFF; + ut_ad(val > 0x7F); + } else if (val < 0xE0) { + /* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */ + val = mach_read_from_3(b) & 0x1FFFFF; + ut_ad(val > 0x3FFF); + } else if (val < 0xF0) { + /* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */ + val = mach_read_from_4(b) & 0xFFFFFFF; + ut_ad(val > 0x1FFFFF); + } else { + /* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */ + ut_ad(val == 0xF0); + val = mach_read_from_4(b + 1); + ut_ad(val > 0xFFFFFFF); + } + + return(val); +} + +/** Read a 32-bit integer in a compressed form. +@param[in,out] b pointer to memory where to read; +advanced by the number of bytes consumed +@return unsigned value */ +UNIV_INLINE +ib_uint32_t +mach_read_next_compressed( + const byte** b) +{ + ulint val = mach_read_from_1(*b); + + if (val < 0x80) { + /* 0nnnnnnn (7 bits) */ + ++*b; + } else if (val < 0xC0) { + /* 10nnnnnn nnnnnnnn (14 bits) */ + val = mach_read_from_2(*b) & 0x3FFF; + ut_ad(val > 0x7F); + *b += 2; + } else if (val < 0xE0) { + /* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */ + val = mach_read_from_3(*b) & 0x1FFFFF; + ut_ad(val > 0x3FFF); + *b += 3; + } else if (val < 0xF0) { + /* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */ + val = mach_read_from_4(*b) & 0xFFFFFFF; + ut_ad(val > 0x1FFFFF); + *b += 4; + } else { + /* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */ + ut_ad(val == 0xF0); + val = mach_read_from_4(*b + 1); + ut_ad(val > 0xFFFFFFF); + *b += 5; + } + + return(static_cast(val)); +} + +/*******************************************************//** +The following function is used to store data in 8 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_8( +/*============*/ + void* b, /*!< in: pointer to 8 bytes where to store */ + ib_uint64_t n) /*!< in: 64-bit integer to be stored */ +{ + mach_write_to_4(static_cast(b), (ulint) (n >> 32)); + mach_write_to_4(static_cast(b) + 4, (ulint) n); +} + +#endif /* !UNIV_INNOCHECKSUM */ + +/********************************************************//** +The following function is used to fetch data from 8 consecutive +bytes. The most significant byte is at the lowest address. +@return 64-bit integer */ +UNIV_INLINE +ib_uint64_t +mach_read_from_8( +/*=============*/ + const byte* b) /*!< in: pointer to 8 bytes */ +{ + ib_uint64_t u64; + + u64 = mach_read_from_4(b); + u64 <<= 32; + u64 |= mach_read_from_4(b + 4); + + return(u64); +} + +#ifndef UNIV_INNOCHECKSUM + +/*******************************************************//** +The following function is used to store data in 7 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_7( +/*============*/ + byte* b, /*!< in: pointer to 7 bytes where to store */ + ib_uint64_t n) /*!< in: 56-bit integer */ +{ + mach_write_to_3(b, (ulint) (n >> 32)); + mach_write_to_4(b + 3, (ulint) n); +} + +/********************************************************//** +The following function is used to fetch data from 7 consecutive +bytes. The most significant byte is at the lowest address. +@return 56-bit integer */ +UNIV_INLINE +ib_uint64_t +mach_read_from_7( +/*=============*/ + const byte* b) /*!< in: pointer to 7 bytes */ +{ + return(ut_ull_create(mach_read_from_3(b), mach_read_from_4(b + 3))); +} + +/*******************************************************//** +The following function is used to store data in 6 consecutive +bytes. We store the most significant byte to the lowest address. */ +UNIV_INLINE +void +mach_write_to_6( +/*============*/ + byte* b, /*!< in: pointer to 6 bytes where to store */ + ib_uint64_t n) /*!< in: 48-bit integer */ +{ + mach_write_to_2(b, (ulint) (n >> 32)); + mach_write_to_4(b + 2, (ulint) n); +} + +/********************************************************//** +The following function is used to fetch data from 6 consecutive +bytes. The most significant byte is at the lowest address. +@return 48-bit integer */ +UNIV_INLINE +ib_uint64_t +mach_read_from_6( +/*=============*/ + const byte* b) /*!< in: pointer to 6 bytes */ +{ + return(ut_ull_create(mach_read_from_2(b), mach_read_from_4(b + 2))); +} + +/*********************************************************//** +Writes a 64-bit integer in a compressed form (5..9 bytes). +@return size in bytes */ +UNIV_INLINE +ulint +mach_u64_write_compressed( +/*======================*/ + byte* b, /*!< in: pointer to memory where to store */ + ib_uint64_t n) /*!< in: 64-bit integer to be stored */ +{ + ulint size = mach_write_compressed(b, (ulint) (n >> 32)); + mach_write_to_4(b + size, (ulint) n); + + return(size + 4); +} + +/** Read a 64-bit integer in a compressed form. +@param[in,out] b pointer to memory where to read; +advanced by the number of bytes consumed +@return unsigned value */ +UNIV_INLINE +ib_uint64_t +mach_u64_read_next_compressed( + const byte** b) +{ + ib_uint64_t val; + + val = mach_read_next_compressed(b); + val <<= 32; + val |= mach_read_from_4(*b); + *b += 4; + return(val); +} + +/*********************************************************//** +Writes a 64-bit integer in a compressed form (1..11 bytes). +@return size in bytes */ +UNIV_INLINE +ulint +mach_u64_write_much_compressed( +/*===========================*/ + byte* b, /*!< in: pointer to memory where to store */ + ib_uint64_t n) /*!< in: 64-bit integer to be stored */ +{ + ulint size; + + if (!(n >> 32)) { + return(mach_write_compressed(b, (ulint) n)); + } + + *b = (byte)0xFF; + size = 1 + mach_write_compressed(b + 1, (ulint) (n >> 32)); + + size += mach_write_compressed(b + size, (ulint) n & 0xFFFFFFFF); + + return(size); +} + +/*********************************************************//** +Reads a 64-bit integer in a compressed form. +@return the value read */ +UNIV_INLINE +ib_uint64_t +mach_u64_read_much_compressed( +/*==========================*/ + const byte* b) /*!< in: pointer to memory from where to read */ +{ + ib_uint64_t n; + + if (*b != 0xFF) { + return(mach_read_compressed(b)); + } + + b++; + n = mach_read_next_compressed(&b); + n <<= 32; + n |= mach_read_compressed(b); + + return(n); +} + +/** Read a 64-bit integer in a compressed form. +@param[in,out] b pointer to memory where to read; +advanced by the number of bytes consumed +@return unsigned value */ +UNIV_INLINE +ib_uint64_t +mach_read_next_much_compressed( + const byte** b) +{ + ib_uint64_t val = mach_read_from_1(*b); + + if (val < 0x80) { + /* 0nnnnnnn (7 bits) */ + ++*b; + } else if (val < 0xC0) { + /* 10nnnnnn nnnnnnnn (14 bits) */ + val = mach_read_from_2(*b) & 0x3FFF; + ut_ad(val > 0x7F); + *b += 2; + } else if (val < 0xE0) { + /* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */ + val = mach_read_from_3(*b) & 0x1FFFFF; + ut_ad(val > 0x3FFF); + *b += 3; + } else if (val < 0xF0) { + /* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */ + val = mach_read_from_4(*b) & 0xFFFFFFF; + ut_ad(val > 0x1FFFFF); + *b += 4; + } else if (val == 0xF0) { + /* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */ + val = mach_read_from_4(*b + 1); + ut_ad(val > 0xFFFFFFF); + *b += 5; + } else { + /* 11111111 followed by up to 64 bits */ + ut_ad(val == 0xFF); + ++*b; + val = mach_read_next_compressed(b); + ut_ad(val > 0); + val <<= 32; + val |= mach_read_next_compressed(b); + } + + return(val); +} + +/*********************************************************//** +Reads a double. It is stored in a little-endian format. +@return double read */ +UNIV_INLINE +double +mach_double_read( +/*=============*/ + const byte* b) /*!< in: pointer to memory from where to read */ +{ + double d; + ulint i; + byte* ptr; + + ptr = (byte*) &d; + + for (i = 0; i < sizeof(double); i++) { +#ifdef WORDS_BIGENDIAN + ptr[sizeof(double) - i - 1] = b[i]; +#else + ptr[i] = b[i]; +#endif + } + + return(d); +} + +/*********************************************************//** +Writes a double. It is stored in a little-endian format. */ +UNIV_INLINE +void +mach_double_write( +/*==============*/ + byte* b, /*!< in: pointer to memory where to write */ + double d) /*!< in: double */ +{ + ulint i; + byte* ptr; + + ptr = (byte*) &d; + + for (i = 0; i < sizeof(double); i++) { +#ifdef WORDS_BIGENDIAN + b[i] = ptr[sizeof(double) - i - 1]; +#else + b[i] = ptr[i]; +#endif + } +} + +/*********************************************************//** +Reads a float. It is stored in a little-endian format. +@return float read */ +UNIV_INLINE +float +mach_float_read( +/*============*/ + const byte* b) /*!< in: pointer to memory from where to read */ +{ + float d; + ulint i; + byte* ptr; + + ptr = (byte*) &d; + + for (i = 0; i < sizeof(float); i++) { +#ifdef WORDS_BIGENDIAN + ptr[sizeof(float) - i - 1] = b[i]; +#else + ptr[i] = b[i]; +#endif + } + + return(d); +} + +/*********************************************************//** +Writes a float. It is stored in a little-endian format. */ +UNIV_INLINE +void +mach_float_write( +/*=============*/ + byte* b, /*!< in: pointer to memory where to write */ + float d) /*!< in: float */ +{ + ulint i; + byte* ptr; + + ptr = (byte*) &d; + + for (i = 0; i < sizeof(float); i++) { +#ifdef WORDS_BIGENDIAN + b[i] = ptr[sizeof(float) - i - 1]; +#else + b[i] = ptr[i]; +#endif + } +} + +/*********************************************************//** +Reads a ulint stored in the little-endian format. +@return unsigned long int */ +UNIV_INLINE +ulint +mach_read_from_n_little_endian( +/*===========================*/ + const byte* buf, /*!< in: from where to read */ + ulint buf_size) /*!< in: from how many bytes to read */ +{ + ulint n = 0; + const byte* ptr; + + ut_ad(buf_size > 0); + + ptr = buf + buf_size; + + for (;;) { + ptr--; + + n = n << 8; + + n += (ulint)(*ptr); + + if (ptr == buf) { + break; + } + } + + return(n); +} + +/*********************************************************//** +Writes a ulint in the little-endian format. */ +UNIV_INLINE +void +mach_write_to_n_little_endian( +/*==========================*/ + byte* dest, /*!< in: where to write */ + ulint dest_size, /*!< in: into how many bytes to write */ + ulint n) /*!< in: unsigned long int to write */ +{ + byte* end; + + ut_ad(dest_size <= sizeof(ulint)); + ut_ad(dest_size > 0); + + end = dest + dest_size; + + for (;;) { + *dest = (byte)(n & 0xFF); + + n = n >> 8; + + dest++; + + if (dest == end) { + break; + } + } + + ut_ad(n == 0); +} + +/*********************************************************//** +Reads a ulint stored in the little-endian format. +@return unsigned long int */ +UNIV_INLINE +ulint +mach_read_from_2_little_endian( +/*===========================*/ + const byte* buf) /*!< in: from where to read */ +{ + return((ulint)(buf[0]) | ((ulint)(buf[1]) << 8)); +} + +/*********************************************************//** +Writes a ulint in the little-endian format. */ +UNIV_INLINE +void +mach_write_to_2_little_endian( +/*==========================*/ + byte* dest, /*!< in: where to write */ + ulint n) /*!< in: unsigned long int to write */ +{ + ut_ad(n < 256 * 256); + + *dest = (byte)(n & 0xFFUL); + + n = n >> 8; + dest++; + + *dest = (byte)(n & 0xFFUL); +} + +/*********************************************************//** +Convert integral type from storage byte order (big endian) to +host byte order. +@return integer value */ +UNIV_INLINE +ib_uint64_t +mach_read_int_type( +/*===============*/ + const byte* src, /*!< in: where to read from */ + ulint len, /*!< in: length of src */ + ibool unsigned_type) /*!< in: signed or unsigned flag */ +{ + /* XXX this can be optimized on big-endian machines */ + + uintmax_t ret; + uint i; + + if (unsigned_type || (src[0] & 0x80)) { + + ret = 0x0000000000000000ULL; + } else { + + ret = 0xFFFFFFFFFFFFFF00ULL; + } + + if (unsigned_type) { + + ret |= src[0]; + } else { + + ret |= src[0] ^ 0x80; + } + + for (i = 1; i < len; i++) { + ret <<= 8; + ret |= src[i]; + } + + return(ret); +} +/*********************************************************//** +Swap byte ordering. */ +UNIV_INLINE +void +mach_swap_byte_order( +/*=================*/ + byte* dest, /*!< out: where to write */ + const byte* from, /*!< in: where to read from */ + ulint len) /*!< in: length of src */ +{ + ut_ad(len > 0); + ut_ad(len <= 8); + + dest += len; + + switch (len & 0x7) { + case 0: *--dest = *from++; /* fall through */ + case 7: *--dest = *from++; /* fall through */ + case 6: *--dest = *from++; /* fall through */ + case 5: *--dest = *from++; /* fall through */ + case 4: *--dest = *from++; /* fall through */ + case 3: *--dest = *from++; /* fall through */ + case 2: *--dest = *from++; /* fall through */ + case 1: *--dest = *from; + } +} + +/************************************************************* +Convert a ulonglong integer from host byte order to (big-endian) +storage byte order. */ +UNIV_INLINE +void +mach_write_ulonglong( +/*=================*/ + byte* dest, /*!< in: where to write */ + ulonglong src, /*!< in: where to read from */ + ulint len, /*!< in: length of dest */ + bool usign) /*!< in: signed or unsigned flag */ +{ + byte* ptr = reinterpret_cast(&src); + + ut_ad(len <= sizeof(ulonglong)); + +#ifdef WORDS_BIGENDIAN + memcpy(dest, ptr + (sizeof(src) - len), len); +#else + mach_swap_byte_order(dest, reinterpret_cast(ptr), len); +#endif /* WORDS_BIGENDIAN */ + + if (!usign) { + *dest ^= 0x80; + } +} + +#endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/mariadb_stats.h b/storage/innobase/include/mariadb_stats.h new file mode 100644 index 00000000..e9051c0c --- /dev/null +++ b/storage/innobase/include/mariadb_stats.h @@ -0,0 +1,119 @@ +/***************************************************************************** + +Copyright (c) 2023, MariaDB Foundation + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +#ifndef mariadb_stats_h +#define mariadb_stats_h + +/* Include file to handle mariadbd handler specific stats */ + +#include "ha_handler_stats.h" +#include "my_rdtsc.h" + +/* Not active threads are ponting to this structure */ +extern thread_local ha_handler_stats mariadb_dummy_stats; + +/* Points to either THD->handler_stats or mariad_dummy_stats */ +extern thread_local ha_handler_stats *mariadb_stats; + +/* + Returns 1 if MariaDB wants engine status +*/ + +inline bool mariadb_stats_active() +{ + return mariadb_stats->active != 0; +} + +inline bool mariadb_stats_active(ha_handler_stats *stats) +{ + return stats->active != 0; +} + +/* The following functions increment different engine status */ + +inline void mariadb_increment_pages_accessed() +{ + mariadb_stats->pages_accessed++; +} + +inline void mariadb_increment_pages_updated(ulonglong count) +{ + mariadb_stats->pages_updated+= count; +} + +inline void mariadb_increment_pages_read() +{ + mariadb_stats->pages_read_count++; +} + +inline void mariadb_increment_undo_records_read() +{ + mariadb_stats->undo_records_read++; +} + +/* + The following has to be identical code as measure() in sql_analyze_stmt.h + + One should only call this if mariadb_stats_active() is true. +*/ + +inline ulonglong mariadb_measure() +{ +#if (MY_TIMER_ROUTINE_CYCLES) + return my_timer_cycles(); +#else + return my_timer_microseconds(); +#endif +} + +/* + Call this only of start_time != 0 + See buf0rea.cc for an example of how to use it efficiently +*/ + +inline void mariadb_increment_pages_read_time(ulonglong start_time) +{ + ha_handler_stats *stats= mariadb_stats; + ulonglong end_time= mariadb_measure(); + /* Check that we only call this if active, see example! */ + DBUG_ASSERT(start_time); + DBUG_ASSERT(mariadb_stats_active(stats)); + + stats->pages_read_time+= (end_time - start_time); +} + + +/* + Helper class to set mariadb_stats temporarly for one call in handler.cc +*/ + +class mariadb_set_stats +{ +public: + uint flag; + mariadb_set_stats(ha_handler_stats *stats) + { + mariadb_stats= stats ? stats : &mariadb_dummy_stats; + } + ~mariadb_set_stats() + { + mariadb_stats= &mariadb_dummy_stats; + } +}; + +#endif /* mariadb_stats_h */ diff --git a/storage/innobase/include/mem0mem.h b/storage/innobase/include/mem0mem.h new file mode 100644 index 00000000..959147a6 --- /dev/null +++ b/storage/innobase/include/mem0mem.h @@ -0,0 +1,345 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/mem0mem.h +The memory management + +Created 6/9/1994 Heikki Tuuri +*******************************************************/ + +#ifndef mem0mem_h +#define mem0mem_h + +#include "ut0mem.h" +#include "ut0rnd.h" +#include "mach0data.h" + +#include + +/* -------------------- MEMORY HEAPS ----------------------------- */ + +/** A block of a memory heap consists of the info structure +followed by an area of memory */ +typedef struct mem_block_info_t mem_block_t; + +/** A memory heap is a nonempty linear list of memory blocks */ +typedef mem_block_t mem_heap_t; + +/** Types of allocation for memory heaps: DYNAMIC means allocation from the +dynamic memory pool of the C compiler, BUFFER means allocation from the +buffer pool; the latter method is used for very big heaps */ + +#define MEM_HEAP_DYNAMIC 0 /* the most common type */ +#define MEM_HEAP_BUFFER 1 +#define MEM_HEAP_BTR_SEARCH 2 /* this flag can optionally be + ORed to MEM_HEAP_BUFFER, in which + case heap->free_block is used in + some cases for memory allocations, + and if it's NULL, the memory + allocation functions can return + NULL. */ + +/** Different type of heaps in terms of which datastructure is using them */ +#define MEM_HEAP_FOR_BTR_SEARCH (MEM_HEAP_BTR_SEARCH | MEM_HEAP_BUFFER) +#define MEM_HEAP_FOR_LOCK_HEAP (MEM_HEAP_BUFFER) + +/** The following start size is used for the first block in the memory heap if +the size is not specified, i.e., 0 is given as the parameter in the call of +create. The standard size is the maximum (payload) size of the blocks used for +allocations of small buffers. */ + +#define MEM_BLOCK_START_SIZE 64 +#define MEM_BLOCK_STANDARD_SIZE \ + (srv_page_size >= 16384 ? 8000 : MEM_MAX_ALLOC_IN_BUF) + +/** If a memory heap is allowed to grow into the buffer pool, the following +is the maximum size for a single allocated buffer: */ +#define MEM_MAX_ALLOC_IN_BUF (srv_page_size - 200 + REDZONE_SIZE) + +/** Space needed when allocating for a user a field of length N. +The space is allocated only in multiples of UNIV_MEM_ALIGNMENT. */ +#define MEM_SPACE_NEEDED(N) UT_CALC_ALIGN((N), UNIV_MEM_ALIGNMENT) + +#ifdef UNIV_DEBUG +/** Macro for memory heap creation. +@param[in] size Desired start block size. */ +# define mem_heap_create(size) \ + mem_heap_create_func((size), __FILE__, __LINE__, MEM_HEAP_DYNAMIC) + +/** Macro for memory heap creation. +@param[in] size Desired start block size. +@param[in] type Heap type */ +# define mem_heap_create_typed(size, type) \ + mem_heap_create_func((size), __FILE__, __LINE__, (type)) + +#else /* UNIV_DEBUG */ +/** Macro for memory heap creation. +@param[in] size Desired start block size. */ +# define mem_heap_create(size) mem_heap_create_func((size), MEM_HEAP_DYNAMIC) + +/** Macro for memory heap creation. +@param[in] size Desired start block size. +@param[in] type Heap type */ +# define mem_heap_create_typed(size, type) \ + mem_heap_create_func((size), (type)) + +#endif /* UNIV_DEBUG */ + +/** Creates a memory heap. +NOTE: Use the corresponding macros instead of this function. +A single user buffer of 'size' will fit in the block. +0 creates a default size block. +@param[in] size Desired start block size. +@param[in] file_name File name where created +@param[in] line Line where created +@param[in] type Heap type +@return own: memory heap, NULL if did not succeed (only possible for +MEM_HEAP_BTR_SEARCH type heaps) */ +UNIV_INLINE +mem_heap_t* +mem_heap_create_func( + ulint size, +#ifdef UNIV_DEBUG + const char* file_name, + unsigned line, +#endif /* UNIV_DEBUG */ + ulint type); + +/** Frees the space occupied by a memory heap. +NOTE: Use the corresponding macro instead of this function. +@param[in] heap Heap to be freed */ +UNIV_INLINE +void +mem_heap_free( + mem_heap_t* heap); + +/** Allocates and zero-fills n bytes of memory from a memory heap. +@param[in] heap memory heap +@param[in] n number of bytes; if the heap is allowed to grow into +the buffer pool, this must be <= MEM_MAX_ALLOC_IN_BUF +@return allocated, zero-filled storage */ +UNIV_INLINE +void* +mem_heap_zalloc( + mem_heap_t* heap, + ulint n); + +/** Allocates n bytes of memory from a memory heap. +@param[in] heap memory heap +@param[in] n number of bytes; if the heap is allowed to grow into +the buffer pool, this must be <= MEM_MAX_ALLOC_IN_BUF +@return allocated storage, NULL if did not succeed (only possible for +MEM_HEAP_BTR_SEARCH type heaps) */ +UNIV_INLINE +void* +mem_heap_alloc( + mem_heap_t* heap, + ulint n); + +/** Returns a pointer to the heap top. +@param[in] heap memory heap +@return pointer to the heap top */ +UNIV_INLINE +byte* +mem_heap_get_heap_top( + mem_heap_t* heap); + +/** Frees the space in a memory heap exceeding the pointer given. +The pointer must have been acquired from mem_heap_get_heap_top. +The first memory block of the heap is not freed. +@param[in] heap heap from which to free +@param[in] old_top pointer to old top of heap */ +UNIV_INLINE +void +mem_heap_free_heap_top( + mem_heap_t* heap, + byte* old_top); + +/** Empties a memory heap. +The first memory block of the heap is not freed. +@param[in] heap heap to empty */ +UNIV_INLINE +void +mem_heap_empty( + mem_heap_t* heap); + +/** Returns a pointer to the topmost element in a memory heap. +The size of the element must be given. +@param[in] heap memory heap +@param[in] n size of the topmost element +@return pointer to the topmost element */ +UNIV_INLINE +void* +mem_heap_get_top( + mem_heap_t* heap, + ulint n); + +/*****************************************************************//** +Frees the topmost element in a memory heap. +The size of the element must be given. */ +UNIV_INLINE +void +mem_heap_free_top( +/*==============*/ + mem_heap_t* heap, /*!< in: memory heap */ + ulint n); /*!< in: size of the topmost element */ +/*****************************************************************//** +Returns the space in bytes occupied by a memory heap. */ +UNIV_INLINE +ulint +mem_heap_get_size( +/*==============*/ + mem_heap_t* heap); /*!< in: heap */ + +/**********************************************************************//** +Duplicates a NUL-terminated string. +@return own: a copy of the string, must be deallocated with ut_free */ +UNIV_INLINE +char* +mem_strdup( +/*=======*/ + const char* str); /*!< in: string to be copied */ +/**********************************************************************//** +Makes a NUL-terminated copy of a nonterminated string. +@return own: a copy of the string, must be deallocated with ut_free */ +UNIV_INLINE +char* +mem_strdupl( +/*========*/ + const char* str, /*!< in: string to be copied */ + ulint len); /*!< in: length of str, in bytes */ + +/** Duplicate a block of data, allocated from a memory heap. +@param[in] heap memory heap where string is allocated +@param[in] data block of data to be copied +@param[in] len length of data, in bytes +@return own: a copy of data */ +inline +void* +mem_heap_dup(mem_heap_t* heap, const void* data, size_t len) +{ + ut_ad(data || !len); + return UNIV_LIKELY(data != NULL) + ? memcpy(mem_heap_alloc(heap, len), data, len) + : NULL; +} + +/** Duplicate a NUL-terminated string, allocated from a memory heap. +@param[in] heap memory heap where string is allocated +@param[in] str string to be copied +@return own: a copy of the string */ +inline +char* +mem_heap_strdup(mem_heap_t* heap, const char* str) +{ + return(static_cast(mem_heap_dup(heap, str, strlen(str) + 1))); +} + +/** Duplicate a string, allocated from a memory heap. +@param[in] heap memory heap where string is allocated +@param[in] str string to be copied +@param[in] len length of str, in bytes +@return own: a NUL-terminated copy of str */ +inline +char* +mem_heap_strdupl(mem_heap_t* heap, const char* str, size_t len) +{ + char* s = static_cast(mem_heap_alloc(heap, len + 1)); + s[len] = 0; + return(static_cast(memcpy(s, str, len))); +} + +/**********************************************************************//** +Concatenate two strings and return the result, using a memory heap. +@return own: the result */ +char* +mem_heap_strcat( +/*============*/ + mem_heap_t* heap, /*!< in: memory heap where string is allocated */ + const char* s1, /*!< in: string 1 */ + const char* s2); /*!< in: string 2 */ + +/****************************************************************//** +A simple sprintf replacement that dynamically allocates the space for the +formatted string from the given heap. This supports a very limited set of +the printf syntax: types 's' and 'u' and length modifier 'l' (which is +required for the 'u' type). +@return heap-allocated formatted string */ +char* +mem_heap_printf( +/*============*/ + mem_heap_t* heap, /*!< in: memory heap */ + const char* format, /*!< in: format string */ + ...) MY_ATTRIBUTE ((format (printf, 2, 3))); + +#ifdef UNIV_DEBUG +/** Validates the contents of a memory heap. +Asserts that the memory heap is consistent +@param[in] heap Memory heap to validate */ +void +mem_heap_validate( + const mem_heap_t* heap); + +#endif /* UNIV_DEBUG */ + +/*#######################################################################*/ + +/** The info structure stored at the beginning of a heap block */ +struct mem_block_info_t { +#ifdef UNIV_DEBUG + char file_name[8];/* file name where the mem heap was created */ + unsigned line; /*!< line number where the mem heap was created */ +#endif /* UNIV_DEBUG */ + UT_LIST_BASE_NODE_T(mem_block_t) base; /* In the first block in the + the list this is the base node of the list of blocks; + in subsequent blocks this is undefined */ + UT_LIST_NODE_T(mem_block_t) list; /* This contains pointers to next + and prev in the list. The first block allocated + to the heap is also the first block in this list, + though it also contains the base node of the list. */ + ulint len; /*!< physical length of this block in bytes */ + ulint total_size; /*!< physical length in bytes of all blocks + in the heap. This is defined only in the base + node and is set to ULINT_UNDEFINED in others. */ + ulint type; /*!< type of heap: MEM_HEAP_DYNAMIC, or + MEM_HEAP_BUF possibly ORed to MEM_HEAP_BTR_SEARCH */ + ulint free; /*!< offset in bytes of the first free position for + user data in the block */ + ulint start; /*!< the value of the struct field 'free' at the + creation of the block */ + + void* free_block; + /* if the MEM_HEAP_BTR_SEARCH bit is set in type, + and this is the heap root, this can contain an + allocated buffer frame, which can be appended as a + free block to the heap, if we need more space; + otherwise, this is NULL */ + void* buf_block; + /* if this block has been allocated from the buffer + pool, this contains the buf_block_t handle; + otherwise, this is NULL */ +}; + +/* Header size for a memory heap block */ +#define MEM_BLOCK_HEADER_SIZE UT_CALC_ALIGN(sizeof(mem_block_info_t),\ + UNIV_MEM_ALIGNMENT) + +#include "mem0mem.inl" +#endif diff --git a/storage/innobase/include/mem0mem.inl b/storage/innobase/include/mem0mem.inl new file mode 100644 index 00000000..9906daf3 --- /dev/null +++ b/storage/innobase/include/mem0mem.inl @@ -0,0 +1,468 @@ +/***************************************************************************** + +Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/mem0mem.ic +The memory management + +Created 6/8/1994 Heikki Tuuri +*************************************************************************/ + +#include "ut0new.h" + +#ifdef UNIV_DEBUG +# define mem_heap_create_block(heap, n, type, file_name, line) \ + mem_heap_create_block_func(heap, n, file_name, line, type) +# define mem_heap_create_at(N, file_name, line) \ + mem_heap_create_func(N, file_name, line, MEM_HEAP_DYNAMIC) +#else /* UNIV_DEBUG */ +# define mem_heap_create_block(heap, n, type, file_name, line) \ + mem_heap_create_block_func(heap, n, type) +# define mem_heap_create_at(N, file_name, line) \ + mem_heap_create_func(N, MEM_HEAP_DYNAMIC) +#endif /* UNIV_DEBUG */ +/***************************************************************//** +Creates a memory heap block where data can be allocated. +@return own: memory heap block, NULL if did not succeed (only possible +for MEM_HEAP_BTR_SEARCH type heaps) */ +mem_block_t* +mem_heap_create_block_func( +/*=======================*/ + mem_heap_t* heap, /*!< in: memory heap or NULL if first block + should be created */ + ulint n, /*!< in: number of bytes needed for user data */ +#ifdef UNIV_DEBUG + const char* file_name,/*!< in: file name where created */ + unsigned line, /*!< in: line where created */ +#endif /* UNIV_DEBUG */ + ulint type); /*!< in: type of heap: MEM_HEAP_DYNAMIC or + MEM_HEAP_BUFFER */ + +/******************************************************************//** +Frees a block from a memory heap. */ +void +mem_heap_block_free( +/*================*/ + mem_heap_t* heap, /*!< in: heap */ + mem_block_t* block); /*!< in: block to free */ + +/******************************************************************//** +Frees the free_block field from a memory heap. */ +void +mem_heap_free_block_free( +/*=====================*/ + mem_heap_t* heap); /*!< in: heap */ + +/***************************************************************//** +Adds a new block to a memory heap. +@param[in] heap memory heap +@param[in] n number of bytes needed +@return created block, NULL if did not succeed (only possible for +MEM_HEAP_BTR_SEARCH type heaps) */ +mem_block_t* +mem_heap_add_block( + mem_heap_t* heap, + ulint n); + +UNIV_INLINE +void +mem_block_set_len(mem_block_t* block, ulint len) +{ + ut_ad(len > 0); + + block->len = len; +} + +UNIV_INLINE +ulint +mem_block_get_len(mem_block_t* block) +{ + return(block->len); +} + +UNIV_INLINE +void +mem_block_set_type(mem_block_t* block, ulint type) +{ + ut_ad((type == MEM_HEAP_DYNAMIC) || (type == MEM_HEAP_BUFFER) + || (type == MEM_HEAP_BUFFER + MEM_HEAP_BTR_SEARCH)); + + block->type = type; +} + +UNIV_INLINE +ulint +mem_block_get_type(mem_block_t* block) +{ + return(block->type); +} + +UNIV_INLINE +void +mem_block_set_free(mem_block_t* block, ulint free) +{ + ut_ad(free > 0); + ut_ad(free <= mem_block_get_len(block)); + + block->free = free; +} + +UNIV_INLINE +ulint +mem_block_get_free(mem_block_t* block) +{ + return(block->free); +} + +UNIV_INLINE +void +mem_block_set_start(mem_block_t* block, ulint start) +{ + ut_ad(start > 0); + + block->start = start; +} + +UNIV_INLINE +ulint +mem_block_get_start(mem_block_t* block) +{ + return(block->start); +} + +/** Allocates and zero-fills n bytes of memory from a memory heap. +@param[in] heap memory heap +@param[in] n number of bytes; if the heap is allowed to grow into +the buffer pool, this must be <= MEM_MAX_ALLOC_IN_BUF +@return allocated, zero-filled storage */ +UNIV_INLINE +void* +mem_heap_zalloc( + mem_heap_t* heap, + ulint n) +{ + ut_ad(heap); + ut_ad(!(heap->type & MEM_HEAP_BTR_SEARCH)); + return(memset(mem_heap_alloc(heap, n), 0, n)); +} + +/** Allocates n bytes of memory from a memory heap. +@param[in] heap memory heap +@param[in] n number of bytes; if the heap is allowed to grow into +the buffer pool, this must be <= MEM_MAX_ALLOC_IN_BUF +@return allocated storage, NULL if did not succeed (only possible for +MEM_HEAP_BTR_SEARCH type heaps) */ +UNIV_INLINE +void* +mem_heap_alloc( + mem_heap_t* heap, + ulint n) +{ + mem_block_t* block; + byte* buf; + ulint free; + + block = UT_LIST_GET_LAST(heap->base); + + n += REDZONE_SIZE; + + ut_ad(!(block->type & MEM_HEAP_BUFFER) || (n <= MEM_MAX_ALLOC_IN_BUF)); + + /* Check if there is enough space in block. If not, create a new + block to the heap */ + + if (mem_block_get_len(block) + < mem_block_get_free(block) + MEM_SPACE_NEEDED(n)) { + + block = mem_heap_add_block(heap, n); + + if (block == NULL) { + + return(NULL); + } + } + + free = mem_block_get_free(block); + + buf = (byte*) block + free; + + mem_block_set_free(block, free + MEM_SPACE_NEEDED(n)); + + buf = buf + REDZONE_SIZE; + MEM_MAKE_ADDRESSABLE(buf, n - REDZONE_SIZE); + return(buf); +} + +/** Returns a pointer to the heap top. +@param[in] heap memory heap +@return pointer to the heap top */ +UNIV_INLINE +byte* +mem_heap_get_heap_top( + mem_heap_t* heap) +{ + mem_block_t* block; + byte* buf; + + block = UT_LIST_GET_LAST(heap->base); + + buf = (byte*) block + mem_block_get_free(block); + + return(buf); +} + +/** Frees the space in a memory heap exceeding the pointer given. +The pointer must have been acquired from mem_heap_get_heap_top. +The first memory block of the heap is not freed. +@param[in] heap heap from which to free +@param[in] old_top pointer to old top of heap */ +UNIV_INLINE +void +mem_heap_free_heap_top( + mem_heap_t* heap, + byte* old_top) +{ + mem_block_t* block; + mem_block_t* prev_block; + + ut_d(mem_heap_validate(heap)); + + block = UT_LIST_GET_LAST(heap->base); + + while (block != NULL) { + if (((byte*) block + mem_block_get_free(block) >= old_top) + && ((byte*) block <= old_top)) { + /* Found the right block */ + + break; + } + + /* Store prev_block value before freeing the current block + (the current block will be erased in freeing) */ + + prev_block = UT_LIST_GET_PREV(list, block); + + mem_heap_block_free(heap, block); + + block = prev_block; + } + + ut_ad(block); + + /* Set the free field of block */ + mem_block_set_free(block, + ulint(old_top - reinterpret_cast(block))); + + ut_ad(mem_block_get_start(block) <= mem_block_get_free(block)); + MEM_NOACCESS(old_top, (byte*) block + block->len - old_top); + + /* If free == start, we may free the block if it is not the first + one */ + + if ((heap != block) && (mem_block_get_free(block) + == mem_block_get_start(block))) { + mem_heap_block_free(heap, block); + } +} + +/** Empties a memory heap. +The first memory block of the heap is not freed. +@param[in] heap heap to empty */ +UNIV_INLINE +void +mem_heap_empty( + mem_heap_t* heap) +{ + mem_heap_free_heap_top(heap, (byte*) heap + mem_block_get_start(heap)); + + if (heap->free_block) { + mem_heap_free_block_free(heap); + } +} + +/** Returns a pointer to the topmost element in a memory heap. +The size of the element must be given. +@param[in] heap memory heap +@param[in] n size of the topmost element +@return pointer to the topmost element */ +UNIV_INLINE +void* +mem_heap_get_top( + mem_heap_t* heap, + ulint n) +{ + mem_block_t* block; + byte* buf; + + block = UT_LIST_GET_LAST(heap->base); + + buf = (byte*) block + mem_block_get_free(block) - MEM_SPACE_NEEDED(n); + + return((void*) buf); +} + +/*****************************************************************//** +Frees the topmost element in a memory heap. The size of the element must be +given. */ +UNIV_INLINE +void +mem_heap_free_top( +/*==============*/ + mem_heap_t* heap, /*!< in: memory heap */ + ulint n) /*!< in: size of the topmost element */ +{ + mem_block_t* block; + + n += REDZONE_SIZE; + + block = UT_LIST_GET_LAST(heap->base); + + /* Subtract the free field of block */ + mem_block_set_free(block, mem_block_get_free(block) + - MEM_SPACE_NEEDED(n)); + + /* If free == start, we may free the block if it is not the first + one */ + + if ((heap != block) && (mem_block_get_free(block) + == mem_block_get_start(block))) { + mem_heap_block_free(heap, block); + } else { + MEM_NOACCESS((byte*) block + mem_block_get_free(block), n); + } +} + +/** Creates a memory heap. +NOTE: Use the corresponding macros instead of this function. +A single user buffer of 'size' will fit in the block. +0 creates a default size block. +@param[in] size Desired start block size. +@param[in] file_name File name where created +@param[in] line Line where created +@param[in] type Heap type +@return own: memory heap, NULL if did not succeed (only possible for +MEM_HEAP_BTR_SEARCH type heaps) */ +UNIV_INLINE +mem_heap_t* +mem_heap_create_func( + ulint size, +#ifdef UNIV_DEBUG + const char* file_name, + unsigned line, +#endif /* UNIV_DEBUG */ + ulint type) +{ + mem_block_t* block; + + if (!size) { + size = MEM_BLOCK_START_SIZE; + } + + block = mem_heap_create_block(NULL, size, type, file_name, line); + + if (block == NULL) { + + return(NULL); + } + + /* The first block should not be in buffer pool, + because it might be relocated to resize buffer pool. */ + ut_ad(block->buf_block == NULL); + + UT_LIST_INIT(block->base, &mem_block_t::list); + + /* Add the created block itself as the first block in the list */ + UT_LIST_ADD_FIRST(block->base, block); + + return(block); +} + +/** Frees the space occupied by a memory heap. +NOTE: Use the corresponding macro instead of this function. +@param[in] heap Heap to be freed */ +UNIV_INLINE +void +mem_heap_free( + mem_heap_t* heap) +{ + mem_block_t* block; + mem_block_t* prev_block; + + block = UT_LIST_GET_LAST(heap->base); + + if (heap->free_block) { + mem_heap_free_block_free(heap); + } + + while (block != NULL) { + /* Store the contents of info before freeing current block + (it is erased in freeing) */ + + prev_block = UT_LIST_GET_PREV(list, block); + + mem_heap_block_free(heap, block); + + block = prev_block; + } +} + +/*****************************************************************//** +Returns the space in bytes occupied by a memory heap. */ +UNIV_INLINE +ulint +mem_heap_get_size( +/*==============*/ + mem_heap_t* heap) /*!< in: heap */ +{ + ulint size = heap->total_size; + + if (heap->free_block) { + size += srv_page_size; + } + + return(size); +} + +/**********************************************************************//** +Duplicates a NUL-terminated string. +@return own: a copy of the string, must be deallocated with ut_free */ +UNIV_INLINE +char* +mem_strdup( +/*=======*/ + const char* str) /*!< in: string to be copied */ +{ + ulint len = strlen(str) + 1; + return(static_cast(memcpy(ut_malloc_nokey(len), str, len))); +} + +/**********************************************************************//** +Makes a NUL-terminated copy of a nonterminated string. +@return own: a copy of the string, must be deallocated with ut_free */ +UNIV_INLINE +char* +mem_strdupl( +/*========*/ + const char* str, /*!< in: string to be copied */ + ulint len) /*!< in: length of str, in bytes */ +{ + char* s = static_cast(ut_malloc_nokey(len + 1)); + s[len] = 0; + return(static_cast(memcpy(s, str, len))); +} diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h new file mode 100644 index 00000000..e2419309 --- /dev/null +++ b/storage/innobase/include/mtr0log.h @@ -0,0 +1,637 @@ +/***************************************************************************** + +Copyright (c) 2019, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/** +@file include/mtr0log.h +Mini-transaction log record encoding and decoding +*******************************************************/ + +#pragma once +#include "mtr0mtr.h" + +/** The smallest invalid page identifier for persistent tablespaces */ +constexpr page_id_t end_page_id{SRV_SPACE_ID_UPPER_BOUND, 0}; + +/** The minimum 2-byte integer (0b10xxxxxx xxxxxxxx) */ +constexpr uint32_t MIN_2BYTE= 1 << 7; +/** The minimum 3-byte integer (0b110xxxxx xxxxxxxx xxxxxxxx) */ +constexpr uint32_t MIN_3BYTE= MIN_2BYTE + (1 << 14); +/** The minimum 4-byte integer (0b1110xxxx xxxxxxxx xxxxxxxx xxxxxxxx) */ +constexpr uint32_t MIN_4BYTE= MIN_3BYTE + (1 << 21); +/** Minimum 5-byte integer (0b11110000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx) */ +constexpr uint32_t MIN_5BYTE= MIN_4BYTE + (1 << 28); + +/** Error from mlog_decode_varint() */ +constexpr uint32_t MLOG_DECODE_ERROR= ~0U; + +/** Decode the length of a variable-length encoded integer. +@param first first byte of the encoded integer +@return the length, in bytes */ +inline uint8_t mlog_decode_varint_length(byte first) +{ + uint8_t len= 1; + for (; first & 0x80; len++, first= static_cast(first << 1)); + return len; +} + +/** Decode an integer in a redo log record. +@param log redo log record buffer +@return the decoded integer +@retval MLOG_DECODE_ERROR on error */ +template +inline uint32_t mlog_decode_varint(const byte_pointer log) +{ + uint32_t i= *log; + if (i < MIN_2BYTE) + return i; + if (i < 0xc0) + return MIN_2BYTE + ((i & ~0x80) << 8 | log[1]); + if (i < 0xe0) + return MIN_3BYTE + ((i & ~0xc0) << 16 | uint32_t{log[1]} << 8 | log[2]); + if (i < 0xf0) + return MIN_4BYTE + ((i & ~0xe0) << 24 | uint32_t{log[1]} << 16 | + uint32_t{log[2]} << 8 | log[3]); + if (i == 0xf0) + { + i= uint32_t{log[1]} << 24 | uint32_t{log[2]} << 16 | + uint32_t{log[3]} << 8 | log[4]; + if (i <= ~MIN_5BYTE) + return MIN_5BYTE + i; + } + return MLOG_DECODE_ERROR; +} + +/** Encode an integer in a redo log record. +@param log redo log record buffer +@param i the integer to encode +@return end of the encoded integer */ +inline byte *mlog_encode_varint(byte *log, size_t i) +{ +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wconversion" /* GCC 4 and 5 need this here */ +#endif + if (i < MIN_2BYTE) + { + } + else if (i < MIN_3BYTE) + { + i-= MIN_2BYTE; + static_assert(MIN_3BYTE - MIN_2BYTE == 1 << 14, "compatibility"); + *log++= 0x80 | static_cast(i >> 8); + } + else if (i < MIN_4BYTE) + { + i-= MIN_3BYTE; + static_assert(MIN_4BYTE - MIN_3BYTE == 1 << 21, "compatibility"); + *log++= 0xc0 | static_cast(i >> 16); + goto last2; + } + else if (i < MIN_5BYTE) + { + i-= MIN_4BYTE; + static_assert(MIN_5BYTE - MIN_4BYTE == 1 << 28, "compatibility"); + *log++= 0xe0 | static_cast(i >> 24); + goto last3; + } + else + { + ut_ad(i < MLOG_DECODE_ERROR); + i-= MIN_5BYTE; + *log++= 0xf0; + *log++= static_cast(i >> 24); +last3: + *log++= static_cast(i >> 16); +last2: + *log++= static_cast(i >> 8); + } +#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6 +# pragma GCC diagnostic pop +#endif + *log++= static_cast(i); + return log; +} + +/** Determine the length of a log record. +@param log start of log record +@param end end of the log record buffer +@return the length of the record, in bytes +@retval 0 if the log extends past the end +@retval MLOG_DECODE_ERROR if the record is corrupted */ +inline uint32_t mlog_decode_len(const byte *log, const byte *end) +{ + ut_ad(log < end); + uint32_t i= *log; + if (!i) + return 0; /* end of mini-transaction */ + if (~i & 15) + return (i & 15) + 1; /* 1..16 bytes */ + if (UNIV_UNLIKELY(++log == end)) + return 0; /* end of buffer */ + i= *log; + if (UNIV_LIKELY(i < MIN_2BYTE)) /* 1 additional length byte: 16..143 bytes */ + return 16 + i; + if (i < 0xc0) /* 2 additional length bytes: 144..16,527 bytes */ + { + if (UNIV_UNLIKELY(log + 1 == end)) + return 0; /* end of buffer */ + return 16 + MIN_2BYTE + ((i & ~0xc0) << 8 | log[1]); + } + if (i < 0xe0) /* 3 additional length bytes: 16528..1065103 bytes */ + { + if (UNIV_UNLIKELY(log + 2 == end)) + return 0; /* end of buffer */ + return 16 + MIN_3BYTE + ((i & ~0xe0) << 16 | + static_cast(log[1]) << 8 | log[2]); + } + /* 1,065,103 bytes per log record ought to be enough for everyone */ + return MLOG_DECODE_ERROR; +} + +/** Write 1, 2, 4, or 8 bytes to a file page. +@param[in] block file page +@param[in,out] ptr pointer in file page +@param[in] val value to write +@tparam l number of bytes to write +@tparam w write request type +@tparam V type of val +@return whether any log was written */ +template +inline bool mtr_t::write(const buf_block_t &block, void *ptr, V val) +{ + ut_ad(ut_align_down(ptr, srv_page_size) == block.page.frame); + static_assert(l == 1 || l == 2 || l == 4 || l == 8, "wrong length"); + byte buf[l]; + + switch (l) { + case 1: + ut_ad(val == static_cast(val)); + buf[0]= static_cast(val); + break; + case 2: + ut_ad(val == static_cast(val)); + mach_write_to_2(buf, static_cast(val)); + break; + case 4: + ut_ad(val == static_cast(val)); + mach_write_to_4(buf, static_cast(val)); + break; + case 8: + mach_write_to_8(buf, val); + break; + } + byte *p= static_cast(ptr); + const byte *const end= p + l; + if (w != FORCED && is_logged()) + { + const byte *b= buf; + while (*p++ == *b++) + { + if (p == end) + { + ut_ad(w == MAYBE_NOP); + return false; + } + } + p--; + } + ::memcpy(ptr, buf, l); + memcpy_low(block, static_cast + (ut_align_offset(p, srv_page_size)), p, end - p); + return true; +} + +/** Log an initialization of a string of bytes. +@param[in] b buffer page +@param[in] ofs byte offset from b->frame +@param[in] len length of the data to write +@param[in] val the data byte to write */ +inline void mtr_t::memset(const buf_block_t &b, ulint ofs, ulint len, byte val) +{ + ut_ad(len); + set_modified(b); + if (!is_logged()) + return; + + static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency"); + size_t lenlen= (len < MIN_2BYTE ? 1 + 1 : len < MIN_3BYTE ? 2 + 1 : 3 + 1); + byte *l= log_write(b.page.id(), &b.page, lenlen, true, ofs); + l= mlog_encode_varint(l, len); + *l++= val; + m_log.close(l); + m_last_offset= static_cast(ofs + len); +} + +/** Initialize a string of bytes. +@param[in,out] b buffer page +@param[in] ofs byte offset from block->frame +@param[in] len length of the data to write +@param[in] val the data byte to write */ +inline void mtr_t::memset(const buf_block_t *b, ulint ofs, ulint len, byte val) +{ + ut_ad(ofs <= ulint(srv_page_size)); + ut_ad(ofs + len <= ulint(srv_page_size)); + ::memset(ofs + b->page.frame, val, len); + memset(*b, ofs, len, val); +} + +/** Log an initialization of a repeating string of bytes. +@param[in] b buffer page +@param[in] ofs byte offset from b->frame +@param[in] len length of the data to write, in bytes +@param[in] str the string to write +@param[in] size size of str, in bytes */ +inline void mtr_t::memset(const buf_block_t &b, ulint ofs, size_t len, + const void *str, size_t size) +{ + ut_ad(size); + ut_ad(len > size); /* use mtr_t::memcpy() for shorter writes */ + set_modified(b); + if (!is_logged()) + return; + + static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency"); + size_t lenlen= (len < MIN_2BYTE ? 1 : len < MIN_3BYTE ? 2 : 3); + byte *l= log_write(b.page.id(), &b.page, lenlen + size, true, ofs); + l= mlog_encode_varint(l, len); + ::memcpy(l, str, size); + l+= size; + m_log.close(l); + m_last_offset= static_cast(ofs + len); +} + +/** Initialize a repeating string of bytes. +@param[in,out] b buffer page +@param[in] ofs byte offset from b->frame +@param[in] len length of the data to write, in bytes +@param[in] str the string to write +@param[in] size size of str, in bytes */ +inline void mtr_t::memset(const buf_block_t *b, ulint ofs, size_t len, + const void *str, size_t size) +{ + ut_ad(ofs <= ulint(srv_page_size)); + ut_ad(ofs + len <= ulint(srv_page_size)); + ut_ad(len > size); /* use mtr_t::memcpy() for shorter writes */ + size_t s= 0; + while (s < len) + { + ::memcpy(ofs + s + b->page.frame, str, size); + s+= len; + } + ::memcpy(ofs + s + b->page.frame, str, len - s); + memset(*b, ofs, len, str, size); +} + +/** Log a write of a byte string to a page. +@param[in] b buffer page +@param[in] offset byte offset from b->frame +@param[in] str the data to write +@param[in] len length of the data to write */ +inline void mtr_t::memcpy(const buf_block_t &b, ulint offset, ulint len) +{ + ut_ad(len); + ut_ad(offset <= ulint(srv_page_size)); + ut_ad(offset + len <= ulint(srv_page_size)); + memcpy_low(b, uint16_t(offset), &b.page.frame[offset], len); +} + +/** Log a write of a byte string to a page. +@param block page +@param offset byte offset within page +@param data data to be written +@param len length of the data, in bytes */ +inline void mtr_t::memcpy_low(const buf_block_t &block, uint16_t offset, + const void *data, size_t len) +{ + ut_ad(len); + set_modified(block); + if (!is_logged()) + return; + if (len < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5)) + { + byte *end= log_write(block.page.id(), &block.page, len, true, + offset); + ::memcpy(end, data, len); + m_log.close(end + len); + } + else + { + m_log.close(log_write(block.page.id(), &block.page, len, false, + offset)); + m_log.push(static_cast(data), static_cast(len)); + } + m_last_offset= static_cast(offset + len); +} + +/** Log that a string of bytes was copied from the same page. +@param[in] b buffer page +@param[in] d destination offset within the page +@param[in] s source offset within the page +@param[in] len length of the data to copy */ +inline void mtr_t::memmove(const buf_block_t &b, ulint d, ulint s, ulint len) +{ + ut_ad(d >= 8); + ut_ad(s >= 8); + ut_ad(len); + ut_ad(s <= ulint(srv_page_size)); + ut_ad(s + len <= ulint(srv_page_size)); + ut_ad(s != d); + ut_ad(d <= ulint(srv_page_size)); + ut_ad(d + len <= ulint(srv_page_size)); + + set_modified(b); + if (!is_logged()) + return; + static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency"); + size_t lenlen= (len < MIN_2BYTE ? 1 : len < MIN_3BYTE ? 2 : 3); + /* The source offset is encoded relative to the destination offset, + with the sign in the least significant bit. */ + if (s > d) + s= (s - d) << 1; + else + s= (d - s) << 1 | 1; + /* The source offset 0 is not possible. */ + s-= 1 << 1; + size_t slen= (s < MIN_2BYTE ? 1 : s < MIN_3BYTE ? 2 : 3); + byte *l= log_write(b.page.id(), &b.page, lenlen + slen, true, d); + l= mlog_encode_varint(l, len); + l= mlog_encode_varint(l, s); + m_log.close(l); + m_last_offset= static_cast(d + len); +} + +/** +Write a log record. +@tparam type redo log record type +@param id persistent page identifier +@param bpage buffer pool page, or nullptr +@param len number of additional bytes to write +@param alloc whether to allocate the additional bytes +@param offset byte offset, or 0 if the record type does not allow one +@return end of mini-transaction log, minus len */ +template +inline byte *mtr_t::log_write(const page_id_t id, const buf_page_t *bpage, + size_t len, bool alloc, size_t offset) +{ + static_assert(!(type & 15) && type != RESERVED && + type <= FILE_CHECKPOINT, "invalid type"); + ut_ad(type >= FILE_CREATE || is_named_space(id.space())); + ut_ad(!bpage || bpage->id() == id); + ut_ad(id < end_page_id); + constexpr bool have_len= type != INIT_PAGE && type != FREE_PAGE; + constexpr bool have_offset= type == WRITE || type == MEMSET || + type == MEMMOVE; + static_assert(!have_offset || have_len, "consistency"); + ut_ad(have_len || len == 0); + ut_ad(have_len || !alloc); + ut_ad(have_offset || offset == 0); + ut_ad(offset + len <= srv_page_size); + static_assert(MIN_4BYTE >= UNIV_PAGE_SIZE_MAX, "consistency"); + ut_ad(type == FREE_PAGE || type == OPTION || (type == EXTENDED && !bpage) || + memo_contains_flagged(bpage, MTR_MEMO_MODIFY)); + size_t max_len; + if (!have_len) + max_len= 1 + 5 + 5; + else if (!have_offset) + max_len= bpage && m_last == bpage + ? 1 + 3 + : 1 + 3 + 5 + 5; + else if (bpage && m_last == bpage && m_last_offset <= offset) + { + /* Encode the offset relative from m_last_offset. */ + offset-= m_last_offset; + max_len= 1 + 3 + 3; + } + else + max_len= 1 + 3 + 5 + 5 + 3; + byte *const log_ptr= m_log.open(alloc ? max_len + len : max_len); + byte *end= log_ptr + 1; + const byte same_page= max_len < 1 + 5 + 5 ? 0x80 : 0; + if (!same_page) + { + end= mlog_encode_varint(end, id.space()); + end= mlog_encode_varint(end, id.page_no()); + m_last= bpage; + } + if (have_offset) + { + byte* oend= mlog_encode_varint(end, offset); + if (oend + len > &log_ptr[16]) + { + len+= oend - log_ptr - 15; + if (len >= MIN_3BYTE - 1) + len+= 2; + else if (len >= MIN_2BYTE) + len++; + + *log_ptr= type | same_page; + end= mlog_encode_varint(log_ptr + 1, len); + if (!same_page) + { + end= mlog_encode_varint(end, id.space()); + end= mlog_encode_varint(end, id.page_no()); + } + end= mlog_encode_varint(end, offset); + return end; + } + else + end= oend; + } + else if (len >= 3 && end + len > &log_ptr[16]) + { + len+= end - log_ptr - 15; + if (len >= MIN_3BYTE - 1) + len+= 2; + else if (len >= MIN_2BYTE) + len++; + + end= log_ptr; + *end++= type | same_page; + end= mlog_encode_varint(end, len); + + if (!same_page) + { + end= mlog_encode_varint(end, id.space()); + end= mlog_encode_varint(end, id.page_no()); + } + return end; + } + + ut_ad(end + len >= &log_ptr[1] + !same_page); + ut_ad(end + len <= &log_ptr[16]); + ut_ad(end <= &log_ptr[max_len]); + *log_ptr= type | same_page | static_cast(end + len - log_ptr - 1); + ut_ad(*log_ptr & 15); + return end; +} + +/** Write a byte string to a page. +@param[in] b buffer page +@param[in] dest destination within b.frame +@param[in] str the data to write +@param[in] len length of the data to write +@tparam w write request type */ +template +inline void mtr_t::memcpy(const buf_block_t &b, void *dest, const void *str, + ulint len) +{ + ut_ad(ut_align_down(dest, srv_page_size) == b.page.frame); + char *d= static_cast(dest); + const char *s= static_cast(str); + if (w != FORCED && is_logged()) + { + ut_ad(len); + const char *const end= d + len; + while (*d++ == *s++) + { + if (d == end) + { + ut_ad(w == MAYBE_NOP); + return; + } + } + s--; + d--; + len= static_cast(end - d); + } + ::memcpy(d, s, len); + memcpy(b, ut_align_offset(d, srv_page_size), len); +} + +/** Write an EXTENDED log record. +@param block buffer pool page +@param type extended record subtype; @see mrec_ext_t */ +inline void mtr_t::log_write_extended(const buf_block_t &block, byte type) +{ + set_modified(block); + if (!is_logged()) + return; + byte *l= log_write(block.page.id(), &block.page, 1, true); + *l++= type; + m_log.close(l); + m_last_offset= FIL_PAGE_TYPE; +} + +/** Write log for partly initializing a B-tree or R-tree page. +@param block B-tree or R-tree page +@param comp false=ROW_FORMAT=REDUNDANT, true=COMPACT or DYNAMIC */ +inline void mtr_t::page_create(const buf_block_t &block, bool comp) +{ + static_assert(false == INIT_ROW_FORMAT_REDUNDANT, "encoding"); + static_assert(true == INIT_ROW_FORMAT_DYNAMIC, "encoding"); + log_write_extended(block, comp); +} + +/** Write log for deleting a B-tree or R-tree record in ROW_FORMAT=REDUNDANT. +@param block B-tree or R-tree page +@param prev_rec byte offset of the predecessor of the record to delete, + starting from PAGE_OLD_INFIMUM */ +inline void mtr_t::page_delete(const buf_block_t &block, ulint prev_rec) +{ + ut_ad(!block.zip_size()); + ut_ad(prev_rec < block.physical_size()); + set_modified(block); + if (!is_logged()) + return; + size_t len= (prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4); + byte *l= log_write(block.page.id(), &block.page, len, true); + ut_d(byte *end= l + len); + *l++= DELETE_ROW_FORMAT_REDUNDANT; + l= mlog_encode_varint(l, prev_rec); + ut_ad(end == l); + m_log.close(l); + m_last_offset= FIL_PAGE_TYPE; +} + +/** Write log for deleting a COMPACT or DYNAMIC B-tree or R-tree record. +@param block B-tree or R-tree page +@param prev_rec byte offset of the predecessor of the record to delete, + starting from PAGE_NEW_INFIMUM +@param prev_rec the predecessor of the record to delete +@param hdr_size record header size, excluding REC_N_NEW_EXTRA_BYTES +@param data_size data payload size, in bytes */ +inline void mtr_t::page_delete(const buf_block_t &block, ulint prev_rec, + size_t hdr_size, size_t data_size) +{ + ut_ad(!block.zip_size()); + set_modified(block); + ut_ad(hdr_size < MIN_3BYTE); + ut_ad(prev_rec < block.physical_size()); + ut_ad(data_size < block.physical_size()); + if (!is_logged()) + return; + size_t len= prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4; + len+= hdr_size < MIN_2BYTE ? 1 : 2; + len+= data_size < MIN_2BYTE ? 1 : data_size < MIN_3BYTE ? 2 : 3; + byte *l= log_write(block.page.id(), &block.page, len, true); + ut_d(byte *end= l + len); + *l++= DELETE_ROW_FORMAT_DYNAMIC; + l= mlog_encode_varint(l, prev_rec); + l= mlog_encode_varint(l, hdr_size); + l= mlog_encode_varint(l, data_size); + ut_ad(end == l); + m_log.close(l); + m_last_offset= FIL_PAGE_TYPE; +} + +/** Write log for initializing an undo log page. +@param block undo page */ +inline void mtr_t::undo_create(const buf_block_t &block) +{ + log_write_extended(block, UNDO_INIT); +} + +/** Write log for appending an undo log record. +@param block undo page +@param data record within the undo page +@param len length of the undo record, in bytes */ +inline void mtr_t::undo_append(const buf_block_t &block, + const void *data, size_t len) +{ + ut_ad(len > 2); + set_modified(block); + if (!is_logged()) + return; + const bool small= len + 1 < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5); + byte *end= log_write(block.page.id(), &block.page, len + 1, small); + if (UNIV_LIKELY(small)) + { + *end++= UNDO_APPEND; + ::memcpy(end, data, len); + m_log.close(end + len); + } + else + { + m_log.close(end); + *m_log.push(1)= UNDO_APPEND; + m_log.push(static_cast(data), static_cast(len)); + } + m_last_offset= FIL_PAGE_TYPE; +} + +/** Trim the end of a tablespace. +@param id first page identifier that will not be in the file */ +inline void mtr_t::trim_pages(const page_id_t id) +{ + if (!is_logged()) + return; + byte *l= log_write(id, nullptr, 1, true); + *l++= TRIM_PAGES; + m_log.close(l); + set_trim_pages(); +} diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h new file mode 100644 index 00000000..841cfab1 --- /dev/null +++ b/storage/innobase/include/mtr0mtr.h @@ -0,0 +1,780 @@ +/***************************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/mtr0mtr.h +Mini-transaction buffer + +Created 11/26/1995 Heikki Tuuri +*******************************************************/ + +#pragma once + +#include "fil0fil.h" +#include "dyn0buf.h" +#include "buf0buf.h" +#include "small_vector.h" + +/** Start a mini-transaction. */ +#define mtr_start(m) (m)->start() + +/** Commit a mini-transaction. */ +#define mtr_commit(m) (m)->commit() + +/** Change the logging mode of a mini-transaction. +@return old mode */ +#define mtr_set_log_mode(m, d) (m)->set_log_mode((d)) + +#ifdef UNIV_PFS_RWLOCK +# define mtr_s_lock_index(i,m) (m)->s_lock(__FILE__, __LINE__, &(i)->lock) +# define mtr_x_lock_index(i,m) (m)->x_lock(__FILE__, __LINE__, &(i)->lock) +# define mtr_sx_lock_index(i,m) (m)->u_lock(__FILE__, __LINE__, &(i)->lock) +#else +# define mtr_s_lock_index(i,m) (m)->s_lock(&(i)->lock) +# define mtr_x_lock_index(i,m) (m)->x_lock(&(i)->lock) +# define mtr_sx_lock_index(i,m) (m)->u_lock(&(i)->lock) +#endif + +/** Mini-transaction memo stack slot. */ +struct mtr_memo_slot_t +{ + /** pointer to the object */ + void *object; + /** type of the stored object */ + mtr_memo_type_t type; + + /** Release the object */ + void release() const; +}; + +/** Mini-transaction handle and buffer */ +struct mtr_t { + mtr_t(); + ~mtr_t(); + + /** Start a mini-transaction. */ + void start(); + + /** Commit the mini-transaction. */ + void commit(); + + /** Release latches of unmodified buffer pages. + @param begin first slot to release + @param end last slot to release, or get_savepoint() */ + void rollback_to_savepoint(ulint begin, ulint end); + + /** Release latches of unmodified buffer pages. + @param begin first slot to release */ + void rollback_to_savepoint(ulint begin) + { rollback_to_savepoint(begin, m_memo.size()); } + + /** Release the last acquired buffer page latch. */ + void release_last_page() + { auto s= m_memo.size(); rollback_to_savepoint(s - 1, s); } + + /** Commit a mini-transaction that is shrinking a tablespace. + @param space tablespace that is being shrunk */ + ATTRIBUTE_COLD void commit_shrink(fil_space_t &space); + + /** Commit a mini-transaction that is deleting or renaming a file. + @param space tablespace that is being renamed or deleted + @param name new file name (nullptr=the file will be deleted) + @return whether the operation succeeded */ + ATTRIBUTE_COLD bool commit_file(fil_space_t &space, const char *name); + + /** Commit a mini-transaction that did not modify any pages, + but generated some redo log on a higher level, such as + FILE_MODIFY records and an optional FILE_CHECKPOINT marker. + The caller must hold exclusive log_sys.latch. + This is to be used at log_checkpoint(). + @param checkpoint_lsn the log sequence number of a checkpoint, or 0 + @return current LSN */ + lsn_t commit_files(lsn_t checkpoint_lsn= 0); + + /** @return mini-transaction savepoint (current size of m_memo) */ + ulint get_savepoint() const + { + ut_ad(is_active()); + return m_memo.size(); + } + + /** Get the block at a savepoint */ + buf_block_t *at_savepoint(ulint savepoint) const + { + ut_ad(is_active()); + const mtr_memo_slot_t &slot= m_memo[savepoint]; + ut_ad(slot.type < MTR_MEMO_S_LOCK); + ut_ad(slot.object); + return static_cast(slot.object); + } + + /** Try to get a block at a savepoint. + @param savepoint the savepoint right before the block was acquired + @return the block at the savepoint + @retval nullptr if no buffer block was registered at that savepoint */ + buf_block_t *block_at_savepoint(ulint savepoint) const + { + ut_ad(is_active()); + const mtr_memo_slot_t &slot= m_memo[savepoint]; + return slot.type < MTR_MEMO_S_LOCK + ? static_cast(slot.object) + : nullptr; + } + + /** Retrieve a page that has already been latched. + @param id page identifier + @param type page latch type + @return block + @retval nullptr if the block had not been latched yet */ + buf_block_t *get_already_latched(const page_id_t id, mtr_memo_type_t type) + const; + + /** @return the logging mode */ + mtr_log_t get_log_mode() const + { + static_assert(MTR_LOG_ALL == 0, "efficiency"); + return static_cast(m_log_mode); + } + + /** @return whether log is to be written for changes */ + bool is_logged() const + { + static_assert(MTR_LOG_ALL == 0, "efficiency"); + static_assert(MTR_LOG_NONE & MTR_LOG_NO_REDO, "efficiency"); + static_assert(!(MTR_LOG_NONE & MTR_LOG_SUB), "efficiency"); + return !(m_log_mode & MTR_LOG_NONE); + } + + /** Change the logging mode. + @param mode logging mode + @return old mode */ + mtr_log_t set_log_mode(mtr_log_t mode) + { + const mtr_log_t old_mode= get_log_mode(); + m_log_mode= mode & 3; + return old_mode; + } + + /** Set the log mode of a sub-minitransaction + @param mtr parent mini-transaction */ + void set_log_mode_sub(const mtr_t &mtr) + { + ut_ad(mtr.m_log_mode == MTR_LOG_ALL || mtr.m_log_mode == MTR_LOG_NO_REDO); + m_log_mode= mtr.m_log_mode | MTR_LOG_SUB; + static_assert((MTR_LOG_SUB | MTR_LOG_NO_REDO) == MTR_LOG_NO_REDO, ""); + } + + /** Check if we are holding a block latch in exclusive mode + @param block buffer pool block to search for */ + bool have_x_latch(const buf_block_t &block) const; + + /** Check if we are holding a block latch in S or U mode + @param block buffer pool block to search for */ + bool have_u_or_x_latch(const buf_block_t &block) const; + + /** Copy the tablespaces associated with the mini-transaction + (needed for generating FILE_MODIFY records) + @param[in] mtr mini-transaction that may modify + the same set of tablespaces as this one */ + void set_spaces(const mtr_t& mtr) + { + ut_ad(!m_user_space_id); + ut_ad(!m_user_space); + + ut_d(m_user_space_id = mtr.m_user_space_id); + m_user_space = mtr.m_user_space; + } + + /** Set the tablespace associated with the mini-transaction + (needed for generating a FILE_MODIFY record) + @param[in] space_id user or system tablespace ID + @return the tablespace */ + fil_space_t* set_named_space_id(uint32_t space_id) + { + ut_ad(!m_user_space_id); + ut_d(m_user_space_id = space_id); + if (!space_id) { + return fil_system.sys_space; + } else { + ut_ad(m_user_space_id == space_id); + ut_ad(!m_user_space); + m_user_space = fil_space_get(space_id); + ut_ad(m_user_space); + return m_user_space; + } + } + + /** Set the tablespace associated with the mini-transaction + (needed for generating a FILE_MODIFY record) + @param[in] space user or system tablespace */ + void set_named_space(fil_space_t* space) + { + ut_ad(!m_user_space_id); + ut_d(m_user_space_id = space->id); + if (space->id) { + m_user_space = space; + } + } + +#ifdef UNIV_DEBUG + /** Check the tablespace associated with the mini-transaction + (needed for generating a FILE_MODIFY record) + @param[in] space tablespace + @return whether the mini-transaction is associated with the space */ + bool is_named_space(uint32_t space) const; + /** Check the tablespace associated with the mini-transaction + (needed for generating a FILE_MODIFY record) + @param[in] space tablespace + @return whether the mini-transaction is associated with the space */ + bool is_named_space(const fil_space_t* space) const; +#endif /* UNIV_DEBUG */ + + /** Acquire a tablespace X-latch. + @param space_id tablespace ID + @return the tablespace object (never NULL) */ + fil_space_t *x_lock_space(uint32_t space_id); + + /** Acquire a shared rw-latch. */ + void s_lock( +#ifdef UNIV_PFS_RWLOCK + const char *file, unsigned line, +#endif + index_lock *lock) + { + lock->s_lock(SRW_LOCK_ARGS(file, line)); + memo_push(lock, MTR_MEMO_S_LOCK); + } + + /** Acquire an exclusive rw-latch. */ + void x_lock( +#ifdef UNIV_PFS_RWLOCK + const char *file, unsigned line, +#endif + index_lock *lock) + { + lock->x_lock(SRW_LOCK_ARGS(file, line)); + memo_push(lock, MTR_MEMO_X_LOCK); + } + + /** Acquire an update latch. */ + void u_lock( +#ifdef UNIV_PFS_RWLOCK + const char *file, unsigned line, +#endif + index_lock *lock) + { + lock->u_lock(SRW_LOCK_ARGS(file, line)); + memo_push(lock, MTR_MEMO_SX_LOCK); + } + + /** Acquire an exclusive tablespace latch. + @param space tablespace */ + void x_lock_space(fil_space_t *space); + + /** Release an index latch. */ + void release(const index_lock &lock) { release(&lock); } + /** Release a latch to an unmodified page. */ + void release(const buf_block_t &block) { release(&block); } +private: + /** Release an unmodified object. */ + void release(const void *object); +public: + /** Mark the given latched page as modified. + @param block page that will be modified */ + void set_modified(const buf_block_t &block); + + /** Set the state to not-modified. This will not log the changes. + This is only used during redo log apply, to avoid logging the changes. */ + void discard_modifications() { m_modifications= false; } + + /** Get the LSN of commit(). + @return the commit LSN + @retval 0 if the transaction only modified temporary tablespaces */ + lsn_t commit_lsn() const { ut_ad(has_committed()); return m_commit_lsn; } + + /** Note that we are inside the change buffer code. */ + void enter_ibuf() { m_inside_ibuf= true; } + + /** Note that we have exited from the change buffer code. */ + void exit_ibuf() { m_inside_ibuf= false; } + + /** @return true if we are inside the change buffer code */ + bool is_inside_ibuf() const { return m_inside_ibuf; } + + /** Note that some pages have been freed */ + void set_trim_pages() { m_trim_pages= true; } + + /** Latch a buffer pool block. + @param block block to be latched + @param rw_latch RW_S_LATCH, RW_SX_LATCH, RW_X_LATCH, RW_NO_LATCH */ + void page_lock(buf_block_t *block, ulint rw_latch); + + /** Acquire a latch on a buffer-fixed buffer pool block. + @param savepoint savepoint location of the buffer-fixed block + @param rw_latch latch to acquire */ + void upgrade_buffer_fix(ulint savepoint, rw_lock_type_t rw_latch); + + /** Register a change to the page latch state. */ + void lock_register(ulint savepoint, mtr_memo_type_t type) + { + mtr_memo_slot_t &slot= m_memo[savepoint]; + ut_ad(slot.type <= MTR_MEMO_BUF_FIX); + ut_ad(type < MTR_MEMO_S_LOCK); + slot.type= type; + } + + /** Upgrade U locks on a block to X */ + void page_lock_upgrade(const buf_block_t &block); + + /** Upgrade index U lock to X */ + ATTRIBUTE_COLD void index_lock_upgrade(); + + /** Check if we are holding tablespace latch + @param space tablespace to search for + @return whether space.latch is being held */ + bool memo_contains(const fil_space_t& space) const + MY_ATTRIBUTE((warn_unused_result)); +#ifdef UNIV_DEBUG + /** Check if we are holding an rw-latch in this mini-transaction + @param lock latch to search for + @param type held latch type + @return whether (lock,type) is contained */ + bool memo_contains(const index_lock &lock, mtr_memo_type_t type) const + MY_ATTRIBUTE((warn_unused_result)); + + /** Check if memo contains an index or buffer block latch. + @param object object to search + @param flags specify types of object latches + @return true if contains */ + bool memo_contains_flagged(const void *object, ulint flags) const + MY_ATTRIBUTE((warn_unused_result, nonnull)); + + /** Check if memo contains the given page. + @param ptr pointer to within page frame + @param flags types latch to look for + @return the block + @retval nullptr if not found */ + buf_block_t *memo_contains_page_flagged(const byte *ptr, ulint flags) const; + + /** @return whether this mini-transaction modifies persistent data */ + bool has_modifications() const { return m_modifications; } +#endif /* UNIV_DEBUG */ + + /** Push a buffer page to an the memo. + @param block buffer block + @param type object type: MTR_MEMO_S_LOCK, ... */ + void memo_push(buf_block_t *block, mtr_memo_type_t type) + __attribute__((nonnull)) + { + ut_ad(is_active()); + ut_ad(type <= MTR_MEMO_PAGE_SX_MODIFY); + ut_ad(block->page.buf_fix_count()); + ut_ad(block->page.in_file()); +#ifdef UNIV_DEBUG + switch (type) { + case MTR_MEMO_PAGE_S_FIX: + ut_ad(block->page.lock.have_s()); + break; + case MTR_MEMO_PAGE_X_FIX: case MTR_MEMO_PAGE_X_MODIFY: + ut_ad(block->page.lock.have_x()); + break; + case MTR_MEMO_PAGE_SX_FIX: case MTR_MEMO_PAGE_SX_MODIFY: + ut_ad(block->page.lock.have_u_or_x()); + break; + case MTR_MEMO_BUF_FIX: + break; + case MTR_MEMO_MODIFY: + case MTR_MEMO_S_LOCK: case MTR_MEMO_X_LOCK: case MTR_MEMO_SX_LOCK: + case MTR_MEMO_SPACE_X_LOCK: + ut_ad("invalid type" == 0); + } +#endif + if (!(type & MTR_MEMO_MODIFY)); + else if (block->page.id().space() >= SRV_TMP_SPACE_ID) + { + block->page.set_temp_modified(); + type= mtr_memo_type_t(type & ~MTR_MEMO_MODIFY); + } + else + { + m_modifications= true; + if (!m_made_dirty) + /* If we are going to modify a previously clean persistent page, + we must set m_made_dirty, so that commit() will acquire + log_sys.flush_order_mutex and insert the block into + buf_pool.flush_list. */ + m_made_dirty= block->page.oldest_modification() <= 1; + } + m_memo.emplace_back(mtr_memo_slot_t{block, type}); + } + + /** Push an index lock or tablespace latch to the memo. + @param object index lock or tablespace latch + @param type object type: MTR_MEMO_S_LOCK, ... */ + void memo_push(void *object, mtr_memo_type_t type) __attribute__((nonnull)) + { + ut_ad(is_active()); + ut_ad(type >= MTR_MEMO_S_LOCK); + m_memo.emplace_back(mtr_memo_slot_t{object, type}); + } + + /** @return the size of the log is empty */ + size_t get_log_size() const { return m_log.size(); } + /** @return whether the log and memo are empty */ + bool is_empty() const { return !get_savepoint() && !get_log_size(); } + + /** Write an OPT_PAGE_CHECKSUM record. */ + inline void page_checksum(const buf_page_t &bpage); + + /** Write request types */ + enum write_type + { + /** the page is guaranteed to always change */ + NORMAL= 0, + /** optional: the page contents might not change */ + MAYBE_NOP, + /** force a write, even if the page contents is not changing */ + FORCED + }; + + /** Write 1, 2, 4, or 8 bytes to a file page. + @param[in] block file page + @param[in,out] ptr pointer in file page + @param[in] val value to write + @tparam l number of bytes to write + @tparam w write request type + @tparam V type of val + @return whether any log was written */ + template + inline bool write(const buf_block_t &block, void *ptr, V val) + MY_ATTRIBUTE((nonnull)); + + /** Log a write of a byte string to a page. + @param[in] b buffer page + @param[in] ofs byte offset from b->frame + @param[in] len length of the data to write */ + inline void memcpy(const buf_block_t &b, ulint ofs, ulint len); + + /** Write a byte string to a page. + @param[in,out] b buffer page + @param[in] dest destination within b.frame + @param[in] str the data to write + @param[in] len length of the data to write + @tparam w write request type */ + template + inline void memcpy(const buf_block_t &b, void *dest, const void *str, + ulint len); + + /** Log a write of a byte string to a ROW_FORMAT=COMPRESSED page. + @param[in] b ROW_FORMAT=COMPRESSED index page + @param[in] offset byte offset from b.zip.data + @param[in] len length of the data to write */ + inline void zmemcpy(const buf_block_t &b, ulint offset, ulint len); + + /** Write a byte string to a ROW_FORMAT=COMPRESSED page. + @param[in] b ROW_FORMAT=COMPRESSED index page + @param[in] dest destination within b.zip.data + @param[in] str the data to write + @param[in] len length of the data to write + @tparam w write request type */ + template + inline void zmemcpy(const buf_block_t &b, void *dest, const void *str, + ulint len); + + /** Log an initialization of a string of bytes. + @param[in] b buffer page + @param[in] ofs byte offset from b->frame + @param[in] len length of the data to write + @param[in] val the data byte to write */ + inline void memset(const buf_block_t &b, ulint ofs, ulint len, byte val); + + /** Initialize a string of bytes. + @param[in,out] b buffer page + @param[in] ofs byte offset from b->frame + @param[in] len length of the data to write + @param[in] val the data byte to write */ + inline void memset(const buf_block_t *b, ulint ofs, ulint len, byte val); + + /** Log an initialization of a repeating string of bytes. + @param[in] b buffer page + @param[in] ofs byte offset from b->frame + @param[in] len length of the data to write, in bytes + @param[in] str the string to write + @param[in] size size of str, in bytes */ + inline void memset(const buf_block_t &b, ulint ofs, size_t len, + const void *str, size_t size); + + /** Initialize a repeating string of bytes. + @param[in,out] b buffer page + @param[in] ofs byte offset from b->frame + @param[in] len length of the data to write, in bytes + @param[in] str the string to write + @param[in] size size of str, in bytes */ + inline void memset(const buf_block_t *b, ulint ofs, size_t len, + const void *str, size_t size); + + /** Log that a string of bytes was copied from the same page. + @param[in] b buffer page + @param[in] d destination offset within the page + @param[in] s source offset within the page + @param[in] len length of the data to copy */ + inline void memmove(const buf_block_t &b, ulint d, ulint s, ulint len); + + /** Initialize an entire page. + @param[in,out] b buffer page */ + void init(buf_block_t *b); + /** Free a page. + @param space tablespace + @param offset offset of the page to be freed */ + void free(const fil_space_t &space, uint32_t offset); + /** Write log for partly initializing a B-tree or R-tree page. + @param block B-tree or R-tree page + @param comp false=ROW_FORMAT=REDUNDANT, true=COMPACT or DYNAMIC */ + inline void page_create(const buf_block_t &block, bool comp); + + /** Write log for inserting a B-tree or R-tree record in + ROW_FORMAT=REDUNDANT. + @param block B-tree or R-tree page + @param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE + @param prev_rec byte offset of the predecessor of the record to insert, + starting from PAGE_OLD_INFIMUM + @param info_bits info_bits of the record + @param n_fields_s number of fields << 1 | rec_get_1byte_offs_flag() + @param hdr_c number of common record header bytes with prev_rec + @param data_c number of common data bytes with prev_rec + @param hdr record header bytes to copy to the log + @param hdr_l number of copied record header bytes + @param data record payload bytes to copy to the log + @param data_l number of copied record data bytes */ + inline void page_insert(const buf_block_t &block, bool reuse, + ulint prev_rec, byte info_bits, + ulint n_fields_s, size_t hdr_c, size_t data_c, + const byte *hdr, size_t hdr_l, + const byte *data, size_t data_l); + /** Write log for inserting a B-tree or R-tree record in + ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC. + @param block B-tree or R-tree page + @param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE + @param prev_rec byte offset of the predecessor of the record to insert, + starting from PAGE_NEW_INFIMUM + @param info_status rec_get_info_and_status_bits() + @param shift unless !reuse: number of bytes the PAGE_FREE is moving + @param hdr_c number of common record header bytes with prev_rec + @param data_c number of common data bytes with prev_rec + @param hdr record header bytes to copy to the log + @param hdr_l number of copied record header bytes + @param data record payload bytes to copy to the log + @param data_l number of copied record data bytes */ + inline void page_insert(const buf_block_t &block, bool reuse, + ulint prev_rec, byte info_status, + ssize_t shift, size_t hdr_c, size_t data_c, + const byte *hdr, size_t hdr_l, + const byte *data, size_t data_l); + /** Write log for deleting a B-tree or R-tree record in ROW_FORMAT=REDUNDANT. + @param block B-tree or R-tree page + @param prev_rec byte offset of the predecessor of the record to delete, + starting from PAGE_OLD_INFIMUM */ + inline void page_delete(const buf_block_t &block, ulint prev_rec); + /** Write log for deleting a COMPACT or DYNAMIC B-tree or R-tree record. + @param block B-tree or R-tree page + @param prev_rec byte offset of the predecessor of the record to delete, + starting from PAGE_NEW_INFIMUM + @param hdr_size record header size, excluding REC_N_NEW_EXTRA_BYTES + @param data_size data payload size, in bytes */ + inline void page_delete(const buf_block_t &block, ulint prev_rec, + size_t hdr_size, size_t data_size); + + /** Write log for initializing an undo log page. + @param block undo page */ + inline void undo_create(const buf_block_t &block); + /** Write log for appending an undo log record. + @param block undo page + @param data record within the undo page + @param len length of the undo record, in bytes */ + inline void undo_append(const buf_block_t &block, + const void *data, size_t len); + /** Trim the end of a tablespace. + @param id first page identifier that will not be in the file */ + inline void trim_pages(const page_id_t id); + + /** Write a log record about a file operation. + @param type file operation + @param space_id tablespace identifier + @param path file path + @param new_path new file path for type=FILE_RENAME */ + inline void log_file_op(mfile_type_t type, uint32_t space_id, + const char *path, + const char *new_path= nullptr); + + /** Add freed page numbers to freed_pages */ + void add_freed_offset(fil_space_t *space, uint32_t page) + { + ut_ad(is_named_space(space)); + if (!m_freed_pages) + { + m_freed_pages= new range_set(); + ut_ad(!m_freed_space); + m_freed_space= space; + } + else + ut_ad(m_freed_space == space); + m_freed_pages->add_value(page); + } + + /** Determine the added buffer fix count of a block. + @param block block to be checked + @return number of buffer count added by this mtr */ + uint32_t get_fix_count(const buf_block_t *block) const; + + /** Note that log_sys.latch is no longer being held exclusively. */ + void flag_wr_unlock() noexcept { ut_ad(m_latch_ex); m_latch_ex= false; } + + /** type of page flushing is needed during commit() */ + enum page_flush_ahead + { + /** no need to trigger page cleaner */ + PAGE_FLUSH_NO= 0, + /** asynchronous flushing is needed */ + PAGE_FLUSH_ASYNC, + /** furious flushing is needed */ + PAGE_FLUSH_SYNC + }; + +private: + /** Handle any pages that were freed during the mini-transaction. */ + void process_freed_pages(); + /** Release modified pages when no log was written. */ + void release_unlogged(); + + /** Log a write of a byte string to a page. + @param block buffer page + @param offset byte offset within page + @param data data to be written + @param len length of the data, in bytes */ + inline void memcpy_low(const buf_block_t &block, uint16_t offset, + const void *data, size_t len); + /** + Write a log record. + @tparam type redo log record type + @param id persistent page identifier + @param bpage buffer pool page, or nullptr + @param len number of additional bytes to write + @param alloc whether to allocate the additional bytes + @param offset byte offset, or 0 if the record type does not allow one + @return end of mini-transaction log, minus len */ + template + inline byte *log_write(const page_id_t id, const buf_page_t *bpage, + size_t len= 0, bool alloc= false, size_t offset= 0); + + /** Write an EXTENDED log record. + @param block buffer pool page + @param type extended record subtype; @see mrec_ext_t */ + inline void log_write_extended(const buf_block_t &block, byte type); + + /** Write a FILE_MODIFY record when a non-predefined persistent + tablespace was modified for the first time since fil_names_clear(). */ + ATTRIBUTE_NOINLINE ATTRIBUTE_COLD void name_write(); + + /** Encrypt the log */ + ATTRIBUTE_NOINLINE void encrypt(); + + /** Append the redo log records to the redo log buffer. + @return {start_lsn,flush_ahead} */ + std::pair do_write(); + + /** Append the redo log records to the redo log buffer. + @param len number of bytes to write + @return {start_lsn,flush_ahead} */ + std::pair finish_write(size_t len); + + /** Release all latches. */ + void release(); + /** Release the resources */ + inline void release_resources(); + +#ifdef UNIV_DEBUG +public: + /** @return whether the mini-transaction is active */ + bool is_active() const + { ut_ad(!m_commit || m_start); return m_start && !m_commit; } + /** @return whether the mini-transaction has been committed */ + bool has_committed() const { ut_ad(!m_commit || m_start); return m_commit; } + /** @return whether the mini-transaction is freeing an index tree */ + bool is_freeing_tree() const { return m_freeing_tree; } + /** Notify that the mini-transaction is freeing an index tree */ + void freeing_tree() { m_freeing_tree= true; } +private: + /** whether start() has been called */ + bool m_start= false; + /** whether commit() has been called */ + bool m_commit= false; + /** whether freeing_tree() has been called */ + bool m_freeing_tree= false; +#endif +private: + /** The page of the most recent m_log record written, or NULL */ + const buf_page_t* m_last; + /** The current byte offset in m_last, or 0 */ + uint16_t m_last_offset; + + /** specifies which operations should be logged; default MTR_LOG_ALL */ + uint16_t m_log_mode:2; + + /** whether at least one persistent page was written to */ + uint16_t m_modifications:1; + + /** whether at least one previously clean buffer pool page was written to */ + uint16_t m_made_dirty:1; + + /** whether log_sys.latch is locked exclusively */ + uint16_t m_latch_ex:1; + + /** whether change buffer is latched; only needed in non-debug builds + to suppress some read-ahead operations, @see ibuf_inside() */ + uint16_t m_inside_ibuf:1; + + /** whether the pages has been trimmed */ + uint16_t m_trim_pages:1; + + /** CRC-32C of m_log */ + uint32_t m_crc; + +#ifdef UNIV_DEBUG + /** Persistent user tablespace associated with the + mini-transaction, or 0 (TRX_SYS_SPACE) if none yet */ + uint32_t m_user_space_id; +#endif /* UNIV_DEBUG */ + + /** acquired dict_index_t::lock, fil_space_t::latch, buf_block_t */ + small_vector m_memo; + + /** mini-transaction log */ + mtr_buf_t m_log; + + /** user tablespace that is being modified by the mini-transaction */ + fil_space_t* m_user_space; + + /** LSN at commit time */ + lsn_t m_commit_lsn; + + /** tablespace where pages have been freed */ + fil_space_t *m_freed_space= nullptr; + /** set of freed page ids */ + range_set *m_freed_pages= nullptr; +}; diff --git a/storage/innobase/include/mtr0types.h b/storage/innobase/include/mtr0types.h new file mode 100644 index 00000000..19db13a1 --- /dev/null +++ b/storage/innobase/include/mtr0types.h @@ -0,0 +1,347 @@ +/***************************************************************************** + +Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/mtr0types.h +Mini-transaction buffer global types + +Created 11/26/1995 Heikki Tuuri +*******************************************************/ + +#pragma once + +#include "buf0types.h" + +#include "ut0byte.h" + +struct mtr_t; + +/** Logging modes for a mini-transaction */ +enum mtr_log_t { + /** Default mode: log all operations modifying disk-based data */ + MTR_LOG_ALL = 0, + + /** Log no operations and dirty pages are not added to the flush list. + Set for attempting modification of a ROW_FORMAT=COMPRESSED page. */ + MTR_LOG_NONE, + + /** Log all operations, but do not write any OPT_PAGE_CHECKSUM + records because some of the modified pages were also modified + by another mini-transaction that did not write its log yet. */ + MTR_LOG_SUB, + + /** Don't generate REDO log but add dirty pages to flush list */ + MTR_LOG_NO_REDO +}; + +/* +A mini-transaction is a stream of records that is always terminated by +a byte 0x00 or 0x01. The first byte of a mini-transaction record is +never one of these bytes, but these bytes can occur within mini-transaction +records. + +The first byte of the record would contain a record type, flags, and a +part of length. The optional second byte of the record will contain +more length. (Not needed for short records.) + +For example, because the length of an INIT_PAGE record is 3 to 11 bytes, +the first byte will be 0x02 to 0x0a, indicating the number of subsequent bytes. + +Bit 7 of the first byte of a redo log record is the same_page flag. +If same_page=1, the record is referring to the same page as the +previous record. Records that do not refer to data pages but to file +operations are identified by setting the same_page=1 in the very first +record(s) of the mini-transaction. A mini-transaction record that +carries same_page=0 must only be followed by page-oriented records. + +Bits 6..4 of the first byte of a redo log record identify the redo log +type. The following record types refer to data pages: + + FREE_PAGE (0): corresponds to MLOG_INIT_FREE_PAGE + INIT_PAGE (1): corresponds to MLOG_INIT_FILE_PAGE2 + EXTENDED (2): extended record; followed by subtype code @see mrec_ext_t + WRITE (3): replaces MLOG_nBYTES, MLOG_WRITE_STRING, MLOG_ZIP_* + MEMSET (4): extends the 10.4 MLOG_MEMSET record + MEMMOVE (5): copy data within the page (avoids logging redundant data) + RESERVED (6): reserved for future use; a subtype code + (encoded immediately after the length) would be written + to reserve code space for further extensions + OPTION (7): optional record that may be ignored; a subtype @see mrec_opt + (encoded immediately after the length) would distinguish actual usage + +Bits 3..0 indicate the redo log record length, excluding the first +byte, but including additional length bytes and any other bytes, +such as the optional tablespace identifier and page number. +Values 1..15 represent lengths of 1 to 15 bytes. The special value 0 +indicates that 1 to 3 length bytes will follow to encode the remaining +length that exceeds 16 bytes. + +Additional length bytes if length>16: 0 to 3 bytes +0xxxxxxx for 0 to 127 (total: 16 to 143 bytes) +10xxxxxx xxxxxxxx for 128 to 16511 (total: 144 to 16527) +110xxxxx xxxxxxxx xxxxxxxx for 16512 to 2113663 (total: 16528 to 2113679) +111xxxxx reserved (corrupted record, and file!) + +If same_page=0, the tablespace identifier and page number will use +similar 1-to-5-byte variable-length encoding: +0xxxxxxx for 0 to 127 +10xxxxxx xxxxxxxx for 128 to 16,511 +110xxxxx xxxxxxxx xxxxxxxx for 16,512 to 2,113,663 +1110xxxx xxxxxxxx xxxxxxxx xxxxxxxx for 2,113,664 to 270,549,119 +11110xxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx for 270,549,120 to 34,630,287,487 +11111xxx reserved (corrupted record) +Note: Some 5-byte values are reserved, because the tablespace identifier +and page number can only be up to 4,294,967,295. + +If same_page=1 is set in a record that follows a same_page=0 record +in a mini-transaction, the tablespace identifier and page number +fields will be omitted. + +For FILE_ records (if same_page=1 for the first record +of a mini-transaction), we will write a tablespace identifier and +a page number (always 0) using the same 1-to-5-byte encoding. + +For FREE_PAGE or INIT_PAGE, if same_page=1, the record will be treated +as corrupted (or reserved for future extension). The type code must +be followed by 1+1 to 5+5 bytes (to encode the tablespace identifier +and page number). If the record length does not match the encoded +lengths of the tablespace identifier and page number, the record will +be treated as corrupted. This allows future expansion of the format. + +If there is a FREE_PAGE record in a mini-transaction, it must be the +only record for that page in the mini-transaction. If there is an +INIT_PAGE record for a page in a mini-transaction, it must be the +first record for that page in the mini-transaction. + +An EXTENDED record must be followed by 1+1 to 5+5 bytes for the page +identifier (unless the same_page flag is set) and a subtype; @see mrec_ext_t + +For WRITE, MEMSET, MEMMOVE, the next 1 to 3 bytes are the byte offset +on the page, relative from the previous offset. If same_page=0, the +"previous offset" is 0. If same_page=1, the "previous offset" is where +the previous operation ended (FIL_PAGE_TYPE for INIT_PAGE). +0xxxxxxx for 0 to 127 +10xxxxxx xxxxxxxx for 128 to 16,511 +110xxxxx xxxxxxxx xxxxxxxx for 16,512 to 2,113,663 +111xxxxx reserved (corrupted record) +If the sum of the "previous offset" and the current offset exceeds the +page size, the record is treated as corrupted. Negative relative offsets +cannot be written. Instead, a record with same_page=0 can be written. + +For MEMSET and MEMMOVE, the target length will follow, encoded in 1 to +3 bytes. If the length+offset exceeds the page size, the record will +be treated as corrupted. + +For MEMMOVE, the source offset will follow, encoded in 1 to 3 bytes, +relative to the current offset. The offset 0 is not possible, and +the sign bit is the least significant bit. That is, ++x is encoded as (x-1)<<1 (+1,+2,+3,... is 0,2,4,...) and +-x is encoded as (x-1)<<1|1 (-1,-2,-3,... is 1,3,5,...). +The source offset must be within the page size, or else the record +will be treated as corrupted. + +For MEMSET or WRITE, the byte(s) to be written will follow. For +MEMSET, it usually is a single byte, but it could also be a multi-byte +string, which would be copied over and over until the target length is +reached. The length of the remaining bytes is implied by the length +bytes at the start of the record. + +For MEMMOVE, if any bytes follow, the record is treated as corrupted +(future expansion). + +As mentioned at the start of this comment, the type byte 0 would be +special, marking the end of a mini-transaction. We could use the +corresponding value 0x80 (with same_page=1) for something special, +such as a future extension when more type codes are needed, or for +encoding rarely needed redo log records. + +Examples: + +INIT could be logged as 0x12 0x34 0x56, meaning "type code 1 (INIT), 2 +bytes to follow" and "tablespace ID 0x34", "page number 0x56". +The first byte must be between 0x12 and 0x1a, and the total length of +the record must match the lengths of the encoded tablespace ID and +page number. + +WRITE could be logged as 0x36 0x40 0x57 0x60 0x12 0x34 0x56, meaning +"type code 3 (WRITE), 6 bytes to follow" and "tablespace ID 0x40", +"page number 0x57", "byte offset 0x60", data 0x34,0x56. + +A subsequent WRITE to the same page could be logged 0xb5 0x7f 0x23 +0x34 0x56 0x78, meaning "same page, type code 3 (WRITE), 5 bytes to +follow", "byte offset 0x7f"+0x60+2, bytes 0x23,0x34,0x56,0x78. + +The end of the mini-transaction would be indicated by the end byte +0x00 or 0x01; @see log_sys.get_sequence_bit(). +If log_sys.is_encrypted(), that is followed by 8 bytes of nonce +(part of initialization vector). That will be followed by 4 bytes +of CRC-32C of the entire mini-tranasction, excluding the end byte. */ + +/** Redo log record types. These bit patterns (3 bits) will be written +to the redo log file, so the existing codes or their interpretation on +crash recovery must not be changed. */ +enum mrec_type_t +{ + /** Free a page. On recovery, it is unnecessary to read the page. + The next record for the page (if any) must be INIT_PAGE. + After this record has been written, the page may be + overwritten with zeros, or discarded or trimmed. */ + FREE_PAGE= 0, + /** Zero-initialize a page. The current byte offset (for subsequent + records) will be reset to FIL_PAGE_TYPE. */ + INIT_PAGE= 0x10, + /** Extended record; @see mrec_ext_t */ + EXTENDED= 0x20, + /** Write a string of bytes. Followed by the byte offset (unsigned, + relative to the current byte offset, encoded in 1 to 3 bytes) and + the bytes to write (at least one). The current byte offset will be + set after the last byte written. */ + WRITE= 0x30, + /** Like WRITE, but before the bytes to write, the data_length-1 + (encoded in 1 to 3 bytes) will be encoded, and it must be more + than the length of the following data bytes to write. + The data byte(s) will be repeatedly copied to the output until + the data_length is reached. */ + MEMSET= 0x40, + /** Like MEMSET, but instead of the bytes to write, a source byte + offset (signed, nonzero, relative to the target byte offset, encoded + in 1 to 3 bytes, with the sign bit in the least significant bit) + will be written. + That is, +x is encoded as (x-1)<<1 (+1,+2,+3,... is 0,2,4,...) + and -x is encoded as (x-1)<<1|1 (-1,-2,-3,... is 1,3,5,...). + The source offset and data_length must be within the page size, or + else the record will be treated as corrupted. The data will be + copied from the page as it was at the start of the + mini-transaction. */ + MEMMOVE= 0x50, + /** Reserved for future use. */ + RESERVED= 0x60, + /** Optional record that may be ignored in crash recovery. + A subtype (@see mrec_opt) will be encoded after the page identifier. */ + OPTION= 0x70 +}; + + +/** Supported EXTENDED record subtypes. */ +enum mrec_ext_t +{ + /** Partly initialize a ROW_FORMAT=REDUNDANT B-tree or R-tree index page, + including writing the "infimum" and "supremum" pseudo-records. + The current byte offset will be reset to FIL_PAGE_TYPE. */ + INIT_ROW_FORMAT_REDUNDANT= 0, + /** Partly initialize a ROW_FORMAT=COMPACT or DYNAMIC index page, + including writing the "infimum" and "supremum" pseudo-records. + The current byte offset will be reset to FIL_PAGE_TYPE. */ + INIT_ROW_FORMAT_DYNAMIC= 1, + /** Initialize an undo log page. + This is roughly (not exactly) equivalent to the old MLOG_UNDO_INIT record. + The current byte offset will be reset to FIL_PAGE_TYPE. */ + UNDO_INIT= 2, + /** Append a record to an undo log page. + This is equivalent to the old MLOG_UNDO_INSERT record. + The current byte offset will be reset to FIL_PAGE_TYPE. */ + UNDO_APPEND= 3, + /** Insert a ROW_FORMAT=REDUNDANT record, extending PAGE_HEAP_TOP. + The current byte offset will be reset to FIL_PAGE_TYPE. */ + INSERT_HEAP_REDUNDANT= 4, + /** Insert a ROW_FORMAT=REDUNDANT record, reusing PAGE_FREE. + The current byte offset will be reset to FIL_PAGE_TYPE. */ + INSERT_REUSE_REDUNDANT= 5, + /** Insert a ROW_FORMAT=COMPACT or DYNAMIC record, extending PAGE_HEAP_TOP. + The current byte offset will be reset to FIL_PAGE_TYPE. */ + INSERT_HEAP_DYNAMIC= 6, + /** Insert a ROW_FORMAT=COMPACT or DYNAMIC record, reusing PAGE_FREE. + The current byte offset will be reset to FIL_PAGE_TYPE. */ + INSERT_REUSE_DYNAMIC= 7, + /** Delete a record on a ROW_FORMAT=REDUNDANT page. + We point to the precedessor of the record to be deleted. + The current byte offset will be reset to FIL_PAGE_TYPE. + This is similar to the old MLOG_REC_DELETE record. */ + DELETE_ROW_FORMAT_REDUNDANT= 8, + /** Delete a record on a ROW_FORMAT=COMPACT or DYNAMIC page. + We point to the precedessor of the record to be deleted + and include the total size of the record being deleted. + The current byte offset will be reset to FIL_PAGE_TYPE. + This is similar to the old MLOG_COMP_REC_DELETE record. */ + DELETE_ROW_FORMAT_DYNAMIC= 9, + /** Truncate a data file. */ + TRIM_PAGES= 10 +}; + + +/** Recognized OPTION record subtypes. */ +enum mrec_opt +{ + /** page checksum at the end of the mini-transaction */ + OPT_PAGE_CHECKSUM= 0 + /* Other possible subtypes: a binlog record, or an SQL statement. */ +}; + + +/** Redo log record types for file-level operations. These bit +patterns will be written to redo log files, so the existing codes or +their interpretation on crash recovery must not be changed. */ +enum mfile_type_t +{ + /** Create a file. Followed by tablespace ID and the file name. */ + FILE_CREATE = 0x80, + /** Delete a file. Followed by tablespace ID and the file name. */ + FILE_DELETE = 0x90, + /** Rename a file. Followed by tablespace ID and the old file name, + NUL, and the new file name. */ + FILE_RENAME = 0xa0, + /** Modify a file. Followed by tablespace ID and the file name. */ + FILE_MODIFY = 0xb0, + /** End-of-checkpoint marker, at the end of a mini-transaction. + Followed by 2 NUL bytes of page identifier and 8 bytes of LSN; + @see SIZE_OF_FILE_CHECKPOINT. + When all bytes are NUL, this is a dummy padding record. */ + FILE_CHECKPOINT = 0xf0 +}; + +/** Size of a FILE_CHECKPOINT record, including the trailing byte to +terminate the mini-transaction and the CRC-32C. */ +constexpr byte SIZE_OF_FILE_CHECKPOINT= 3/*type,page_id*/ + 8/*LSN*/ + 1 + 4; + +#ifndef UNIV_INNOCHECKSUM +/** Types for the mlock objects to store in the mtr_t::m_memo */ +enum mtr_memo_type_t { + MTR_MEMO_PAGE_S_FIX = RW_S_LATCH, + + MTR_MEMO_PAGE_X_FIX = RW_X_LATCH, + + MTR_MEMO_PAGE_SX_FIX = RW_SX_LATCH, + + MTR_MEMO_BUF_FIX = RW_NO_LATCH, + + MTR_MEMO_MODIFY = 16, + + MTR_MEMO_PAGE_X_MODIFY = MTR_MEMO_PAGE_X_FIX | MTR_MEMO_MODIFY, + MTR_MEMO_PAGE_SX_MODIFY = MTR_MEMO_PAGE_SX_FIX | MTR_MEMO_MODIFY, + + MTR_MEMO_S_LOCK = RW_S_LATCH << 5, + + MTR_MEMO_X_LOCK = RW_X_LATCH << 5, + + MTR_MEMO_SX_LOCK = RW_SX_LATCH << 5, + + /** wr_lock() on fil_space_t::latch */ + MTR_MEMO_SPACE_X_LOCK = MTR_MEMO_SX_LOCK << 1 +}; +#endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h new file mode 100644 index 00000000..c9db6a1f --- /dev/null +++ b/storage/innobase/include/os0file.h @@ -0,0 +1,1188 @@ +/*********************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, 2022, MariaDB Corporation. + +Portions of this file contain modifications contributed and copyrighted +by Percona Inc.. Those modifications are +gratefully acknowledged and are described briefly in the InnoDB +documentation. The contributions by Percona Inc. are incorporated with +their permission, and subject to the conditions contained in the file +COPYING.Percona. + +This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +***********************************************************************/ + +/**************************************************//** +@file include/os0file.h +The interface to the operating system file io + +Created 10/21/1995 Heikki Tuuri +*******************************************************/ + +#ifndef os0file_h +#define os0file_h + +#include "fsp0types.h" +#include "tpool.h" +#include "my_counter.h" + +#ifndef _WIN32 +#include +#include +#include +#endif /* !_WIN32 */ + +extern bool os_has_said_disk_full; + +/** File offset in bytes */ +typedef ib_uint64_t os_offset_t; + +class buf_tmp_buffer_t; + +#ifdef _WIN32 + +/** We define always WIN_ASYNC_IO, and check at run-time whether +the OS actually supports it: Win 95 does not, NT does. */ +# define WIN_ASYNC_IO + +/** Use unbuffered I/O */ +# define UNIV_NON_BUFFERED_IO + +/** File handle */ +typedef native_file_handle os_file_t; + + +#else /* _WIN32 */ + +/** File handle */ +typedef int os_file_t; + +#endif /* _WIN32 */ + +static const os_file_t OS_FILE_CLOSED = IF_WIN(os_file_t(INVALID_HANDLE_VALUE),-1); + +/** File descriptor with optional PERFORMANCE_SCHEMA instrumentation */ +struct pfs_os_file_t +{ + /** Default constructor */ + pfs_os_file_t(os_file_t file = OS_FILE_CLOSED) : m_file(file) +#ifdef UNIV_PFS_IO + , m_psi(NULL) +#endif + {} + + /** The wrapped file handle */ + os_file_t m_file; +#ifdef UNIV_PFS_IO + /** PERFORMANCE_SCHEMA descriptor */ + struct PSI_file *m_psi; +#endif + /** Implicit type conversion. + @return the wrapped file handle */ + operator os_file_t() const { return m_file; } + /** Assignment operator. + @param[in] file file handle to be assigned */ + void operator=(os_file_t file) { m_file = file; } + bool operator==(os_file_t file) const { return m_file == file; } + bool operator!=(os_file_t file) const { return !(*this == file); } +#ifndef DBUG_OFF + friend std::ostream& operator<<(std::ostream& os, pfs_os_file_t f){ + os << os_file_t(f); + return os; + } +#endif +}; + +/** Options for os_file_create_func @{ */ +enum os_file_create_t { + OS_FILE_OPEN = 51, /*!< to open an existing file (if + doesn't exist, error) */ + OS_FILE_CREATE, /*!< to create new file (if + exists, error) */ + OS_FILE_OVERWRITE, /*!< to create a new file, if exists + the overwrite old file */ + OS_FILE_OPEN_RAW, /*!< to open a raw device or disk + partition */ + OS_FILE_CREATE_PATH, /*!< to create the directories */ + OS_FILE_OPEN_RETRY, /*!< open with retry */ + + /** Flags that can be combined with the above values. Please ensure + that the above values stay below 128. */ + + OS_FILE_ON_ERROR_NO_EXIT = 128, /*!< do not exit on unknown errors */ + OS_FILE_ON_ERROR_SILENT = 256 /*!< don't print diagnostic messages to + the log unless it is a fatal error, + this flag is only used if + ON_ERROR_NO_EXIT is set */ +}; + +static const ulint OS_FILE_READ_ONLY = 333; +static const ulint OS_FILE_READ_WRITE = 444; + +/** Used by MySQLBackup */ +static const ulint OS_FILE_READ_ALLOW_DELETE = 555; + +/* Options for file_create */ +static const ulint OS_FILE_AIO = 61; +static const ulint OS_FILE_NORMAL = 62; +/* @} */ + +/** Types for file create @{ */ +static const ulint OS_DATA_FILE = 100; +static const ulint OS_LOG_FILE = 101; +static const ulint OS_DATA_FILE_NO_O_DIRECT = 103; +/* @} */ + +/** Error codes from os_file_get_last_error @{ */ +static const ulint OS_FILE_NAME_TOO_LONG = 36; +static const ulint OS_FILE_NOT_FOUND = 71; +static const ulint OS_FILE_DISK_FULL = 72; +static const ulint OS_FILE_ALREADY_EXISTS = 73; +static const ulint OS_FILE_PATH_ERROR = 74; + +/** wait for OS aio resources to become available again */ +static const ulint OS_FILE_AIO_RESOURCES_RESERVED = 75; + +static const ulint OS_FILE_SHARING_VIOLATION = 76; +static const ulint OS_FILE_ERROR_NOT_SPECIFIED = 77; +static const ulint OS_FILE_INSUFFICIENT_RESOURCE = 78; +static const ulint OS_FILE_AIO_INTERRUPTED = 79; +static const ulint OS_FILE_OPERATION_ABORTED = 80; +static const ulint OS_FILE_ACCESS_VIOLATION = 81; +static const ulint OS_FILE_OPERATION_NOT_SUPPORTED = 125; +static const ulint OS_FILE_ERROR_MAX = 200; +/* @} */ + +/** +The I/O context that is passed down to the low level IO code */ +class IORequest +{ +public: + enum Type + { + /** Synchronous read */ + READ_SYNC= 2, + /** Asynchronous read; some errors will be ignored */ + READ_ASYNC= READ_SYNC | 1, + /** Possibly partial read; only used with + os_file_read_no_error_handling() */ + READ_MAYBE_PARTIAL= READ_SYNC | 4, + /** Read for doublewrite buffer recovery */ + DBLWR_RECOVER= READ_SYNC | 8, + /** Synchronous write */ + WRITE_SYNC= 16, + /** Asynchronous write */ + WRITE_ASYNC= WRITE_SYNC | 1, + /** A doublewrite batch */ + DBLWR_BATCH= WRITE_ASYNC | 8, + /** Write data; evict the block on write completion */ + WRITE_LRU= WRITE_ASYNC | 32, + /** Write data and punch hole for the rest */ + PUNCH= WRITE_ASYNC | 64, + /** Write data and punch hole; evict the block on write completion */ + PUNCH_LRU= PUNCH | WRITE_LRU, + /** Zero out a range of bytes in fil_space_t::io() */ + PUNCH_RANGE= WRITE_SYNC | 128, + }; + + constexpr IORequest(buf_page_t *bpage, buf_tmp_buffer_t *slot, + fil_node_t *node, Type type) : + bpage(bpage), slot(slot), node(node), type(type) {} + + constexpr IORequest(Type type= READ_SYNC, buf_page_t *bpage= nullptr, + buf_tmp_buffer_t *slot= nullptr) : + bpage(bpage), slot(slot), type(type) {} + + bool is_read() const { return (type & READ_SYNC) != 0; } + bool is_write() const { return (type & WRITE_SYNC) != 0; } + bool is_LRU() const { return (type & (WRITE_LRU ^ WRITE_ASYNC)) != 0; } + bool is_async() const { return (type & (READ_SYNC ^ READ_ASYNC)) != 0; } + + void write_complete(int io_error) const; + void read_complete(int io_error) const; + void fake_read_complete(os_offset_t offset) const; + + /** If requested, free storage space associated with a section of the file. + @param off byte offset from the start (SEEK_SET) + @param len size of the hole in bytes + @return DB_SUCCESS or error code */ + dberr_t maybe_punch_hole(os_offset_t off, ulint len) + { + return off && len && node && (type & (PUNCH ^ WRITE_ASYNC)) + ? punch_hole(off, len) + : DB_SUCCESS; + } + +private: + /** Free storage space associated with a section of the file. + @param off byte offset from the start (SEEK_SET) + @param len size of the hole in bytes + @return DB_SUCCESS or error code */ + dberr_t punch_hole(os_offset_t off, ulint len) const; + +public: + /** Page to be written on write operation */ + buf_page_t *const bpage= nullptr; + + /** Memory to be used for encrypted or page_compressed pages */ + buf_tmp_buffer_t *const slot= nullptr; + + /** File descriptor */ + fil_node_t *const node= nullptr; + + /** Request type bit flags */ + const Type type; +}; + +constexpr IORequest IORequestRead(IORequest::READ_SYNC); +constexpr IORequest IORequestReadPartial(IORequest::READ_MAYBE_PARTIAL); +constexpr IORequest IORequestWrite(IORequest::WRITE_SYNC); + +/** Sparse file size information. */ +struct os_file_size_t { + /** Total size of file in bytes */ + os_offset_t m_total_size; + + /** If it is a sparse file then this is the number of bytes + actually allocated for the file. */ + os_offset_t m_alloc_size; +}; + +constexpr ulint OS_AIO_N_PENDING_IOS_PER_THREAD= 256; + +extern Atomic_counter os_n_file_reads; +extern Atomic_counter os_n_file_writes; +extern Atomic_counter os_n_fsyncs; + +/* File types for directory entry data type */ + +enum os_file_type_t { + OS_FILE_TYPE_UNKNOWN = 0, + OS_FILE_TYPE_FILE, /* regular file */ + OS_FILE_TYPE_DIR, /* directory */ + OS_FILE_TYPE_LINK, /* symbolic link */ + OS_FILE_TYPE_BLOCK /* block device */ +}; + +/* Maximum path string length in bytes when referring to tables with in the +'./databasename/tablename.ibd' path format; we can allocate at least 2 buffers +of this size from the thread stack; that is why this should not be made much +bigger than 4000 bytes. The maximum path length used by any storage engine +in the server must be at least this big. */ + +/* MySQL 5.7 my_global.h */ +#ifndef FN_REFLEN_SE +#define FN_REFLEN_SE 4000 +#endif + +#define OS_FILE_MAX_PATH 4000 +#if (FN_REFLEN_SE < OS_FILE_MAX_PATH) +# error "(FN_REFLEN_SE < OS_FILE_MAX_PATH)" +#endif + +/** Struct used in fetching information of a file in a directory */ +struct os_file_stat_t { + char name[OS_FILE_MAX_PATH]; /*!< path to a file */ + os_file_type_t type; /*!< file type */ + os_offset_t size; /*!< file size in bytes */ + os_offset_t alloc_size; /*!< Allocated size for + sparse files in bytes */ + size_t block_size; /*!< Block size to use for IO + in bytes*/ + time_t ctime; /*!< creation time */ + time_t mtime; /*!< modification time */ + time_t atime; /*!< access time */ + bool rw_perm; /*!< true if can be opened + in read-write mode. Only valid + if type == OS_FILE_TYPE_FILE */ +}; + +/** Create a temporary file. This function is like tmpfile(3), but +the temporary file is created in the in the mysql server configuration +parameter (--tmpdir). +@return temporary file handle, or NULL on error */ +FILE* +os_file_create_tmpfile(); + +/** +This function attempts to create a directory named pathname. The new directory +gets default permissions. On Unix, the permissions are (0770 & ~umask). If the +directory exists already, nothing is done and the call succeeds, unless the +fail_if_exists arguments is true. + +@param[in] pathname directory name as null-terminated string +@param[in] fail_if_exists if true, pre-existing directory is treated + as an error. +@return true if call succeeds, false on error */ +bool +os_file_create_directory( + const char* pathname, + bool fail_if_exists); + +/** NOTE! Use the corresponding macro os_file_create_simple(), not directly +this function! +A simple function to open or create a file. +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE +@param[in] read_only if true read only mode checks are enforced +@param[out] success true if succeed, false if error +@return own: handle to the file, not defined if error, error number + can be retrieved with os_file_get_last_error */ +pfs_os_file_t +os_file_create_simple_func( + const char* name, + ulint create_mode, + ulint access_type, + bool read_only, + bool* success); + +/** NOTE! Use the corresponding macro +os_file_create_simple_no_error_handling(), not directly this function! +A simple function to open or create a file. +@param[in] name name of the file or path as a null-terminated string +@param[in] create_mode create mode +@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or + OS_FILE_READ_ALLOW_DELETE; the last option + is used by a backup program reading the file +@param[in] read_only if true read only mode checks are enforced +@param[out] success true if succeeded +@return own: handle to the file, not defined if error, error number + can be retrieved with os_file_get_last_error */ +pfs_os_file_t +os_file_create_simple_no_error_handling_func( + const char* name, + ulint create_mode, + ulint access_type, + bool read_only, + bool* success) + MY_ATTRIBUTE((warn_unused_result)); + +#ifdef _WIN32 +#define os_file_set_nocache(fd, file_name, operation_name) do{}while(0) +#else +/** Tries to disable OS caching on an opened file descriptor. +@param[in] fd file descriptor to alter +@param[in] file_name file name, used in the diagnostic message +@param[in] name "open" or "create"; used in the diagnostic + message */ +void +os_file_set_nocache( +/*================*/ + int fd, /*!< in: file descriptor to alter */ + const char* file_name, + const char* operation_name); +#endif + +#ifndef _WIN32 /* On Microsoft Windows, mandatory locking is used */ +/** Obtain an exclusive lock on a file. +@param fd file descriptor +@param name file name +@return 0 on success */ +int os_file_lock(int fd, const char *name); +#endif + +/** NOTE! Use the corresponding macro os_file_create(), not directly +this function! +Opens an existing file or creates a new. +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O + is desired, OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. + and srv_.. variables whether we really use + async I/O or unbuffered I/O: look in the + function source code for the exact rules +@param[in] type OS_DATA_FILE or OS_LOG_FILE +@param[in] read_only if true read only mode checks are enforced +@param[in] success true if succeeded +@return own: handle to the file, not defined if error, error number + can be retrieved with os_file_get_last_error */ +pfs_os_file_t +os_file_create_func( + const char* name, + ulint create_mode, + ulint purpose, + ulint type, + bool read_only, + bool* success) + MY_ATTRIBUTE((warn_unused_result)); + +/** Deletes a file. The file has to be closed before calling this. +@param[in] name file path as a null-terminated string +@return true if success */ +bool +os_file_delete_func(const char* name); + +/** Deletes a file if it exists. The file has to be closed before calling this. +@param[in] name file path as a null-terminated string +@param[out] exist indicate if file pre-exist +@return true if success */ +bool +os_file_delete_if_exists_func(const char* name, bool* exist); + +/** NOTE! Use the corresponding macro os_file_rename(), not directly +this function! +Renames a file (can also move it to another directory). It is safest that the +file is closed before calling this function. +@param[in] oldpath old file path as a null-terminated string +@param[in] newpath new file path +@return true if success */ +bool +os_file_rename_func(const char* oldpath, const char* newpath); + +/** NOTE! Use the corresponding macro os_file_close(), not directly this +function! +Closes a file handle. In case of error, error number can be retrieved with +os_file_get_last_error. +@param[in] file own: handle to a file +@return true if success */ +bool os_file_close_func(os_file_t file); + +#ifdef UNIV_PFS_IO + +/* Keys to register InnoDB I/O with performance schema */ +extern mysql_pfs_key_t innodb_data_file_key; +extern mysql_pfs_key_t innodb_temp_file_key; + +/* Following four macros are instumentations to register +various file I/O operations with performance schema. +1) register_pfs_file_open_begin() and register_pfs_file_open_end() are +used to register file creation, opening, closing and renaming. +2) register_pfs_file_rename_begin() and register_pfs_file_rename_end() +are used to register file renaming +2) register_pfs_file_io_begin() and register_pfs_file_io_end() are +used to register actual file read, write and flush +3) register_pfs_file_close_begin() and register_pfs_file_close_end() +are used to register file deletion operations*/ +# define register_pfs_file_open_begin(state, locker, key, op, name, \ + src_file, src_line) \ +do { \ + locker = PSI_FILE_CALL(get_thread_file_name_locker)( \ + state, key, op, name, &locker); \ + if (locker != NULL) { \ + PSI_FILE_CALL(start_file_open_wait)( \ + locker, src_file, src_line); \ + } \ +} while (0) + +# define register_pfs_file_open_end(locker, file, result) \ +do { \ + if (locker != NULL) { \ + file.m_psi = PSI_FILE_CALL(end_file_open_wait)( \ + locker, result); \ + } \ +} while (0) + +# define register_pfs_file_rename_begin(state, locker, key, op, name, \ + src_file, src_line) \ + register_pfs_file_open_begin(state, locker, key, op, name, \ + src_file, src_line) \ + +# define register_pfs_file_rename_end(locker, from, to, result) \ +do { \ + if (locker != NULL) { \ + PSI_FILE_CALL( \ + end_file_rename_wait)( \ + locker, from, to, result); \ + } \ +} while (0) + +# define register_pfs_file_close_begin(state, locker, key, op, name, \ + src_file, src_line) \ +do { \ + locker = PSI_FILE_CALL(get_thread_file_name_locker)( \ + state, key, op, name, &locker); \ + if (locker != NULL) { \ + PSI_FILE_CALL(start_file_close_wait)( \ + locker, src_file, src_line); \ + } \ +} while (0) + +# define register_pfs_file_close_end(locker, result) \ +do { \ + if (locker != NULL) { \ + PSI_FILE_CALL(end_file_close_wait)( \ + locker, result); \ + } \ +} while (0) + +# define register_pfs_file_io_begin(state, locker, file, count, op, \ + src_file, src_line) \ +do { \ + locker = PSI_FILE_CALL(get_thread_file_stream_locker)( \ + state, file.m_psi, op); \ + if (locker != NULL) { \ + PSI_FILE_CALL(start_file_wait)( \ + locker, count, src_file, src_line); \ + } \ +} while (0) + +# define register_pfs_file_io_end(locker, count) \ +do { \ + if (locker != NULL) { \ + PSI_FILE_CALL(end_file_wait)(locker, count); \ + } \ +} while (0) + +/* Following macros/functions are file I/O APIs that would be performance +schema instrumented if "UNIV_PFS_IO" is defined. They would point to +wrapper functions with performance schema instrumentation in such case. + +os_file_create +os_file_create_simple +os_file_create_simple_no_error_handling +os_file_close +os_file_rename +os_aio +os_file_read +os_file_read_no_error_handling +os_file_write + +The wrapper functions have the prefix of "innodb_". */ + +# define os_file_create(key, name, create, purpose, type, read_only, \ + success) \ + pfs_os_file_create_func(key, name, create, purpose, type, \ + read_only, success, __FILE__, __LINE__) + +# define os_file_create_simple(key, name, create, access, \ + read_only, success) \ + pfs_os_file_create_simple_func(key, name, create, access, \ + read_only, success, __FILE__, __LINE__) + +# define os_file_create_simple_no_error_handling( \ + key, name, create_mode, access, read_only, success) \ + pfs_os_file_create_simple_no_error_handling_func( \ + key, name, create_mode, access, \ + read_only, success, __FILE__, __LINE__) + +# define os_file_close(file) \ + pfs_os_file_close_func(file, __FILE__, __LINE__) + +# define os_file_read(type, file, buf, offset, n, o) \ + pfs_os_file_read_func(type, file, buf, offset, n,o, __FILE__, __LINE__) + +# define os_file_write(type, name, file, buf, offset, n) \ + pfs_os_file_write_func(type, name, file, buf, offset, \ + n, __FILE__, __LINE__) + +# define os_file_flush(file) \ + pfs_os_file_flush_func(file, __FILE__, __LINE__) + +# define os_file_rename(key, oldpath, newpath) \ + pfs_os_file_rename_func(key, oldpath, newpath, __FILE__, __LINE__) + +# define os_file_delete(key, name) \ + pfs_os_file_delete_func(key, name, __FILE__, __LINE__) + +# define os_file_delete_if_exists(key, name, exist) \ + pfs_os_file_delete_if_exists_func(key, name, exist, __FILE__, __LINE__) + +/** NOTE! Please use the corresponding macro os_file_create_simple(), +not directly this function! +A performance schema instrumented wrapper function for +os_file_create_simple() which opens or creates a file. +@param[in] key Performance Schema Key +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE +@param[in] read_only if true read only mode checks are enforced +@param[out] success true if succeeded +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return own: handle to the file, not defined if error, error number + can be retrieved with os_file_get_last_error */ +UNIV_INLINE +pfs_os_file_t +pfs_os_file_create_simple_func( + mysql_pfs_key_t key, + const char* name, + ulint create_mode, + ulint access_type, + bool read_only, + bool* success, + const char* src_file, + uint src_line) + MY_ATTRIBUTE((warn_unused_result)); + +/** NOTE! Please use the corresponding macro +os_file_create_simple_no_error_handling(), not directly this function! +A performance schema instrumented wrapper function for +os_file_create_simple_no_error_handling(). Add instrumentation to +monitor file creation/open. +@param[in] key Performance Schema Key +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or + OS_FILE_READ_ALLOW_DELETE; the last option is + used by a backup program reading the file +@param[in] read_only if true read only mode checks are enforced +@param[out] success true if succeeded +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return own: handle to the file, not defined if error, error number + can be retrieved with os_file_get_last_error */ +UNIV_INLINE +pfs_os_file_t +pfs_os_file_create_simple_no_error_handling_func( + mysql_pfs_key_t key, + const char* name, + ulint create_mode, + ulint access_type, + bool read_only, + bool* success, + const char* src_file, + uint src_line) + MY_ATTRIBUTE((warn_unused_result)); + +/** NOTE! Please use the corresponding macro os_file_create(), not directly +this function! +A performance schema wrapper function for os_file_create(). +Add instrumentation to monitor file creation/open. +@param[in] key Performance Schema Key +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O + is desired, OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. + and srv_.. variables whether we really use + async I/O or unbuffered I/O: look in the + function source code for the exact rules +@param[in] read_only if true read only mode checks are enforced +@param[out] success true if succeeded +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return own: handle to the file, not defined if error, error number + can be retrieved with os_file_get_last_error */ +UNIV_INLINE +pfs_os_file_t +pfs_os_file_create_func( + mysql_pfs_key_t key, + const char* name, + ulint create_mode, + ulint purpose, + ulint type, + bool read_only, + bool* success, + const char* src_file, + uint src_line) + MY_ATTRIBUTE((warn_unused_result)); + +/** NOTE! Please use the corresponding macro os_file_close(), not directly +this function! +A performance schema instrumented wrapper function for os_file_close(). +@param[in] file handle to a file +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return true if success */ +UNIV_INLINE +bool +pfs_os_file_close_func( + pfs_os_file_t file, + const char* src_file, + uint src_line); + +/** NOTE! Please use the corresponding macro os_file_read(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_read() which requests a synchronous read operation. +@param[in] type IO request context +@param[in] file Open file handle +@param[out] buf buffer where to read +@param[in] offset file offset where to read +@param[in] n number of bytes to read +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return DB_SUCCESS if request was successful */ +UNIV_INLINE +dberr_t +pfs_os_file_read_func( + const IORequest& type, + pfs_os_file_t file, + void* buf, + os_offset_t offset, + ulint n, + ulint* o, + const char* src_file, + uint src_line); + +/** NOTE! Please use the corresponding macro os_file_write(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_write() which requests a synchronous write operation. +@param[in] type IO request context +@param[in] name Name of the file or path as NUL terminated + string +@param[in] file Open file handle +@param[out] buf buffer where to read +@param[in] offset file offset where to read +@param[in] n number of bytes to read +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return DB_SUCCESS if request was successful */ +UNIV_INLINE +dberr_t +pfs_os_file_write_func( + const IORequest& type, + const char* name, + pfs_os_file_t file, + const void* buf, + os_offset_t offset, + ulint n, + const char* src_file, + uint src_line); + +/** NOTE! Please use the corresponding macro os_file_flush(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_flush() which flushes the write buffers of a given file to the disk. +Flushes the write buffers of a given file to the disk. +@param[in] file Open file handle +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return TRUE if success */ +UNIV_INLINE +bool +pfs_os_file_flush_func( + pfs_os_file_t file, + const char* src_file, + uint src_line); + + +/** NOTE! Please use the corresponding macro os_file_rename(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_rename() +@param[in] key Performance Schema Key +@param[in] oldpath old file path as a null-terminated string +@param[in] newpath new file path +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return true if success */ +UNIV_INLINE +bool +pfs_os_file_rename_func( + mysql_pfs_key_t key, + const char* oldpath, + const char* newpath, + const char* src_file, + uint src_line); + +/** +NOTE! Please use the corresponding macro os_file_delete(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_delete() +@param[in] key Performance Schema Key +@param[in] name old file path as a null-terminated string +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return true if success */ +UNIV_INLINE +bool +pfs_os_file_delete_func( + mysql_pfs_key_t key, + const char* name, + const char* src_file, + uint src_line); + +/** +NOTE! Please use the corresponding macro os_file_delete_if_exists(), not +directly this function! +This is the performance schema instrumented wrapper function for +os_file_delete_if_exists() +@param[in] key Performance Schema Key +@param[in] name old file path as a null-terminated string +@param[in] exist indicate if file pre-exist +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return true if success */ +UNIV_INLINE +bool +pfs_os_file_delete_if_exists_func( + mysql_pfs_key_t key, + const char* name, + bool* exist, + const char* src_file, + uint src_line); + +#else /* UNIV_PFS_IO */ + +/* If UNIV_PFS_IO is not defined, these I/O APIs point +to original un-instrumented file I/O APIs */ +# define os_file_create(key, name, create, purpose, type, read_only, \ + success) \ + os_file_create_func(name, create, purpose, type, read_only, \ + success) + +# define os_file_create_simple(key, name, create_mode, access, \ + read_only, success) \ + os_file_create_simple_func(name, create_mode, access, \ + read_only, success) + +# define os_file_create_simple_no_error_handling( \ + key, name, create_mode, access, read_only, success) \ + os_file_create_simple_no_error_handling_func( \ + name, create_mode, access, read_only, success) + +# define os_file_close(file) os_file_close_func(file) + +# define os_file_read(type, file, buf, offset, n, o) \ + os_file_read_func(type, file, buf, offset, n, o) + +# define os_file_write(type, name, file, buf, offset, n) \ + os_file_write_func(type, name, file, buf, offset, n) + +# define os_file_flush(file) os_file_flush_func(file) + +# define os_file_rename(key, oldpath, newpath) \ + os_file_rename_func(oldpath, newpath) + +# define os_file_delete(key, name) os_file_delete_func(name) + +# define os_file_delete_if_exists(key, name, exist) \ + os_file_delete_if_exists_func(name, exist) + +#endif /* UNIV_PFS_IO */ + +/** Gets a file size. +@param[in] file handle to a file +@return file size if OK, else set m_total_size to ~0 and m_alloc_size + to errno */ +os_file_size_t +os_file_get_size( + const char* filename) + MY_ATTRIBUTE((warn_unused_result)); + +/** Gets a file size. +@param[in] file handle to a file +@return file size, or (os_offset_t) -1 on failure */ +os_offset_t +os_file_get_size( + os_file_t file) + MY_ATTRIBUTE((warn_unused_result)); + +/** Extend a file. + +On Windows, extending a file allocates blocks for the file, +unless the file is sparse. + +On Unix, we will extend the file with ftruncate(), if +file needs to be sparse. Otherwise posix_fallocate() is used +when available, and if not, binary zeroes are added to the end +of file. + +@param[in] name file name +@param[in] file file handle +@param[in] size desired file size +@param[in] sparse whether to create a sparse file (no preallocating) +@return whether the operation succeeded */ +bool +os_file_set_size( + const char* name, + os_file_t file, + os_offset_t size, + bool is_sparse = false) + MY_ATTRIBUTE((warn_unused_result)); + +/** Truncates a file at its current position. +@param[in/out] file file to be truncated +@return true if success */ +bool +os_file_set_eof( + FILE* file); /*!< in: file to be truncated */ + +/** Truncate a file to a specified size in bytes. +@param[in] pathname file path +@param[in] file file to be truncated +@param[in] size size preserved in bytes +@param[in] allow_shrink whether to allow the file to become smaller +@return true if success */ +bool +os_file_truncate( + const char* pathname, + os_file_t file, + os_offset_t size, + bool allow_shrink = false); + +/** NOTE! Use the corresponding macro os_file_flush(), not directly this +function! +Flushes the write buffers of a given file to the disk. +@param[in] file handle to a file +@return true if success */ +bool +os_file_flush_func( + os_file_t file); + +/** Retrieves the last error number if an error occurs in a file io function. +The number should be retrieved before any other OS calls (because they may +overwrite the error number). If the number is not known to this program, +the OS error number + OS_FILE_ERROR_MAX is returned. +@param[in] report_all_errors true if we want an error message + printed of all errors +@param[in] on_error_silent true then don't print any diagnostic + to the log +@return error number, or OS error number + OS_FILE_ERROR_MAX */ +ulint os_file_get_last_error(bool report_all_errors, + bool on_error_silent= false); + +/** NOTE! Use the corresponding macro os_file_read(), not directly this +function! +Requests a synchronous read operation. +@param[in] type IO request context +@param[in] file Open file handle +@param[out] buf buffer where to read +@param[in] offset file offset where to read +@param[in] n number of bytes to read +@param[out] o number of bytes actually read +@return DB_SUCCESS if request was successful */ +dberr_t +os_file_read_func( + const IORequest& type, + os_file_t file, + void* buf, + os_offset_t offset, + ulint n, + ulint* o) + MY_ATTRIBUTE((warn_unused_result)); + +/** Rewind file to its start, read at most size - 1 bytes from it to str, and +NUL-terminate str. All errors are silently ignored. This function is +mostly meant to be used with temporary files. +@param[in,out] file file to read from +@param[in,out] str buffer where to read +@param[in] size size of buffer */ +void +os_file_read_string( + FILE* file, + char* str, + ulint size); + +/** NOTE! Use the corresponding macro os_file_write(), not directly this +function! +Requests a synchronous write operation. +@param[in] type IO request context +@param[in] file Open file handle +@param[out] buf buffer where to read +@param[in] offset file offset where to read +@param[in] n number of bytes to read +@return DB_SUCCESS if request was successful */ +dberr_t +os_file_write_func( + const IORequest& type, + const char* name, + os_file_t file, + const void* buf, + os_offset_t offset, + ulint n) + MY_ATTRIBUTE((warn_unused_result)); + +/** Check the existence and type of the given file. +@param[in] path pathname of the file +@param[out] exists true if file exists +@param[out] type type of the file (if it exists) +@return true if call succeeded */ +bool +os_file_status( + const char* path, + bool* exists, + os_file_type_t* type); + +/** This function reduces a null-terminated full remote path name into +the path that is sent by MySQL for DATA DIRECTORY clause. It replaces +the 'databasename/tablename.ibd' found at the end of the path with just +'tablename'. + +Since the result is always smaller than the path sent in, no new memory +is allocated. The caller should allocate memory for the path sent in. +This function manipulates that path in place. + +If the path format is not as expected, just return. The result is used +to inform a SHOW CREATE TABLE command. +@param[in,out] data_dir_path Full path/data_dir_path */ +void +os_file_make_data_dir_path( + char* data_dir_path); + +/** Create all missing subdirectories along the given path. +@return DB_SUCCESS if OK, otherwise error code. */ +dberr_t +os_file_create_subdirs_if_needed( + const char* path); + +#ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR +/* Test the function os_file_get_parent_dir. */ +void +unit_test_os_file_get_parent_dir(); +#endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */ + +/** +Initializes the asynchronous io system. */ +int os_aio_init(); + +/** +Frees the asynchronous io system. */ +void os_aio_free(); + +/** Submit a fake read request during crash recovery. +@param type fake read request +@param offset additional context */ +void os_fake_read(const IORequest &type, os_offset_t offset); + +/** Request a read or write. +@param type I/O request +@param buf buffer +@param offset file offset +@param n number of bytes +@retval DB_SUCCESS if request was queued successfully +@retval DB_IO_ERROR on I/O error */ +dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n); + +/** @return number of pending reads */ +size_t os_aio_pending_reads(); +/** @return approximate number of pending reads */ +size_t os_aio_pending_reads_approx(); +/** @return number of pending writes */ +size_t os_aio_pending_writes(); + +/** Wait until there are no pending asynchronous writes. +@param declare whether the wait will be declared in tpool */ +void os_aio_wait_until_no_pending_writes(bool declare); + +/** Wait until all pending asynchronous reads have completed. +@param declare whether the wait will be declared in tpool */ +void os_aio_wait_until_no_pending_reads(bool declare); + +/** Prints info of the aio arrays. +@param[in/out] file file where to print */ +void +os_aio_print(FILE* file); + +/** Refreshes the statistics used to print per-second averages. */ +void +os_aio_refresh_stats(); + +/** Checks that all slots in the system have been freed, that is, there are +no pending io operations. */ +bool +os_aio_all_slots_free(); + + +/** This function returns information about the specified file +@param[in] path pathname of the file +@param[in] stat_info information of a file in a directory +@param[in] check_rw_perm for testing whether the file can be opened + in RW mode +@param[in] read_only if true read only mode checks are enforced +@return DB_SUCCESS if all OK */ +dberr_t +os_file_get_status( + const char* path, + os_file_stat_t* stat_info, + bool check_rw_perm, + bool read_only); + +/** Set the file create umask +@param[in] umask The umask to use for file creation. */ +void +os_file_set_umask(ulint umask); + +#ifdef _WIN32 + +/** +Make file sparse, on Windows. + +@param[in] file file handle +@param[in] is_sparse if true, make file sparse, + otherwise "unsparse" the file +@return true on success, false on error */ +bool os_file_set_sparse_win32(os_file_t file, bool is_sparse = true); + +/** +Changes file size on Windows + +If file is extended, following happens the bytes between +old and new EOF are zeros. + +If file is sparse, "virtual" block is added at the end of +allocated area. + +If file is normal, file system allocates storage. + +@param[in] pathname file path +@param[in] file file handle +@param[in] size size to preserve in bytes +@return true if success */ +bool +os_file_change_size_win32( + const char* pathname, + os_file_t file, + os_offset_t size); + +#endif /*_WIN32 */ + +/** Free storage space associated with a section of the file. +@param[in] fh Open file handle +@param[in] off Starting offset (SEEK_SET) +@param[in] len Size of the hole +@return DB_SUCCESS or error code */ +dberr_t +os_file_punch_hole( + os_file_t fh, + os_offset_t off, + os_offset_t len) + MY_ATTRIBUTE((warn_unused_result)); + +/* Determine if a path is an absolute path or not. +@param[in] OS directory or file path to evaluate +@retval true if an absolute path +@retval false if a relative path */ +inline bool is_absolute_path(const char *path) +{ + switch (path[0]) { +#ifdef _WIN32 + case '\0': + return false; + case '\\': +#endif + case '/': + return true; + } + +#ifdef _WIN32 + if (path[1] == ':') + { + switch (path[2]) { + case '/': + case '\\': + return true; + } + } +#endif /* _WIN32 */ + + return false; +} + +#include "os0file.inl" + +#endif /* os0file_h */ diff --git a/storage/innobase/include/os0file.inl b/storage/innobase/include/os0file.inl new file mode 100644 index 00000000..7de31505 --- /dev/null +++ b/storage/innobase/include/os0file.inl @@ -0,0 +1,412 @@ +/***************************************************************************** + +Copyright (c) 2010, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/os0file.ic +The interface to the operating system file io + +Created 2/20/2010 Jimmy Yang +*******************************************************/ + +#ifdef UNIV_PFS_IO +/** NOTE! Please use the corresponding macro os_file_create_simple(), +not directly this function! +A performance schema instrumented wrapper function for +os_file_create_simple() which opens or creates a file. +@param[in] key Performance Schema Key +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE +@param[in] read_only if true read only mode checks are enforced +@param[out] success true if succeeded +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INLINE +pfs_os_file_t +pfs_os_file_create_simple_func( + mysql_pfs_key_t key, + const char* name, + ulint create_mode, + ulint access_type, + bool read_only, + bool* success, + const char* src_file, + uint src_line) +{ + PSI_file_locker_state state; + struct PSI_file_locker* locker = NULL; + + /* register a file open or creation depending on "create_mode" */ + register_pfs_file_open_begin( + &state, locker, key, + (create_mode == OS_FILE_CREATE) + ? PSI_FILE_CREATE : PSI_FILE_OPEN, + name, src_file, src_line); + + pfs_os_file_t file = os_file_create_simple_func( + name, create_mode, access_type, read_only, success); + + /* Register psi value for the file */ + register_pfs_file_open_end(locker, file, + (*success == TRUE ? success : 0)); + + return(file); +} + +/** NOTE! Please use the corresponding macro +os_file_create_simple_no_error_handling(), not directly this function! +A performance schema instrumented wrapper function for +os_file_create_simple_no_error_handling(). Add instrumentation to +monitor file creation/open. +@param[in] key Performance Schema Key +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or + OS_FILE_READ_ALLOW_DELETE; the last option is + used by a backup program reading the file +@param[in] read_only if true read only mode checks are enforced +@param[out] success true if succeeded +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INLINE +pfs_os_file_t +pfs_os_file_create_simple_no_error_handling_func( + mysql_pfs_key_t key, + const char* name, + ulint create_mode, + ulint access_type, + bool read_only, + bool* success, + const char* src_file, + uint src_line) +{ + PSI_file_locker_state state; + struct PSI_file_locker* locker = NULL; + + /* register a file open or creation depending on "create_mode" */ + register_pfs_file_open_begin( + &state, locker, key, + create_mode == OS_FILE_CREATE + ? PSI_FILE_CREATE : PSI_FILE_OPEN, + name, src_file, src_line); + + pfs_os_file_t file = os_file_create_simple_no_error_handling_func( + name, create_mode, access_type, read_only, success); + + register_pfs_file_open_end(locker, file, + (*success == TRUE ? success : 0)); + + return(file); +} + +/** NOTE! Please use the corresponding macro os_file_create(), not directly +this function! +A performance schema wrapper function for os_file_create(). +Add instrumentation to monitor file creation/open. +@param[in] key Performance Schema Key +@param[in] name name of the file or path as a null-terminated + string +@param[in] create_mode create mode +@param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O + is desired, OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. + and srv_.. variables whether we really us + async I/O or unbuffered I/O: look in the + function source code for the exact rules +@param[in] read_only if true read only mode checks are enforced +@param[out] success true if succeeded +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INLINE +pfs_os_file_t +pfs_os_file_create_func( + mysql_pfs_key_t key, + const char* name, + ulint create_mode, + ulint purpose, + ulint type, + bool read_only, + bool* success, + const char* src_file, + uint src_line) +{ + PSI_file_locker_state state; + struct PSI_file_locker* locker = NULL; + + /* register a file open or creation depending on "create_mode" */ + register_pfs_file_open_begin( + &state, locker, key, + create_mode == OS_FILE_CREATE + ? PSI_FILE_CREATE : PSI_FILE_OPEN, + name, src_file, src_line); + + pfs_os_file_t file = os_file_create_func( + name, create_mode, purpose, type, read_only, success); + + register_pfs_file_open_end(locker, file, + (*success == TRUE ? success : 0)); + + return(file); +} +/** +NOTE! Please use the corresponding macro os_file_close(), not directly +this function! +A performance schema instrumented wrapper function for os_file_close(). +@param[in] file handle to a file +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return true if success */ +UNIV_INLINE +bool +pfs_os_file_close_func( + pfs_os_file_t file, + const char* src_file, + uint src_line) +{ + PSI_file_locker_state state; + struct PSI_file_locker* locker = NULL; + + /* register the file close */ + register_pfs_file_io_begin( + &state, locker, file, 0, PSI_FILE_CLOSE, src_file, src_line); + + bool result = os_file_close_func(file); + + register_pfs_file_io_end(locker, 0); + + return(result); +} + +/** NOTE! Please use the corresponding macro os_file_read(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_read() which requests a synchronous read operation. +@param[in] type IO request context +@param[in] file Open file handle +@param[out] buf buffer where to read +@param[in] offset file offset where to read +@param[in] n number of bytes to read +@param[out] o number of bytes actually read +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return DB_SUCCESS if request was successful */ +UNIV_INLINE +dberr_t +pfs_os_file_read_func( + const IORequest& type, + pfs_os_file_t file, + void* buf, + os_offset_t offset, + ulint n, + ulint* o, + const char* src_file, + uint src_line) +{ + PSI_file_locker_state state; + struct PSI_file_locker* locker = NULL; + + register_pfs_file_io_begin( + &state, locker, file, n, PSI_FILE_READ, src_file, src_line); + + dberr_t result; + + result = os_file_read_func(type, file, buf, offset, n, o); + + register_pfs_file_io_end(locker, n); + + return(result); +} + +/** NOTE! Please use the corresponding macro os_file_write(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_write() which requests a synchronous write operation. +@param[in] type IO request context +@param[in] name Name of the file or path as NUL terminated + string +@param[in] file Open file handle +@param[out] buf buffer where to read +@param[in] offset file offset where to read +@param[in] n number of bytes to read +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return error code +@retval DB_SUCCESS if the request was successfully fulfilled */ +UNIV_INLINE +dberr_t +pfs_os_file_write_func( + const IORequest& type, + const char* name, + pfs_os_file_t file, + const void* buf, + os_offset_t offset, + ulint n, + const char* src_file, + uint src_line) +{ + PSI_file_locker_state state; + struct PSI_file_locker* locker = NULL; + + register_pfs_file_io_begin( + &state, locker, file, n, PSI_FILE_WRITE, src_file, src_line); + + dberr_t result; + + result = os_file_write_func(type, name, file, buf, offset, n); + + register_pfs_file_io_end(locker, n); + + return(result); +} + + +/** NOTE! Please use the corresponding macro os_file_flush(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_flush() which flushes the write buffers of a given file to the disk. +Flushes the write buffers of a given file to the disk. +@param[in] file Open file handle +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return TRUE if success */ +UNIV_INLINE +bool +pfs_os_file_flush_func( + pfs_os_file_t file, + const char* src_file, + uint src_line) +{ + PSI_file_locker_state state; + struct PSI_file_locker* locker = NULL; + + register_pfs_file_io_begin( + &state, locker, file, 0, PSI_FILE_SYNC, src_file, src_line); + + bool result = os_file_flush_func(file); + + register_pfs_file_io_end(locker, 0); + + return(result); +} + +/** NOTE! Please use the corresponding macro os_file_rename(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_rename() +@param[in] key Performance Schema Key +@param[in] oldpath old file path as a null-terminated string +@param[in] newpath new file path +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return true if success */ +UNIV_INLINE +bool +pfs_os_file_rename_func( + mysql_pfs_key_t key, + const char* oldpath, + const char* newpath, + const char* src_file, + uint src_line) + +{ + PSI_file_locker_state state; + struct PSI_file_locker* locker = NULL; + + register_pfs_file_rename_begin( + &state, locker, key, PSI_FILE_RENAME, newpath, + src_file, src_line); + + bool result = os_file_rename_func(oldpath, newpath); + + register_pfs_file_rename_end(locker, oldpath, newpath, !result); + + return(result); +} + +/** NOTE! Please use the corresponding macro os_file_delete(), not directly +this function! +This is the performance schema instrumented wrapper function for +os_file_delete() +@param[in] key Performance Schema Key +@param[in] name old file path as a null-terminated string +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return true if success */ +UNIV_INLINE +bool +pfs_os_file_delete_func( + mysql_pfs_key_t key, + const char* name, + const char* src_file, + uint src_line) +{ + PSI_file_locker_state state; + struct PSI_file_locker* locker = NULL; + + register_pfs_file_close_begin( + &state, locker, key, PSI_FILE_DELETE, name, src_file, src_line); + + bool result = os_file_delete_func(name); + + register_pfs_file_close_end(locker, 0); + + return(result); +} + +/** +NOTE! Please use the corresponding macro os_file_delete_if_exists(), not +directly this function! +This is the performance schema instrumented wrapper function for +os_file_delete_if_exists() +@param[in] key Performance Schema Key +@param[in] name old file path as a null-terminated string +@param[in] exist indicate if file pre-exist +@param[in] src_file file name where func invoked +@param[in] src_line line where the func invoked +@return true if success */ +UNIV_INLINE +bool +pfs_os_file_delete_if_exists_func( + mysql_pfs_key_t key, + const char* name, + bool* exist, + const char* src_file, + uint src_line) +{ + PSI_file_locker_state state; + struct PSI_file_locker* locker = NULL; + + register_pfs_file_close_begin( + &state, locker, key, PSI_FILE_DELETE, name, src_file, src_line); + + bool result = os_file_delete_if_exists_func(name, exist); + + register_pfs_file_close_end(locker, 0); + + return(result); +} +#endif /* UNIV_PFS_IO */ diff --git a/storage/innobase/include/page0cur.h b/storage/innobase/include/page0cur.h new file mode 100644 index 00000000..28aa3056 --- /dev/null +++ b/storage/innobase/include/page0cur.h @@ -0,0 +1,303 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/page0cur.h +The page cursor + +Created 10/4/1994 Heikki Tuuri +*************************************************************************/ + +#ifndef page0cur_h +#define page0cur_h + +#include "page0page.h" + +#ifdef UNIV_DEBUG +/*********************************************************//** +Gets pointer to the page frame where the cursor is positioned. +@return page */ +UNIV_INLINE +page_t* +page_cur_get_page( +/*==============*/ + page_cur_t* cur); /*!< in: page cursor */ +/*********************************************************//** +Gets pointer to the buffer block where the cursor is positioned. +@return page */ +UNIV_INLINE +buf_block_t* +page_cur_get_block( +/*===============*/ + page_cur_t* cur); /*!< in: page cursor */ +/*********************************************************//** +Gets pointer to the page frame where the cursor is positioned. +@return page */ +UNIV_INLINE +page_zip_des_t* +page_cur_get_page_zip( +/*==================*/ + page_cur_t* cur); /*!< in: page cursor */ +/* Gets the record where the cursor is positioned. +@param cur page cursor +@return record */ +UNIV_INLINE +rec_t *page_cur_get_rec(const page_cur_t *cur); +#else /* UNIV_DEBUG */ +# define page_cur_get_page(cur) page_align((cur)->rec) +# define page_cur_get_block(cur) (cur)->block +# define page_cur_get_page_zip(cur) buf_block_get_page_zip((cur)->block) +# define page_cur_get_rec(cur) (cur)->rec +#endif /* UNIV_DEBUG */ +# define is_page_cur_get_page_zip(cur) is_buf_block_get_page_zip((cur)->block) +/*********************************************************//** +Sets the cursor object to point before the first user record +on the page. */ +UNIV_INLINE +void +page_cur_set_before_first( +/*======================*/ + const buf_block_t* block, /*!< in: index page */ + page_cur_t* cur); /*!< in: cursor */ +/*********************************************************//** +Sets the cursor object to point after the last user record on +the page. */ +UNIV_INLINE +void +page_cur_set_after_last( +/*====================*/ + const buf_block_t* block, /*!< in: index page */ + page_cur_t* cur); /*!< in: cursor */ +/*********************************************************//** +Returns TRUE if the cursor is before first user record on page. +@return TRUE if at start */ +UNIV_INLINE +ibool +page_cur_is_before_first( +/*=====================*/ + const page_cur_t* cur); /*!< in: cursor */ +/*********************************************************//** +Returns TRUE if the cursor is after last user record. +@return TRUE if at end */ +UNIV_INLINE +ibool +page_cur_is_after_last( +/*===================*/ + const page_cur_t* cur); /*!< in: cursor */ +/**********************************************************//** +Positions the cursor on the given record. */ +UNIV_INLINE +void +page_cur_position( +/*==============*/ + const rec_t* rec, /*!< in: record on a page */ + const buf_block_t* block, /*!< in: buffer block containing + the record */ + page_cur_t* cur); /*!< out: page cursor */ + +/***********************************************************//** +Inserts a record next to page cursor. Returns pointer to inserted record if +succeed, i.e., enough space available, NULL otherwise. The cursor stays at +the same logical position, but the physical position may change if it is +pointing to a compressed page that was reorganized. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to record if succeed, NULL otherwise */ +UNIV_INLINE +rec_t* +page_cur_tuple_insert( +/*==================*/ + page_cur_t* cursor, /*!< in/out: a page cursor */ + const dtuple_t* tuple, /*!< in: pointer to a data tuple */ + rec_offs** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/***********************************************************//** +Inserts a record next to page cursor on an uncompressed page. +@return pointer to record +@retval nullptr if not enough space was available */ +rec_t* +page_cur_insert_rec_low( +/*====================*/ + const page_cur_t*cur, /*!< in: page cursor */ + const rec_t* rec, /*!< in: record to insert after cur */ + rec_offs* offsets,/*!< in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/***********************************************************//** +Inserts a record next to page cursor on a compressed and uncompressed +page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to inserted record +@return nullptr on failure */ +rec_t* +page_cur_insert_rec_zip( +/*====================*/ + page_cur_t* cursor, /*!< in/out: page cursor, + logical position unchanged */ + const rec_t* rec, /*!< in: pointer to a physical record */ + rec_offs* offsets,/*!< in/out: rec_get_offsets(rec, index) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/***********************************************************//** +Deletes a record at the page cursor. The cursor is moved to the +next record after the deleted one. */ +void +page_cur_delete_rec( +/*================*/ + page_cur_t* cursor, /*!< in/out: a page cursor */ + const rec_offs* offsets,/*!< in: rec_get_offsets( + cursor->rec, index) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull)); + +/** Apply a INSERT_HEAP_REDUNDANT or INSERT_REUSE_REDUNDANT record that was +written by page_cur_insert_rec_low() for a ROW_FORMAT=REDUNDANT page. +@param block B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC +@param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE +@param prev byte offset of the predecessor, relative to PAGE_OLD_INFIMUM +@param enc_hdr encoded fixed-size header bits +@param hdr_c number of common record header bytes with prev +@param data_c number of common data bytes with prev +@param data literal header and data bytes +@param data_len length of the literal data, in bytes +@return whether the operation failed (inconcistency was noticed) */ +bool page_apply_insert_redundant(const buf_block_t &block, bool reuse, + ulint prev, ulint enc_hdr, + size_t hdr_c, size_t data_c, + const void *data, size_t data_len); + +/** Apply a INSERT_HEAP_DYNAMIC or INSERT_REUSE_DYNAMIC record that was +written by page_cur_insert_rec_low() for a ROW_FORMAT=COMPACT or DYNAMIC page. +@param block B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC +@param reuse false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE +@param prev byte offset of the predecessor, relative to PAGE_NEW_INFIMUM +@param shift unless !reuse: number of bytes the PAGE_FREE is moving +@param enc_hdr_l number of copied record header bytes, plus record type bits +@param hdr_c number of common record header bytes with prev +@param data_c number of common data bytes with prev +@param data literal header and data bytes +@param data_len length of the literal data, in bytes +@return whether the operation failed (inconcistency was noticed) */ +bool page_apply_insert_dynamic(const buf_block_t &block, bool reuse, + ulint prev, ulint shift, ulint enc_hdr_l, + size_t hdr_c, size_t data_c, + const void *data, size_t data_len); + +/** Apply a DELETE_ROW_FORMAT_REDUNDANT record that was written by +page_cur_delete_rec() for a ROW_FORMAT=REDUNDANT page. +@param block B-tree or R-tree page in ROW_FORMAT=REDUNDANT +@param prev byte offset of the predecessor, relative to PAGE_OLD_INFIMUM +@return whether the operation failed (inconcistency was noticed) */ +bool page_apply_delete_redundant(const buf_block_t &block, ulint prev); + +/** Apply a DELETE_ROW_FORMAT_DYNAMIC record that was written by +page_cur_delete_rec() for a ROW_FORMAT=COMPACT or DYNAMIC page. +@param block B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC +@param prev byte offset of the predecessor, relative to PAGE_NEW_INFIMUM +@param hdr_size record header size, excluding REC_N_NEW_EXTRA_BYTES +@param data_size data payload size, in bytes +@return whether the operation failed (inconcistency was noticed) */ +bool page_apply_delete_dynamic(const buf_block_t &block, ulint prev, + size_t hdr_size, size_t data_size); + +MY_ATTRIBUTE((warn_unused_result)) +/****************************************************************//** +Searches the right position for a page cursor. */ +bool +page_cur_search_with_match( +/*=======================*/ + const dtuple_t* tuple, /*!< in: data tuple */ + page_cur_mode_t mode, /*!< in: PAGE_CUR_L, + PAGE_CUR_LE, PAGE_CUR_G, or + PAGE_CUR_GE */ + ulint* iup_matched_fields, + /*!< in/out: already matched + fields in upper limit record */ + ulint* ilow_matched_fields, + /*!< in/out: already matched + fields in lower limit record */ + page_cur_t* cursor, /*!< in/out: page cursor */ + rtr_info_t* rtr_info);/*!< in/out: rtree search stack */ +#ifdef BTR_CUR_HASH_ADAPT +MY_ATTRIBUTE((warn_unused_result)) +/** Search the right position for a page cursor. +@param[in] tuple key to be searched for +@param[in] mode search mode +@param[in,out] iup_matched_fields already matched fields in the +upper limit record +@param[in,out] iup_matched_bytes already matched bytes in the +first partially matched field in the upper limit record +@param[in,out] ilow_matched_fields already matched fields in the +lower limit record +@param[in,out] ilow_matched_bytes already matched bytes in the +first partially matched field in the lower limit record +@param[in,out] cursor page cursor */ +bool +page_cur_search_with_match_bytes( + const dtuple_t* tuple, + page_cur_mode_t mode, + ulint* iup_matched_fields, + ulint* iup_matched_bytes, + ulint* ilow_matched_fields, + ulint* ilow_matched_bytes, + page_cur_t* cursor); +#endif /* BTR_CUR_HASH_ADAPT */ +/***********************************************************//** +Positions a page cursor on a randomly chosen user record on a page. If there +are no user records, sets the cursor on the infimum record. */ +void page_cur_open_on_rnd_user_rec(page_cur_t *cursor); + +/** Index page cursor */ + +struct page_cur_t{ + dict_index_t* index; + rec_t* rec; /*!< pointer to a record on page */ + rec_offs* offsets; + buf_block_t* block; /*!< pointer to the block containing rec */ +}; + + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +inline rec_t *page_cur_move_to_next(page_cur_t *cur) +{ + return cur->rec= page_rec_get_next(cur->rec); +} + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +inline rec_t *page_cur_move_to_prev(page_cur_t *cur) +{ + return cur->rec= page_rec_get_prev(cur->rec); +} + +#include "page0cur.inl" + +#endif diff --git a/storage/innobase/include/page0cur.inl b/storage/innobase/include/page0cur.inl new file mode 100644 index 00000000..7c4eafa2 --- /dev/null +++ b/storage/innobase/include/page0cur.inl @@ -0,0 +1,203 @@ +/***************************************************************************** + +Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/page0cur.ic +The page cursor + +Created 10/4/1994 Heikki Tuuri +*************************************************************************/ + +#ifdef UNIV_DEBUG +/*********************************************************//** +Gets pointer to the page frame where the cursor is positioned. +@return page */ +UNIV_INLINE +page_t* +page_cur_get_page( +/*==============*/ + page_cur_t* cur) /*!< in: page cursor */ +{ + return page_align(page_cur_get_rec(cur)); +} + +/*********************************************************//** +Gets pointer to the buffer block where the cursor is positioned. +@return page */ +UNIV_INLINE +buf_block_t* +page_cur_get_block( +/*===============*/ + page_cur_t* cur) /*!< in: page cursor */ +{ + ut_ad(cur); + ut_ad(!cur->rec || page_align(cur->rec) == cur->block->page.frame); + return cur->block; +} + +/*********************************************************//** +Gets pointer to the page frame where the cursor is positioned. +@return page */ +UNIV_INLINE +page_zip_des_t* +page_cur_get_page_zip( +/*==================*/ + page_cur_t* cur) /*!< in: page cursor */ +{ + return(buf_block_get_page_zip(page_cur_get_block(cur))); +} + +/* Gets the record where the cursor is positioned. +@param cur page cursor +@return record */ +UNIV_INLINE +rec_t *page_cur_get_rec(const page_cur_t *cur) +{ + ut_ad(cur); + ut_ad(!cur->rec || page_align(cur->rec) == cur->block->page.frame); + return cur->rec; +} +#endif /* UNIV_DEBUG */ + +/*********************************************************//** +Sets the cursor object to point before the first user record +on the page. */ +UNIV_INLINE +void +page_cur_set_before_first( +/*======================*/ + const buf_block_t* block, /*!< in: index page */ + page_cur_t* cur) /*!< in: cursor */ +{ + cur->block = const_cast(block); + cur->rec = page_get_infimum_rec(buf_block_get_frame(cur->block)); +} + +/*********************************************************//** +Sets the cursor object to point after the last user record on +the page. */ +UNIV_INLINE +void +page_cur_set_after_last( +/*====================*/ + const buf_block_t* block, /*!< in: index page */ + page_cur_t* cur) /*!< in: cursor */ +{ + cur->block = const_cast(block); + cur->rec = page_get_supremum_rec(buf_block_get_frame(cur->block)); +} + +/*********************************************************//** +Returns TRUE if the cursor is before first user record on page. +@return TRUE if at start */ +UNIV_INLINE +ibool +page_cur_is_before_first( +/*=====================*/ + const page_cur_t* cur) /*!< in: cursor */ +{ + ut_ad(cur); + ut_ad(page_align(cur->rec) == cur->block->page.frame); + return(page_rec_is_infimum(cur->rec)); +} + +/*********************************************************//** +Returns TRUE if the cursor is after last user record. +@return TRUE if at end */ +UNIV_INLINE +ibool +page_cur_is_after_last( +/*===================*/ + const page_cur_t* cur) /*!< in: cursor */ +{ + ut_ad(cur); + ut_ad(page_align(cur->rec) == cur->block->page.frame); + return(page_rec_is_supremum(cur->rec)); +} + +/**********************************************************//** +Positions the cursor on the given record. */ +UNIV_INLINE +void +page_cur_position( +/*==============*/ + const rec_t* rec, /*!< in: record on a page */ + const buf_block_t* block, /*!< in: buffer block containing + the record */ + page_cur_t* cur) /*!< out: page cursor */ +{ + ut_ad(rec && block && cur); + ut_ad(page_align(rec) == block->page.frame); + + cur->rec = (rec_t*) rec; + cur->block = (buf_block_t*) block; +} + +/***********************************************************//** +Inserts a record next to page cursor. Returns pointer to inserted record if +succeed, i.e., enough space available, NULL otherwise. The cursor stays at +the same logical position, but the physical position may change if it is +pointing to a compressed page that was reorganized. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to record if succeed, NULL otherwise */ +UNIV_INLINE +rec_t* +page_cur_tuple_insert( +/*==================*/ + page_cur_t* cursor, /*!< in/out: a page cursor */ + const dtuple_t* tuple, /*!< in: pointer to a data tuple */ + rec_offs** offsets,/*!< out: offsets on *rec */ + mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */ + ulint n_ext, /*!< in: number of externally stored columns */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint size = rec_get_converted_size(cursor->index, tuple, n_ext); + + if (!*heap) { + *heap = mem_heap_create(size + + (4 + REC_OFFS_HEADER_SIZE + + dtuple_get_n_fields(tuple)) + * sizeof **offsets); + } + + rec_t* rec = rec_convert_dtuple_to_rec( + static_cast(mem_heap_alloc(*heap, size)), + cursor->index, tuple, n_ext); + + *offsets = rec_get_offsets(rec, cursor->index, *offsets, + page_is_leaf(cursor->block->page.frame) + ? cursor->index->n_core_fields : 0, + ULINT_UNDEFINED, heap); + ut_ad(size == rec_offs_size(*offsets)); + + if (is_buf_block_get_page_zip(cursor->block)) { + rec = page_cur_insert_rec_zip(cursor, rec, *offsets, mtr); + } else { + rec = page_cur_insert_rec_low(cursor, rec, *offsets, mtr); + } + + ut_ad(!rec || !cmp_dtuple_rec(tuple, rec, cursor->index, *offsets)); + return(rec); +} + diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h new file mode 100644 index 00000000..2978656b --- /dev/null +++ b/storage/innobase/include/page0page.h @@ -0,0 +1,1101 @@ +/***************************************************************************** +Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/page0page.h +Index page routines + +Created 2/2/1994 Heikki Tuuri +*******************************************************/ + +#ifndef page0page_h +#define page0page_h + +#include "page0types.h" +#include "fsp0fsp.h" +#include "fil0fil.h" +#include "buf0buf.h" +#include "rem0rec.h" +#include "mach0data.h" +#ifndef UNIV_INNOCHECKSUM +#include "dict0dict.h" +#include "data0data.h" +#include "mtr0mtr.h" + +/* PAGE HEADER + =========== + +Index page header starts at the first offset left free by the FIL-module */ + +typedef byte page_header_t; +#endif /* !UNIV_INNOCHECKSUM */ + +#define PAGE_HEADER FSEG_PAGE_DATA /* index page header starts at this + offset */ +/*-----------------------------*/ +#define PAGE_N_DIR_SLOTS 0 /* number of slots in page directory */ +#define PAGE_HEAP_TOP 2 /* pointer to record heap top */ +#define PAGE_N_HEAP 4 /* number of records in the heap, + bit 15=flag: new-style compact page format */ +#define PAGE_FREE 6 /* pointer to start of page free record list */ +#define PAGE_GARBAGE 8 /* number of bytes in deleted records */ +#define PAGE_LAST_INSERT 10 /* pointer to the last inserted record, or + 0 if this info has been reset by a delete, + for example */ + +/** This 10-bit field is usually 0. In B-tree index pages of +ROW_FORMAT=REDUNDANT tables, this byte can contain garbage if the .ibd +file was created in MySQL 4.1.0 or if the table resides in the system +tablespace and was created before MySQL 4.1.1 or MySQL 4.0.14. +In this case, the FIL_PAGE_TYPE would be FIL_PAGE_INDEX. + +In ROW_FORMAT=COMPRESSED tables, this field is always 0, because +instant ADD COLUMN is not supported. + +In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC tables, this field is +always 0, except in the root page of the clustered index after instant +ADD COLUMN. + +Instant ADD COLUMN will change FIL_PAGE_TYPE to FIL_PAGE_TYPE_INSTANT +and initialize the PAGE_INSTANT field to the original number of +fields in the clustered index (dict_index_t::n_core_fields). The most +significant bits are in the first byte, and the least significant 5 +bits are stored in the most significant 5 bits of PAGE_DIRECTION_B. + +These FIL_PAGE_TYPE_INSTANT and PAGE_INSTANT may be assigned even if +instant ADD COLUMN was not committed. Changes to these page header fields +are not undo-logged, but changes to the hidden metadata record are. +If the server is killed and restarted, the page header fields could +remain set even though no metadata record is present. + +When the table becomes empty, the PAGE_INSTANT field and the +FIL_PAGE_TYPE can be reset and any metadata record be removed. */ +#define PAGE_INSTANT 12 + +/** last insert direction: PAGE_LEFT, .... +In ROW_FORMAT=REDUNDANT tables created before MySQL 4.1.1 or MySQL 4.0.14, +this byte can be garbage. */ +#define PAGE_DIRECTION_B 13 +#define PAGE_N_DIRECTION 14 /* number of consecutive inserts to the same + direction */ +#define PAGE_N_RECS 16 /* number of user records on the page */ +/** The largest DB_TRX_ID that may have modified a record on the page; +Defined only in secondary index leaf pages and in change buffer leaf pages. +Otherwise written as 0. @see PAGE_ROOT_AUTO_INC */ +#define PAGE_MAX_TRX_ID 18 +/** The AUTO_INCREMENT value (on persistent clustered index root pages). */ +#define PAGE_ROOT_AUTO_INC PAGE_MAX_TRX_ID +#define PAGE_HEADER_PRIV_END 26 /* end of private data structure of the page + header which are set in a page create */ +/*----*/ +#define PAGE_LEVEL 26 /* level of the node in an index tree; the + leaf level is the level 0. This field should + not be written to after page creation. */ +#define PAGE_INDEX_ID 28 /* index id where the page belongs. + This field should not be written to after + page creation. */ + +#define PAGE_BTR_SEG_LEAF 36 /* file segment header for the leaf pages in + a B-tree: defined only on the root page of a + B-tree, but not in the root of an ibuf tree */ +#define PAGE_BTR_IBUF_FREE_LIST PAGE_BTR_SEG_LEAF +#define PAGE_BTR_IBUF_FREE_LIST_NODE PAGE_BTR_SEG_LEAF + /* in the place of PAGE_BTR_SEG_LEAF and _TOP + there is a free list base node if the page is + the root page of an ibuf tree, and at the same + place is the free list node if the page is in + a free list */ +#define PAGE_BTR_SEG_TOP (36 + FSEG_HEADER_SIZE) + /* file segment header for the non-leaf pages + in a B-tree: defined only on the root page of + a B-tree, but not in the root of an ibuf + tree */ +/*----*/ +#define PAGE_DATA (PAGE_HEADER + 36 + 2 * FSEG_HEADER_SIZE) + /* start of data on the page */ + +#define PAGE_OLD_INFIMUM (PAGE_DATA + 1 + REC_N_OLD_EXTRA_BYTES) + /* offset of the page infimum record on an + old-style page */ +#define PAGE_OLD_SUPREMUM (PAGE_DATA + 2 + 2 * REC_N_OLD_EXTRA_BYTES + 8) + /* offset of the page supremum record on an + old-style page */ +#define PAGE_OLD_SUPREMUM_END (PAGE_OLD_SUPREMUM + 9) + /* offset of the page supremum record end on + an old-style page */ +#define PAGE_NEW_INFIMUM (PAGE_DATA + REC_N_NEW_EXTRA_BYTES) + /* offset of the page infimum record on a + new-style compact page */ +#define PAGE_NEW_SUPREMUM (PAGE_DATA + 2 * REC_N_NEW_EXTRA_BYTES + 8) + /* offset of the page supremum record on a + new-style compact page */ +#define PAGE_NEW_SUPREMUM_END (PAGE_NEW_SUPREMUM + 8) + /* offset of the page supremum record end on + a new-style compact page */ +/*-----------------------------*/ + +/* Heap numbers */ +#define PAGE_HEAP_NO_INFIMUM 0U /* page infimum */ +#define PAGE_HEAP_NO_SUPREMUM 1U /* page supremum */ +#define PAGE_HEAP_NO_USER_LOW 2U /* first user record in + creation (insertion) order, + not necessarily collation order; + this record may have been deleted */ + +/* Directions of cursor movement (stored in PAGE_DIRECTION field) */ +constexpr uint16_t PAGE_LEFT= 1; +constexpr uint16_t PAGE_RIGHT= 2; +constexpr uint16_t PAGE_SAME_REC= 3; +constexpr uint16_t PAGE_SAME_PAGE= 4; +constexpr uint16_t PAGE_NO_DIRECTION= 5; + +#ifndef UNIV_INNOCHECKSUM + +/* PAGE DIRECTORY + ============== +*/ + +typedef byte page_dir_slot_t; + +/* Offset of the directory start down from the page end. We call the +slot with the highest file address directory start, as it points to +the first record in the list of records. */ +#define PAGE_DIR FIL_PAGE_DATA_END + +/* We define a slot in the page directory as two bytes */ +constexpr uint16_t PAGE_DIR_SLOT_SIZE= 2; + +/* The offset of the physically lower end of the directory, counted from +page end, when the page is empty */ +#define PAGE_EMPTY_DIR_START (PAGE_DIR + 2 * PAGE_DIR_SLOT_SIZE) + +/* The maximum and minimum number of records owned by a directory slot. The +number may drop below the minimum in the first and the last slot in the +directory. */ +#define PAGE_DIR_SLOT_MAX_N_OWNED 8 +#define PAGE_DIR_SLOT_MIN_N_OWNED 4 + +extern my_bool srv_immediate_scrub_data_uncompressed; +#endif /* UNIV_INNOCHECKSUM */ + +/** Get the start of a page frame. +@param[in] ptr pointer within a page frame +@return start of the page frame */ +MY_ATTRIBUTE((const)) +inline page_t* page_align(void *ptr) +{ + return my_assume_aligned + (reinterpret_cast(ut_align_down(ptr, srv_page_size))); +} +inline const page_t *page_align(const void *ptr) +{ + return page_align(const_cast(ptr)); +} + +/** Gets the byte offset within a page frame. +@param[in] ptr pointer within a page frame +@return offset from the start of the page */ +MY_ATTRIBUTE((const)) +inline uint16_t page_offset(const void* ptr) +{ + return static_cast(ut_align_offset(ptr, srv_page_size)); +} + +/** Determine whether an index page is not in ROW_FORMAT=REDUNDANT. +@param[in] page index page +@return nonzero if ROW_FORMAT is one of COMPACT,DYNAMIC,COMPRESSED +@retval 0 if ROW_FORMAT=REDUNDANT */ +inline +byte +page_is_comp(const page_t* page) +{ + ut_ad(!ut_align_offset(page, UNIV_ZIP_SIZE_MIN)); + return(page[PAGE_HEADER + PAGE_N_HEAP] & 0x80); +} + +/** Determine whether an index page is empty. +@param[in] page index page +@return whether the page is empty (PAGE_N_RECS = 0) */ +inline +bool +page_is_empty(const page_t* page) +{ + ut_ad(!ut_align_offset(page, UNIV_ZIP_SIZE_MIN)); + return !*reinterpret_cast(PAGE_HEADER + PAGE_N_RECS + + page); +} + +/** Determine whether an index page contains garbage. +@param[in] page index page +@return whether the page contains garbage (PAGE_GARBAGE is not 0) */ +inline +bool +page_has_garbage(const page_t* page) +{ + ut_ad(!ut_align_offset(page, UNIV_ZIP_SIZE_MIN)); + return *reinterpret_cast(PAGE_HEADER + PAGE_GARBAGE + + page); +} + +/** Determine whether an B-tree or R-tree index page is a leaf page. +@param[in] page index page +@return true if the page is a leaf (PAGE_LEVEL = 0) */ +inline +bool +page_is_leaf(const page_t* page) +{ + ut_ad(!ut_align_offset(page, UNIV_ZIP_SIZE_MIN)); + return !*reinterpret_cast(PAGE_HEADER + PAGE_LEVEL + + page); +} + +#ifndef UNIV_INNOCHECKSUM +/** Determine whether an index page record is not in ROW_FORMAT=REDUNDANT. +@param[in] rec record in an index page frame (not a copy) +@return nonzero if ROW_FORMAT is one of COMPACT,DYNAMIC,COMPRESSED +@retval 0 if ROW_FORMAT=REDUNDANT */ +inline +byte +page_rec_is_comp(const byte* rec) +{ + return(page_is_comp(page_align(rec))); +} + +# ifdef UNIV_DEBUG +/** Determine if the record is the metadata pseudo-record +in the clustered index. +@param[in] rec leaf page record on an index page +@return whether the record is the metadata pseudo-record */ +inline bool page_rec_is_metadata(const rec_t* rec) +{ + return rec_get_info_bits(rec, page_rec_is_comp(rec)) + & REC_INFO_MIN_REC_FLAG; +} +# endif /* UNIV_DEBUG */ + +/** Determine the offset of the infimum record on the page. +@param[in] page index page +@return offset of the infimum record in record list, relative from page */ +inline +unsigned +page_get_infimum_offset(const page_t* page) +{ + ut_ad(!page_offset(page)); + return page_is_comp(page) ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM; +} + +/** Determine the offset of the supremum record on the page. +@param[in] page index page +@return offset of the supremum record in record list, relative from page */ +inline +unsigned +page_get_supremum_offset(const page_t* page) +{ + ut_ad(!page_offset(page)); + return page_is_comp(page) ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM; +} + +/** Determine whether an index page record is a user record. +@param[in] offset record offset in the page +@retval true if a user record +@retval false if the infimum or supremum pseudo-record */ +inline +bool +page_rec_is_user_rec_low(ulint offset) +{ + compile_time_assert(PAGE_OLD_INFIMUM >= PAGE_NEW_INFIMUM); + compile_time_assert(PAGE_OLD_SUPREMUM >= PAGE_NEW_SUPREMUM); + compile_time_assert(PAGE_NEW_INFIMUM < PAGE_OLD_SUPREMUM); + compile_time_assert(PAGE_OLD_INFIMUM < PAGE_NEW_SUPREMUM); + compile_time_assert(PAGE_NEW_SUPREMUM < PAGE_OLD_SUPREMUM_END); + compile_time_assert(PAGE_OLD_SUPREMUM < PAGE_NEW_SUPREMUM_END); + ut_ad(offset >= PAGE_NEW_INFIMUM); + ut_ad(offset <= srv_page_size - PAGE_EMPTY_DIR_START); + + return(offset != PAGE_NEW_SUPREMUM + && offset != PAGE_NEW_INFIMUM + && offset != PAGE_OLD_INFIMUM + && offset != PAGE_OLD_SUPREMUM); +} + +/** Determine if a record is the supremum record on an index page. +@param[in] offset record offset in an index page +@return true if the supremum record */ +inline +bool +page_rec_is_supremum_low(ulint offset) +{ + ut_ad(offset >= PAGE_NEW_INFIMUM); + ut_ad(offset <= srv_page_size - PAGE_EMPTY_DIR_START); + return(offset == PAGE_NEW_SUPREMUM || offset == PAGE_OLD_SUPREMUM); +} + +/** Determine if a record is the infimum record on an index page. +@param[in] offset record offset in an index page +@return true if the infimum record */ +inline +bool +page_rec_is_infimum_low(ulint offset) +{ + ut_ad(offset >= PAGE_NEW_INFIMUM); + ut_ad(offset <= srv_page_size - PAGE_EMPTY_DIR_START); + return(offset == PAGE_NEW_INFIMUM || offset == PAGE_OLD_INFIMUM); +} + +/** Determine whether an B-tree or R-tree index record is in a leaf page. +@param[in] rec index record in an index page +@return true if the record is in a leaf page */ +inline +bool +page_rec_is_leaf(const page_t* rec) +{ + const page_t* page = page_align(rec); + ut_ad(ulint(rec - page) >= page_get_infimum_offset(page)); + bool leaf = page_is_leaf(page); + ut_ad(!page_rec_is_comp(rec) + || !page_rec_is_user_rec_low(ulint(rec - page)) + || leaf == !rec_get_node_ptr_flag(rec)); + return leaf; +} + +/** Determine whether an index page record is a user record. +@param[in] rec record in an index page +@return true if a user record */ +inline +bool +page_rec_is_user_rec(const rec_t* rec); + +/** Determine whether an index page record is the supremum record. +@param[in] rec record in an index page +@return true if the supremum record */ +inline +bool +page_rec_is_supremum(const rec_t* rec); + +/** Determine whether an index page record is the infimum record. +@param[in] rec record in an index page +@return true if the infimum record */ +inline +bool +page_rec_is_infimum(const rec_t* rec); + +/** Read PAGE_MAX_TRX_ID. +@param[in] page index page +@return the value of PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +inline trx_id_t page_get_max_trx_id(const page_t *page) +{ + ut_ad(fil_page_index_page_check(page)); + static_assert((PAGE_HEADER + PAGE_MAX_TRX_ID) % 8 == 0, "alignment"); + const auto *p= my_assume_aligned<8>(page + PAGE_HEADER + PAGE_MAX_TRX_ID); + return mach_read_from_8(p); +} + +/** +Set the number of owned records. +@tparam compressed whether to update any ROW_FORMAT=COMPRESSED page as well +@param[in,out] block index page +@param[in,out] rec record in block.frame +@param[in] n_owned number of records skipped in the sparse page directory +@param[in] comp whether ROW_FORMAT is one of COMPACT,DYNAMIC,COMPRESSED +@param[in,out] mtr mini-transaction */ +template +inline void page_rec_set_n_owned(buf_block_t *block, rec_t *rec, ulint n_owned, + bool comp, mtr_t *mtr) +{ + ut_ad(block->page.frame == page_align(rec)); + ut_ad(comp == (page_is_comp(block->page.frame) != 0)); + + if (page_zip_des_t *page_zip= compressed + ? buf_block_get_page_zip(block) : nullptr) + { + ut_ad(comp); + rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT); + if (rec_get_status(rec) != REC_STATUS_SUPREMUM) + page_zip_rec_set_owned(block, rec, n_owned, mtr); + } + else + { + rec-= comp ? REC_NEW_N_OWNED : REC_OLD_N_OWNED; + mtr->write<1,mtr_t::MAYBE_NOP>(*block, rec, (*rec & ~REC_N_OWNED_MASK) | + (n_owned << REC_N_OWNED_SHIFT)); + } +} + +/*************************************************************//** +Sets the max trx id field value. */ +void +page_set_max_trx_id( +/*================*/ + buf_block_t* block, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr); /*!< in/out: mini-transaction, or NULL */ +/*************************************************************//** +Sets the max trx id field value if trx_id is bigger than the previous +value. */ +UNIV_INLINE +void +page_update_max_trx_id( +/*===================*/ + buf_block_t* block, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr); /*!< in/out: mini-transaction */ + +/** Persist the AUTO_INCREMENT value on a clustered index root page. +@param[in,out] block clustered index root page +@param[in] autoinc next available AUTO_INCREMENT value +@param[in,out] mtr mini-transaction +@param[in] reset whether to reset the AUTO_INCREMENT + to a possibly smaller value than currently + exists in the page */ +void +page_set_autoinc( + buf_block_t* block, + ib_uint64_t autoinc, + mtr_t* mtr, + bool reset) + MY_ATTRIBUTE((nonnull)); + +/*************************************************************//** +Returns the RTREE SPLIT SEQUENCE NUMBER (FIL_RTREE_SPLIT_SEQ_NUM). +@return SPLIT SEQUENCE NUMBER */ +UNIV_INLINE +node_seq_t +page_get_ssn_id( +/*============*/ + const page_t* page); /*!< in: page */ +/*************************************************************//** +Sets the RTREE SPLIT SEQUENCE NUMBER field value */ +UNIV_INLINE +void +page_set_ssn_id( +/*============*/ + buf_block_t* block, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + node_seq_t ssn_id, /*!< in: split sequence id */ + mtr_t* mtr); /*!< in/out: mini-transaction */ + +#endif /* !UNIV_INNOCHECKSUM */ +/** Read a page header field. */ +inline uint16_t page_header_get_field(const page_t *page, ulint field) +{ + ut_ad(field <= PAGE_INDEX_ID); + ut_ad(!(field & 1)); + return mach_read_from_2(my_assume_aligned<2>(PAGE_HEADER + field + page)); +} + +#ifndef UNIV_INNOCHECKSUM +/*************************************************************//** +Returns the offset stored in the given header field. +@return offset from the start of the page, or 0 */ +UNIV_INLINE +uint16_t +page_header_get_offs( +/*=================*/ + const page_t* page, /*!< in: page */ + ulint field) /*!< in: PAGE_FREE, ... */ + MY_ATTRIBUTE((warn_unused_result)); + +/*************************************************************//** +Returns the pointer stored in the given header field, or NULL. */ +#define page_header_get_ptr(page, field) \ + (page_header_get_offs(page, field) \ + ? page + page_header_get_offs(page, field) : NULL) + +/** +Reset PAGE_LAST_INSERT. +@param[in,out] block file page +@param[in,out] mtr mini-transaction */ +inline void page_header_reset_last_insert(buf_block_t *block, mtr_t *mtr) + MY_ATTRIBUTE((nonnull)); +#define page_get_infimum_rec(page) ((page) + page_get_infimum_offset(page)) +#define page_get_supremum_rec(page) ((page) + page_get_supremum_offset(page)) + +/************************************************************//** +Returns the nth record of the record list. +This is the inverse function of page_rec_get_n_recs_before(). +@return nth record +@retval nullptr on corrupted page */ +const rec_t* +page_rec_get_nth_const( +/*===================*/ + const page_t* page, /*!< in: page */ + ulint nth) /*!< in: nth record */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/************************************************************//** +Returns the nth record of the record list. +This is the inverse function of page_rec_get_n_recs_before(). +@return nth record +@retval nullptr on corrupted page */ +inline rec_t *page_rec_get_nth(page_t* page, ulint nth) +{ + return const_cast(page_rec_get_nth_const(page, nth)); +} + +/************************************************************//** +Returns the middle record of the records on the page. If there is an +even number of records in the list, returns the first record of the +upper half-list. +@return middle record */ +UNIV_INLINE +rec_t* +page_get_middle_rec( +/*================*/ + page_t* page) /*!< in: page */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*************************************************************//** +Gets the page number. +@return page number */ +UNIV_INLINE +uint32_t +page_get_page_no( +/*=============*/ + const page_t* page); /*!< in: page */ + +/*************************************************************//** +Gets the tablespace identifier. +@return space id */ +UNIV_INLINE +uint32_t +page_get_space_id( +/*==============*/ + const page_t* page); /*!< in: page */ + +/*************************************************************//** +Gets the number of user records on page (the infimum and supremum records +are not user records). +@return number of user records */ +UNIV_INLINE +uint16_t +page_get_n_recs( +/*============*/ + const page_t* page); /*!< in: index page */ + +/** Return the number of preceding records in an index page. +@param rec index record +@return number of preceding records, including the infimum pseudo-record +@retval ULINT_UNDEFINED on corrupted page */ +ulint page_rec_get_n_recs_before(const rec_t *rec); +/*************************************************************//** +Gets the number of records in the heap. +@return number of user records */ +UNIV_INLINE +uint16_t +page_dir_get_n_heap( +/*================*/ + const page_t* page); /*!< in: index page */ +/*************************************************************//** +Gets the number of dir slots in directory. +@return number of slots */ +UNIV_INLINE +uint16_t +page_dir_get_n_slots( +/*=================*/ + const page_t* page); /*!< in: index page */ +/** Gets the pointer to a directory slot. +@param n sparse directory slot number +@return pointer to the sparse directory slot */ +inline page_dir_slot_t *page_dir_get_nth_slot(page_t *page, ulint n) +{ + ut_ad(page_dir_get_n_slots(page) > n); + static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility"); + return my_assume_aligned<2>(page + srv_page_size - (PAGE_DIR + 2) - n * 2); +} +inline const page_dir_slot_t *page_dir_get_nth_slot(const page_t *page,ulint n) +{ + return page_dir_get_nth_slot(const_cast(page), n); +} +/**************************************************************//** +Used to check the consistency of a record on a page. +@return TRUE if succeed */ +UNIV_INLINE +ibool +page_rec_check( +/*===========*/ + const rec_t* rec); /*!< in: record */ +/** Get the record pointed to by a directory slot. +@param[in] slot directory slot +@return pointer to record */ +inline rec_t *page_dir_slot_get_rec(page_dir_slot_t *slot) +{ + return page_align(slot) + mach_read_from_2(my_assume_aligned<2>(slot)); +} +inline const rec_t *page_dir_slot_get_rec(const page_dir_slot_t *slot) +{ + return page_dir_slot_get_rec(const_cast(slot)); +} + +inline rec_t *page_dir_slot_get_rec_validate(page_dir_slot_t *slot) +{ + const size_t s= mach_read_from_2(my_assume_aligned<2>(slot)); + page_t *page= page_align(slot); + + return UNIV_LIKELY(s >= PAGE_NEW_INFIMUM && + s <= page_header_get_field(page, PAGE_HEAP_TOP)) + ? page + s + : nullptr; +} +inline const rec_t *page_dir_slot_get_rec_validate(const page_dir_slot_t *slot) +{ + return page_dir_slot_get_rec_validate(const_cast(slot)); +} + + +/***************************************************************//** +Gets the number of records owned by a directory slot. +@return number of records */ +UNIV_INLINE +ulint +page_dir_slot_get_n_owned( +/*======================*/ + const page_dir_slot_t* slot); /*!< in: page directory slot */ +/************************************************************//** +Calculates the space reserved for directory slots of a given +number of records. The exact value is a fraction number +n * PAGE_DIR_SLOT_SIZE / PAGE_DIR_SLOT_MIN_N_OWNED, and it is +rounded upwards to an integer. */ +UNIV_INLINE +ulint +page_dir_calc_reserved_space( +/*=========================*/ + ulint n_recs); /*!< in: number of records */ +/***************************************************************//** +Looks for the directory slot which owns the given record. +@return the directory slot number +@retval ULINT_UNDEFINED on corruption */ +ulint +page_dir_find_owner_slot( +/*=====================*/ + const rec_t* rec); /*!< in: the physical record */ + +/***************************************************************//** +Returns the heap number of a record. +@return heap number */ +UNIV_INLINE +ulint +page_rec_get_heap_no( +/*=================*/ + const rec_t* rec); /*!< in: the physical record */ +/** Determine whether a page has any siblings. +@param[in] page page frame +@return true if the page has any siblings */ +inline bool page_has_siblings(const page_t* page) +{ + compile_time_assert(!(FIL_PAGE_PREV % 8)); + compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4); + compile_time_assert(FIL_NULL == 0xffffffff); + return *reinterpret_cast(page + FIL_PAGE_PREV) + != ~uint64_t(0); +} + +/** Determine whether a page has a predecessor. +@param[in] page page frame +@return true if the page has a predecessor */ +inline bool page_has_prev(const page_t* page) +{ + return *reinterpret_cast(page + FIL_PAGE_PREV) + != FIL_NULL; +} + +/** Determine whether a page has a successor. +@param[in] page page frame +@return true if the page has a successor */ +inline bool page_has_next(const page_t* page) +{ + return *reinterpret_cast(page + FIL_PAGE_NEXT) + != FIL_NULL; +} + +/** Read the AUTO_INCREMENT value from a clustered index root page. +@param[in] page clustered index root page +@return the persisted AUTO_INCREMENT value */ +MY_ATTRIBUTE((nonnull, warn_unused_result)) +inline uint64_t page_get_autoinc(const page_t *page) +{ + ut_d(uint16_t page_type= fil_page_get_type(page)); + ut_ad(page_type == FIL_PAGE_INDEX || page_type == FIL_PAGE_TYPE_INSTANT); + ut_ad(!page_has_siblings(page)); + const auto *p= my_assume_aligned<8>(page + PAGE_HEADER + PAGE_ROOT_AUTO_INC); + return mach_read_from_8(p); +} + +/************************************************************//** +Gets the pointer to the next record on the page. +@return pointer to next record */ +UNIV_INLINE +const rec_t* +page_rec_get_next_low( +/*==================*/ + const rec_t* rec, /*!< in: pointer to record */ + ulint comp); /*!< in: nonzero=compact page layout */ +/************************************************************//** +Gets the pointer to the next record on the page. +@return pointer to next record */ +UNIV_INLINE +rec_t* +page_rec_get_next( +/*==============*/ + rec_t* rec); /*!< in: pointer to record */ +/************************************************************//** +Gets the pointer to the next record on the page. +@return pointer to next record */ +UNIV_INLINE +const rec_t* +page_rec_get_next_const( +/*====================*/ + const rec_t* rec); /*!< in: pointer to record */ +/************************************************************//** +Gets the pointer to the previous record. +@return pointer to previous record +@retval nullptr on error */ +const rec_t* +page_rec_get_prev_const( +/*====================*/ + const rec_t* rec); /*!< in: pointer to record, must not be page + infimum */ +/************************************************************//** +Gets the pointer to the previous record. +@param rec record (not page infimum) +@return pointer to previous record +@retval nullptr on error */ +inline rec_t *page_rec_get_prev(rec_t *rec) +{ + return const_cast(page_rec_get_prev_const(rec)); +} + +/************************************************************//** +true if the record is the first user record on a page. +@return true if the first user record */ +UNIV_INLINE +bool +page_rec_is_first( +/*==============*/ + const rec_t* rec, /*!< in: record */ + const page_t* page) /*!< in: page */ + MY_ATTRIBUTE((warn_unused_result)); + +/************************************************************//** +true if the record is the last user record on a page. +@return true if the last user record */ +UNIV_INLINE +bool +page_rec_is_last( +/*=============*/ + const rec_t* rec, /*!< in: record */ + const page_t* page) /*!< in: page */ + MY_ATTRIBUTE((warn_unused_result)); + +/************************************************************//** +Returns the maximum combined size of records which can be inserted on top +of record heap. +@return maximum combined size for inserted records */ +UNIV_INLINE +ulint +page_get_max_insert_size( +/*=====================*/ + const page_t* page, /*!< in: index page */ + ulint n_recs);/*!< in: number of records */ +/************************************************************//** +Returns the maximum combined size of records which can be inserted on top +of record heap if page is first reorganized. +@return maximum combined size for inserted records */ +UNIV_INLINE +ulint +page_get_max_insert_size_after_reorganize( +/*======================================*/ + const page_t* page, /*!< in: index page */ + ulint n_recs);/*!< in: number of records */ +/*************************************************************//** +Calculates free space if a page is emptied. +@return free space */ +UNIV_INLINE +ulint +page_get_free_space_of_empty( +/*=========================*/ + ulint comp) /*!< in: nonzero=compact page format */ + MY_ATTRIBUTE((const)); +/************************************************************//** +Returns the sum of the sizes of the records in the record list +excluding the infimum and supremum records. +@return data in bytes */ +UNIV_INLINE +uint16_t +page_get_data_size( +/*===============*/ + const page_t* page); /*!< in: index page */ +/** Read the PAGE_DIRECTION field from a byte. +@param[in] ptr pointer to PAGE_DIRECTION_B +@return the value of the PAGE_DIRECTION field */ +inline +byte +page_ptr_get_direction(const byte* ptr); + +/** Read the PAGE_DIRECTION field. +@param[in] page index page +@return the value of the PAGE_DIRECTION field */ +inline +byte +page_get_direction(const page_t* page) +{ + return page_ptr_get_direction(PAGE_HEADER + PAGE_DIRECTION_B + page); +} + +/** Read the PAGE_INSTANT field. +@param[in] page index page +@return the value of the PAGE_INSTANT field */ +inline +uint16_t +page_get_instant(const page_t* page); + +/** Create an uncompressed index page. +@param[in,out] block buffer block +@param[in,out] mtr mini-transaction +@param[in] comp set unless ROW_FORMAT=REDUNDANT */ +void page_create(buf_block_t *block, mtr_t *mtr, bool comp); +/**********************************************************//** +Create a compressed B-tree index page. */ +void +page_create_zip( +/*============*/ + buf_block_t* block, /*!< in/out: a buffer frame + where the page is created */ + dict_index_t* index, /*!< in: the index of the + page */ + ulint level, /*!< in: the B-tree level of + the page */ + trx_id_t max_trx_id, /*!< in: PAGE_MAX_TRX_ID */ + mtr_t* mtr); /*!< in/out: mini-transaction + handle */ +/**********************************************************//** +Empty a previously created B-tree index page. */ +void +page_create_empty( +/*==============*/ + buf_block_t* block, /*!< in/out: B-tree block */ + dict_index_t* index, /*!< in: the index of the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull(1,2))); + +MY_ATTRIBUTE((nonnull, warn_unused_result)) +/*************************************************************//** +Differs from page_copy_rec_list_end, because this function does not +touch the lock table and max trx id on page or compress the page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_t::commit(). + +@return error code */ +dberr_t +page_copy_rec_list_end_no_locks( +/*============================*/ + buf_block_t* new_block, /*!< in: index page to copy to */ + buf_block_t* block, /*!< in: index page of rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr); /*!< in: mtr */ +/*************************************************************//** +Copies records from page to new_page, from the given record onward, +including that record. Infimum and supremum records are not copied. +The records are copied to the start of the record list on new_page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_t::commit(). + +@return pointer to the original successor of the infimum record on new_block +@retval nullptr on ROW_FORMAT=COMPRESSED page overflow */ +rec_t* +page_copy_rec_list_end( +/*===================*/ + buf_block_t* new_block, /*!< in/out: index page to copy to */ + buf_block_t* block, /*!< in: index page containing rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + dberr_t* err) /*!< out: error code */ + MY_ATTRIBUTE((nonnull(1,2,3,4,5), warn_unused_result)); +/*************************************************************//** +Copies records from page to new_page, up to the given record, NOT +including that record. Infimum and supremum records are not copied. +The records are copied to the end of the record list on new_page. + +IMPORTANT: The caller will have to update IBUF_BITMAP_FREE +if new_block is a compressed leaf page in a secondary index. +This has to be done either within the same mini-transaction, +or by invoking ibuf_reset_free_bits() before mtr_commit(). + +@return pointer to the original predecessor of the supremum record on new_block +@retval nullptr on ROW_FORMAT=COMPRESSED page overflow */ +rec_t* +page_copy_rec_list_start( +/*=====================*/ + buf_block_t* new_block, /*!< in/out: index page to copy to */ + buf_block_t* block, /*!< in: index page containing rec */ + rec_t* rec, /*!< in: record on page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + dberr_t* err) /*!< out: error code */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*************************************************************//** +Deletes records from a page from a given record onward, including that record. +The infimum and supremum records are not deleted. */ +dberr_t +page_delete_rec_list_end( +/*=====================*/ + rec_t* rec, /*!< in: pointer to record on page */ + buf_block_t* block, /*!< in: buffer block of the page */ + dict_index_t* index, /*!< in: record descriptor */ + ulint n_recs, /*!< in: number of records to delete, + or ULINT_UNDEFINED if not known */ + ulint size, /*!< in: the sum of the sizes of the + records in the end of the chain to + delete, or ULINT_UNDEFINED if not known */ + mtr_t* mtr) /*!< in: mtr */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*************************************************************//** +Deletes records from page, up to the given record, NOT including +that record. Infimum and supremum records are not deleted. */ +void +page_delete_rec_list_start( +/*=======================*/ + rec_t* rec, /*!< in: record on page */ + buf_block_t* block, /*!< in: buffer block of the page */ + dict_index_t* index, /*!< in: record descriptor */ + mtr_t* mtr) /*!< in: mtr */ + MY_ATTRIBUTE((nonnull)); +/** Create an index page. +@param[in,out] block buffer block +@param[in] comp nonzero=compact page format */ +void page_create_low(const buf_block_t* block, bool comp); + +/************************************************************//** +Prints record contents including the data relevant only in +the index page context. */ +void +page_rec_print( +/*===========*/ + const rec_t* rec, /*!< in: physical record */ + const rec_offs* offsets);/*!< in: record descriptor */ +# ifdef UNIV_BTR_PRINT +/***************************************************************//** +This is used to print the contents of the directory for +debugging purposes. */ +void +page_dir_print( +/*===========*/ + page_t* page, /*!< in: index page */ + ulint pr_n); /*!< in: print n first and n last entries */ +/***************************************************************//** +This is used to print the contents of the page record list for +debugging purposes. */ +void +page_print_list( +/*============*/ + buf_block_t* block, /*!< in: index page */ + dict_index_t* index, /*!< in: dictionary index of the page */ + ulint pr_n); /*!< in: print n first and n last entries */ +/***************************************************************//** +Prints the info in a page header. */ +void +page_header_print( +/*==============*/ + const page_t* page); /*!< in: index page */ +/***************************************************************//** +This is used to print the contents of the page for +debugging purposes. */ +void +page_print( +/*=======*/ + buf_block_t* block, /*!< in: index page */ + dict_index_t* index, /*!< in: dictionary index of the page */ + ulint dn, /*!< in: print dn first and last entries + in directory */ + ulint rn); /*!< in: print rn first and last records + in directory */ +# endif /* UNIV_BTR_PRINT */ +/***************************************************************//** +The following is used to validate a record on a page. This function +differs from rec_validate as it can also check the n_owned field and +the heap_no field. +@return TRUE if ok */ +ibool +page_rec_validate( +/*==============*/ + const rec_t* rec, /*!< in: physical record */ + const rec_offs* offsets);/*!< in: array returned by rec_get_offsets() */ +#ifdef UNIV_DEBUG +/***************************************************************//** +Checks that the first directory slot points to the infimum record and +the last to the supremum. This function is intended to track if the +bug fixed in 4.0.14 has caused corruption to users' databases. */ +void +page_check_dir( +/*===========*/ + const page_t* page); /*!< in: index page */ +#endif /* UNIV_DEBUG */ +/***************************************************************//** +This function checks the consistency of an index page when we do not +know the index. This is also resilient so that this should never crash +even if the page is total garbage. +@return TRUE if ok */ +ibool +page_simple_validate_old( +/*=====================*/ + const page_t* page); /*!< in: index page in ROW_FORMAT=REDUNDANT */ +/***************************************************************//** +This function checks the consistency of an index page when we do not +know the index. This is also resilient so that this should never crash +even if the page is total garbage. +@return TRUE if ok */ +ibool +page_simple_validate_new( +/*=====================*/ + const page_t* page); /*!< in: index page in ROW_FORMAT!=REDUNDANT */ +/** Check the consistency of an index page. +@param[in] page index page +@param[in] index B-tree or R-tree index +@return whether the page is valid */ +bool page_validate(const page_t* page, const dict_index_t* index) + MY_ATTRIBUTE((nonnull)); +/***************************************************************//** +Looks in the page record list for a record with the given heap number. +@return record, NULL if not found */ +const rec_t* +page_find_rec_with_heap_no( +/*=======================*/ + const page_t* page, /*!< in: index page */ + ulint heap_no);/*!< in: heap number */ +/** Get the last non-delete-marked record on a page. +@param[in] page index tree leaf page +@return the last record, not delete-marked +@retval infimum record if all records are delete-marked */ +const rec_t *page_find_rec_last_not_deleted(const page_t *page); + +#endif /* !UNIV_INNOCHECKSUM */ + +#include "page0page.inl" + +#endif diff --git a/storage/innobase/include/page0page.inl b/storage/innobase/include/page0page.inl new file mode 100644 index 00000000..6c0167ed --- /dev/null +++ b/storage/innobase/include/page0page.inl @@ -0,0 +1,550 @@ +/***************************************************************************** + +Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/page0page.ic +Index page routines + +Created 2/2/1994 Heikki Tuuri +*******************************************************/ + +#ifndef UNIV_INNOCHECKSUM +#include "rem0cmp.h" +#include "mtr0log.h" +#include "page0zip.h" + +/*************************************************************//** +Sets the max trx id field value if trx_id is bigger than the previous +value. */ +UNIV_INLINE +void +page_update_max_trx_id( +/*===================*/ + buf_block_t* block, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + trx_id_t trx_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(block); + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX)); + ut_ad(trx_id); + ut_ad(page_is_leaf(buf_block_get_frame(block))); + + if (page_get_max_trx_id(buf_block_get_frame(block)) < trx_id) { + + page_set_max_trx_id(block, page_zip, trx_id, mtr); + } +} + +/*************************************************************//** +Returns the RTREE SPLIT SEQUENCE NUMBER (FIL_RTREE_SPLIT_SEQ_NUM). +@return SPLIT SEQUENCE NUMBER */ +UNIV_INLINE +node_seq_t +page_get_ssn_id( +/*============*/ + const page_t* page) /*!< in: page */ +{ + ut_ad(page); + + return(static_cast( + mach_read_from_8(page + FIL_RTREE_SPLIT_SEQ_NUM))); +} + +/*************************************************************//** +Sets the RTREE SPLIT SEQUENCE NUMBER field value */ +UNIV_INLINE +void +page_set_ssn_id( +/*============*/ + buf_block_t* block, /*!< in/out: page */ + page_zip_des_t* page_zip,/*!< in/out: compressed page whose + uncompressed part will be updated, or NULL */ + node_seq_t ssn_id, /*!< in: transaction id */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_SX_FIX | + MTR_MEMO_PAGE_X_FIX)); + ut_ad(!page_zip || page_zip == &block->page.zip); + constexpr uint16_t field= FIL_RTREE_SPLIT_SEQ_NUM; + byte *b= my_assume_aligned<2>(&block->page.frame[field]); + if (mtr->write<8,mtr_t::MAYBE_NOP>(*block, b, ssn_id) && + UNIV_LIKELY_NULL(page_zip)) + memcpy_aligned<2>(&page_zip->data[field], b, 8); +} + +#endif /* !UNIV_INNOCHECKSUM */ + +#ifndef UNIV_INNOCHECKSUM +/*************************************************************//** +Returns the offset stored in the given header field. +@return offset from the start of the page, or 0 */ +UNIV_INLINE +uint16_t +page_header_get_offs( +/*=================*/ + const page_t* page, /*!< in: page */ + ulint field) /*!< in: PAGE_FREE, ... */ +{ + ut_ad((field == PAGE_FREE) + || (field == PAGE_LAST_INSERT) + || (field == PAGE_HEAP_TOP)); + + uint16_t offs = page_header_get_field(page, field); + + ut_ad((field != PAGE_HEAP_TOP) || offs); + + return(offs); +} + + +/** +Reset PAGE_LAST_INSERT. +@param[in,out] block file page +@param[in,out] mtr mini-transaction */ +inline void page_header_reset_last_insert(buf_block_t *block, mtr_t *mtr) +{ + constexpr uint16_t field= PAGE_HEADER + PAGE_LAST_INSERT; + byte *b= my_assume_aligned<2>(&block->page.frame[field]); + if (mtr->write<2,mtr_t::MAYBE_NOP>(*block, b, 0U) && + UNIV_LIKELY_NULL(block->page.zip.data)) + memset_aligned<2>(&block->page.zip.data[field], 0, 2); +} + +/***************************************************************//** +Returns the heap number of a record. +@return heap number */ +UNIV_INLINE +ulint +page_rec_get_heap_no( +/*=================*/ + const rec_t* rec) /*!< in: the physical record */ +{ + if (page_rec_is_comp(rec)) { + return(rec_get_heap_no_new(rec)); + } else { + return(rec_get_heap_no_old(rec)); + } +} + +/** Determine whether an index page record is a user record. +@param[in] rec record in an index page +@return true if a user record */ +inline +bool +page_rec_is_user_rec(const rec_t* rec) +{ + ut_ad(page_rec_check(rec)); + return(page_rec_is_user_rec_low(page_offset(rec))); +} + +/** Determine whether an index page record is the supremum record. +@param[in] rec record in an index page +@return true if the supremum record */ +inline +bool +page_rec_is_supremum(const rec_t* rec) +{ + ut_ad(page_rec_check(rec)); + return(page_rec_is_supremum_low(page_offset(rec))); +} + +/** Determine whether an index page record is the infimum record. +@param[in] rec record in an index page +@return true if the infimum record */ +inline +bool +page_rec_is_infimum(const rec_t* rec) +{ + ut_ad(page_rec_check(rec)); + return(page_rec_is_infimum_low(page_offset(rec))); +} + +/************************************************************//** +true if the record is the first user record on a page. +@return true if the first user record */ +UNIV_INLINE +bool +page_rec_is_first( +/*==============*/ + const rec_t* rec, /*!< in: record */ + const page_t* page) /*!< in: page */ +{ + ut_ad(page_get_n_recs(page) > 0); + + return(page_rec_get_next_const(page_get_infimum_rec(page)) == rec); +} + +/************************************************************//** +true if the record is the last user record on a page. +@return true if the last user record */ +UNIV_INLINE +bool +page_rec_is_last( +/*=============*/ + const rec_t* rec, /*!< in: record */ + const page_t* page) /*!< in: page */ +{ + ut_ad(page_get_n_recs(page) > 0); + + return(page_rec_get_next_const(rec) == page_get_supremum_rec(page)); +} + +/************************************************************//** +Returns the middle record of the records on the page. If there is an +even number of records in the list, returns the first record of the +upper half-list. +@return middle record */ +UNIV_INLINE +rec_t* +page_get_middle_rec( +/*================*/ + page_t* page) /*!< in: page */ +{ + ulint middle = (ulint(page_get_n_recs(page)) + + PAGE_HEAP_NO_USER_LOW) / 2; + + return(page_rec_get_nth(page, middle)); +} + +#endif /* !UNIV_INNOCHECKSUM */ + +/*************************************************************//** +Gets the page number. +@return page number */ +UNIV_INLINE +uint32_t +page_get_page_no( +/*=============*/ + const page_t* page) /*!< in: page */ +{ + ut_ad(page == page_align((page_t*) page)); + return mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_OFFSET)); +} + +#ifndef UNIV_INNOCHECKSUM +/*************************************************************//** +Gets the tablespace identifier. +@return space id */ +UNIV_INLINE +uint32_t +page_get_space_id( +/*==============*/ + const page_t* page) /*!< in: page */ +{ + ut_ad(page == page_align((page_t*) page)); + return mach_read_from_4(my_assume_aligned<2> + (page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID)); +} + +#endif /* !UNIV_INNOCHECKSUM */ + +/*************************************************************//** +Gets the number of user records on page (infimum and supremum records +are not user records). +@return number of user records */ +UNIV_INLINE +uint16_t +page_get_n_recs( +/*============*/ + const page_t* page) /*!< in: index page */ +{ + return(page_header_get_field(page, PAGE_N_RECS)); +} + +#ifndef UNIV_INNOCHECKSUM +/*************************************************************//** +Gets the number of dir slots in directory. +@return number of slots */ +UNIV_INLINE +uint16_t +page_dir_get_n_slots( +/*=================*/ + const page_t* page) /*!< in: index page */ +{ + return(page_header_get_field(page, PAGE_N_DIR_SLOTS)); +} + +/*************************************************************//** +Gets the number of records in the heap. +@return number of user records */ +UNIV_INLINE +uint16_t +page_dir_get_n_heap( +/*================*/ + const page_t* page) /*!< in: index page */ +{ + return(page_header_get_field(page, PAGE_N_HEAP) & 0x7fff); +} + +/**************************************************************//** +Used to check the consistency of a record on a page. +@return TRUE if succeed */ +UNIV_INLINE +ibool +page_rec_check( +/*===========*/ + const rec_t* rec) /*!< in: record */ +{ + const page_t* page = page_align(rec); + + ut_a(rec); + + ut_a(page_offset(rec) <= page_header_get_field(page, PAGE_HEAP_TOP)); + ut_a(page_offset(rec) >= PAGE_DATA); + + return(TRUE); +} + +/***************************************************************//** +Gets the number of records owned by a directory slot. +@return number of records */ +UNIV_INLINE +ulint +page_dir_slot_get_n_owned( +/*======================*/ + const page_dir_slot_t* slot) /*!< in: page directory slot */ +{ + const rec_t* rec = page_dir_slot_get_rec(slot); + if (page_rec_is_comp(slot)) { + return(rec_get_n_owned_new(rec)); + } else { + return(rec_get_n_owned_old(rec)); + } +} + +/************************************************************//** +Calculates the space reserved for directory slots of a given number of +records. The exact value is a fraction number n * PAGE_DIR_SLOT_SIZE / +PAGE_DIR_SLOT_MIN_N_OWNED, and it is rounded upwards to an integer. */ +UNIV_INLINE +ulint +page_dir_calc_reserved_space( +/*=========================*/ + ulint n_recs) /*!< in: number of records */ +{ + return((PAGE_DIR_SLOT_SIZE * n_recs + PAGE_DIR_SLOT_MIN_N_OWNED - 1) + / PAGE_DIR_SLOT_MIN_N_OWNED); +} + +/************************************************************//** +Gets the pointer to the next record on the page. +@return pointer to next record */ +UNIV_INLINE +const rec_t* +page_rec_get_next_low( +/*==================*/ + const rec_t* rec, /*!< in: pointer to record */ + ulint comp) /*!< in: nonzero=compact page layout */ +{ + const page_t *page= page_align(rec); + ut_ad(page_rec_check(rec)); + ulint offs= rec_get_next_offs(rec, comp); + if (!offs) + return nullptr; + if (UNIV_UNLIKELY(offs < (comp ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM))) + return nullptr; + if (UNIV_UNLIKELY(offs > page_header_get_field(page, PAGE_HEAP_TOP))) + return nullptr; + ut_ad(page_rec_is_infimum(rec) || + (!page_is_leaf(page) && !page_has_prev(page)) || + !(rec_get_info_bits(page + offs, comp) & REC_INFO_MIN_REC_FLAG)); + return page + offs; +} + +/************************************************************//** +Gets the pointer to the next record on the page. +@return pointer to next record */ +UNIV_INLINE +rec_t* +page_rec_get_next( +/*==============*/ + rec_t* rec) /*!< in: pointer to record */ +{ + return((rec_t*) page_rec_get_next_low(rec, page_rec_is_comp(rec))); +} + +/************************************************************//** +Gets the pointer to the next record on the page. +@return pointer to next record */ +UNIV_INLINE +const rec_t* +page_rec_get_next_const( +/*====================*/ + const rec_t* rec) /*!< in: pointer to record */ +{ + return(page_rec_get_next_low(rec, page_rec_is_comp(rec))); +} +#endif /* UNIV_INNOCHECKSUM */ + +/************************************************************//** +Returns the sum of the sizes of the records in the record list, excluding +the infimum and supremum records. +@return data in bytes */ +UNIV_INLINE +uint16_t +page_get_data_size( +/*===============*/ + const page_t* page) /*!< in: index page */ +{ + unsigned ret = page_header_get_field(page, PAGE_HEAP_TOP) + - (page_is_comp(page) + ? PAGE_NEW_SUPREMUM_END + : PAGE_OLD_SUPREMUM_END) + - page_header_get_field(page, PAGE_GARBAGE); + ut_ad(ret < srv_page_size); + return static_cast(ret); +} + +#ifndef UNIV_INNOCHECKSUM +/*************************************************************//** +Calculates free space if a page is emptied. +@return free space */ +UNIV_INLINE +ulint +page_get_free_space_of_empty( +/*=========================*/ + ulint comp) /*!< in: nonzero=compact page layout */ +{ + if (comp) { + return((ulint)(srv_page_size + - PAGE_NEW_SUPREMUM_END + - PAGE_DIR + - 2 * PAGE_DIR_SLOT_SIZE)); + } + + return((ulint)(srv_page_size + - PAGE_OLD_SUPREMUM_END + - PAGE_DIR + - 2 * PAGE_DIR_SLOT_SIZE)); +} + +/************************************************************//** +Each user record on a page, and also the deleted user records in the heap +takes its size plus the fraction of the dir cell size / +PAGE_DIR_SLOT_MIN_N_OWNED bytes for it. If the sum of these exceeds the +value of page_get_free_space_of_empty, the insert is impossible, otherwise +it is allowed. This function returns the maximum combined size of records +which can be inserted on top of the record heap. +@return maximum combined size for inserted records */ +UNIV_INLINE +ulint +page_get_max_insert_size( +/*=====================*/ + const page_t* page, /*!< in: index page */ + ulint n_recs) /*!< in: number of records */ +{ + ulint occupied; + ulint free_space; + + if (page_is_comp(page)) { + occupied = page_header_get_field(page, PAGE_HEAP_TOP) + - PAGE_NEW_SUPREMUM_END + + page_dir_calc_reserved_space( + n_recs + page_dir_get_n_heap(page) - 2); + + free_space = page_get_free_space_of_empty(TRUE); + } else { + occupied = page_header_get_field(page, PAGE_HEAP_TOP) + - PAGE_OLD_SUPREMUM_END + + page_dir_calc_reserved_space( + n_recs + page_dir_get_n_heap(page) - 2); + + free_space = page_get_free_space_of_empty(FALSE); + } + + /* Above the 'n_recs +' part reserves directory space for the new + inserted records; the '- 2' excludes page infimum and supremum + records */ + + if (occupied > free_space) { + + return(0); + } + + return(free_space - occupied); +} + +/************************************************************//** +Returns the maximum combined size of records which can be inserted on top +of the record heap if a page is first reorganized. +@return maximum combined size for inserted records */ +UNIV_INLINE +ulint +page_get_max_insert_size_after_reorganize( +/*======================================*/ + const page_t* page, /*!< in: index page */ + ulint n_recs) /*!< in: number of records */ +{ + ulint occupied; + ulint free_space; + + occupied = page_get_data_size(page) + + page_dir_calc_reserved_space(n_recs + page_get_n_recs(page)); + + free_space = page_get_free_space_of_empty(page_is_comp(page)); + + if (occupied > free_space) { + + return(0); + } + + return(free_space - occupied); +} + +/** Read the PAGE_DIRECTION field from a byte. +@param[in] ptr pointer to PAGE_DIRECTION_B +@return the value of the PAGE_DIRECTION field */ +inline +byte +page_ptr_get_direction(const byte* ptr) +{ + ut_ad(page_offset(ptr) == PAGE_HEADER + PAGE_DIRECTION_B); + return *ptr & ((1U << 3) - 1); +} + +/** Read the PAGE_INSTANT field. +@param[in] page index page +@return the value of the PAGE_INSTANT field */ +inline +uint16_t +page_get_instant(const page_t* page) +{ + uint16_t i = page_header_get_field(page, PAGE_INSTANT); +#ifdef UNIV_DEBUG + switch (fil_page_get_type(page)) { + case FIL_PAGE_TYPE_INSTANT: + ut_ad(page_get_direction(page) <= PAGE_NO_DIRECTION); + ut_ad(i >> 3); + break; + case FIL_PAGE_INDEX: + ut_ad(i <= PAGE_NO_DIRECTION || !page_is_comp(page)); + break; + case FIL_PAGE_RTREE: + ut_ad(i <= PAGE_NO_DIRECTION); + break; + default: + ut_ad("invalid page type" == 0); + break; + } +#endif /* UNIV_DEBUG */ + return static_cast(i >> 3); /* i / 8 */ +} +#endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/page0types.h b/storage/innobase/include/page0types.h new file mode 100644 index 00000000..83fc45cd --- /dev/null +++ b/storage/innobase/include/page0types.h @@ -0,0 +1,188 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/page0types.h +Index page routines + +Created 2/2/1994 Heikki Tuuri +*******************************************************/ + +#ifndef page0types_h +#define page0types_h + +#include "dict0types.h" +#include "mtr0types.h" +#include "rem0types.h" +#include "ut0new.h" + +#include + +/** Eliminates a name collision on HP-UX */ +#define page_t ib_page_t +/** Type of the index page */ +typedef byte page_t; +#ifndef UNIV_INNOCHECKSUM +/** Index page cursor */ +struct page_cur_t; +/** Buffer pool block */ +struct buf_block_t; + +/** Compressed index page */ +typedef byte page_zip_t; + +/* The following definitions would better belong to page0zip.h, +but we cannot include page0zip.h from rem0rec.ic, because +page0*.h includes rem0rec.h and may include rem0rec.ic. */ + +/** Number of bits needed for representing different compressed page sizes */ +#define PAGE_ZIP_SSIZE_BITS 3 + +/** Maximum compressed page shift size */ +#define PAGE_ZIP_SSIZE_MAX \ + (UNIV_ZIP_SIZE_SHIFT_MAX - UNIV_ZIP_SIZE_SHIFT_MIN + 1) + +/* Make sure there are enough bits available to store the maximum zip +ssize, which is the number of shifts from 512. */ +#if PAGE_ZIP_SSIZE_MAX >= (1 << PAGE_ZIP_SSIZE_BITS) +# error "PAGE_ZIP_SSIZE_MAX >= (1 << PAGE_ZIP_SSIZE_BITS)" +#endif + +/* Page cursor search modes; the values must be in this order! */ +enum page_cur_mode_t { + PAGE_CUR_UNSUPP = 0, + PAGE_CUR_G = 1, + PAGE_CUR_GE = 2, + PAGE_CUR_L = 3, + PAGE_CUR_LE = 4, + +/* PAGE_CUR_LE_OR_EXTENDS = 5,*/ /* This is a search mode used in + "column LIKE 'abc%' ORDER BY column DESC"; + we have to find strings which are <= 'abc' or + which extend it */ + +/* These search mode is for search R-tree index. */ + PAGE_CUR_CONTAIN = 7, + PAGE_CUR_INTERSECT = 8, + PAGE_CUR_WITHIN = 9, + PAGE_CUR_DISJOINT = 10, + PAGE_CUR_MBR_EQUAL = 11, + PAGE_CUR_RTREE_INSERT = 12, + PAGE_CUR_RTREE_LOCATE = 13, + PAGE_CUR_RTREE_GET_FATHER = 14 +}; + +class buf_pool_t; +class buf_page_t; + +/** Compressed page descriptor */ +struct page_zip_des_t +{ + page_zip_t* data; /*!< compressed page data */ + + uint32_t m_end:16; /*!< end offset of modification log */ + uint32_t m_nonempty:1; /*!< TRUE if the modification log + is not empty */ + uint32_t n_blobs:12; /*!< number of externally stored + columns on the page; the maximum + is 744 on a 16 KiB page */ + uint32_t ssize:PAGE_ZIP_SSIZE_BITS; + /*!< 0 or compressed page shift size; + the size in bytes is + (UNIV_ZIP_SIZE_MIN >> 1) << ssize. */ +#ifdef UNIV_DEBUG + uint16_t m_start; /*!< start offset of modification log */ + bool m_external; /*!< Allocated externally, not from the + buffer pool */ +#endif /* UNIV_DEBUG */ + + void clear() { + /* Clear everything except the member "fix". */ + memset((void*) this, 0, + reinterpret_cast(&fix) + - reinterpret_cast(this)); + } + + page_zip_des_t() = default; + page_zip_des_t(const page_zip_des_t&) = default; + + /* Initialize everything except the member "fix". */ + page_zip_des_t(const page_zip_des_t& old, bool) { + memcpy((void*) this, (void*) &old, + reinterpret_cast(&fix) + - reinterpret_cast(this)); + } + +private: + friend buf_pool_t; + friend buf_page_t; + /** fix count and state used in buf_page_t */ + Atomic_relaxed fix; +}; + +/** Compression statistics for a given page size */ +struct page_zip_stat_t { + /** Number of page compressions */ + ulint compressed; + /** Number of successful page compressions */ + ulint compressed_ok; + /** Number of page decompressions */ + ulint decompressed; + /** Duration of page compressions in microseconds */ + ib_uint64_t compressed_usec; + /** Duration of page decompressions in microseconds */ + ib_uint64_t decompressed_usec; + page_zip_stat_t() : + /* Initialize members to 0 so that when we do + stlmap[key].compressed++ and element with "key" does not + exist it gets inserted with zeroed members. */ + compressed(0), + compressed_ok(0), + decompressed(0), + compressed_usec(0), + decompressed_usec(0) + { } +}; + +/** Compression statistics types */ +typedef std::map< + index_id_t, + page_zip_stat_t, + std::less, + ut_allocator > > + page_zip_stat_per_index_t; + +/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */ +extern page_zip_stat_t page_zip_stat[PAGE_ZIP_SSIZE_MAX]; +/** Statistics on compression, indexed by dict_index_t::id */ +extern page_zip_stat_per_index_t page_zip_stat_per_index; + +/**********************************************************************//** +Write the "owned" flag of a record on a compressed page. The n_owned field +must already have been written on the uncompressed page. */ +void +page_zip_rec_set_owned( +/*===================*/ + buf_block_t* block, /*!< in/out: ROW_FORMAT=COMPRESSED page */ + const byte* rec, /*!< in: record on the uncompressed page */ + ulint flag, /*!< in: the owned flag (nonzero=TRUE) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull)); +#endif /* !UNIV_INNOCHECKSUM */ +#endif diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h new file mode 100644 index 00000000..43329906 --- /dev/null +++ b/storage/innobase/include/page0zip.h @@ -0,0 +1,383 @@ +/***************************************************************************** + +Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/page0zip.h +Compressed page interface + +Created June 2005 by Marko Makela +*******************************************************/ + +#ifndef page0zip_h +#define page0zip_h + +#include "buf0types.h" + +#ifndef UNIV_INNOCHECKSUM +#include "mtr0types.h" +#include "page0types.h" +#include "dict0types.h" +#include "srv0srv.h" +#include "trx0types.h" +#include "mem0mem.h" + +/* Compression level to be used by zlib. Settable by user. */ +extern uint page_zip_level; + +/* Default compression level. */ +#define DEFAULT_COMPRESSION_LEVEL 6 +/** Start offset of the area that will be compressed */ +#define PAGE_ZIP_START PAGE_NEW_SUPREMUM_END +/** Size of an compressed page directory entry */ +#define PAGE_ZIP_DIR_SLOT_SIZE 2 +/** Predefine the sum of DIR_SLOT, TRX_ID & ROLL_PTR */ +#define PAGE_ZIP_CLUST_LEAF_SLOT_SIZE \ + (PAGE_ZIP_DIR_SLOT_SIZE \ + + DATA_TRX_ID_LEN \ + + DATA_ROLL_PTR_LEN) +/** Mask of record offsets */ +#define PAGE_ZIP_DIR_SLOT_MASK 0x3fffU +/** 'owned' flag */ +#define PAGE_ZIP_DIR_SLOT_OWNED 0x4000U +/** 'deleted' flag */ +#define PAGE_ZIP_DIR_SLOT_DEL 0x8000U + +/**********************************************************************//** +Determine the size of a compressed page in bytes. +@return size in bytes */ +UNIV_INLINE +ulint +page_zip_get_size( +/*==============*/ + const page_zip_des_t* page_zip) /*!< in: compressed page */ + MY_ATTRIBUTE((warn_unused_result)); +/**********************************************************************//** +Set the size of a compressed page in bytes. */ +UNIV_INLINE +void +page_zip_set_size( +/*==============*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + ulint size); /*!< in: size in bytes */ + +/** Determine if a record is so big that it needs to be stored externally. +@param[in] rec_size length of the record in bytes +@param[in] comp nonzero=compact format +@param[in] n_fields number of fields in the record; ignored if +tablespace is not compressed +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@return false if the entire record can be stored locally on the page */ +inline bool page_zip_rec_needs_ext(ulint rec_size, ulint comp, ulint n_fields, + ulint zip_size) + MY_ATTRIBUTE((warn_unused_result)); + +/**********************************************************************//** +Determine the guaranteed free space on an empty page. +@return minimum payload size on the page */ +ulint +page_zip_empty_size( +/*================*/ + ulint n_fields, /*!< in: number of columns in the index */ + ulint zip_size) /*!< in: compressed page size in bytes */ + MY_ATTRIBUTE((const)); + +/** Check whether a tuple is too big for compressed table +@param[in] index dict index object +@param[in] entry entry for the index +@return true if it's too big, otherwise false */ +bool +page_zip_is_too_big( + const dict_index_t* index, + const dtuple_t* entry); + +/**********************************************************************//** +Initialize a compressed page descriptor. */ +#define page_zip_des_init(page_zip) (page_zip)->clear() + +/**********************************************************************//** +Configure the zlib allocator to use the given memory heap. */ +void +page_zip_set_alloc( +/*===============*/ + void* stream, /*!< in/out: zlib stream */ + mem_heap_t* heap); /*!< in: memory heap to use */ + +/** Attempt to compress a ROW_FORMAT=COMPRESSED page. +@retval true on success +@retval false on failure; block->page.zip will be left intact. */ +bool +page_zip_compress( + buf_block_t* block, /*!< in/out: buffer block */ + dict_index_t* index, /*!< in: index of the B-tree node */ + ulint level, /*!< in: commpression level */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull)); + +/**********************************************************************//** +Write the index information for the compressed page. +@return used size of buf */ +ulint +page_zip_fields_encode( +/*===================*/ + ulint n, /*!< in: number of fields + to compress */ + const dict_index_t* index, /*!< in: index comprising + at least n fields */ + ulint trx_id_pos, + /*!< in: position of the trx_id column + in the index, or ULINT_UNDEFINED if + this is a non-leaf page */ + byte* buf); /*!< out: buffer of (n + 1) * 2 bytes */ + +/**********************************************************************//** +Decompress a page. This function should tolerate errors on the compressed +page. Instead of letting assertions fail, it will return FALSE if an +inconsistency is detected. +@return TRUE on success, FALSE on failure */ +ibool +page_zip_decompress( +/*================*/ + page_zip_des_t* page_zip,/*!< in: data, ssize; + out: m_start, m_end, m_nonempty, n_blobs */ + page_t* page, /*!< out: uncompressed page, may be trashed */ + ibool all) /*!< in: TRUE=decompress the whole page; + FALSE=verify but do not copy some + page header fields that should not change + after page creation */ + MY_ATTRIBUTE((nonnull(1,2))); + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Validate a compressed page descriptor. +@return TRUE if ok */ +UNIV_INLINE +ibool +page_zip_simple_validate( +/*=====================*/ + const page_zip_des_t* page_zip); /*!< in: compressed page + descriptor */ +#endif /* UNIV_DEBUG */ + +#ifdef UNIV_ZIP_DEBUG +/**********************************************************************//** +Check that the compressed and decompressed pages match. +@return TRUE if valid, FALSE if not */ +ibool +page_zip_validate_low( +/*==================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + const page_t* page, /*!< in: uncompressed page */ + const dict_index_t* index, /*!< in: index of the page, if known */ + ibool sloppy) /*!< in: FALSE=strict, + TRUE=ignore the MIN_REC_FLAG */ + MY_ATTRIBUTE((nonnull(1,2))); +/**********************************************************************//** +Check that the compressed and decompressed pages match. */ +ibool +page_zip_validate( +/*==============*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + const page_t* page, /*!< in: uncompressed page */ + const dict_index_t* index) /*!< in: index of the page, if known */ + MY_ATTRIBUTE((nonnull(1,2))); +#endif /* UNIV_ZIP_DEBUG */ + +/**********************************************************************//** +Determine how big record can be inserted without recompressing the page. +@return a positive number indicating the maximum size of a record +whose insertion is guaranteed to succeed, or zero or negative */ +UNIV_INLINE +lint +page_zip_max_ins_size( +/*==================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + ibool is_clust)/*!< in: TRUE if clustered index */ + MY_ATTRIBUTE((warn_unused_result)); + +/**********************************************************************//** +Determine if enough space is available in the modification log. +@return TRUE if page_zip_write_rec() will succeed */ +UNIV_INLINE +ibool +page_zip_available( +/*===============*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + ibool is_clust,/*!< in: TRUE if clustered index */ + ulint length, /*!< in: combined size of the record */ + ulint create) /*!< in: nonzero=add the record to + the heap */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Write an entire record to the ROW_FORMAT=COMPRESSED page. +The data must already have been written to the uncompressed page. +@param[in,out] block ROW_FORMAT=COMPRESSED page +@param[in] rec record in the uncompressed page +@param[in] index the index that the page belongs to +@param[in] offsets rec_get_offsets(rec, index) +@param[in] create nonzero=insert, zero=update +@param[in,out] mtr mini-transaction */ +void page_zip_write_rec(buf_block_t *block, const byte *rec, + const dict_index_t *index, const rec_offs *offsets, + ulint create, mtr_t *mtr) + MY_ATTRIBUTE((nonnull)); + +/**********************************************************************//** +Write a BLOB pointer of a record on the leaf page of a clustered index. +The information must already have been updated on the uncompressed page. */ +void +page_zip_write_blob_ptr( +/*====================*/ + buf_block_t* block, /*!< in/out: ROW_FORMAT=COMPRESSED page */ + const byte* rec, /*!< in/out: record whose data is being + written */ + dict_index_t* index, /*!< in: index of the page */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + ulint n, /*!< in: column index */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull)); + +/**********************************************************************//** +Write the node pointer of a record on a non-leaf compressed page. */ +void +page_zip_write_node_ptr( +/*====================*/ + buf_block_t* block, /*!< in/out: compressed page */ + byte* rec, /*!< in/out: record */ + ulint size, /*!< in: data size of rec */ + ulint ptr, /*!< in: node pointer */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull)); + +/** Write the DB_TRX_ID,DB_ROLL_PTR into a clustered index leaf page record. +@param[in,out] block ROW_FORMAT=COMPRESSED page +@param[in,out] rec record +@param[in] offsets rec_get_offsets(rec, index) +@param[in] trx_id_field field number of DB_TRX_ID (number of PK fields) +@param[in] trx_id DB_TRX_ID value (transaction identifier) +@param[in] roll_ptr DB_ROLL_PTR value (undo log pointer) +@param[in,out] mtr mini-transaction */ +void +page_zip_write_trx_id_and_roll_ptr( + buf_block_t* block, + byte* rec, + const rec_offs* offsets, + ulint trx_id_col, + trx_id_t trx_id, + roll_ptr_t roll_ptr, + mtr_t* mtr) + MY_ATTRIBUTE((nonnull)); + +/** Modify the delete-mark flag of a ROW_FORMAT=COMPRESSED record. +@param[in,out] block buffer block +@param[in,out] rec record on a physical index page +@param[in] flag the value of the delete-mark flag +@param[in,out] mtr mini-transaction */ +void page_zip_rec_set_deleted(buf_block_t *block, rec_t *rec, bool flag, + mtr_t *mtr) + MY_ATTRIBUTE((nonnull)); + +/**********************************************************************//** +Insert a record to the dense page directory. */ +void +page_zip_dir_insert( +/*================*/ + page_cur_t* cursor, /*!< in/out: page cursor */ + uint16_t free_rec,/*!< in: record from which rec was + allocated, or 0 */ + byte* rec, /*!< in: record to insert */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + MY_ATTRIBUTE((nonnull(1,3,4))); + +/** Shift the dense page directory and the array of BLOB pointers +when a record is deleted. +@param[in,out] block index page +@param[in,out] rec record being deleted +@param[in] index the index that the page belongs to +@param[in] offsets rec_get_offsets(rec, index) +@param[in] free previous start of the free list +@param[in,out] mtr mini-transaction */ +void page_zip_dir_delete(buf_block_t *block, byte *rec, + const dict_index_t *index, const rec_offs *offsets, + const byte *free, mtr_t *mtr) + MY_ATTRIBUTE((nonnull(1,2,3,4,6))); + +/**********************************************************************//** +Reorganize and compress a page. This is a low-level operation for +compressed pages, to be used when page_zip_compress() fails. +On success, redo log will be written. +The function btr_page_reorganize() should be preferred whenever possible. +IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a +non-clustered index, the caller must update the insert buffer free +bits in the same mini-transaction in such a way that the modification +will be redo-logged. +@return error code +@retval DB_FAIL on overflow; the block_zip will be left intact */ +dberr_t +page_zip_reorganize( + buf_block_t* block, /*!< in/out: page with compressed page; + on the compressed page, in: size; + out: data, n_blobs, + m_start, m_end, m_nonempty */ + dict_index_t* index, /*!< in: index of the B-tree node */ + ulint z_level,/*!< in: compression level */ + mtr_t* mtr, /*!< in: mini-transaction */ + bool restore = false)/*!< whether to restore on failure */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/**********************************************************************//** +Copy the records of a page byte for byte. Do not copy the page header +or trailer, except those B-tree header fields that are directly +related to the storage of records. Also copy PAGE_MAX_TRX_ID. +NOTE: The caller must update the lock table and the adaptive hash index. */ +void +page_zip_copy_recs( + buf_block_t* block, /*!< in/out: buffer block */ + const page_zip_des_t* src_zip, /*!< in: compressed page */ + const page_t* src, /*!< in: page */ + dict_index_t* index, /*!< in: index of the B-tree */ + mtr_t* mtr); /*!< in: mini-transaction */ +#endif /* !UNIV_INNOCHECKSUM */ + +/** Calculate the compressed page checksum. +@param data compressed page +@param size size of compressed page +@param use_adler whether to use Adler32 instead of a XOR of 3 CRC-32C +@return page checksum */ +uint32_t page_zip_calc_checksum(const void *data, size_t size, bool use_adler); + +/** Validate the checksum on a ROW_FORMAT=COMPRESSED page. +@param data ROW_FORMAT=COMPRESSED page +@param size size of the page, in bytes +@return whether the stored checksum matches innodb_checksum_algorithm */ +bool page_zip_verify_checksum(const byte *data, size_t size); + +#ifndef UNIV_INNOCHECKSUM +/**********************************************************************//** +Reset the counters used for filling +INFORMATION_SCHEMA.innodb_cmp_per_index. */ +UNIV_INLINE +void +page_zip_reset_stat_per_index(); +/*===========================*/ + +#include "page0zip.inl" +#endif /* !UNIV_INNOCHECKSUM */ + +#endif /* page0zip_h */ diff --git a/storage/innobase/include/page0zip.inl b/storage/innobase/include/page0zip.inl new file mode 100644 index 00000000..afc877c3 --- /dev/null +++ b/storage/innobase/include/page0zip.inl @@ -0,0 +1,317 @@ +/***************************************************************************** + +Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/page0zip.ic +Compressed page interface + +Created June 2005 by Marko Makela +*******************************************************/ + +#include "page0page.h" + +/* The format of compressed pages is as follows. + +The header and trailer of the uncompressed pages, excluding the page +directory in the trailer, are copied as is to the header and trailer +of the compressed page. + +At the end of the compressed page, there is a dense page directory +pointing to every user record contained on the page, including deleted +records on the free list. The dense directory is indexed in the +collation order, i.e., in the order in which the record list is +linked on the uncompressed page. The infimum and supremum records are +excluded. The two most significant bits of the entries are allocated +for the delete-mark and an n_owned flag indicating the last record in +a chain of records pointed to from the sparse page directory on the +uncompressed page. + +The data between PAGE_ZIP_START and the last page directory entry will +be written in compressed format, starting at offset PAGE_DATA. +Infimum and supremum records are not stored. We exclude the +REC_N_NEW_EXTRA_BYTES in every record header. These can be recovered +from the dense page directory stored at the end of the compressed +page. + +The fields node_ptr (in non-leaf B-tree nodes; level>0), trx_id and +roll_ptr (in leaf B-tree nodes; level=0), and BLOB pointers of +externally stored columns are stored separately, in ascending order of +heap_no and column index, starting backwards from the dense page +directory. + +The compressed data stream may be followed by a modification log +covering the compressed portion of the page, as follows. + +MODIFICATION LOG ENTRY FORMAT +- write record: + - (heap_no - 1) << 1 (1..2 bytes) + - extra bytes backwards + - data bytes +- clear record: + - (heap_no - 1) << 1 | 1 (1..2 bytes) + +The integer values are stored in a variable-length format: +- 0xxxxxxx: 0..127 +- 1xxxxxxx xxxxxxxx: 0..32767 + +The end of the modification log is marked by a 0 byte. + +In summary, the compressed page looks like this: + +(1) Uncompressed page header (PAGE_DATA bytes) +(2) Compressed index information +(3) Compressed page data +(4) Page modification log (page_zip->m_start..page_zip->m_end) +(5) Empty zero-filled space +(6) BLOB pointers (on leaf pages) + - BTR_EXTERN_FIELD_REF_SIZE for each externally stored column + - in descending collation order +(7) Uncompressed columns of user records, n_dense * uncompressed_size bytes, + - indexed by heap_no + - DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN for leaf pages of clustered indexes + - REC_NODE_PTR_SIZE for non-leaf pages + - 0 otherwise +(8) dense page directory, stored backwards + - n_dense = n_heap - 2 + - existing records in ascending collation order + - deleted records (free list) in link order +*/ + +/**********************************************************************//** +Determine the size of a compressed page in bytes. +@return size in bytes */ +UNIV_INLINE +ulint +page_zip_get_size( +/*==============*/ + const page_zip_des_t* page_zip) /*!< in: compressed page */ +{ + ulint size; + + if (!page_zip->ssize) { + return(0); + } + + size = (UNIV_ZIP_SIZE_MIN >> 1) << page_zip->ssize; + + ut_ad(size >= UNIV_ZIP_SIZE_MIN); + ut_ad(size <= srv_page_size); + + return(size); +} +/**********************************************************************//** +Set the size of a compressed page in bytes. */ +UNIV_INLINE +void +page_zip_set_size( +/*==============*/ + page_zip_des_t* page_zip, /*!< in/out: compressed page */ + ulint size) /*!< in: size in bytes */ +{ + if (size) { + unsigned ssize; + + ut_ad(ut_is_2pow(size)); + + for (ssize = 1; size > (512U << ssize); ssize++) { + } + + page_zip->ssize = ssize & ((1U << PAGE_ZIP_SSIZE_BITS) - 1); + } else { + page_zip->ssize = 0; + } + + ut_ad(page_zip_get_size(page_zip) == size); +} + +/** Determine if a record is so big that it needs to be stored externally. +@param[in] rec_size length of the record in bytes +@param[in] comp nonzero=compact format +@param[in] n_fields number of fields in the record; ignored if +tablespace is not compressed +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@return false if the entire record can be stored locally on the page */ +inline bool page_zip_rec_needs_ext(ulint rec_size, ulint comp, ulint n_fields, + ulint zip_size) +{ + /* FIXME: row size check is this function seems to be the most correct. + Put it in a separate function and use in more places of InnoDB */ + + ut_ad(rec_size + > ulint(comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES)); + ut_ad(comp || !zip_size); + +#if UNIV_PAGE_SIZE_MAX > COMPRESSED_REC_MAX_DATA_SIZE + if (comp ? rec_size >= COMPRESSED_REC_MAX_DATA_SIZE : + rec_size >= REDUNDANT_REC_MAX_DATA_SIZE) { + return(TRUE); + } +#endif + + if (zip_size) { + ut_ad(comp); + /* On a compressed page, there is a two-byte entry in + the dense page directory for every record. But there + is no record header. There should be enough room for + one record on an empty leaf page. Subtract 1 byte for + the encoded heap number. Check also the available space + on the uncompressed page. */ + return(rec_size - (REC_N_NEW_EXTRA_BYTES - 2 - 1) + >= page_zip_empty_size(n_fields, zip_size) + || rec_size >= page_get_free_space_of_empty(TRUE) / 2); + } + + return(rec_size >= page_get_free_space_of_empty(comp) / 2); +} + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Validate a compressed page descriptor. +@return TRUE if ok */ +UNIV_INLINE +ibool +page_zip_simple_validate( +/*=====================*/ + const page_zip_des_t* page_zip)/*!< in: compressed page descriptor */ +{ + ut_ad(page_zip); + ut_ad(page_zip->data); + ut_ad(page_zip->ssize <= PAGE_ZIP_SSIZE_MAX); + ut_ad(page_zip_get_size(page_zip) + > PAGE_DATA + PAGE_ZIP_DIR_SLOT_SIZE); + ut_ad(page_zip->m_start <= page_zip->m_end); + ut_ad(page_zip->m_end < page_zip_get_size(page_zip)); + ut_ad(page_zip->n_blobs + < page_zip_get_size(page_zip) / BTR_EXTERN_FIELD_REF_SIZE); + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/**********************************************************************//** +Determine if the length of the page trailer. +@return length of the page trailer, in bytes, not including the +terminating zero byte of the modification log */ +UNIV_INLINE +ibool +page_zip_get_trailer_len( +/*=====================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + ibool is_clust)/*!< in: TRUE if clustered index */ +{ + ulint uncompressed_size; + + ut_ad(page_zip_simple_validate(page_zip)); + MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip)); + + if (!page_is_leaf(page_zip->data)) { + uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE + + REC_NODE_PTR_SIZE; + ut_ad(!page_zip->n_blobs); + } else if (is_clust) { + uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; + } else { + uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE; + ut_ad(!page_zip->n_blobs); + } + + return (ulint(page_dir_get_n_heap(page_zip->data)) - 2) + * uncompressed_size + + ulint(page_zip->n_blobs) * BTR_EXTERN_FIELD_REF_SIZE; +} + +/**********************************************************************//** +Determine how big record can be inserted without recompressing the page. +@return a positive number indicating the maximum size of a record +whose insertion is guaranteed to succeed, or zero or negative */ +UNIV_INLINE +lint +page_zip_max_ins_size( +/*==================*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + ibool is_clust)/*!< in: TRUE if clustered index */ +{ + ulint trailer_len; + + trailer_len = page_zip_get_trailer_len(page_zip, is_clust); + + /* When a record is created, a pointer may be added to + the dense directory. + Likewise, space for the columns that will not be + compressed will be allocated from the page trailer. + Also the BLOB pointers will be allocated from there, but + we may as well count them in the length of the record. */ + + trailer_len += PAGE_ZIP_DIR_SLOT_SIZE; + + return(lint(page_zip_get_size(page_zip) + - trailer_len - page_zip->m_end + - (REC_N_NEW_EXTRA_BYTES - 2))); +} + +/**********************************************************************//** +Determine if enough space is available in the modification log. +@return TRUE if enough space is available */ +UNIV_INLINE +ibool +page_zip_available( +/*===============*/ + const page_zip_des_t* page_zip,/*!< in: compressed page */ + ibool is_clust,/*!< in: TRUE if clustered index */ + ulint length, /*!< in: combined size of the record */ + ulint create) /*!< in: nonzero=add the record to + the heap */ +{ + ulint trailer_len; + + ut_ad(length > REC_N_NEW_EXTRA_BYTES); + + trailer_len = page_zip_get_trailer_len(page_zip, is_clust); + + /* Subtract the fixed extra bytes and add the maximum + space needed for identifying the record (encoded heap_no). */ + length -= REC_N_NEW_EXTRA_BYTES - 2; + + if (create > 0) { + /* When a record is created, a pointer may be added to + the dense directory. + Likewise, space for the columns that will not be + compressed will be allocated from the page trailer. + Also the BLOB pointers will be allocated from there, but + we may as well count them in the length of the record. */ + + trailer_len += PAGE_ZIP_DIR_SLOT_SIZE; + } + + return(length + trailer_len + page_zip->m_end + < page_zip_get_size(page_zip)); +} + +/**********************************************************************//** +Reset the counters used for filling +INFORMATION_SCHEMA.innodb_cmp_per_index. */ +UNIV_INLINE +void +page_zip_reset_stat_per_index() +/*===========================*/ +{ + mysql_mutex_lock(&page_zip_stat_per_index_mutex); + page_zip_stat_per_index.clear(); + mysql_mutex_unlock(&page_zip_stat_per_index_mutex); +} diff --git a/storage/innobase/include/pars0grm.h b/storage/innobase/include/pars0grm.h new file mode 100644 index 00000000..e7112d99 --- /dev/null +++ b/storage/innobase/include/pars0grm.h @@ -0,0 +1,151 @@ +/* A Bison parser, made by GNU Bison 3.7.6. */ + +/* Bison interface for Yacc-like parsers in C + + Copyright (C) 1984, 1989-1990, 2000-2015, 2018-2021 Free Software Foundation, + Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* As a special exception, you may create a larger work that contains + part or all of the Bison parser skeleton and distribute that work + under terms of your choice, so long as that work isn't itself a + parser generator using the skeleton or a modified version thereof + as a parser skeleton. Alternatively, if you modify or redistribute + the parser skeleton itself, you may (at your option) remove this + special exception, which will cause the skeleton and the resulting + Bison output files to be licensed under the GNU General Public + License without this special exception. + + This special exception was added by the Free Software Foundation in + version 2.2 of Bison. */ + +/* DO NOT RELY ON FEATURES THAT ARE NOT DOCUMENTED in the manual, + especially those whose name start with YY_ or yy_. They are + private implementation details that can be changed or removed. */ + +#ifndef YY_YY_PARS0GRM_TAB_H_INCLUDED +# define YY_YY_PARS0GRM_TAB_H_INCLUDED +/* Debug traces. */ +#ifndef YYDEBUG +# define YYDEBUG 0 +#endif +#if YYDEBUG +extern int yydebug; +#endif + +/* Token kinds. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + enum yytokentype + { + YYEMPTY = -2, + YYEOF = 0, /* "end of file" */ + YYerror = 256, /* error */ + YYUNDEF = 257, /* "invalid token" */ + PARS_INT_LIT = 258, /* PARS_INT_LIT */ + PARS_FLOAT_LIT = 259, /* PARS_FLOAT_LIT */ + PARS_STR_LIT = 260, /* PARS_STR_LIT */ + PARS_NULL_LIT = 261, /* PARS_NULL_LIT */ + PARS_ID_TOKEN = 262, /* PARS_ID_TOKEN */ + PARS_AND_TOKEN = 263, /* PARS_AND_TOKEN */ + PARS_OR_TOKEN = 264, /* PARS_OR_TOKEN */ + PARS_NOT_TOKEN = 265, /* PARS_NOT_TOKEN */ + PARS_GE_TOKEN = 266, /* PARS_GE_TOKEN */ + PARS_LE_TOKEN = 267, /* PARS_LE_TOKEN */ + PARS_NE_TOKEN = 268, /* PARS_NE_TOKEN */ + PARS_PROCEDURE_TOKEN = 269, /* PARS_PROCEDURE_TOKEN */ + PARS_IN_TOKEN = 270, /* PARS_IN_TOKEN */ + PARS_INT_TOKEN = 271, /* PARS_INT_TOKEN */ + PARS_CHAR_TOKEN = 272, /* PARS_CHAR_TOKEN */ + PARS_IS_TOKEN = 273, /* PARS_IS_TOKEN */ + PARS_BEGIN_TOKEN = 274, /* PARS_BEGIN_TOKEN */ + PARS_END_TOKEN = 275, /* PARS_END_TOKEN */ + PARS_IF_TOKEN = 276, /* PARS_IF_TOKEN */ + PARS_THEN_TOKEN = 277, /* PARS_THEN_TOKEN */ + PARS_ELSE_TOKEN = 278, /* PARS_ELSE_TOKEN */ + PARS_ELSIF_TOKEN = 279, /* PARS_ELSIF_TOKEN */ + PARS_LOOP_TOKEN = 280, /* PARS_LOOP_TOKEN */ + PARS_WHILE_TOKEN = 281, /* PARS_WHILE_TOKEN */ + PARS_RETURN_TOKEN = 282, /* PARS_RETURN_TOKEN */ + PARS_SELECT_TOKEN = 283, /* PARS_SELECT_TOKEN */ + PARS_COUNT_TOKEN = 284, /* PARS_COUNT_TOKEN */ + PARS_FROM_TOKEN = 285, /* PARS_FROM_TOKEN */ + PARS_WHERE_TOKEN = 286, /* PARS_WHERE_TOKEN */ + PARS_FOR_TOKEN = 287, /* PARS_FOR_TOKEN */ + PARS_DDOT_TOKEN = 288, /* PARS_DDOT_TOKEN */ + PARS_ORDER_TOKEN = 289, /* PARS_ORDER_TOKEN */ + PARS_BY_TOKEN = 290, /* PARS_BY_TOKEN */ + PARS_ASC_TOKEN = 291, /* PARS_ASC_TOKEN */ + PARS_DESC_TOKEN = 292, /* PARS_DESC_TOKEN */ + PARS_INSERT_TOKEN = 293, /* PARS_INSERT_TOKEN */ + PARS_INTO_TOKEN = 294, /* PARS_INTO_TOKEN */ + PARS_VALUES_TOKEN = 295, /* PARS_VALUES_TOKEN */ + PARS_UPDATE_TOKEN = 296, /* PARS_UPDATE_TOKEN */ + PARS_SET_TOKEN = 297, /* PARS_SET_TOKEN */ + PARS_DELETE_TOKEN = 298, /* PARS_DELETE_TOKEN */ + PARS_CURRENT_TOKEN = 299, /* PARS_CURRENT_TOKEN */ + PARS_OF_TOKEN = 300, /* PARS_OF_TOKEN */ + PARS_CREATE_TOKEN = 301, /* PARS_CREATE_TOKEN */ + PARS_TABLE_TOKEN = 302, /* PARS_TABLE_TOKEN */ + PARS_INDEX_TOKEN = 303, /* PARS_INDEX_TOKEN */ + PARS_UNIQUE_TOKEN = 304, /* PARS_UNIQUE_TOKEN */ + PARS_CLUSTERED_TOKEN = 305, /* PARS_CLUSTERED_TOKEN */ + PARS_ON_TOKEN = 306, /* PARS_ON_TOKEN */ + PARS_ASSIGN_TOKEN = 307, /* PARS_ASSIGN_TOKEN */ + PARS_DECLARE_TOKEN = 308, /* PARS_DECLARE_TOKEN */ + PARS_CURSOR_TOKEN = 309, /* PARS_CURSOR_TOKEN */ + PARS_SQL_TOKEN = 310, /* PARS_SQL_TOKEN */ + PARS_OPEN_TOKEN = 311, /* PARS_OPEN_TOKEN */ + PARS_FETCH_TOKEN = 312, /* PARS_FETCH_TOKEN */ + PARS_CLOSE_TOKEN = 313, /* PARS_CLOSE_TOKEN */ + PARS_NOTFOUND_TOKEN = 314, /* PARS_NOTFOUND_TOKEN */ + PARS_TO_BINARY_TOKEN = 315, /* PARS_TO_BINARY_TOKEN */ + PARS_SUBSTR_TOKEN = 316, /* PARS_SUBSTR_TOKEN */ + PARS_CONCAT_TOKEN = 317, /* PARS_CONCAT_TOKEN */ + PARS_INSTR_TOKEN = 318, /* PARS_INSTR_TOKEN */ + PARS_LENGTH_TOKEN = 319, /* PARS_LENGTH_TOKEN */ + PARS_COMMIT_TOKEN = 320, /* PARS_COMMIT_TOKEN */ + PARS_ROLLBACK_TOKEN = 321, /* PARS_ROLLBACK_TOKEN */ + PARS_WORK_TOKEN = 322, /* PARS_WORK_TOKEN */ + PARS_EXIT_TOKEN = 323, /* PARS_EXIT_TOKEN */ + PARS_FUNCTION_TOKEN = 324, /* PARS_FUNCTION_TOKEN */ + PARS_LOCK_TOKEN = 325, /* PARS_LOCK_TOKEN */ + PARS_SHARE_TOKEN = 326, /* PARS_SHARE_TOKEN */ + PARS_MODE_TOKEN = 327, /* PARS_MODE_TOKEN */ + PARS_LIKE_TOKEN = 328, /* PARS_LIKE_TOKEN */ + PARS_LIKE_TOKEN_EXACT = 329, /* PARS_LIKE_TOKEN_EXACT */ + PARS_LIKE_TOKEN_PREFIX = 330, /* PARS_LIKE_TOKEN_PREFIX */ + PARS_LIKE_TOKEN_SUFFIX = 331, /* PARS_LIKE_TOKEN_SUFFIX */ + PARS_LIKE_TOKEN_SUBSTR = 332, /* PARS_LIKE_TOKEN_SUBSTR */ + PARS_TABLE_NAME_TOKEN = 333, /* PARS_TABLE_NAME_TOKEN */ + PARS_BIGINT_TOKEN = 334, /* PARS_BIGINT_TOKEN */ + NEG = 335 /* NEG */ + }; + typedef enum yytokentype yytoken_kind_t; +#endif + +/* Value type. */ +#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED +typedef int YYSTYPE; +# define YYSTYPE_IS_TRIVIAL 1 +# define YYSTYPE_IS_DECLARED 1 +#endif + + +extern YYSTYPE yylval; + +int yyparse (void); + +#endif /* !YY_YY_PARS0GRM_TAB_H_INCLUDED */ diff --git a/storage/innobase/include/pars0opt.h b/storage/innobase/include/pars0opt.h new file mode 100644 index 00000000..07a726ea --- /dev/null +++ b/storage/innobase/include/pars0opt.h @@ -0,0 +1,68 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2018, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/pars0opt.h +Simple SQL optimizer + +Created 12/21/1997 Heikki Tuuri +*******************************************************/ + +#ifndef pars0opt_h +#define pars0opt_h + +#include "que0types.h" +#include "pars0sym.h" +#include "row0sel.h" + +/*******************************************************************//** +Optimizes a select. Decides which indexes to tables to use. The tables +are accessed in the order that they were written to the FROM part in the +select statement. */ +void +opt_search_plan( +/*============*/ + sel_node_t* sel_node); /*!< in: parsed select node */ +/*******************************************************************//** +Looks for occurrences of the columns of the table in the query subgraph and +adds them to the list of columns if an occurrence of the same column does not +already exist in the list. If the column is already in the list, puts a value +indirection to point to the occurrence in the column list, except if the +column occurrence we are looking at is in the column list, in which case +nothing is done. */ +void +opt_find_all_cols( +/*==============*/ + ibool copy_val, /*!< in: if TRUE, new found columns are + added as columns to copy */ + dict_index_t* index, /*!< in: index to use */ + sym_node_list_t* col_list, /*!< in: base node of a list where + to add new found columns */ + plan_t* plan, /*!< in: plan or NULL */ + que_node_t* exp); /*!< in: expression or condition */ +#ifdef UNIV_SQL_DEBUG +/********************************************************************//** +Prints info of a query plan. */ +void +opt_print_query_plan( +/*=================*/ + sel_node_t* sel_node); /*!< in: select node */ +#endif /* UNIV_SQL_DEBUG */ + +#endif diff --git a/storage/innobase/include/pars0pars.h b/storage/innobase/include/pars0pars.h new file mode 100644 index 00000000..16823ce1 --- /dev/null +++ b/storage/innobase/include/pars0pars.h @@ -0,0 +1,695 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/pars0pars.h +SQL parser + +Created 11/19/1996 Heikki Tuuri +*******************************************************/ + +#ifndef pars0pars_h +#define pars0pars_h + +#include "que0types.h" +#include "pars0types.h" +#include "row0types.h" +#include "trx0types.h" +#include "ut0vec.h" +#include "row0mysql.h" + +/** Type of the user functions. The first argument is always InnoDB-supplied +and varies in type, while 'user_arg' is a user-supplied argument. The +meaning of the return type also varies. See the individual use cases, e.g. +the FETCH statement, for details on them. */ +typedef ibool (*pars_user_func_cb_t)(void* arg, void* user_arg); + +/** If the following is set TRUE, the parser will emit debugging +information */ +extern int yydebug; + +/* Global variable used while parsing a single procedure or query : the code is +NOT re-entrant */ +extern sym_tab_t* pars_sym_tab_global; + +extern pars_res_word_t pars_to_binary_token; +extern pars_res_word_t pars_substr_token; +extern pars_res_word_t pars_concat_token; +extern pars_res_word_t pars_length_token; +extern pars_res_word_t pars_instr_token; +extern pars_res_word_t pars_count_token; +extern pars_res_word_t pars_int_token; +extern pars_res_word_t pars_bigint_token; +extern pars_res_word_t pars_char_token; +extern pars_res_word_t pars_update_token; +extern pars_res_word_t pars_asc_token; +extern pars_res_word_t pars_desc_token; +extern pars_res_word_t pars_open_token; +extern pars_res_word_t pars_close_token; +extern pars_res_word_t pars_share_token; +extern pars_res_word_t pars_unique_token; +extern pars_res_word_t pars_clustered_token; + +extern ulint pars_star_denoter; + +/* Procedure parameter types */ +#define PARS_INPUT 0 +#define PARS_OUTPUT 1 +#define PARS_NOT_PARAM 2 + +int +yyparse(void); + +/*************************************************************//** +Parses an SQL string returning the query graph. +@return own: the query graph */ +que_t* +pars_sql( +/*=====*/ + pars_info_t* info, /*!< in: extra information, or NULL */ + const char* str); /*!< in: SQL string */ +/*************************************************************//** +Retrieves characters to the lexical analyzer. +@return number of characters copied or 0 on EOF */ +int +pars_get_lex_chars( +/*===============*/ + char* buf, /*!< in/out: buffer where to copy */ + size_t max_size); /*!< in: maximum number of characters which fit + in the buffer */ +/*************************************************************//** +Called by yyparse on error. */ +void +yyerror( +/*====*/ + const char* s); /*!< in: error message string */ +/*********************************************************************//** +Parses a variable declaration. +@return own: symbol table node of type SYM_VAR */ +sym_node_t* +pars_variable_declaration( +/*======================*/ + sym_node_t* node, /*!< in: symbol table node allocated for the + id of the variable */ + pars_res_word_t* type); /*!< in: pointer to a type token */ +/*********************************************************************//** +Parses a function expression. +@return own: function node in a query tree */ +func_node_t* +pars_func( +/*======*/ + que_node_t* res_word,/*!< in: function name reserved word */ + que_node_t* arg); /*!< in: first argument in the argument list */ +/************************************************************************* +Rebind a LIKE search string. NOTE: We ignore any '%' characters embedded +within the search string. +@return own: function node in a query tree */ +int +pars_like_rebind( +/*=============*/ + sym_node_t* node, /* in: The search string node.*/ + const byte* ptr, /* in: literal to (re) bind */ + ulint len); /* in: length of literal to (re) bind*/ +/*********************************************************************//** +Parses an operator expression. +@return own: function node in a query tree */ +func_node_t* +pars_op( +/*====*/ + int func, /*!< in: operator token code */ + que_node_t* arg1, /*!< in: first argument */ + que_node_t* arg2); /*!< in: second argument or NULL for an unary + operator */ +/*********************************************************************//** +Parses an ORDER BY clause. Order by a single column only is supported. +@return own: order-by node in a query tree */ +order_node_t* +pars_order_by( +/*==========*/ + sym_node_t* column, /*!< in: column name */ + pars_res_word_t* asc); /*!< in: &pars_asc_token or pars_desc_token */ +/*********************************************************************//** +Parses a select list; creates a query graph node for the whole SELECT +statement. +@return own: select node in a query tree */ +sel_node_t* +pars_select_list( +/*=============*/ + que_node_t* select_list, /*!< in: select list */ + sym_node_t* into_list); /*!< in: variables list or NULL */ +/*********************************************************************//** +Parses a cursor declaration. +@return sym_node */ +que_node_t* +pars_cursor_declaration( +/*====================*/ + sym_node_t* sym_node, /*!< in: cursor id node in the symbol + table */ + sel_node_t* select_node); /*!< in: select node */ +/*********************************************************************//** +Parses a function declaration. +@return sym_node */ +que_node_t* +pars_function_declaration( +/*======================*/ + sym_node_t* sym_node); /*!< in: function id node in the symbol + table */ +/*********************************************************************//** +Parses a select statement. +@return own: select node in a query tree */ +sel_node_t* +pars_select_statement( +/*==================*/ + sel_node_t* select_node, /*!< in: select node already containing + the select list */ + sym_node_t* table_list, /*!< in: table list */ + que_node_t* search_cond, /*!< in: search condition or NULL */ + pars_res_word_t* for_update, /*!< in: NULL or &pars_update_token */ + pars_res_word_t* consistent_read,/*!< in: NULL or + &pars_consistent_token */ + order_node_t* order_by); /*!< in: NULL or an order-by node */ +/*********************************************************************//** +Parses a column assignment in an update. +@return column assignment node */ +col_assign_node_t* +pars_column_assignment( +/*===================*/ + sym_node_t* column, /*!< in: column to assign */ + que_node_t* exp); /*!< in: value to assign */ +/*********************************************************************//** +Parses a delete or update statement start. +@return own: update node in a query tree */ +upd_node_t* +pars_update_statement_start( +/*========================*/ + ibool is_delete, /*!< in: TRUE if delete */ + sym_node_t* table_sym, /*!< in: table name node */ + col_assign_node_t* col_assign_list);/*!< in: column assignment list, NULL + if delete */ +/*********************************************************************//** +Parses an update or delete statement. +@return own: update node in a query tree */ +upd_node_t* +pars_update_statement( +/*==================*/ + upd_node_t* node, /*!< in: update node */ + sym_node_t* cursor_sym, /*!< in: pointer to a cursor entry in + the symbol table or NULL */ + que_node_t* search_cond); /*!< in: search condition or NULL */ +/*********************************************************************//** +Parses an insert statement. +@return own: update node in a query tree */ +ins_node_t* +pars_insert_statement( +/*==================*/ + sym_node_t* table_sym, /*!< in: table name node */ + que_node_t* values_list, /*!< in: value expression list or NULL */ + sel_node_t* select); /*!< in: select condition or NULL */ +/*********************************************************************//** +Parses an elsif element. +@return elsif node */ +elsif_node_t* +pars_elsif_element( +/*===============*/ + que_node_t* cond, /*!< in: if-condition */ + que_node_t* stat_list); /*!< in: statement list */ +/*********************************************************************//** +Parses an if-statement. +@return if-statement node */ +if_node_t* +pars_if_statement( +/*==============*/ + que_node_t* cond, /*!< in: if-condition */ + que_node_t* stat_list, /*!< in: statement list */ + que_node_t* else_part); /*!< in: else-part statement list */ +/*********************************************************************//** +Parses a for-loop-statement. +@return for-statement node */ +for_node_t* +pars_for_statement( +/*===============*/ + sym_node_t* loop_var, /*!< in: loop variable */ + que_node_t* loop_start_limit,/*!< in: loop start expression */ + que_node_t* loop_end_limit, /*!< in: loop end expression */ + que_node_t* stat_list); /*!< in: statement list */ +/*********************************************************************//** +Parses a while-statement. +@return while-statement node */ +while_node_t* +pars_while_statement( +/*=================*/ + que_node_t* cond, /*!< in: while-condition */ + que_node_t* stat_list); /*!< in: statement list */ +/*********************************************************************//** +Parses an exit statement. +@return exit statement node */ +exit_node_t* +pars_exit_statement(void); +/*=====================*/ +/*********************************************************************//** +Parses a return-statement. +@return return-statement node */ +return_node_t* +pars_return_statement(void); +/*=======================*/ +/*********************************************************************//** +Parses a procedure call. +@return function node */ +func_node_t* +pars_procedure_call( +/*================*/ + que_node_t* res_word,/*!< in: procedure name reserved word */ + que_node_t* args); /*!< in: argument list */ +/*********************************************************************//** +Parses an assignment statement. +@return assignment statement node */ +assign_node_t* +pars_assignment_statement( +/*======================*/ + sym_node_t* var, /*!< in: variable to assign */ + que_node_t* val); /*!< in: value to assign */ +/*********************************************************************//** +Parses a fetch statement. into_list or user_func (but not both) must be +non-NULL. +@return fetch statement node */ +fetch_node_t* +pars_fetch_statement( +/*=================*/ + sym_node_t* cursor, /*!< in: cursor node */ + sym_node_t* into_list, /*!< in: variables to set, or NULL */ + sym_node_t* user_func); /*!< in: user function name, or NULL */ +/*********************************************************************//** +Parses an open or close cursor statement. +@return fetch statement node */ +open_node_t* +pars_open_statement( +/*================*/ + ulint type, /*!< in: ROW_SEL_OPEN_CURSOR + or ROW_SEL_CLOSE_CURSOR */ + sym_node_t* cursor); /*!< in: cursor node */ +/*********************************************************************//** +Parses a row_printf-statement. +@return row_printf-statement node */ +row_printf_node_t* +pars_row_printf_statement( +/*======================*/ + sel_node_t* sel_node); /*!< in: select node */ +/*********************************************************************//** +Parses a commit statement. +@return own: commit node struct */ +commit_node_t* +pars_commit_statement(void); +/*=======================*/ +/*********************************************************************//** +Parses a rollback statement. +@return own: rollback node struct */ +roll_node_t* +pars_rollback_statement(void); +/*=========================*/ +/*********************************************************************//** +Parses a column definition at a table creation. +@return column sym table node */ +sym_node_t* +pars_column_def( +/*============*/ + sym_node_t* sym_node, /*!< in: column node in the + symbol table */ + pars_res_word_t* type, /*!< in: data type */ + sym_node_t* len, /*!< in: length of column, or + NULL */ + void* is_not_null); /*!< in: if not NULL, column + is of type NOT NULL. */ +/*********************************************************************//** +Parses a table creation operation. +@return table create subgraph */ +tab_node_t* +pars_create_table( +/*==============*/ + sym_node_t* table_sym, /*!< in: table name node in the symbol + table */ + sym_node_t* column_defs); /*!< in: list of column names */ +/*********************************************************************//** +Parses an index creation operation. +@return index create subgraph */ +ind_node_t* +pars_create_index( +/*==============*/ + pars_res_word_t* unique_def, /*!< in: not NULL if a unique index */ + pars_res_word_t* clustered_def, /*!< in: not NULL if a clustered index */ + sym_node_t* index_sym, /*!< in: index name node in the symbol + table */ + sym_node_t* table_sym, /*!< in: table name node in the symbol + table */ + sym_node_t* column_list); /*!< in: list of column names */ +/*********************************************************************//** +Parses a procedure definition. +@return query fork node */ +que_fork_t* +pars_procedure_definition( +/*======================*/ + sym_node_t* sym_node, /*!< in: procedure id node in the symbol + table */ + que_node_t* stat_list); /*!< in: statement list */ + +/** Completes a query graph by adding query thread and fork nodes +above it and prepares the graph for running. +@param[in] node root node for an incomplete query + graph, or NULL for dummy graph +@param[in] trx transaction handle +@param[in] heap memory heap from which allocated +@param[in] prebuilt row prebuilt structure +@return query thread node to run */ +que_thr_t* +pars_complete_graph_for_exec( + que_node_t* node, + trx_t* trx, + mem_heap_t* heap, + row_prebuilt_t* prebuilt) + MY_ATTRIBUTE((nonnull(2,3), warn_unused_result)); + +/****************************************************************//** +Create parser info struct. +@return own: info struct */ +pars_info_t* +pars_info_create(void); +/*==================*/ + +/****************************************************************//** +Add bound literal. */ +void +pars_info_add_literal( +/*==================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const void* address, /*!< in: address */ + ulint length, /*!< in: length of data */ + ulint type, /*!< in: type, e.g. DATA_FIXBINARY */ + ulint prtype); /*!< in: precise type, e.g. + DATA_UNSIGNED */ + +/****************************************************************//** +Equivalent to pars_info_add_literal(info, name, str, strlen(str), +DATA_VARCHAR, DATA_ENGLISH). */ +void +pars_info_add_str_literal( +/*======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const char* str); /*!< in: string */ +/******************************************************************** +If the literal value already exists then it rebinds otherwise it +creates a new entry.*/ +void +pars_info_bind_literal( +/*===================*/ + pars_info_t* info, /* in: info struct */ + const char* name, /* in: name */ + const void* address, /* in: address */ + ulint length, /* in: length of data */ + ulint type, /* in: type, e.g. DATA_FIXBINARY */ + ulint prtype); /* in: precise type, e.g. */ +/******************************************************************** +If the literal value already exists then it rebinds otherwise it +creates a new entry.*/ +void +pars_info_bind_varchar_literal( +/*===========================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const byte* str, /*!< in: string */ + ulint str_len); /*!< in: string length */ +/****************************************************************//** +Equivalent to: + +char buf[4]; +mach_write_to_4(buf, val); +pars_info_add_literal(info, name, buf, 4, DATA_INT, 0); + +except that the buffer is dynamically allocated from the info struct's +heap. */ +void +pars_info_bind_int4_literal( +/*=======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const ib_uint32_t* val); /*!< in: value */ +/******************************************************************** +If the literal value already exists then it rebinds otherwise it +creates a new entry. */ +void +pars_info_bind_int8_literal( +/*=======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const ib_uint64_t* val); /*!< in: value */ +/****************************************************************//** +Add user function. */ +void +pars_info_bind_function( +/*===================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: function name */ + pars_user_func_cb_t func, /*!< in: function address */ + void* arg); /*!< in: user-supplied argument */ +/****************************************************************//** +Add bound id. */ +void +pars_info_bind_id( +/*=============*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const char* id); /*!< in: id */ +/****************************************************************//** +Equivalent to: + +char buf[4]; +mach_write_to_4(buf, val); +pars_info_add_literal(info, name, buf, 4, DATA_INT, 0); + +except that the buffer is dynamically allocated from the info struct's +heap. */ +void +pars_info_add_int4_literal( +/*=======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + ulint val); /*!< in: value */ + +/****************************************************************//** +Equivalent to: + +char buf[8]; +mach_write_to_8(buf, val); +pars_info_add_literal(info, name, buf, 8, DATA_FIXBINARY, 0); + +except that the buffer is dynamically allocated from the info struct's +heap. */ +void +pars_info_add_ull_literal( +/*======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + ib_uint64_t val); /*!< in: value */ + +/****************************************************************//** +If the literal value already exists then it rebinds otherwise it +creates a new entry. */ +void +pars_info_bind_ull_literal( +/*=======================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name, /*!< in: name */ + const ib_uint64_t* val) /*!< in: value */ + MY_ATTRIBUTE((nonnull)); + +/****************************************************************//** +Get bound literal with the given name. +@return bound literal, or NULL if not found */ +pars_bound_lit_t* +pars_info_get_bound_lit( +/*====================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name); /*!< in: bound literal name to find */ + +/****************************************************************//** +Get bound id with the given name. +@return bound id, or NULL if not found */ +pars_bound_id_t* +pars_info_get_bound_id( +/*===================*/ + pars_info_t* info, /*!< in: info struct */ + const char* name); /*!< in: bound id name to find */ + +/******************************************************************//** +Release any resources used by the lexer. */ +void +pars_lexer_close(void); +/*==================*/ + +/** Extra information supplied for pars_sql(). */ +struct pars_info_t { + mem_heap_t* heap; /*!< our own memory heap */ + + ib_vector_t* funcs; /*!< user functions, or NUll + (pars_user_func_t*) */ + ib_vector_t* bound_lits; /*!< bound literals, or NULL + (pars_bound_lit_t*) */ + ib_vector_t* bound_ids; /*!< bound ids, or NULL + (pars_bound_id_t*) */ +}; + +inline void pars_info_free(pars_info_t *info) { mem_heap_free(info->heap); } + +/** User-supplied function and argument. */ +struct pars_user_func_t { + const char* name; /*!< function name */ + pars_user_func_cb_t func; /*!< function address */ + void* arg; /*!< user-supplied argument */ +}; + +/** Bound literal. */ +struct pars_bound_lit_t { + const char* name; /*!< name */ + const void* address; /*!< address */ + ulint length; /*!< length of data */ + ulint type; /*!< type, e.g. DATA_FIXBINARY */ + ulint prtype; /*!< precise type, e.g. DATA_UNSIGNED */ + sym_node_t* node; /*!< symbol node */ +}; + +/** Bound identifier. */ +struct pars_bound_id_t { + const char* name; /*!< name */ + const char* id; /*!< identifier */ +}; + +/** Struct used to denote a reserved word in a parsing tree */ +struct pars_res_word_t{ + int code; /*!< the token code for the reserved word from + pars0grm.h */ +}; + +/** A predefined function or operator node in a parsing tree; this construct +is also used for some non-functions like the assignment ':=' */ +struct func_node_t{ + que_common_t common; /*!< type: QUE_NODE_FUNC */ + int func; /*!< token code of the function name */ + ulint fclass; /*!< class of the function */ + que_node_t* args; /*!< argument(s) of the function */ + UT_LIST_NODE_T(func_node_t) cond_list; + /*!< list of comparison conditions; defined + only for comparison operator nodes except, + presently, for OPT_SCROLL_TYPE ones */ + UT_LIST_NODE_T(func_node_t) func_node_list; + /*!< list of function nodes in a parsed + query graph */ +}; + +/** An order-by node in a select */ +struct order_node_t{ + que_common_t common; /*!< type: QUE_NODE_ORDER */ + sym_node_t* column; /*!< order-by column */ + ibool asc; /*!< TRUE if ascending, FALSE if descending */ +}; + +/** Procedure definition node */ +struct proc_node_t{ + que_common_t common; /*!< type: QUE_NODE_PROC */ + sym_node_t* proc_id; /*!< procedure name symbol in the symbol + table of this same procedure */ + que_node_t* stat_list; /*!< statement list */ + sym_tab_t* sym_tab; /*!< symbol table of this procedure */ +}; + +/** elsif-element node */ +struct elsif_node_t{ + que_common_t common; /*!< type: QUE_NODE_ELSIF */ + que_node_t* cond; /*!< if condition */ + que_node_t* stat_list; /*!< statement list */ +}; + +/** if-statement node */ +struct if_node_t{ + que_common_t common; /*!< type: QUE_NODE_IF */ + que_node_t* cond; /*!< if condition */ + que_node_t* stat_list; /*!< statement list */ + que_node_t* else_part; /*!< else-part statement list */ + elsif_node_t* elsif_list; /*!< elsif element list */ +}; + +/** while-statement node */ +struct while_node_t{ + que_common_t common; /*!< type: QUE_NODE_WHILE */ + que_node_t* cond; /*!< while condition */ + que_node_t* stat_list; /*!< statement list */ +}; + +/** for-loop-statement node */ +struct for_node_t{ + que_common_t common; /*!< type: QUE_NODE_FOR */ + sym_node_t* loop_var; /*!< loop variable: this is the + dereferenced symbol from the + variable declarations, not the + symbol occurrence in the for loop + definition */ + que_node_t* loop_start_limit;/*!< initial value of loop variable */ + que_node_t* loop_end_limit; /*!< end value of loop variable */ + lint loop_end_value; /*!< evaluated value for the end value: + it is calculated only when the loop + is entered, and will not change within + the loop */ + que_node_t* stat_list; /*!< statement list */ +}; + +/** exit statement node */ +struct exit_node_t{ + que_common_t common; /*!< type: QUE_NODE_EXIT */ +}; + +/** return-statement node */ +struct return_node_t{ + que_common_t common; /*!< type: QUE_NODE_RETURN */ +}; + +/** Assignment statement node */ +struct assign_node_t{ + que_common_t common; /*!< type: QUE_NODE_ASSIGNMENT */ + sym_node_t* var; /*!< variable to set */ + que_node_t* val; /*!< value to assign */ +}; + +/** Column assignment node */ +struct col_assign_node_t{ + que_common_t common; /*!< type: QUE_NODE_COL_ASSIGN */ + sym_node_t* col; /*!< column to set */ + que_node_t* val; /*!< value to assign */ +}; + +/** Classes of functions */ +/* @{ */ +#define PARS_FUNC_ARITH 1 /*!< +, -, *, / */ +#define PARS_FUNC_LOGICAL 2 /*!< AND, OR, NOT */ +#define PARS_FUNC_CMP 3 /*!< comparison operators */ +#define PARS_FUNC_PREDEFINED 4 /*!< TO_NUMBER, SUBSTR, ... */ +#define PARS_FUNC_AGGREGATE 5 /*!< COUNT */ +#define PARS_FUNC_OTHER 6 /*!< these are not real functions, + e.g., := */ +/* @} */ + +#endif diff --git a/storage/innobase/include/pars0sym.h b/storage/innobase/include/pars0sym.h new file mode 100644 index 00000000..59f6cc31 --- /dev/null +++ b/storage/innobase/include/pars0sym.h @@ -0,0 +1,243 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/pars0sym.h +SQL parser symbol table + +Created 12/15/1997 Heikki Tuuri +*******************************************************/ + +#ifndef pars0sym_h +#define pars0sym_h + +#include "que0types.h" +#include "pars0types.h" +#include "row0types.h" + +/******************************************************************//** +Creates a symbol table for a single stored procedure or query. +@return own: symbol table */ +sym_tab_t* +sym_tab_create( +/*===========*/ + mem_heap_t* heap); /*!< in: memory heap where to create */ +/******************************************************************//** +Frees the memory allocated dynamically AFTER parsing phase for variables +etc. in the symbol table. Does not free the mem heap where the table was +originally created. Frees also SQL explicit cursor definitions. */ +void +sym_tab_free_private( +/*=================*/ + sym_tab_t* sym_tab); /*!< in, own: symbol table */ +/******************************************************************//** +Adds an integer literal to a symbol table. +@return symbol table node */ +sym_node_t* +sym_tab_add_int_lit( +/*================*/ + sym_tab_t* sym_tab, /*!< in: symbol table */ + ulint val); /*!< in: integer value */ +/******************************************************************//** +Adds an string literal to a symbol table. +@return symbol table node */ +sym_node_t* +sym_tab_add_str_lit( +/*================*/ + sym_tab_t* sym_tab, /*!< in: symbol table */ + const byte* str, /*!< in: string with no quotes around + it */ + ulint len); /*!< in: string length */ +/******************************************************************//** +Add a bound literal to a symbol table. +@return symbol table node */ +sym_node_t* +sym_tab_add_bound_lit( +/*==================*/ + sym_tab_t* sym_tab, /*!< in: symbol table */ + const char* name, /*!< in: name of bound literal */ + ulint* lit_type); /*!< out: type of literal (PARS_*_LIT) */ +/********************************************************************** +Rebind literal to a node in the symbol table. */ +sym_node_t* +sym_tab_rebind_lit( +/*===============*/ + /* out: symbol table node */ + sym_node_t* node, /* in: node that is bound to literal*/ + const void* address, /* in: pointer to data */ + ulint length); /* in: length of data */ +/******************************************************************//** +Adds an SQL null literal to a symbol table. +@return symbol table node */ +sym_node_t* +sym_tab_add_null_lit( +/*=================*/ + sym_tab_t* sym_tab); /*!< in: symbol table */ +/******************************************************************//** +Adds an identifier to a symbol table. +@return symbol table node */ +sym_node_t* +sym_tab_add_id( +/*===========*/ + sym_tab_t* sym_tab, /*!< in: symbol table */ + byte* name, /*!< in: identifier name */ + ulint len); /*!< in: identifier length */ + +/******************************************************************//** +Add a bound identifier to a symbol table. +@return symbol table node */ +sym_node_t* +sym_tab_add_bound_id( +/*===========*/ + sym_tab_t* sym_tab, /*!< in: symbol table */ + const char* name); /*!< in: name of bound id */ + +/** Index of sym_node_t::field_nos corresponding to the clustered index */ +#define SYM_CLUST_FIELD_NO 0 +/** Index of sym_node_t::field_nos corresponding to a secondary index */ +#define SYM_SEC_FIELD_NO 1 + +/** Types of a symbol table node */ +enum sym_tab_entry { + SYM_UNSET, /*!< Unset entry. */ + SYM_VAR = 91, /*!< declared parameter or local + variable of a procedure */ + SYM_IMPLICIT_VAR, /*!< storage for a intermediate result + of a calculation */ + SYM_LIT, /*!< literal */ + SYM_TABLE_REF_COUNTED, /*!< database table name, ref counted. Must + be closed explicitly. */ + SYM_TABLE, /*!< database table name */ + SYM_COLUMN, /*!< database table name */ + SYM_CURSOR, /*!< named cursor */ + SYM_PROCEDURE_NAME, /*!< stored procedure name */ + SYM_INDEX, /*!< database index name */ + SYM_FUNCTION /*!< user function name */ +}; + +/** Symbol table node */ +struct sym_node_t{ + que_common_t common; /*!< node type: + QUE_NODE_SYMBOL */ + /* NOTE: if the data field in 'common.val' is not NULL and the symbol + table node is not for a temporary column, the memory for the value has + been allocated from dynamic memory and it should be freed when the + symbol table is discarded */ + + /* 'alias' and 'indirection' are almost the same, but not quite. + 'alias' always points to the primary instance of the variable, while + 'indirection' does the same only if we should use the primary + instance's values for the node's data. This is usually the case, but + when initializing a cursor (e.g., "DECLARE CURSOR c IS SELECT * FROM + t WHERE id = x;"), we copy the values from the primary instance to + the cursor's instance so that they are fixed for the duration of the + cursor, and set 'indirection' to NULL. If we did not, the value of + 'x' could change between fetches and things would break horribly. + + TODO: It would be cleaner to make 'indirection' a boolean field and + always use 'alias' to refer to the primary node. */ + + sym_node_t* indirection; /*!< pointer to + another symbol table + node which contains + the value for this + node, NULL otherwise */ + sym_node_t* alias; /*!< pointer to + another symbol table + node for which this + node is an alias, + NULL otherwise */ + UT_LIST_NODE_T(sym_node_t) col_var_list; /*!< list of table + columns or a list of + input variables for an + explicit cursor */ + ibool copy_val; /*!< TRUE if a column + and its value should + be copied to dynamic + memory when fetched */ + ulint field_nos[2]; /*!< if a column, in + the position + SYM_CLUST_FIELD_NO is + the field number in the + clustered index; in + the position + SYM_SEC_FIELD_NO + the field number in the + non-clustered index to + use first; if not found + from the index, then + ULINT_UNDEFINED */ + ibool resolved; /*!< TRUE if the + meaning of a variable + or a column has been + resolved; for literals + this is always TRUE */ + enum sym_tab_entry token_type; /*!< type of the + parsed token */ + const char* name; /*!< name of an id */ + ulint name_len; /*!< id name length */ + dict_table_t* table; /*!< table definition + if a table id or a + column id */ + ulint col_no; /*!< column number if a + column */ + sel_buf_t* prefetch_buf; /*!< NULL, or a buffer + for cached column + values for prefetched + rows */ + sel_node_t* cursor_def; /*!< cursor definition + select node if a + named cursor */ + ulint param_type; /*!< PARS_INPUT, + PARS_OUTPUT, or + PARS_NOT_PARAM if not a + procedure parameter */ + sym_tab_t* sym_table; /*!< back pointer to + the symbol table */ + UT_LIST_NODE_T(sym_node_t) sym_list; /*!< list of symbol + nodes */ + sym_node_t* like_node; /* LIKE operator node*/ +}; + +/** Symbol table */ +struct sym_tab_t{ + que_t* query_graph; + /*!< query graph generated by the + parser */ + const char* sql_string; + /*!< SQL string to parse */ + size_t string_len; + /*!< SQL string length */ + size_t next_char_pos; + /*!< position of the next character in + sql_string to give to the lexical + analyzer */ + pars_info_t* info; /*!< extra information, or NULL */ + sym_node_list_t sym_list; + /*!< list of symbol nodes in the symbol + table */ + UT_LIST_BASE_NODE_T(func_node_t) + func_node_list; + /*!< list of function nodes in the + parsed query graph */ + mem_heap_t* heap; /*!< memory heap from which we can + allocate space */ +}; + +#endif diff --git a/storage/innobase/include/pars0types.h b/storage/innobase/include/pars0types.h new file mode 100644 index 00000000..f5b69522 --- /dev/null +++ b/storage/innobase/include/pars0types.h @@ -0,0 +1,50 @@ +/***************************************************************************** + +Copyright (c) 1998, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/pars0types.h +SQL parser global types + +Created 1/11/1998 Heikki Tuuri +*******************************************************/ + +#ifndef pars0types_h +#define pars0types_h + +struct pars_info_t; +struct pars_user_func_t; +struct pars_bound_lit_t; +struct pars_bound_id_t; +struct sym_node_t; +struct sym_tab_t; +struct pars_res_word_t; +struct func_node_t; +struct order_node_t; +struct proc_node_t; +struct elsif_node_t; +struct if_node_t; +struct while_node_t; +struct for_node_t; +struct exit_node_t; +struct return_node_t; +struct assign_node_t; +struct col_assign_node_t; + +typedef UT_LIST_BASE_NODE_T(sym_node_t) sym_node_list_t; + +#endif diff --git a/storage/innobase/include/que0que.h b/storage/innobase/include/que0que.h new file mode 100644 index 00000000..c60f390a --- /dev/null +++ b/storage/innobase/include/que0que.h @@ -0,0 +1,314 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/que0que.h +Query graph + +Created 5/27/1996 Heikki Tuuri +*******************************************************/ + +#ifndef que0que_h +#define que0que_h + +#include "data0data.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "srv0srv.h" +#include "que0types.h" +#include "row0types.h" +#include "pars0types.h" + +/***********************************************************************//** +Creates a query graph fork node. +@return own: fork node */ +que_fork_t *que_fork_create(mem_heap_t* heap); +/***********************************************************************//** +Gets the first thr in a fork. */ +UNIV_INLINE +que_thr_t* +que_fork_get_first_thr( +/*===================*/ + que_fork_t* fork); /*!< in: query fork */ +/***********************************************************************//** +Gets the child node of the first thr in a fork. */ +UNIV_INLINE +que_node_t* +que_fork_get_child( +/*===============*/ + que_fork_t* fork); /*!< in: query fork */ +/***********************************************************************//** +Sets the parent of a graph node. */ +UNIV_INLINE +void +que_node_set_parent( +/*================*/ + que_node_t* node, /*!< in: graph node */ + que_node_t* parent);/*!< in: parent */ +/** Creates a query graph thread node. +@param[in] parent parent node, i.e., a fork node +@param[in] heap memory heap where created +@param[in] prebuilt row prebuilt structure +@return own: query thread node */ +que_thr_t* +que_thr_create( + que_fork_t* parent, + mem_heap_t* heap, + row_prebuilt_t* prebuilt); +/**********************************************************************//** +Frees a query graph, but not the heap where it was created. Does not free +explicit cursor declarations, they are freed in que_graph_free. */ +void +que_graph_free_recursive( +/*=====================*/ + que_node_t* node); /*!< in: query graph node */ +/**********************************************************************//** +Frees a query graph. */ +void +que_graph_free( +/*===========*/ + que_t* graph); /*!< in: query graph; we assume that the memory + heap where this graph was created is private + to this graph: if not, then use + que_graph_free_recursive and free the heap + afterwards! */ + +/**********************************************************************//** +Run a query thread. Handles lock waits. */ +void +que_run_threads( +/*============*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Starts execution of a command in a query fork. Picks a query thread which +is not in the QUE_THR_RUNNING state and moves it to that state. If none +can be chosen, a situation which may arise in parallelized fetches, NULL +is returned. +@return a query thread of the graph moved to QUE_THR_RUNNING state, or +NULL; the query thread should be executed by que_run_threads by the +caller */ +que_thr_t* +que_fork_start_command( +/*===================*/ + que_fork_t* fork); /*!< in: a query fork */ +/***********************************************************************//** +Gets the trx of a query thread. */ +UNIV_INLINE +trx_t* +thr_get_trx( +/*========*/ + que_thr_t* thr); /*!< in: query thread */ +/***********************************************************************//** +Gets the type of a graph node. */ +UNIV_INLINE +ulint +que_node_get_type( +/*==============*/ + const que_node_t* node); /*!< in: graph node */ +/***********************************************************************//** +Gets pointer to the value data type field of a graph node. */ +UNIV_INLINE +dtype_t* +que_node_get_data_type( +/*===================*/ + que_node_t* node); /*!< in: graph node */ +/***********************************************************************//** +Gets pointer to the value dfield of a graph node. */ +UNIV_INLINE +dfield_t* +que_node_get_val( +/*=============*/ + que_node_t* node); /*!< in: graph node */ +/***********************************************************************//** +Gets the value buffer size of a graph node. +@return val buffer size, not defined if val.data == NULL in node */ +UNIV_INLINE +ulint +que_node_get_val_buf_size( +/*======================*/ + que_node_t* node); /*!< in: graph node */ +/***********************************************************************//** +Sets the value buffer size of a graph node. */ +UNIV_INLINE +void +que_node_set_val_buf_size( +/*======================*/ + que_node_t* node, /*!< in: graph node */ + ulint size); /*!< in: size */ +/*********************************************************************//** +Gets the next list node in a list of query graph nodes. */ +UNIV_INLINE +que_node_t* +que_node_get_next( +/*==============*/ + que_node_t* node); /*!< in: node in a list */ +/*********************************************************************//** +Gets the parent node of a query graph node. +@return parent node or NULL */ +UNIV_INLINE +que_node_t* +que_node_get_parent( +/*================*/ + que_node_t* node); /*!< in: node */ +/****************************************************************//** +Get the first containing loop node (e.g. while_node_t or for_node_t) for the +given node, or NULL if the node is not within a loop. +@return containing loop node, or NULL. */ +que_node_t* +que_node_get_containing_loop_node( +/*==============================*/ + que_node_t* node); /*!< in: node */ +/*********************************************************************//** +Catenates a query graph node to a list of them, possible empty list. +@return one-way list of nodes */ +UNIV_INLINE +que_node_t* +que_node_list_add_last( +/*===================*/ + que_node_t* node_list, /*!< in: node list, or NULL */ + que_node_t* node); /*!< in: node */ +/************************************************************************* +Get the last node from the list.*/ +UNIV_INLINE +que_node_t* +que_node_list_get_last( +/*===================*/ + /* out: node last node from list.*/ + que_node_t* node_list); /* in: node list, or NULL */ +/*********************************************************************//** +Gets a query graph node list length. +@return length, for NULL list 0 */ +UNIV_INLINE +ulint +que_node_list_get_len( +/*==================*/ + que_node_t* node_list); /*!< in: node list, or NULL */ +/*********************************************************************//** +Evaluate the given SQL +@return error code or DB_SUCCESS */ +dberr_t +que_eval_sql( +/*=========*/ + pars_info_t* info, /*!< in: info struct, or NULL */ + const char* sql, /*!< in: SQL string */ + trx_t* trx); /*!< in: trx */ + +/**********************************************************************//** +Round robin scheduler. +@return a query thread of the graph moved to QUE_THR_RUNNING state, or +NULL; the query thread should be executed by que_run_threads by the +caller */ +que_thr_t* +que_fork_scheduler_round_robin( +/*===========================*/ + que_fork_t* fork, /*!< in: a query fork */ + que_thr_t* thr); /*!< in: current pos */ + +/** Query thread states */ +enum que_thr_state_t { + /** in selects this means that the thread is at the end of its + result set (or start, in case of a scroll cursor); in other + statements, this means the thread has done its task */ + QUE_THR_COMPLETED, + QUE_THR_RUNNING +}; + +/** Query thread lock states */ +enum que_thr_lock_t { + QUE_THR_LOCK_NOLOCK, + QUE_THR_LOCK_ROW, + QUE_THR_LOCK_TABLE +}; + +/* Query graph query thread node: the fields are protected by the +trx_t::mutex with the exceptions named below */ + +struct que_thr_t{ + que_common_t common; /*!< type: QUE_NODE_THR */ + que_node_t* child; /*!< graph child node */ + que_t* graph; /*!< graph where this node belongs */ + que_thr_state_t state; /*!< state of the query thread */ + /*------------------------------*/ + /* The following fields are private to the OS thread executing the + query thread, and are not protected by any mutex: */ + + que_node_t* run_node; /*!< pointer to the node where the + subgraph down from this node is + currently executed */ + que_node_t* prev_node; /*!< pointer to the node from which + the control came */ + ulint resource; /*!< resource usage of the query thread + thus far */ + ulint lock_state; /*!< lock state of thread (table or + row) */ + /*------------------------------*/ + /* The following fields are links for the various lists that + this type can be on. */ + UT_LIST_NODE_T(que_thr_t) + thrs; /*!< list of thread nodes of the fork + node */ + UT_LIST_NODE_T(que_thr_t) + queue; /*!< list of runnable thread nodes in + the server task queue */ + ulint fk_cascade_depth; /*!< maximum cascading call depth + supported for foreign key constraint + related delete/updates */ + row_prebuilt_t* prebuilt; /*!< prebuilt structure processed by + the query thread */ +}; + +/* Query graph fork node: its fields are protected by the query thread mutex */ +struct que_fork_t{ + que_common_t common; /*!< type: QUE_NODE_FORK */ + que_t* graph; /*!< query graph of this node */ + trx_t* trx; /*!< transaction: this is set only in + the root node */ + ulint state; /*!< state of the fork node */ + que_thr_t* caller; /*!< pointer to a possible calling query + thread */ + UT_LIST_BASE_NODE_T(que_thr_t) + thrs; /*!< list of query threads */ + /*------------------------------*/ + /* The fields in this section are defined only in the root node */ + sym_tab_t* sym_tab; /*!< symbol table of the query, + generated by the parser, or NULL + if the graph was created 'by hand' */ + pars_info_t* info; /*!< info struct, or NULL */ + + sel_node_t* last_sel_node; /*!< last executed select node, or NULL + if none */ + UT_LIST_NODE_T(que_fork_t) + graphs; /*!< list of query graphs of a session + or a stored procedure */ + /*------------------------------*/ + mem_heap_t* heap; /*!< memory heap where the fork was + created */ + +}; + +/* Query fork (or graph) states */ +#define QUE_FORK_ACTIVE 1 +#define QUE_FORK_COMMAND_WAIT 2 + +/* Flag which is ORed to control structure statement node types */ +#define QUE_NODE_CONTROL_STAT 1024 + +#include "que0que.inl" + +#endif diff --git a/storage/innobase/include/que0que.inl b/storage/innobase/include/que0que.inl new file mode 100644 index 00000000..e21cbad3 --- /dev/null +++ b/storage/innobase/include/que0que.inl @@ -0,0 +1,245 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2020, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/que0que.ic +Query graph + +Created 5/27/1996 Heikki Tuuri +*******************************************************/ + +/***********************************************************************//** +Gets the trx of a query thread. */ +UNIV_INLINE +trx_t* +thr_get_trx( +/*========*/ + que_thr_t* thr) /*!< in: query thread */ +{ + ut_ad(thr); + + return(thr->graph->trx); +} + +/***********************************************************************//** +Gets the first thr in a fork. */ +UNIV_INLINE +que_thr_t* +que_fork_get_first_thr( +/*===================*/ + que_fork_t* fork) /*!< in: query fork */ +{ + return(UT_LIST_GET_FIRST(fork->thrs)); +} + +/***********************************************************************//** +Gets the child node of the first thr in a fork. */ +UNIV_INLINE +que_node_t* +que_fork_get_child( +/*===============*/ + que_fork_t* fork) /*!< in: query fork */ +{ + que_thr_t* thr; + + thr = UT_LIST_GET_FIRST(fork->thrs); + + return(thr->child); +} + +/***********************************************************************//** +Gets the type of a graph node. */ +UNIV_INLINE +ulint +que_node_get_type( +/*==============*/ + const que_node_t* node) /*!< in: graph node */ +{ + return(reinterpret_cast(node)->type); +} + +/***********************************************************************//** +Gets pointer to the value dfield of a graph node. */ +UNIV_INLINE +dfield_t* +que_node_get_val( +/*=============*/ + que_node_t* node) /*!< in: graph node */ +{ + ut_ad(node); + + return(&(((que_common_t*) node)->val)); +} + +/***********************************************************************//** +Gets the value buffer size of a graph node. +@return val buffer size, not defined if val.data == NULL in node */ +UNIV_INLINE +ulint +que_node_get_val_buf_size( +/*======================*/ + que_node_t* node) /*!< in: graph node */ +{ + ut_ad(node); + + return(((que_common_t*) node)->val_buf_size); +} + +/***********************************************************************//** +Sets the value buffer size of a graph node. */ +UNIV_INLINE +void +que_node_set_val_buf_size( +/*======================*/ + que_node_t* node, /*!< in: graph node */ + ulint size) /*!< in: size */ +{ + ut_ad(node); + + ((que_common_t*) node)->val_buf_size = size; +} + +/***********************************************************************//** +Sets the parent of a graph node. */ +UNIV_INLINE +void +que_node_set_parent( +/*================*/ + que_node_t* node, /*!< in: graph node */ + que_node_t* parent) /*!< in: parent */ +{ + ut_ad(node); + + ((que_common_t*) node)->parent = parent; +} + +/***********************************************************************//** +Gets pointer to the value data type field of a graph node. */ +UNIV_INLINE +dtype_t* +que_node_get_data_type( +/*===================*/ + que_node_t* node) /*!< in: graph node */ +{ + ut_ad(node); + + return(dfield_get_type(&((que_common_t*) node)->val)); +} + +/*********************************************************************//** +Catenates a query graph node to a list of them, possible empty list. +@return one-way list of nodes */ +UNIV_INLINE +que_node_t* +que_node_list_add_last( +/*===================*/ + que_node_t* node_list, /*!< in: node list, or NULL */ + que_node_t* node) /*!< in: node */ +{ + que_common_t* cnode; + que_common_t* cnode2; + + cnode = (que_common_t*) node; + + cnode->brother = NULL; + + if (node_list == NULL) { + + return(node); + } + + cnode2 = (que_common_t*) node_list; + + while (cnode2->brother != NULL) { + cnode2 = (que_common_t*) cnode2->brother; + } + + cnode2->brother = node; + + return(node_list); +} + +/************************************************************************* +Removes a query graph node from the list.*/ +UNIV_INLINE +que_node_t* +que_node_list_get_last( +/*===================*/ + /* out: last node in list.*/ + que_node_t* node_list) /* in: node list */ +{ + que_common_t* node; + + ut_a(node_list != NULL); + + node = (que_common_t*) node_list; + + /* We need the last element */ + while (node->brother != NULL) { + node = (que_common_t*) node->brother; + } + + return(node); +} +/*********************************************************************//** +Gets the next list node in a list of query graph nodes. +@return next node in a list of nodes */ +UNIV_INLINE +que_node_t* +que_node_get_next( +/*==============*/ + que_node_t* node) /*!< in: node in a list */ +{ + return(((que_common_t*) node)->brother); +} + +/*********************************************************************//** +Gets a query graph node list length. +@return length, for NULL list 0 */ +UNIV_INLINE +ulint +que_node_list_get_len( +/*==================*/ + que_node_t* node_list) /*!< in: node list, or NULL */ +{ + const que_common_t* cnode; + ulint len; + + cnode = (const que_common_t*) node_list; + len = 0; + + while (cnode != NULL) { + len++; + cnode = (const que_common_t*) cnode->brother; + } + + return(len); +} + +/*********************************************************************//** +Gets the parent node of a query graph node. +@return parent node or NULL */ +UNIV_INLINE +que_node_t* +que_node_get_parent( +/*================*/ + que_node_t* node) /*!< in: node */ +{ + return(((que_common_t*) node)->parent); +} diff --git a/storage/innobase/include/que0types.h b/storage/innobase/include/que0types.h new file mode 100644 index 00000000..38f6e380 --- /dev/null +++ b/storage/innobase/include/que0types.h @@ -0,0 +1,97 @@ +/***************************************************************************** + +Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/que0types.h +Query graph global types + +Created 5/27/1996 Heikki Tuuri +*******************************************************/ + +#ifndef que0types_h +#define que0types_h + +#include "data0data.h" + +/* Pseudotype for all graph nodes */ +typedef void que_node_t; + +/* Query graph root is a fork node */ +typedef struct que_fork_t que_t; + +struct row_prebuilt_t; +struct que_thr_t; + +/* Query graph node types */ +#define QUE_NODE_LOCK 1 +#define QUE_NODE_INSERT 2 +#define QUE_NODE_UPDATE 4 +#define QUE_NODE_CURSOR 5 +#define QUE_NODE_SELECT 6 +#define QUE_NODE_AGGREGATE 7 +#define QUE_NODE_FORK 8 +#define QUE_NODE_THR 9 +#define QUE_NODE_UNDO 10 +#define QUE_NODE_COMMIT 11 +#define QUE_NODE_ROLLBACK 12 +#define QUE_NODE_PURGE 13 +#define QUE_NODE_CREATE_TABLE 14 +#define QUE_NODE_CREATE_INDEX 15 +#define QUE_NODE_SYMBOL 16 +#define QUE_NODE_RES_WORD 17 +#define QUE_NODE_FUNC 18 +#define QUE_NODE_ORDER 19 +#define QUE_NODE_PROC (20 + QUE_NODE_CONTROL_STAT) +#define QUE_NODE_IF (21 + QUE_NODE_CONTROL_STAT) +#define QUE_NODE_WHILE (22 + QUE_NODE_CONTROL_STAT) +#define QUE_NODE_ASSIGNMENT 23 +#define QUE_NODE_FETCH 24 +#define QUE_NODE_OPEN 25 +#define QUE_NODE_COL_ASSIGNMENT 26 +#define QUE_NODE_FOR (27 + QUE_NODE_CONTROL_STAT) +#define QUE_NODE_RETURN 28 +#define QUE_NODE_ROW_PRINTF 29 +#define QUE_NODE_ELSIF 30 +#define QUE_NODE_CALL 31 +#define QUE_NODE_EXIT 32 + +/* Common struct at the beginning of each query graph node; the name of this +substruct must be 'common' */ + +struct que_common_t{ + ulint type; /*!< query node type */ + que_node_t* parent; /*!< back pointer to parent node, or NULL */ + que_node_t* brother;/* pointer to a possible brother node */ + dfield_t val; /*!< evaluated value for an expression */ + ulint val_buf_size; + /* buffer size for the evaluated value data, + if the buffer has been allocated dynamically: + if this field is != 0, and the node is a + symbol node or a function node, then we + have to free the data field in val + explicitly */ + + /** Constructor */ + que_common_t(ulint type, que_node_t* parent) : + type(type), parent(parent), brother(NULL), + val(), val_buf_size(0) + {} +}; + +#endif diff --git a/storage/innobase/include/read0types.h b/storage/innobase/include/read0types.h new file mode 100644 index 00000000..e002f1b7 --- /dev/null +++ b/storage/innobase/include/read0types.h @@ -0,0 +1,275 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/read0types.h +Cursor read + +Created 2/16/1997 Heikki Tuuri +*******************************************************/ + +#pragma once + +#include "dict0mem.h" +#include "trx0types.h" +#include "srw_lock.h" +#include + +/** + Read view lists the trx ids of those transactions for which a consistent read + should not see the modifications to the database. +*/ +class ReadViewBase +{ + /** + The read should not see any transaction with trx id >= this value. + In other words, this is the "high water mark". + */ + trx_id_t m_low_limit_id= 0; + + /** + The read should see all trx ids which are strictly + smaller (<) than this value. In other words, this is the + low water mark". + */ + trx_id_t m_up_limit_id; + + /** Set of RW transactions that was active when this snapshot was taken */ + trx_ids_t m_ids; + + /** + The view does not need to see the undo logs for transactions whose + transaction number is strictly smaller (<) than this value: they can be + removed in purge if not needed by other views. + */ + trx_id_t m_low_limit_no; + +protected: + bool empty() { return m_ids.empty(); } + + /** @return the up limit id */ + trx_id_t up_limit_id() const { return m_up_limit_id; } + +public: + /** + Append state from another view. + + This method is used to find min(m_low_limit_no), min(m_low_limit_id) and + all transaction ids below min(m_low_limit_id). These values effectively + form oldest view. + + @param other view to copy from + */ + void append(const ReadViewBase &other) + { + ut_ad(&other != this); + if (m_low_limit_no > other.m_low_limit_no) + m_low_limit_no= other.m_low_limit_no; + if (m_low_limit_id > other.m_low_limit_id) + m_low_limit_id= other.m_low_limit_id; + + trx_ids_t::iterator dst= m_ids.begin(); + for (const trx_id_t id : other.m_ids) + { + if (id >= m_low_limit_id) + break; +loop: + if (dst == m_ids.end()) + { + m_ids.push_back(id); + dst= m_ids.end(); + continue; + } + if (*dst < id) + { + dst++; + goto loop; + } + else if (*dst > id) + dst= m_ids.insert(dst, id) + 1; + } + m_ids.erase(std::lower_bound(dst, m_ids.end(), m_low_limit_id), + m_ids.end()); + + m_up_limit_id= m_ids.empty() ? m_low_limit_id : m_ids.front(); + ut_ad(m_up_limit_id <= m_low_limit_id); + } + + + /** + Creates a snapshot where exactly the transactions serialized before this + point in time are seen in the view. + + @param[in,out] trx transaction + */ + inline void snapshot(trx_t *trx); + + + /** + Check whether the changes by id are visible. + @param[in] id transaction id to check against the view + @return whether the view sees the modifications of id. + */ + bool changes_visible(trx_id_t id) const + MY_ATTRIBUTE((warn_unused_result)) + { + if (id >= m_low_limit_id) + return false; + return id < m_up_limit_id || + m_ids.empty() || + !std::binary_search(m_ids.begin(), m_ids.end(), id); + } + + /** + @param id transaction to check + @return true if view sees transaction id + */ + bool sees(trx_id_t id) const { return id < m_up_limit_id; } + + /** @return the low limit no */ + trx_id_t low_limit_no() const { return m_low_limit_no; } + + /** @return the low limit id */ + trx_id_t low_limit_id() const { return m_low_limit_id; } + + /** Clamp the low limit id for purge_sys.end_view */ + void clamp_low_limit_id(trx_id_t limit) + { + if (m_low_limit_id > limit) + m_low_limit_id= limit; + } +}; + + +/** A ReadView with extra members required for trx_t::read_view. */ +class ReadView: public ReadViewBase +{ + /** + View state. + + Implemented as atomic to allow mutex-free view close and re-use. + Non-owner thread is allowed to call is_open() alone without mutex + protection as well. E.g. trx_sys.view_count() does this. + + If non-owner thread intends to access other members as well, both + is_open() and other members accesses must be protected by m_mutex. + E.g. copy_to(). + */ + std::atomic m_open; + + /** For synchronisation with purge coordinator. */ + mutable srw_mutex m_mutex; + + /** + trx id of creating transaction. + Used exclusively by the read view owner thread. + */ + trx_id_t m_creator_trx_id; + +public: + ReadView() + { + memset(reinterpret_cast(this), 0, sizeof *this); + m_mutex.init(); + } + ~ReadView() { m_mutex.destroy(); } + + + /** + Opens a read view where exactly the transactions serialized before this + point in time are seen in the view. + + View becomes visible to purge thread. Intended to be called by the ReadView + owner thread. + + @param[in,out] trx transaction + */ + void open(trx_t *trx); + + + /** + Closes the view. + + View becomes not visible to purge thread. Intended to be called by the + ReadView owner thread. + */ + void close() { m_open.store(false, std::memory_order_relaxed); } + + + /** Returns true if view is open. */ + bool is_open() const { return m_open.load(std::memory_order_relaxed); } + + + /** + Sets the creator transaction id. + + This should be set only for views created by RW transactions. + Intended to be called by the ReadView owner thread. + */ + void set_creator_trx_id(trx_id_t id) + { + ut_ad(m_creator_trx_id == 0); + m_creator_trx_id= id; + } + + + /** + Writes the limits to the file. + @param file file to write to + */ + void print_limits(FILE *file) const + { + m_mutex.wr_lock(); + if (is_open()) + fprintf(file, "Trx read view will not see trx with" + " id >= " TRX_ID_FMT ", sees < " TRX_ID_FMT "\n", + low_limit_id(), up_limit_id()); + m_mutex.wr_unlock(); + } + + + /** + A wrapper around ReadViewBase::changes_visible(). + Intended to be called by the ReadView owner thread. + */ + bool changes_visible(trx_id_t id) const + { return id == m_creator_trx_id || ReadViewBase::changes_visible(id); } + + /** + A wrapper around ReadViewBase::append(). + Intended to be called by the purge coordinator task. + */ + void append_to(ReadViewBase *to) const + { + m_mutex.wr_lock(); + if (is_open()) + to->append(*this); + m_mutex.wr_unlock(); + } + + /** + Declare the object mostly unaccessible. + */ + void mem_noaccess() const + { + MEM_NOACCESS(&m_open, sizeof m_open); + /* m_mutex is accessed via trx_sys.rw_trx_hash */ + MEM_NOACCESS(&m_creator_trx_id, sizeof m_creator_trx_id); + } +}; diff --git a/storage/innobase/include/rem0cmp.h b/storage/innobase/include/rem0cmp.h new file mode 100644 index 00000000..3a30f5a9 --- /dev/null +++ b/storage/innobase/include/rem0cmp.h @@ -0,0 +1,286 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/rem0cmp.h +Comparison services for records + +Created 7/1/1994 Heikki Tuuri +************************************************************************/ + +#pragma once + +#include "data0data.h" +#include "data0type.h" +#include "rem0types.h" +#include "page0types.h" + +/*************************************************************//** +Returns TRUE if two columns are equal for comparison purposes. +@return TRUE if the columns are considered equal in comparisons */ +ibool +cmp_cols_are_equal( +/*===============*/ + const dict_col_t* col1, /*!< in: column 1 */ + const dict_col_t* col2, /*!< in: column 2 */ + ibool check_charsets); + /*!< in: whether to check charsets */ +/** Compare two data fields. +@param mtype main type +@param prtype precise type +@param descending whether to use descending order +@param data1 data field +@param len1 length of data1 in bytes, or UNIV_SQL_NULL +@param data2 data field +@param len2 length of data2 in bytes, or UNIV_SQL_NULL +@return the comparison result of data1 and data2 +@retval 0 if data1 is equal to data2 +@retval negative if data1 is less than data2 +@retval positive if data1 is greater than data2 */ +int cmp_data(ulint mtype, ulint prtype, bool descending, + const byte *data1, size_t len1, const byte *data2, size_t len2) + MY_ATTRIBUTE((warn_unused_result)); + +/** Compare two data fields. +@param dfield1 data field; must have type field set +@param dfield2 data field +@param descending whether to use descending order +@return the comparison result of dfield1 and dfield2 +@retval 0 if dfield1 is equal to dfield2 +@retval negative if dfield1 is less than dfield2 +@retval positive if dfield1 is greater than dfield2 */ +inline int cmp_dfield_dfield(const dfield_t *dfield1, const dfield_t *dfield2, + bool descending= false) +{ + ut_ad(dfield_check_typed(dfield1)); + const dtype_t *type= dfield_get_type(dfield1); + return cmp_data(type->mtype, type->prtype, descending, + static_cast(dfield_get_data(dfield1)), + dfield_get_len(dfield1), + static_cast(dfield_get_data(dfield2)), + dfield_get_len(dfield2)); +} + +#ifdef UNIV_DEBUG +/** Compare a GIS data tuple to a physical record. +@param[in] dtuple data tuple +@param[in] rec R-tree record +@param[in] mode compare mode +@retval negative if dtuple is less than rec */ +int cmp_dtuple_rec_with_gis(const dtuple_t *dtuple, const rec_t *rec, + page_cur_mode_t mode) + MY_ATTRIBUTE((nonnull)); +#endif + +/** Compare two minimum bounding rectangles. +@return 1, 0, -1, if a is greater, equal, less than b, respectively */ +inline int cmp_geometry_field(const void *a, const void *b) +{ + const byte *mbr1= static_cast(a); + const byte *mbr2= static_cast(b); + + static_assert(SPDIMS == 2, "compatibility"); + static_assert(DATA_MBR_LEN == SPDIMS * 2 * sizeof(double), "compatibility"); + + /* Try to compare mbr left lower corner (xmin, ymin) */ + double x1= mach_double_read(mbr1); + double x2= mach_double_read(mbr2); + if (x1 > x2) + return 1; + if (x1 < x2) + return -1; + + x1= mach_double_read(mbr1 + sizeof(double) * SPDIMS); + x2= mach_double_read(mbr2 + sizeof(double) * SPDIMS); + + if (x1 > x2) + return 1; + if (x1 < x2) + return -1; + + /* left lower corner (xmin, ymin) overlaps, now right upper corner */ + x1= mach_double_read(mbr1 + sizeof(double)); + x2= mach_double_read(mbr2 + sizeof(double)); + + if (x1 > x2) + return 1; + if (x1 < x2) + return -1; + + x1= mach_double_read(mbr1 + sizeof(double) * 2 + sizeof(double)); + x2= mach_double_read(mbr2 + sizeof(double) * 2 + sizeof(double)); + + if (x1 > x2) + return 1; + if (x1 < x2) + return -1; + + return 0; +} + +/** Compare a data tuple to a physical record. +@param dtuple data tuple +@param rec B-tree index record +@param index B-tree index +@param offsets rec_get_offsets(rec,index) +@param n_cmp number of fields to compare +@param matched_fields number of completely matched fields +@return the comparison result of dtuple and rec +@retval 0 if dtuple is equal to rec +@retval negative if dtuple is less than rec +@retval positive if dtuple is greater than rec */ +int cmp_dtuple_rec_with_match_low(const dtuple_t *dtuple, const rec_t *rec, + const dict_index_t *index, + const rec_offs *offsets, + ulint n_cmp, ulint *matched_fields) + MY_ATTRIBUTE((nonnull)); +#define cmp_dtuple_rec_with_match(tuple,rec,index,offsets,fields) \ + cmp_dtuple_rec_with_match_low( \ + tuple,rec,index,offsets,dtuple_get_n_fields_cmp(tuple),fields) +/** Compare a data tuple to a physical record. +@param[in] dtuple data tuple +@param[in] rec B-tree or R-tree index record +@param[in] index index tree +@param[in] offsets rec_get_offsets(rec) +@param[in,out] matched_fields number of completely matched fields +@param[in,out] matched_bytes number of matched bytes in the first +field that is not matched +@return the comparison result of dtuple and rec +@retval 0 if dtuple is equal to rec +@retval negative if dtuple is less than rec +@retval positive if dtuple is greater than rec */ +int +cmp_dtuple_rec_with_match_bytes( + const dtuple_t* dtuple, + const rec_t* rec, + const dict_index_t* index, + const rec_offs* offsets, + ulint* matched_fields, + ulint* matched_bytes) + MY_ATTRIBUTE((warn_unused_result)); +/** Compare a data tuple to a physical record. +@see cmp_dtuple_rec_with_match +@param dtuple data tuple +@param rec index record +@param index index +@param offsets rec_get_offsets(rec, index) +@return the comparison result of dtuple and rec +@retval 0 if dtuple is equal to rec +@retval negative if dtuple is less than rec +@retval positive if dtuple is greater than rec */ +inline int cmp_dtuple_rec(const dtuple_t *dtuple, const rec_t *rec, + const dict_index_t *index, const rec_offs *offsets) +{ + ulint matched= 0; + return cmp_dtuple_rec_with_match(dtuple, rec, index, offsets, &matched); +} + +/** Check if a dtuple is a prefix of a record. +@param dtuple data tuple +@param rec index record +@param index index +@param offsets rec_get_offsets(rec) +@return whether dtuple is a prefix of rec */ +bool cmp_dtuple_is_prefix_of_rec(const dtuple_t *dtuple, const rec_t *rec, + const dict_index_t *index, + const rec_offs *offsets) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Compare two physical records that contain the same number of columns, +none of which are stored externally. +@retval positive if rec1 (including non-ordering columns) is greater than rec2 +@retval negative if rec1 (including non-ordering columns) is less than rec2 +@retval 0 if rec1 is a duplicate of rec2 */ +int +cmp_rec_rec_simple( +/*===============*/ + const rec_t* rec1, /*!< in: physical record */ + const rec_t* rec2, /*!< in: physical record */ + const rec_offs* offsets1,/*!< in: rec_get_offsets(rec1, ...) */ + const rec_offs* offsets2,/*!< in: rec_get_offsets(rec2, ...) */ + const dict_index_t* index, /*!< in: data dictionary index */ + struct TABLE* table) /*!< in: MySQL table, for reporting + duplicate key value if applicable, + or NULL */ + MY_ATTRIBUTE((nonnull(1,2,3,4), warn_unused_result)); + +/** Compare two B-tree or R-tree records. +Only the common first fields are compared, and externally stored field +are treated as equal. +@param[in] rec1 record (possibly not on an index page) +@param[in] rec2 B-tree or R-tree record in an index page +@param[in] offsets1 rec_get_offsets(rec1, index) +@param[in] offsets2 rec_get_offsets(rec2, index) +@param[in] nulls_unequal true if this is for index cardinality + statistics estimation with + innodb_stats_method=nulls_unequal + or innodb_stats_method=nulls_ignored +@param[out] matched_fields number of completely matched fields + within the first field not completely matched +@retval 0 if rec1 is equal to rec2 +@retval negative if rec1 is less than rec2 +@retval positive if rec1 is greater than rec2 */ +int +cmp_rec_rec( + const rec_t* rec1, + const rec_t* rec2, + const rec_offs* offsets1, + const rec_offs* offsets2, + const dict_index_t* index, + bool nulls_unequal = false, + ulint* matched_fields = NULL) + MY_ATTRIBUTE((nonnull(1,2,3,4,5))); + +/** Compare two data fields. +@param dfield1 data field +@param dfield2 data field +@return the comparison result of dfield1 and dfield2 +@retval true if dfield1 is equal to dfield2, or a prefix of dfield1 +@retval false otherwise */ +inline bool cmp_dfield_dfield_eq_prefix(const dfield_t *dfield1, + const dfield_t *dfield2) +{ + ut_ad(dfield_check_typed(dfield1)); + ut_ad(dfield_check_typed(dfield2)); + const dtype_t *type= dfield_get_type(dfield1); + +#ifdef UNIV_DEBUG + switch (type->prtype & DATA_MYSQL_TYPE_MASK) { + case MYSQL_TYPE_BIT: + case MYSQL_TYPE_STRING: + case MYSQL_TYPE_VAR_STRING: + case MYSQL_TYPE_TINY_BLOB: + case MYSQL_TYPE_MEDIUM_BLOB: + case MYSQL_TYPE_BLOB: + case MYSQL_TYPE_LONG_BLOB: + case MYSQL_TYPE_VARCHAR: + break; + default: + ut_error; + } +#endif /* UNIV_DEBUG */ + + uint cs_num= dtype_get_charset_coll(type->prtype); + CHARSET_INFO *cs= get_charset(cs_num, MYF(MY_WME)); + ut_a(cs); + return !cs->strnncoll(static_cast(dfield_get_data(dfield1)), + dfield_get_len(dfield1), + static_cast(dfield_get_data(dfield2)), + dfield_get_len(dfield2), 1); +} diff --git a/storage/innobase/include/rem0rec.h b/storage/innobase/include/rem0rec.h new file mode 100644 index 00000000..2f038ab3 --- /dev/null +++ b/storage/innobase/include/rem0rec.h @@ -0,0 +1,1276 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/rem0rec.h +Record manager + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#ifndef rem0rec_h +#define rem0rec_h + +#ifndef UNIV_INNOCHECKSUM +#include "data0data.h" +#include "rem0types.h" +#include "mtr0types.h" +#include "page0types.h" +#include "dict0dict.h" +#include "trx0types.h" +#endif /*! UNIV_INNOCHECKSUM */ +#include +#include + +/* Number of extra bytes in an old-style record, +in addition to the data and the offsets */ +#define REC_N_OLD_EXTRA_BYTES 6 +/* Number of extra bytes in a new-style record, +in addition to the data and the offsets */ +#define REC_N_NEW_EXTRA_BYTES 5 + +#define REC_NEW_STATUS 3 /* This is single byte bit-field */ +#define REC_NEW_STATUS_MASK 0x7UL +#define REC_NEW_STATUS_SHIFT 0 + +/* The following four constants are needed in page0zip.cc in order to +efficiently compress and decompress pages. */ + +/* The offset of heap_no in a compact record */ +#define REC_NEW_HEAP_NO 4 +/* The shift of heap_no in a compact record. +The status is stored in the low-order bits. */ +#define REC_HEAP_NO_SHIFT 3 + +/* Length of a B-tree node pointer, in bytes */ +#define REC_NODE_PTR_SIZE 4 + +#ifndef UNIV_INNOCHECKSUM +/** SQL null flag in a 1-byte offset of ROW_FORMAT=REDUNDANT records */ +constexpr rec_offs REC_1BYTE_SQL_NULL_MASK= 0x80; +/** SQL null flag in a 2-byte offset of ROW_FORMAT=REDUNDANT records */ +constexpr rec_offs REC_2BYTE_SQL_NULL_MASK= 0x8000; + +/** In a 2-byte offset of ROW_FORMAT=REDUNDANT records, the second most +significant bit denotes that the tail of a field is stored off-page. */ +constexpr rec_offs REC_2BYTE_EXTERN_MASK= 0x4000; + +constexpr size_t RECORD_OFFSET= 2; +constexpr size_t INDEX_OFFSET= + RECORD_OFFSET + sizeof(rec_t *) / sizeof(rec_offs); +#endif /* UNIV_INNOCHECKSUM */ + +/* Length of the rec_get_offsets() header */ +constexpr size_t REC_OFFS_HEADER_SIZE= +#ifdef UNIV_DEBUG +#ifndef UNIV_INNOCHECKSUM + sizeof(rec_t *) / sizeof(rec_offs) + + sizeof(dict_index_t *) / sizeof(rec_offs) + +#endif /* UNIV_INNOCHECKSUM */ +#endif /* UNIV_DEBUG */ + 2; + +/* Number of elements that should be initially allocated for the +offsets[] array, first passed to rec_get_offsets() */ +constexpr size_t REC_OFFS_NORMAL_SIZE= 300; +constexpr size_t REC_OFFS_SMALL_SIZE= 18; +constexpr size_t REC_OFFS_SEC_INDEX_SIZE= + /* PK max key parts */ 16 + /* sec idx max key parts */ 16 + + /* child page number for non-leaf pages */ 1; + +/** Get the base address of offsets. The extra_size is stored at +this position, and following positions hold the end offsets of +the fields. */ +#define rec_offs_base(offsets) (offsets + REC_OFFS_HEADER_SIZE) + +#ifndef UNIV_INNOCHECKSUM +/* Offset consists of two parts: 2 upper bits is type and all other bits is +value */ + +/** Only 4 different values is possible! */ +enum field_type_t +{ + /** normal field */ + STORED_IN_RECORD= 0 << 14, + /** this field is stored off-page */ + STORED_OFFPAGE= 1 << 14, + /** just an SQL NULL */ + SQL_NULL= 2 << 14, + /** instantly added field */ + DEFAULT= 3 << 14, +}; + +/** without 2 upper bits */ +static constexpr rec_offs DATA_MASK= 0x3fff; +/** 2 upper bits */ +static constexpr rec_offs TYPE_MASK= ~DATA_MASK; +inline field_type_t get_type(rec_offs n) +{ + return static_cast(n & TYPE_MASK); +} +inline void set_type(rec_offs &n, field_type_t type) +{ + n= static_cast((n & DATA_MASK) | type); +} +inline rec_offs get_value(rec_offs n) { return n & DATA_MASK; } +inline rec_offs combine(rec_offs value, field_type_t type) +{ + return static_cast(get_value(value) | type); +} + +/** Compact flag ORed to the extra size returned by rec_get_offsets() */ +constexpr rec_offs REC_OFFS_COMPACT= rec_offs(~(rec_offs(~0) >> 1)); +/** External flag in offsets returned by rec_get_offsets() */ +constexpr rec_offs REC_OFFS_EXTERNAL= REC_OFFS_COMPACT >> 1; +/** Default value flag in offsets returned by rec_get_offsets() */ +constexpr rec_offs REC_OFFS_DEFAULT= REC_OFFS_COMPACT >> 2; +constexpr rec_offs REC_OFFS_MASK= REC_OFFS_DEFAULT - 1; + +/******************************************************//** +The following function is used to get the offset of the +next chained record on the same page. +@return the page offset of the next chained record, or 0 if none */ +UNIV_INLINE +ulint +rec_get_next_offs( +/*==============*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ + MY_ATTRIBUTE((warn_unused_result)); +/******************************************************//** +The following function is used to set the next record offset field +of an old-style record. */ +UNIV_INLINE +void +rec_set_next_offs_old( +/*==================*/ + rec_t* rec, /*!< in: old-style physical record */ + ulint next) /*!< in: offset of the next record */ + MY_ATTRIBUTE((nonnull)); +/******************************************************//** +The following function is used to set the next record offset field +of a new-style record. */ +UNIV_INLINE +void +rec_set_next_offs_new( +/*==================*/ + rec_t* rec, /*!< in/out: new-style physical record */ + ulint next) /*!< in: offset of the next record */ + MY_ATTRIBUTE((nonnull)); +/******************************************************//** +The following function is used to get the number of fields +in an old-style record. +@return number of data fields */ +UNIV_INLINE +ulint +rec_get_n_fields_old( +/*=================*/ + const rec_t* rec) /*!< in: physical record */ + MY_ATTRIBUTE((warn_unused_result)); +/******************************************************//** +The following function is used to get the number of fields +in a record. +@return number of data fields */ +UNIV_INLINE +ulint +rec_get_n_fields( +/*=============*/ + const rec_t* rec, /*!< in: physical record */ + const dict_index_t* index) /*!< in: record descriptor */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Confirms the n_fields of the entry is sane with comparing the other +record in the same page specified +@param[in] index index +@param[in] rec record of the same page +@param[in] entry index entry +@return true if n_fields is sane */ +UNIV_INLINE +bool +rec_n_fields_is_sane( + dict_index_t* index, + const rec_t* rec, + const dtuple_t* entry) + MY_ATTRIBUTE((warn_unused_result)); + +/******************************************************//** +The following function is used to get the number of records owned by the +previous directory record. +@return number of owned records */ +UNIV_INLINE +ulint +rec_get_n_owned_old( +/*================*/ + const rec_t* rec) /*!< in: old-style physical record */ + MY_ATTRIBUTE((warn_unused_result)); +/******************************************************//** +The following function is used to get the number of records owned by the +previous directory record. +@return number of owned records */ +UNIV_INLINE +ulint +rec_get_n_owned_new( +/*================*/ + const rec_t* rec) /*!< in: new-style physical record */ + MY_ATTRIBUTE((warn_unused_result)); + +/******************************************************//** +The following function is used to retrieve the info bits of +a record. +@return info bits */ +UNIV_INLINE +byte +rec_get_info_bits( +/*==============*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Determine the status bits of a non-REDUNDANT record. +@param[in] rec ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED record +@return status bits */ +inline +rec_comp_status_t +rec_get_status(const rec_t* rec) +{ + byte bits = rec[-REC_NEW_STATUS] & REC_NEW_STATUS_MASK; + ut_ad(bits <= REC_STATUS_INSTANT); + return static_cast(bits); +} + +/** Set the status bits of a non-REDUNDANT record. +@param[in,out] rec ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED record +@param[in] bits status bits */ +inline void rec_set_status(rec_t *rec, byte bits) +{ + ut_ad(bits <= REC_STATUS_INSTANT); + rec[-REC_NEW_STATUS]= static_cast((rec[-REC_NEW_STATUS] & + ~REC_NEW_STATUS_MASK) | bits); +} + +/** Get the length of added field count in a REC_STATUS_INSTANT record. +@param[in] n_add_field number of added fields, minus one +@return storage size of the field count, in bytes */ +inline unsigned rec_get_n_add_field_len(ulint n_add_field) +{ + ut_ad(n_add_field < REC_MAX_N_FIELDS); + return n_add_field < 0x80 ? 1 : 2; +} + +/** Get the added field count in a REC_STATUS_INSTANT record. +@param[in,out] header variable header of a REC_STATUS_INSTANT record +@return number of added fields */ +inline unsigned rec_get_n_add_field(const byte*& header) +{ + unsigned n_fields_add = *--header; + if (n_fields_add < 0x80) { + ut_ad(rec_get_n_add_field_len(n_fields_add) == 1); + return n_fields_add; + } + + n_fields_add &= 0x7f; + n_fields_add |= unsigned(*--header) << 7; + ut_ad(n_fields_add < REC_MAX_N_FIELDS); + ut_ad(rec_get_n_add_field_len(n_fields_add) == 2); + return n_fields_add; +} + +/** Set the added field count in a REC_STATUS_INSTANT record. +@param[in,out] header variable header of a REC_STATUS_INSTANT record +@param[in] n_add number of added fields, minus 1 +@return record header before the number of added fields */ +inline void rec_set_n_add_field(byte*& header, ulint n_add) +{ + ut_ad(n_add < REC_MAX_N_FIELDS); + + if (n_add < 0x80) { + *header-- = byte(n_add); + } else { + *header-- = byte(byte(n_add) | 0x80); + *header-- = byte(n_add >> 7); + } +} + +/******************************************************//** +The following function is used to retrieve the info and status +bits of a record. (Only compact records have status bits.) +@return info and status bits */ +UNIV_INLINE +byte +rec_get_info_and_status_bits( +/*=========================*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ + MY_ATTRIBUTE((warn_unused_result)); +/******************************************************//** +The following function is used to set the info and status +bits of a record. (Only compact records have status bits.) */ +UNIV_INLINE +void +rec_set_info_and_status_bits( +/*=========================*/ + rec_t* rec, /*!< in/out: compact physical record */ + ulint bits) /*!< in: info bits */ + MY_ATTRIBUTE((nonnull)); + +/******************************************************//** +The following function tells if record is delete marked. +@return nonzero if delete marked */ +UNIV_INLINE +ulint +rec_get_deleted_flag( +/*=================*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ + MY_ATTRIBUTE((warn_unused_result)); +/******************************************************//** +The following function tells if a new-style record is a node pointer. +@return TRUE if node pointer */ +UNIV_INLINE +bool +rec_get_node_ptr_flag( +/*==================*/ + const rec_t* rec) /*!< in: physical record */ + MY_ATTRIBUTE((warn_unused_result)); +/******************************************************//** +The following function is used to get the order number +of an old-style record in the heap of the index page. +@return heap order number */ +UNIV_INLINE +ulint +rec_get_heap_no_old( +/*================*/ + const rec_t* rec) /*!< in: physical record */ + MY_ATTRIBUTE((warn_unused_result)); +/******************************************************//** +The following function is used to get the order number +of a new-style record in the heap of the index page. +@return heap order number */ +UNIV_INLINE +ulint +rec_get_heap_no_new( +/*================*/ + const rec_t* rec) /*!< in: physical record */ + MY_ATTRIBUTE((warn_unused_result)); +/******************************************************//** +The following function is used to test whether the data offsets +in the record are stored in one-byte or two-byte format. +@return TRUE if 1-byte form */ +UNIV_INLINE +ibool +rec_get_1byte_offs_flag( +/*====================*/ + const rec_t* rec) /*!< in: physical record */ + MY_ATTRIBUTE((warn_unused_result)); + +/******************************************************//** +The following function is used to set the 1-byte offsets flag. */ +UNIV_INLINE +void +rec_set_1byte_offs_flag( +/*====================*/ + rec_t* rec, /*!< in: physical record */ + ibool flag) /*!< in: TRUE if 1byte form */ + MY_ATTRIBUTE((nonnull)); + +/******************************************************//** +Returns the offset of nth field end if the record is stored in the 1-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. +@return offset of the start of the field, SQL null flag ORed */ +UNIV_INLINE +uint8_t +rec_1_get_field_end_info( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ + MY_ATTRIBUTE((warn_unused_result)); + +/******************************************************//** +Returns the offset of nth field end if the record is stored in the 2-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. +@return offset of the start of the field, SQL null flag and extern +storage flag ORed */ +UNIV_INLINE +uint16_t +rec_2_get_field_end_info( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ + MY_ATTRIBUTE((warn_unused_result)); + +/******************************************************//** +Returns nonzero if the field is stored off-page. +@retval 0 if the field is stored in-page +@retval REC_2BYTE_EXTERN_MASK if the field is stored externally */ +UNIV_INLINE +ulint +rec_2_is_field_extern( +/*==================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ + MY_ATTRIBUTE((warn_unused_result)); + +/******************************************************//** +Determine how many of the first n columns in a compact +physical record are stored externally. +@return number of externally stored columns */ +ulint +rec_get_n_extern_new( +/*=================*/ + const rec_t* rec, /*!< in: compact physical record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint n) /*!< in: number of columns to scan */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Determine the offsets to each field in an index record. +@param[in] rec physical record +@param[in] index the index that the record belongs to +@param[in,out] offsets array comprising offsets[0] allocated elements, + or an array from rec_get_offsets(), or NULL +@param[in] n_core 0, or index->n_core_fields for leaf page +@param[in] n_fields maximum number of offsets to compute + (ULINT_UNDEFINED to compute all offsets) +@param[in,out] heap memory heap +@return the new offsets */ +rec_offs* +rec_get_offsets_func( + const rec_t* rec, + const dict_index_t* index, + rec_offs* offsets, + ulint n_core, + ulint n_fields, +#ifdef UNIV_DEBUG + const char* file, /*!< in: file name where called */ + unsigned line, /*!< in: line number where called */ +#endif /* UNIV_DEBUG */ + mem_heap_t** heap) /*!< in/out: memory heap */ +#ifdef UNIV_DEBUG + MY_ATTRIBUTE((nonnull(1,2,6,8),warn_unused_result)); +#else /* UNIV_DEBUG */ + MY_ATTRIBUTE((nonnull(1,2,6),warn_unused_result)); +#endif /* UNIV_DEBUG */ + +#ifdef UNIV_DEBUG +# define rec_get_offsets(rec, index, offsets, leaf, n, heap) \ + rec_get_offsets_func(rec,index,offsets,leaf,n,__FILE__,__LINE__,heap) +#else /* UNIV_DEBUG */ +# define rec_get_offsets(rec, index, offsets, leaf, n, heap) \ + rec_get_offsets_func(rec, index, offsets, leaf, n, heap) +#endif /* UNIV_DEBUG */ + +/******************************************************//** +The following function determines the offsets to each field +in the record. It can reuse a previously allocated array. */ +void +rec_get_offsets_reverse( +/*====================*/ + const byte* extra, /*!< in: the extra bytes of a + compact record in reverse order, + excluding the fixed-size + REC_N_NEW_EXTRA_BYTES */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint node_ptr,/*!< in: nonzero=node pointer, + 0=leaf node */ + rec_offs* offsets)/*!< in/out: array consisting of + offsets[0] allocated elements */ + MY_ATTRIBUTE((nonnull)); +#ifdef UNIV_DEBUG +/** Validate offsets returned by rec_get_offsets(). +@param[in] rec record, or NULL +@param[in] index the index that the record belongs in, or NULL +@param[in,out] offsets the offsets of the record +@return true */ +bool +rec_offs_validate( + const rec_t* rec, + const dict_index_t* index, + const rec_offs* offsets) + MY_ATTRIBUTE((nonnull(3), warn_unused_result)); +/** Update debug data in offsets, in order to tame rec_offs_validate(). +@param[in] rec record +@param[in] index the index that the record belongs in +@param[in] leaf whether the record resides in a leaf page +@param[in,out] offsets offsets from rec_get_offsets() to adjust */ +void +rec_offs_make_valid( + const rec_t* rec, + const dict_index_t* index, + bool leaf, + rec_offs* offsets) + MY_ATTRIBUTE((nonnull)); +#else +# define rec_offs_make_valid(rec, index, leaf, offsets) +#endif /* UNIV_DEBUG */ + +/************************************************************//** +The following function is used to get the offset to the nth +data field in an old-style record. +@return offset to the field */ +ulint +rec_get_nth_field_offs_old( +/*=======================*/ + const rec_t* rec, /*!< in: record */ + ulint n, /*!< in: index of the field */ + ulint* len) /*!< out: length of the field; UNIV_SQL_NULL + if SQL null */ + MY_ATTRIBUTE((nonnull)); +#define rec_get_nth_field_old(rec, n, len) \ +((rec) + rec_get_nth_field_offs_old(rec, n, len)) +/************************************************************//** +Gets the physical size of an old-style field. +Also an SQL null may have a field of size > 0, +if the data type is of a fixed size. +@return field size in bytes */ +UNIV_INLINE +ulint +rec_get_nth_field_size( +/*===================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: index of the field */ + MY_ATTRIBUTE((warn_unused_result)); +/************************************************************//** +The following function is used to get an offset to the nth +data field in a record. +@return offset from the origin of rec */ +UNIV_INLINE +rec_offs +rec_get_nth_field_offs( +/*===================*/ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n, /*!< in: index of the field */ + ulint* len) /*!< out: length of the field; UNIV_SQL_NULL + if SQL null */ + MY_ATTRIBUTE((nonnull)); +#define rec_get_nth_field(rec, offsets, n, len) \ +((rec) + rec_get_nth_field_offs(offsets, n, len)) +/******************************************************//** +Determine if the offsets are for a record containing null BLOB pointers. +@return first field containing a null BLOB pointer, or NULL if none found */ +UNIV_INLINE +const byte* +rec_offs_any_null_extern( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + const rec_offs* offsets) /*!< in: rec_get_offsets(rec) */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Mark the nth field as externally stored. +@param[in] offsets array returned by rec_get_offsets() +@param[in] n nth field */ +void +rec_offs_make_nth_extern( + rec_offs* offsets, + const ulint n); + +MY_ATTRIBUTE((nonnull)) +/** Determine the number of allocated elements for an array of offsets. +@param[in] offsets offsets after rec_offs_set_n_alloc() +@return number of elements */ +inline ulint rec_offs_get_n_alloc(const rec_offs *offsets) +{ + ut_ad(offsets); + ulint n_alloc= offsets[0]; + ut_ad(n_alloc > REC_OFFS_HEADER_SIZE); + MEM_CHECK_ADDRESSABLE(offsets, n_alloc * sizeof *offsets); + return n_alloc; +} + +/** Determine the number of fields for which offsets have been initialized. +@param[in] offsets rec_get_offsets() +@return number of fields */ +inline +ulint +rec_offs_n_fields(const rec_offs* offsets) +{ + ulint n_fields; + ut_ad(offsets); + n_fields = offsets[1]; + ut_ad(n_fields > 0); + ut_ad(n_fields <= REC_MAX_N_FIELDS); + ut_ad(n_fields + REC_OFFS_HEADER_SIZE + <= rec_offs_get_n_alloc(offsets)); + return(n_fields); +} + +/** Get a flag of a record field. +@param[in] offsets rec_get_offsets() +@param[in] n nth field +@param[in] flag flag to extract +@return type of the record field */ +inline field_type_t rec_offs_nth_type(const rec_offs *offsets, ulint n) +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + ut_ad(n < rec_offs_n_fields(offsets)); + return get_type(rec_offs_base(offsets)[1 + n]); +} + +/** Determine if a record field is missing +(should be replaced by dict_index_t::instant_field_value()). +@param[in] offsets rec_get_offsets() +@param[in] n nth field +@return nonzero if default bit is set */ +inline ulint rec_offs_nth_default(const rec_offs *offsets, ulint n) +{ + return rec_offs_nth_type(offsets, n) == DEFAULT; +} + +/** Determine if a record field is SQL NULL +(should be replaced by dict_index_t::instant_field_value()). +@param[in] offsets rec_get_offsets() +@param[in] n nth field +@return nonzero if SQL NULL set */ +inline ulint rec_offs_nth_sql_null(const rec_offs *offsets, ulint n) +{ + return rec_offs_nth_type(offsets, n) == SQL_NULL; +} + +/** Determine if a record field is stored off-page. +@param[in] offsets rec_get_offsets() +@param[in] n nth field +Returns nonzero if the extern bit is set in nth field of rec. +@return nonzero if externally stored */ +inline ulint rec_offs_nth_extern(const rec_offs *offsets, ulint n) +{ + return rec_offs_nth_type(offsets, n) == STORED_OFFPAGE; +} + +/** Get a global flag of a record. +@param[in] offsets rec_get_offsets() +@param[in] flag flag to extract +@return the flag of the record field */ +inline ulint rec_offs_any_flag(const rec_offs *offsets, ulint flag) +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + return *rec_offs_base(offsets) & flag; +} + +/** Determine if the offsets are for a record containing off-page columns. +@param[in] offsets rec_get_offsets() +@return nonzero if any off-page columns exist */ +inline bool rec_offs_any_extern(const rec_offs *offsets) +{ + return rec_offs_any_flag(offsets, REC_OFFS_EXTERNAL); +} + +/** Determine if the offsets are for a record that is missing fields. +@param[in] offsets rec_get_offsets() +@return nonzero if any fields need to be replaced with + dict_index_t::instant_field_value() */ +inline ulint rec_offs_any_default(const rec_offs *offsets) +{ + return rec_offs_any_flag(offsets, REC_OFFS_DEFAULT); +} + +/** Determine if the offsets are for other than ROW_FORMAT=REDUNDANT. +@param[in] offsets rec_get_offsets() +@return nonzero if ROW_FORMAT is COMPACT,DYNAMIC or COMPRESSED +@retval 0 if ROW_FORMAT=REDUNDANT */ +inline ulint rec_offs_comp(const rec_offs *offsets) +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + return (*rec_offs_base(offsets) & REC_OFFS_COMPACT); +} + +/** Determine if the record is the metadata pseudo-record +in the clustered index for instant ADD COLUMN or ALTER TABLE. +@param[in] rec leaf page record +@param[in] comp 0 if ROW_FORMAT=REDUNDANT, else nonzero +@return whether the record is the metadata pseudo-record */ +inline bool rec_is_metadata(const rec_t* rec, ulint comp) +{ + bool is = !!(rec_get_info_bits(rec, comp) & REC_INFO_MIN_REC_FLAG); + ut_ad(!is || !comp || rec_get_status(rec) == REC_STATUS_INSTANT); + return is; +} + +/** Determine if the record is the metadata pseudo-record +in the clustered index for instant ADD COLUMN or ALTER TABLE. +@param[in] rec leaf page record +@param[in] index index of the record +@return whether the record is the metadata pseudo-record */ +inline bool rec_is_metadata(const rec_t *rec, const dict_index_t &index) +{ + return rec_is_metadata(rec, index.table->not_redundant()); +} + +/** Determine if the record is the metadata pseudo-record +in the clustered index for instant ADD COLUMN (not other ALTER TABLE). +@param[in] rec leaf page record +@param[in] comp 0 if ROW_FORMAT=REDUNDANT, else nonzero +@return whether the record is the metadata pseudo-record */ +inline bool rec_is_add_metadata(const rec_t* rec, ulint comp) +{ + bool is = rec_get_info_bits(rec, comp) == REC_INFO_MIN_REC_FLAG; + ut_ad(!is || !comp || rec_get_status(rec) == REC_STATUS_INSTANT); + return is; +} + +/** Determine if the record is the metadata pseudo-record +in the clustered index for instant ADD COLUMN (not other ALTER TABLE). +@param[in] rec leaf page record +@param[in] index index of the record +@return whether the record is the metadata pseudo-record */ +inline bool rec_is_add_metadata(const rec_t* rec, const dict_index_t& index) +{ + bool is = rec_is_add_metadata(rec, dict_table_is_comp(index.table)); + ut_ad(!is || index.is_instant()); + return is; +} + +/** Determine if the record is the metadata pseudo-record +in the clustered index for instant ALTER TABLE (not plain ADD COLUMN). +@param[in] rec leaf page record +@param[in] comp 0 if ROW_FORMAT=REDUNDANT, else nonzero +@return whether the record is the ALTER TABLE metadata pseudo-record */ +inline bool rec_is_alter_metadata(const rec_t* rec, ulint comp) +{ + bool is = !(~rec_get_info_bits(rec, comp) + & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG)); + ut_ad(!is || rec_is_metadata(rec, comp)); + return is; +} + +/** Determine if the record is the metadata pseudo-record +in the clustered index for instant ALTER TABLE (not plain ADD COLUMN). +@param[in] rec leaf page record +@param[in] index index of the record +@return whether the record is the ALTER TABLE metadata pseudo-record */ +inline bool rec_is_alter_metadata(const rec_t* rec, const dict_index_t& index) +{ + bool is = rec_is_alter_metadata(rec, dict_table_is_comp(index.table)); + ut_ad(!is || index.is_dummy || index.is_instant()); + return is; +} + +/** Determine if a record is delete-marked (not a metadata pseudo-record). +@param[in] rec record +@param[in] comp nonzero if ROW_FORMAT!=REDUNDANT +@return whether the record is a delete-marked user record */ +inline bool rec_is_delete_marked(const rec_t* rec, ulint comp) +{ + return (rec_get_info_bits(rec, comp) + & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG)) + == REC_INFO_DELETED_FLAG; +} + +/** Get the nth field from an index. +@param[in] rec index record +@param[in] index index +@param[in] offsets rec_get_offsets(rec, index) +@param[in] n field number +@param[out] len length of the field in bytes, or UNIV_SQL_NULL +@return a read-only copy of the index field */ +inline +const byte* +rec_get_nth_cfield( + const rec_t* rec, + const dict_index_t* index, + const rec_offs* offsets, + ulint n, + ulint* len) +{ + /* Because this function may be invoked by innobase_rec_to_mysql() + for reporting a duplicate key during ALTER TABLE or + CREATE UNIQUE INDEX, and in that case the rec omit the fixed-size + header of 5 or 6 bytes, the check + rec_offs_validate(rec, index, offsets) must be avoided here. */ + if (!rec_offs_nth_default(offsets, n)) { + return rec_get_nth_field(rec, offsets, n, len); + } + return index->instant_field_value(n, len); +} + +/******************************************************//** +Gets the physical size of a field. +@return length of field */ +UNIV_INLINE +ulint +rec_offs_nth_size( +/*==============*/ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n) /*!< in: nth field */ + MY_ATTRIBUTE((warn_unused_result)); + +/******************************************************//** +Returns the number of extern bits set in a record. +@return number of externally stored fields */ +UNIV_INLINE +ulint +rec_offs_n_extern( +/*==============*/ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ + MY_ATTRIBUTE((warn_unused_result)); +/**********************************************************//** +The following function returns the data size of an old-style physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. +@return size */ +UNIV_INLINE +ulint +rec_get_data_size_old( +/*==================*/ + const rec_t* rec) /*!< in: physical record */ + MY_ATTRIBUTE((warn_unused_result)); +/**********************************************************//** +The following function sets the number of allocated elements +for an array of offsets. */ +UNIV_INLINE +void +rec_offs_set_n_alloc( +/*=================*/ + rec_offs*offsets, /*!< out: array for rec_get_offsets(), + must be allocated */ + ulint n_alloc) /*!< in: number of elements */ + MY_ATTRIBUTE((nonnull)); +#define rec_offs_init(offsets) \ + rec_offs_set_n_alloc(offsets, (sizeof offsets) / sizeof *offsets) +/**********************************************************//** +The following function returns the data size of a physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. +@return size */ +UNIV_INLINE +ulint +rec_offs_data_size( +/*===============*/ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ + MY_ATTRIBUTE((warn_unused_result)); +/**********************************************************//** +Returns the total size of record minus data size of record. +The value returned by the function is the distance from record +start to record origin in bytes. +@return size */ +UNIV_INLINE +ulint +rec_offs_extra_size( +/*================*/ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ + MY_ATTRIBUTE((warn_unused_result)); +/**********************************************************//** +Returns the total size of a physical record. +@return size */ +UNIV_INLINE +ulint +rec_offs_size( +/*==========*/ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ + MY_ATTRIBUTE((warn_unused_result)); +#ifdef UNIV_DEBUG +/**********************************************************//** +Returns a pointer to the start of the record. +@return pointer to start */ +UNIV_INLINE +byte* +rec_get_start( +/*==========*/ + const rec_t* rec, /*!< in: pointer to record */ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ + MY_ATTRIBUTE((warn_unused_result)); +/**********************************************************//** +Returns a pointer to the end of the record. +@return pointer to end */ +UNIV_INLINE +byte* +rec_get_end( +/*========*/ + const rec_t* rec, /*!< in: pointer to record */ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ + MY_ATTRIBUTE((warn_unused_result)); +#else /* UNIV_DEBUG */ +# define rec_get_start(rec, offsets) ((rec) - rec_offs_extra_size(offsets)) +# define rec_get_end(rec, offsets) ((rec) + rec_offs_data_size(offsets)) +#endif /* UNIV_DEBUG */ + +/** Copy a physical record to a buffer. +@param[in] buf buffer +@param[in] rec physical record +@param[in] offsets array returned by rec_get_offsets() +@return pointer to the origin of the copy */ +UNIV_INLINE +rec_t* +rec_copy( + void* buf, + const rec_t* rec, + const rec_offs* offsets); + +/** Determine the size of a data tuple prefix in a temporary file. +@tparam redundant_temp whether to use the ROW_FORMAT=REDUNDANT format +@param[in] index clustered or secondary index +@param[in] fields data fields +@param[in] n_fields number of data fields +@param[out] extra record header size +@param[in] status REC_STATUS_ORDINARY or REC_STATUS_INSTANT +@return total size, in bytes */ +template +ulint +rec_get_converted_size_temp( + const dict_index_t* index, + const dfield_t* fields, + ulint n_fields, + ulint* extra, + rec_comp_status_t status = REC_STATUS_ORDINARY) + MY_ATTRIBUTE((warn_unused_result, nonnull)); + +/** Determine the offset to each field in temporary file. +@param[in] rec temporary file record +@param[in] index index of that the record belongs to +@param[in,out] offsets offsets to the fields; in: rec_offs_n_fields(offsets) +@param[in] n_core number of core fields (index->n_core_fields) +@param[in] def_val default values for non-core fields +@param[in] status REC_STATUS_ORDINARY or REC_STATUS_INSTANT */ +void +rec_init_offsets_temp( + const rec_t* rec, + const dict_index_t* index, + rec_offs* offsets, + ulint n_core, + const dict_col_t::def_t*def_val, + rec_comp_status_t status = REC_STATUS_ORDINARY) + MY_ATTRIBUTE((nonnull(1,2,3))); +/** Determine the offset to each field in temporary file. +@param[in] rec temporary file record +@param[in] index index of that the record belongs to +@param[in,out] offsets offsets to the fields; in: rec_offs_n_fields(offsets) +*/ +void +rec_init_offsets_temp( + const rec_t* rec, + const dict_index_t* index, + rec_offs* offsets) + MY_ATTRIBUTE((nonnull)); + +/** Convert a data tuple prefix to the temporary file format. +@tparam redundant_temp whether to use the ROW_FORMAT=REDUNDANT format +@param[out] rec record in temporary file format +@param[in] index clustered or secondary index +@param[in] fields data fields +@param[in] n_fields number of data fields +@param[in] status REC_STATUS_ORDINARY or REC_STATUS_INSTANT */ +template +void +rec_convert_dtuple_to_temp( + rec_t* rec, + const dict_index_t* index, + const dfield_t* fields, + ulint n_fields, + rec_comp_status_t status = REC_STATUS_ORDINARY) + MY_ATTRIBUTE((nonnull)); + +/**************************************************************//** +Copies the first n fields of a physical record to a new physical record in +a buffer. +@return own: copied record */ +rec_t* +rec_copy_prefix_to_buf( +/*===================*/ + const rec_t* rec, /*!< in: physical record */ + const dict_index_t* index, /*!< in: record descriptor */ + ulint n_fields, /*!< in: number of fields + to copy */ + byte** buf, /*!< in/out: memory buffer + for the copied prefix, + or NULL */ + ulint* buf_size) /*!< in/out: buffer size */ + MY_ATTRIBUTE((nonnull)); +/*********************************************************//** +Builds a physical record out of a data tuple and +stores it into the given buffer. +@return pointer to the origin of physical record */ +rec_t* +rec_convert_dtuple_to_rec( +/*======================*/ + byte* buf, /*!< in: start address of the + physical record */ + const dict_index_t* index, /*!< in: record descriptor */ + const dtuple_t* dtuple, /*!< in: data tuple */ + ulint n_ext) /*!< in: number of + externally stored columns */ + MY_ATTRIBUTE((warn_unused_result)); +/**********************************************************//** +Returns the extra size of an old-style physical record if we know its +data size and number of fields. +@return extra size */ +UNIV_INLINE +ulint +rec_get_converted_extra_size( +/*=========================*/ + ulint data_size, /*!< in: data size */ + ulint n_fields, /*!< in: number of fields */ + ulint n_ext) /*!< in: number of externally stored columns */ + MY_ATTRIBUTE((const)); +/**********************************************************//** +Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT. +@return total size */ +ulint +rec_get_converted_size_comp_prefix( +/*===============================*/ + const dict_index_t* index, /*!< in: record descriptor */ + const dfield_t* fields, /*!< in: array of data fields */ + ulint n_fields,/*!< in: number of data fields */ + ulint* extra) /*!< out: extra size */ + MY_ATTRIBUTE((warn_unused_result, nonnull(1,2))); + +/** Determine the size of a record in ROW_FORMAT=COMPACT. +@param[in] index record descriptor. dict_table_is_comp() + is assumed to hold, even if it doesn't +@param[in] tuple logical record +@param[out] extra extra size +@return total size */ +ulint +rec_get_converted_size_comp( + const dict_index_t* index, + const dtuple_t* tuple, + ulint* extra) + MY_ATTRIBUTE((nonnull(1,2))); + +/**********************************************************//** +The following function returns the size of a data tuple when converted to +a physical record. +@return size */ +UNIV_INLINE +ulint +rec_get_converted_size( +/*===================*/ + dict_index_t* index, /*!< in: record descriptor */ + const dtuple_t* dtuple, /*!< in: data tuple */ + ulint n_ext) /*!< in: number of externally stored columns */ + MY_ATTRIBUTE((warn_unused_result, nonnull)); +/** Copy the first n fields of a (copy of a) physical record to a data tuple. +The fields are copied into the memory heap. +@param[out] tuple data tuple +@param[in] rec index record, or a copy thereof +@param[in] index index of rec +@param[in] n_core index->n_core_fields at the time rec was + copied, or 0 if non-leaf page record +@param[in] n_fields number of fields to copy +@param[in,out] heap memory heap */ +void +rec_copy_prefix_to_dtuple( + dtuple_t* tuple, + const rec_t* rec, + const dict_index_t* index, + ulint n_core, + ulint n_fields, + mem_heap_t* heap) + MY_ATTRIBUTE((nonnull)); +/***************************************************************//** +Validates the consistency of a physical record. +@return TRUE if ok */ +ibool +rec_validate( +/*=========*/ + const rec_t* rec, /*!< in: physical record */ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ + MY_ATTRIBUTE((nonnull)); +/***************************************************************//** +Prints an old-style physical record. */ +void +rec_print_old( +/*==========*/ + FILE* file, /*!< in: file where to print */ + const rec_t* rec) /*!< in: physical record */ + MY_ATTRIBUTE((nonnull)); +/***************************************************************//** +Prints a spatial index record. */ +void +rec_print_mbr_rec( +/*==========*/ + FILE* file, /*!< in: file where to print */ + const rec_t* rec, /*!< in: physical record */ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ + MY_ATTRIBUTE((nonnull)); +/***************************************************************//** +Prints a physical record. */ +void +rec_print_new( +/*==========*/ + FILE* file, /*!< in: file where to print */ + const rec_t* rec, /*!< in: physical record */ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ + MY_ATTRIBUTE((nonnull)); +/***************************************************************//** +Prints a physical record. */ +void +rec_print( +/*======*/ + FILE* file, /*!< in: file where to print */ + const rec_t* rec, /*!< in: physical record */ + const dict_index_t* index) /*!< in: record descriptor */ + MY_ATTRIBUTE((nonnull)); + +/** Pretty-print a record. +@param[in,out] o output stream +@param[in] rec physical record +@param[in] info rec_get_info_bits(rec) +@param[in] offsets rec_get_offsets(rec) */ +void +rec_print( + std::ostream& o, + const rec_t* rec, + ulint info, + const rec_offs* offsets); + +/** Wrapper for pretty-printing a record */ +struct rec_index_print +{ + /** Constructor */ + rec_index_print(const rec_t* rec, const dict_index_t* index) : + m_rec(rec), m_index(index) + {} + + /** Record */ + const rec_t* m_rec; + /** Index */ + const dict_index_t* m_index; +}; + +/** Display a record. +@param[in,out] o output stream +@param[in] r record to display +@return the output stream */ +std::ostream& +operator<<(std::ostream& o, const rec_index_print& r); + +/** Wrapper for pretty-printing a record */ +struct rec_offsets_print +{ + /** Constructor */ + rec_offsets_print(const rec_t* rec, const rec_offs* offsets) : + m_rec(rec), m_offsets(offsets) + {} + + /** Record */ + const rec_t* m_rec; + /** Offsets to each field */ + const rec_offs* m_offsets; +}; + +/** Display a record. +@param[in,out] o output stream +@param[in] r record to display +@return the output stream */ +ATTRIBUTE_COLD +std::ostream& +operator<<(std::ostream& o, const rec_offsets_print& r); + +/** Pretty-printer of records and tuples */ +class rec_printer : public std::ostringstream { +public: + /** Construct a pretty-printed record. + @param rec record with header + @param offsets rec_get_offsets(rec, ...) */ + ATTRIBUTE_COLD + rec_printer(const rec_t* rec, const rec_offs* offsets) + : + std::ostringstream () + { + rec_print(*this, rec, + rec_get_info_bits(rec, rec_offs_comp(offsets)), + offsets); + } + + /** Construct a pretty-printed record. + @param rec record, possibly lacking header + @param info rec_get_info_bits(rec) + @param offsets rec_get_offsets(rec, ...) */ + ATTRIBUTE_COLD + rec_printer(const rec_t* rec, ulint info, const rec_offs* offsets) + : + std::ostringstream () + { + rec_print(*this, rec, info, offsets); + } + + /** Construct a pretty-printed tuple. + @param tuple data tuple */ + ATTRIBUTE_COLD + rec_printer(const dtuple_t* tuple) + : + std::ostringstream () + { + dtuple_print(*this, tuple); + } + + /** Construct a pretty-printed tuple. + @param field array of data tuple fields + @param n number of fields */ + ATTRIBUTE_COLD + rec_printer(const dfield_t* field, ulint n) + : + std::ostringstream () + { + dfield_print(*this, field, n); + } + + /** Destructor */ + ~rec_printer() override = default; + +private: + /** Copy constructor */ + rec_printer(const rec_printer& other); + /** Assignment operator */ + rec_printer& operator=(const rec_printer& other); +}; + + +# ifdef UNIV_DEBUG +/** Read the DB_TRX_ID of a clustered index record. +@param[in] rec clustered index record +@param[in] index clustered index +@return the value of DB_TRX_ID */ +trx_id_t +rec_get_trx_id( + const rec_t* rec, + const dict_index_t* index) + MY_ATTRIBUTE((nonnull, warn_unused_result)); +# endif /* UNIV_DEBUG */ + +/* Maximum lengths for the data in a physical record if the offsets +are given in one byte (resp. two byte) format. */ +#define REC_1BYTE_OFFS_LIMIT 0x7FUL +#define REC_2BYTE_OFFS_LIMIT 0x7FFFUL + +/* The data size of record must not be larger than this on +REDUNDANT row format because we reserve two upmost bits in a +two byte offset for special purposes */ +#define REDUNDANT_REC_MAX_DATA_SIZE (16383) + +/* The data size of record must be smaller than this on +COMPRESSED row format because we reserve two upmost bits in a +two byte offset for special purposes */ +#define COMPRESSED_REC_MAX_DATA_SIZE (16384) + +#ifdef WITH_WSREP +int wsrep_rec_get_foreign_key( + byte *buf, /* out: extracted key */ + ulint *buf_len, /* in/out: length of buf */ + const rec_t* rec, /* in: physical record */ + dict_index_t* index_for, /* in: index for foreign table */ + dict_index_t* index_ref, /* in: index for referenced table */ + ibool new_protocol); /* in: protocol > 1 */ +#endif /* WITH_WSREP */ + +#include "rem0rec.inl" + +#endif /* !UNIV_INNOCHECKSUM */ +#endif /* rem0rec_h */ diff --git a/storage/innobase/include/rem0rec.inl b/storage/innobase/include/rem0rec.inl new file mode 100644 index 00000000..46c209cb --- /dev/null +++ b/storage/innobase/include/rem0rec.inl @@ -0,0 +1,1134 @@ +/***************************************************************************** + +Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/rem0rec.ic +Record manager + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#include "mach0data.h" +#include "ut0byte.h" +#include "dict0boot.h" +#include "btr0types.h" + +/* Offsets of the bit-fields in an old-style record. NOTE! In the table the +most significant bytes and bits are written below less significant. + + (1) byte offset (2) bit usage within byte + downward from + origin -> 1 8 bits pointer to next record + 2 8 bits pointer to next record + 3 1 bit short flag + 7 bits number of fields + 4 3 bits number of fields + 5 bits heap number + 5 8 bits heap number + 6 4 bits n_owned + 4 bits info bits +*/ + +/* Offsets of the bit-fields in a new-style record. NOTE! In the table the +most significant bytes and bits are written below less significant. + + (1) byte offset (2) bit usage within byte + downward from + origin -> 1 8 bits relative offset of next record + 2 8 bits relative offset of next record + the relative offset is an unsigned 16-bit + integer: + (offset_of_next_record + - offset_of_this_record) mod 64Ki, + where mod is the modulo as a non-negative + number; + we can calculate the offset of the next + record with the formula: + relative_offset + offset_of_this_record + mod srv_page_size + 3 3 bits status: + 000=REC_STATUS_ORDINARY + 001=REC_STATUS_NODE_PTR + 010=REC_STATUS_INFIMUM + 011=REC_STATUS_SUPREMUM + 100=REC_STATUS_INSTANT + 1xx=reserved + 5 bits heap number + 4 8 bits heap number + 5 4 bits n_owned + 4 bits info bits +*/ + +/* We list the byte offsets from the origin of the record, the mask, +and the shift needed to obtain each bit-field of the record. */ + +#define REC_NEXT 2 +#define REC_NEXT_MASK 0xFFFFUL +#define REC_NEXT_SHIFT 0 + +#define REC_OLD_SHORT 3 /* This is single byte bit-field */ +#define REC_OLD_SHORT_MASK 0x1UL +#define REC_OLD_SHORT_SHIFT 0 + +#define REC_OLD_N_FIELDS 4 +#define REC_OLD_N_FIELDS_MASK 0x7FEUL +#define REC_OLD_N_FIELDS_SHIFT 1 + +#define REC_OLD_HEAP_NO 5 +#define REC_HEAP_NO_MASK 0xFFF8UL +#if 0 /* defined in rem0rec.h for use of page0zip.cc */ +#define REC_NEW_HEAP_NO 4 +#define REC_HEAP_NO_SHIFT 3 +#endif + +#define REC_OLD_N_OWNED 6 /* This is single byte bit-field */ +#define REC_NEW_N_OWNED 5 /* This is single byte bit-field */ +#define REC_N_OWNED_MASK 0xFUL +#define REC_N_OWNED_SHIFT 0 + +#define REC_OLD_INFO_BITS 6 /* This is single byte bit-field */ +#define REC_NEW_INFO_BITS 5 /* This is single byte bit-field */ +#define REC_INFO_BITS_MASK 0xF0UL +#define REC_INFO_BITS_SHIFT 0 + +#if REC_OLD_SHORT_MASK << (8 * (REC_OLD_SHORT - 3)) \ + ^ REC_OLD_N_FIELDS_MASK << (8 * (REC_OLD_N_FIELDS - 4)) \ + ^ REC_HEAP_NO_MASK << (8 * (REC_OLD_HEAP_NO - 4)) \ + ^ REC_N_OWNED_MASK << (8 * (REC_OLD_N_OWNED - 3)) \ + ^ REC_INFO_BITS_MASK << (8 * (REC_OLD_INFO_BITS - 3)) \ + ^ 0xFFFFFFFFUL +# error "sum of old-style masks != 0xFFFFFFFFUL" +#endif +#if REC_NEW_STATUS_MASK << (8 * (REC_NEW_STATUS - 3)) \ + ^ REC_HEAP_NO_MASK << (8 * (REC_NEW_HEAP_NO - 4)) \ + ^ REC_N_OWNED_MASK << (8 * (REC_NEW_N_OWNED - 3)) \ + ^ REC_INFO_BITS_MASK << (8 * (REC_NEW_INFO_BITS - 3)) \ + ^ 0xFFFFFFUL +# error "sum of new-style masks != 0xFFFFFFUL" +#endif + +/******************************************************//** +Gets a bit field from within 1 byte. */ +UNIV_INLINE +byte +rec_get_bit_field_1( +/*================*/ + const rec_t* rec, /*!< in: pointer to record origin */ + ulint offs, /*!< in: offset from the origin down */ + ulint mask, /*!< in: mask used to filter bits */ + ulint shift) /*!< in: shift right applied after masking */ +{ + return static_cast((*(rec - offs) & mask) >> shift); +} + +/******************************************************//** +Sets a bit field within 1 byte. */ +UNIV_INLINE +void +rec_set_bit_field_1( +/*================*/ + rec_t* rec, /*!< in: pointer to record origin */ + ulint val, /*!< in: value to set */ + ulint offs, /*!< in: offset from the origin down */ + ulint mask, /*!< in: mask used to filter bits */ + ulint shift) /*!< in: shift right applied after masking */ +{ + ut_ad(rec); + ut_ad(offs <= REC_N_OLD_EXTRA_BYTES); + ut_ad(mask); + ut_ad(mask <= 0xFFUL); + ut_ad(((mask >> shift) << shift) == mask); + ut_ad(((val << shift) & mask) == (val << shift)); + + mach_write_to_1(rec - offs, + (mach_read_from_1(rec - offs) & ~mask) + | (val << shift)); +} + +/******************************************************//** +Gets a bit field from within 2 bytes. */ +UNIV_INLINE +ulint +rec_get_bit_field_2( +/*================*/ + const rec_t* rec, /*!< in: pointer to record origin */ + ulint offs, /*!< in: offset from the origin down */ + ulint mask, /*!< in: mask used to filter bits */ + ulint shift) /*!< in: shift right applied after masking */ +{ + ut_ad(rec); + + return((mach_read_from_2(rec - offs) & mask) >> shift); +} + +/******************************************************//** +Sets a bit field within 2 bytes. */ +UNIV_INLINE +void +rec_set_bit_field_2( +/*================*/ + rec_t* rec, /*!< in: pointer to record origin */ + ulint val, /*!< in: value to set */ + ulint offs, /*!< in: offset from the origin down */ + ulint mask, /*!< in: mask used to filter bits */ + ulint shift) /*!< in: shift right applied after masking */ +{ + ut_ad(rec); + ut_ad(offs <= REC_N_OLD_EXTRA_BYTES); + ut_ad(mask > 0xFFUL); + ut_ad(mask <= 0xFFFFUL); + ut_ad((mask >> shift) & 1); + ut_ad(0 == ((mask >> shift) & ((mask >> shift) + 1))); + ut_ad(((mask >> shift) << shift) == mask); + ut_ad(((val << shift) & mask) == (val << shift)); + + mach_write_to_2(rec - offs, + (mach_read_from_2(rec - offs) & ~mask) + | (val << shift)); +} + +/******************************************************//** +The following function is used to get the offset of the next chained record +on the same page. +@return the page offset of the next chained record, or 0 if none */ +UNIV_INLINE +ulint +rec_get_next_offs( +/*==============*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + ulint field_value; + compile_time_assert(REC_NEXT_MASK == 0xFFFFUL); + compile_time_assert(REC_NEXT_SHIFT == 0); + + field_value = mach_read_from_2(rec - REC_NEXT); + + if (comp) { +#if UNIV_PAGE_SIZE_MAX <= 32768 + /* Note that for 64 KiB pages, field_value can 'wrap around' + and the debug assertion is not valid */ + + /* In the following assertion, field_value is interpreted + as signed 16-bit integer in 2's complement arithmetics. + If all platforms defined int16_t in the standard headers, + the expression could be written simpler as + (int16_t) field_value + ut_align_offset(...) < srv_page_size + */ + ut_ad((field_value >= 32768 + ? field_value - 65536 + : field_value) + + ut_align_offset(rec, srv_page_size) + < srv_page_size); +#endif + if (field_value == 0) { + + return(0); + } + + /* There must be at least REC_N_NEW_EXTRA_BYTES + 1 + between each record. */ + ut_ad((field_value > REC_N_NEW_EXTRA_BYTES + && field_value < 32768) + || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES); + + return(ut_align_offset(rec + field_value, srv_page_size)); + } else { + ut_ad(field_value < srv_page_size); + + return(field_value); + } +} + +/******************************************************//** +The following function is used to set the next record offset field +of an old-style record. */ +UNIV_INLINE +void +rec_set_next_offs_old( +/*==================*/ + rec_t* rec, /*!< in: old-style physical record */ + ulint next) /*!< in: offset of the next record */ +{ + ut_ad(srv_page_size > next); + compile_time_assert(REC_NEXT_MASK == 0xFFFFUL); + compile_time_assert(REC_NEXT_SHIFT == 0); + mach_write_to_2(rec - REC_NEXT, next); +} + +/******************************************************//** +The following function is used to set the next record offset field +of a new-style record. */ +UNIV_INLINE +void +rec_set_next_offs_new( +/*==================*/ + rec_t* rec, /*!< in/out: new-style physical record */ + ulint next) /*!< in: offset of the next record */ +{ + ulint field_value; + + ut_ad(srv_page_size > next); + + if (!next) { + field_value = 0; + } else { + /* The following two statements calculate + next - offset_of_rec mod 64Ki, where mod is the modulo + as a non-negative number */ + + field_value = (ulint) + ((lint) next + - (lint) ut_align_offset(rec, srv_page_size)); + field_value &= REC_NEXT_MASK; + } + + mach_write_to_2(rec - REC_NEXT, field_value); +} + +/******************************************************//** +The following function is used to get the number of fields +in an old-style record. +@return number of data fields */ +UNIV_INLINE +ulint +rec_get_n_fields_old( +/*=================*/ + const rec_t* rec) /*!< in: physical record */ +{ + ulint ret; + + ut_ad(rec); + + ret = rec_get_bit_field_2(rec, REC_OLD_N_FIELDS, + REC_OLD_N_FIELDS_MASK, + REC_OLD_N_FIELDS_SHIFT); + ut_ad(ret <= REC_MAX_N_FIELDS); + ut_ad(ret > 0); + + return(ret); +} + +/******************************************************//** +The following function is used to set the number of fields +in an old-style record. */ +UNIV_INLINE +void +rec_set_n_fields_old( +/*=================*/ + rec_t* rec, /*!< in: physical record */ + ulint n_fields) /*!< in: the number of fields */ +{ + ut_ad(rec); + ut_ad(n_fields <= REC_MAX_N_FIELDS); + ut_ad(n_fields > 0); + + rec_set_bit_field_2(rec, n_fields, REC_OLD_N_FIELDS, + REC_OLD_N_FIELDS_MASK, REC_OLD_N_FIELDS_SHIFT); +} + +/******************************************************//** +The following function is used to get the number of fields +in a record. +@return number of data fields */ +UNIV_INLINE +ulint +rec_get_n_fields( +/*=============*/ + const rec_t* rec, /*!< in: physical record */ + const dict_index_t* index) /*!< in: record descriptor */ +{ + ut_ad(rec); + ut_ad(index); + + if (!dict_table_is_comp(index->table)) { + return(rec_get_n_fields_old(rec)); + } + + switch (rec_get_status(rec)) { + case REC_STATUS_INSTANT: + case REC_STATUS_ORDINARY: + return(dict_index_get_n_fields(index)); + case REC_STATUS_NODE_PTR: + return(dict_index_get_n_unique_in_tree(index) + 1); + case REC_STATUS_INFIMUM: + case REC_STATUS_SUPREMUM: + return(1); + } + + ut_error; + return(ULINT_UNDEFINED); +} + +/** Confirms the n_fields of the entry is sane with comparing the other +record in the same page specified +@param[in] index index +@param[in] rec record of the same page +@param[in] entry index entry +@return true if n_fields is sane */ +UNIV_INLINE +bool +rec_n_fields_is_sane( + dict_index_t* index, + const rec_t* rec, + const dtuple_t* entry) +{ + const ulint n_fields = rec_get_n_fields(rec, index); + + return(n_fields == dtuple_get_n_fields(entry) + || (index->is_instant() + && n_fields >= index->n_core_fields) + /* a record for older SYS_INDEXES table + (missing merge_threshold column) is acceptable. */ + || (index->table->id == DICT_INDEXES_ID + && n_fields == dtuple_get_n_fields(entry) - 1)); +} + +/******************************************************//** +The following function is used to get the number of records owned by the +previous directory record. +@return number of owned records */ +UNIV_INLINE +ulint +rec_get_n_owned_old( +/*================*/ + const rec_t* rec) /*!< in: old-style physical record */ +{ + return(rec_get_bit_field_1(rec, REC_OLD_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT)); +} + +/******************************************************//** +The following function is used to get the number of records owned by the +previous directory record. +@return number of owned records */ +UNIV_INLINE +ulint +rec_get_n_owned_new( +/*================*/ + const rec_t* rec) /*!< in: new-style physical record */ +{ + return(rec_get_bit_field_1(rec, REC_NEW_N_OWNED, + REC_N_OWNED_MASK, REC_N_OWNED_SHIFT)); +} + +/******************************************************//** +The following function is used to retrieve the info bits of a record. +@return info bits */ +UNIV_INLINE +byte +rec_get_info_bits( +/*==============*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + return rec_get_bit_field_1( + rec, comp ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS, + REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); +} + +/******************************************************//** +The following function is used to retrieve the info and status +bits of a record. (Only compact records have status bits.) +@return info and status bits */ +UNIV_INLINE +byte +rec_get_info_and_status_bits( +/*=========================*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + compile_time_assert(!((REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) + & (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT))); + if (comp) + return static_cast(rec_get_info_bits(rec, TRUE) | + rec_get_status(rec)); + else + return rec_get_info_bits(rec, FALSE); +} +/******************************************************//** +The following function is used to set the info and status +bits of a record. (Only compact records have status bits.) */ +UNIV_INLINE +void +rec_set_info_and_status_bits( +/*=========================*/ + rec_t* rec, /*!< in/out: physical record */ + ulint bits) /*!< in: info bits */ +{ + compile_time_assert(!((REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) + & (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT))); + rec_set_status(rec, bits & REC_NEW_STATUS_MASK); + rec_set_bit_field_1(rec, bits & ~REC_NEW_STATUS_MASK, + REC_NEW_INFO_BITS, + REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT); +} + +/******************************************************//** +The following function tells if record is delete marked. +@return nonzero if delete marked */ +UNIV_INLINE +ulint +rec_get_deleted_flag( +/*=================*/ + const rec_t* rec, /*!< in: physical record */ + ulint comp) /*!< in: nonzero=compact page format */ +{ + if (comp) { + return(rec_get_bit_field_1(rec, REC_NEW_INFO_BITS, + REC_INFO_DELETED_FLAG, + REC_INFO_BITS_SHIFT)); + } else { + return(rec_get_bit_field_1(rec, REC_OLD_INFO_BITS, + REC_INFO_DELETED_FLAG, + REC_INFO_BITS_SHIFT)); + } +} + +/******************************************************//** +The following function tells if a new-style record is a node pointer. +@return TRUE if node pointer */ +UNIV_INLINE +bool +rec_get_node_ptr_flag( +/*==================*/ + const rec_t* rec) /*!< in: physical record */ +{ + return(REC_STATUS_NODE_PTR == rec_get_status(rec)); +} + +/******************************************************//** +The following function is used to get the order number +of an old-style record in the heap of the index page. +@return heap order number */ +UNIV_INLINE +ulint +rec_get_heap_no_old( +/*================*/ + const rec_t* rec) /*!< in: physical record */ +{ + return(rec_get_bit_field_2(rec, REC_OLD_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT)); +} + +/******************************************************//** +The following function is used to get the order number +of a new-style record in the heap of the index page. +@return heap order number */ +UNIV_INLINE +ulint +rec_get_heap_no_new( +/*================*/ + const rec_t* rec) /*!< in: physical record */ +{ + return(rec_get_bit_field_2(rec, REC_NEW_HEAP_NO, + REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT)); +} + +/******************************************************//** +The following function is used to test whether the data offsets in the record +are stored in one-byte or two-byte format. +@return TRUE if 1-byte form */ +UNIV_INLINE +ibool +rec_get_1byte_offs_flag( +/*====================*/ + const rec_t* rec) /*!< in: physical record */ +{ + return(rec_get_bit_field_1(rec, REC_OLD_SHORT, REC_OLD_SHORT_MASK, + REC_OLD_SHORT_SHIFT)); +} + +/******************************************************//** +The following function is used to set the 1-byte offsets flag. */ +UNIV_INLINE +void +rec_set_1byte_offs_flag( +/*====================*/ + rec_t* rec, /*!< in: physical record */ + ibool flag) /*!< in: TRUE if 1byte form */ +{ + ut_ad(flag <= 1); + + rec_set_bit_field_1(rec, flag, REC_OLD_SHORT, REC_OLD_SHORT_MASK, + REC_OLD_SHORT_SHIFT); +} + +/******************************************************//** +Returns the offset of nth field end if the record is stored in the 1-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. +@return offset of the start of the field, SQL null flag ORed */ +UNIV_INLINE +uint8_t +rec_1_get_field_end_info( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(rec_get_1byte_offs_flag(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); + + return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1))); +} + +/******************************************************//** +Returns the offset of nth field end if the record is stored in the 2-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. +@return offset of the start of the field, SQL null flag and extern +storage flag ORed */ +UNIV_INLINE +uint16_t +rec_2_get_field_end_info( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(!rec_get_1byte_offs_flag(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); + + return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2))); +} + +/******************************************************//** +Returns nonzero if the field is stored off-page. +@retval 0 if the field is stored in-page +@retval REC_2BYTE_EXTERN_MASK if the field is stored externally */ +UNIV_INLINE +ulint +rec_2_is_field_extern( +/*==================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + return(rec_2_get_field_end_info(rec, n) & REC_2BYTE_EXTERN_MASK); +} + +/**********************************************************//** +The following function sets the number of allocated elements +for an array of offsets. */ +UNIV_INLINE +void +rec_offs_set_n_alloc( +/*=================*/ + rec_offs*offsets, /*!< out: array for rec_get_offsets(), + must be allocated */ + ulint n_alloc) /*!< in: number of elements */ +{ + ut_ad(n_alloc > REC_OFFS_HEADER_SIZE); + MEM_UNDEFINED(offsets, n_alloc * sizeof *offsets); + offsets[0] = static_cast(n_alloc); +} + +/************************************************************//** +The following function is used to get an offset to the nth +data field in a record. +@return offset from the origin of rec */ +UNIV_INLINE +rec_offs +rec_get_nth_field_offs( +/*===================*/ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n, /*!< in: index of the field */ + ulint* len) /*!< out: length of the field; UNIV_SQL_NULL + if SQL null; UNIV_SQL_DEFAULT is default value */ +{ + ut_ad(n < rec_offs_n_fields(offsets)); + + rec_offs offs = n == 0 ? 0 : get_value(rec_offs_base(offsets)[n]); + rec_offs next_offs = rec_offs_base(offsets)[1 + n]; + + if (get_type(next_offs) == SQL_NULL) { + *len = UNIV_SQL_NULL; + } else if (get_type(next_offs) == DEFAULT) { + *len = UNIV_SQL_DEFAULT; + } else { + *len = get_value(next_offs) - offs; + } + + return(offs); +} + +/******************************************************//** +Determine if the offsets are for a record containing null BLOB pointers. +@return first field containing a null BLOB pointer, or NULL if none found */ +UNIV_INLINE +const byte* +rec_offs_any_null_extern( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + const rec_offs* offsets) /*!< in: rec_get_offsets(rec) */ +{ + ulint i; + ut_ad(rec_offs_validate(rec, NULL, offsets)); + + if (!rec_offs_any_extern(offsets)) { + return(NULL); + } + + for (i = 0; i < rec_offs_n_fields(offsets); i++) { + if (rec_offs_nth_extern(offsets, i)) { + ulint len; + const byte* field + = rec_get_nth_field(rec, offsets, i, &len); + + ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE); + if (!memcmp(field + len + - BTR_EXTERN_FIELD_REF_SIZE, + field_ref_zero, + BTR_EXTERN_FIELD_REF_SIZE)) { + return(field); + } + } + } + + return(NULL); +} + +/******************************************************//** +Gets the physical size of a field. +@return length of field */ +UNIV_INLINE +ulint +rec_offs_nth_size( +/*==============*/ + const rec_offs* offsets,/*!< in: array returned by rec_get_offsets() */ + ulint n) /*!< in: nth field */ +{ + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + ut_ad(n < rec_offs_n_fields(offsets)); + if (!n) { + return get_value(rec_offs_base(offsets)[1 + n]); + } + return get_value((rec_offs_base(offsets)[1 + n])) + - get_value(rec_offs_base(offsets)[n]); +} + +/******************************************************//** +Returns the number of extern bits set in a record. +@return number of externally stored fields */ +UNIV_INLINE +ulint +rec_offs_n_extern( +/*==============*/ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ulint n = 0; + + if (rec_offs_any_extern(offsets)) { + ulint i; + + for (i = rec_offs_n_fields(offsets); i--; ) { + if (rec_offs_nth_extern(offsets, i)) { + n++; + } + } + } + + return(n); +} + +/******************************************************//** +Returns the offset of n - 1th field end if the record is stored in the 1-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. This function and the 2-byte counterpart are defined here because the +C-compiler was not able to sum negative and positive constant offsets, and +warned of constant arithmetic overflow within the compiler. +@return offset of the start of the PREVIOUS field, SQL null flag ORed */ +UNIV_INLINE +ulint +rec_1_get_prev_field_end_info( +/*==========================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(rec_get_1byte_offs_flag(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); + + return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n))); +} + +/******************************************************//** +Returns the offset of n - 1th field end if the record is stored in the 2-byte +offsets form. If the field is SQL null, the flag is ORed in the returned +value. +@return offset of the start of the PREVIOUS field, SQL null flag ORed */ +UNIV_INLINE +ulint +rec_2_get_prev_field_end_info( +/*==========================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(!rec_get_1byte_offs_flag(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); + + return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n))); +} + +/******************************************************//** +Sets the field end info for the nth field if the record is stored in the +1-byte format. */ +UNIV_INLINE +void +rec_1_set_field_end_info( +/*=====================*/ + rec_t* rec, /*!< in: record */ + ulint n, /*!< in: field index */ + ulint info) /*!< in: value to set */ +{ + ut_ad(rec_get_1byte_offs_flag(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); + + mach_write_to_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1), info); +} + +/******************************************************//** +Sets the field end info for the nth field if the record is stored in the +2-byte format. */ +UNIV_INLINE +void +rec_2_set_field_end_info( +/*=====================*/ + rec_t* rec, /*!< in: record */ + ulint n, /*!< in: field index */ + ulint info) /*!< in: value to set */ +{ + ut_ad(!rec_get_1byte_offs_flag(rec)); + ut_ad(n < rec_get_n_fields_old(rec)); + + mach_write_to_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2), info); +} + +/******************************************************//** +Returns the offset of nth field start if the record is stored in the 1-byte +offsets form. +@return offset of the start of the field */ +UNIV_INLINE +ulint +rec_1_get_field_start_offs( +/*=======================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(rec_get_1byte_offs_flag(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); + + if (n == 0) { + + return(0); + } + + return(rec_1_get_prev_field_end_info(rec, n) + & ~REC_1BYTE_SQL_NULL_MASK); +} + +/******************************************************//** +Returns the offset of nth field start if the record is stored in the 2-byte +offsets form. +@return offset of the start of the field */ +UNIV_INLINE +ulint +rec_2_get_field_start_offs( +/*=======================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(!rec_get_1byte_offs_flag(rec)); + ut_ad(n <= rec_get_n_fields_old(rec)); + + if (n == 0) { + + return(0); + } + + return(rec_2_get_prev_field_end_info(rec, n) + & ~(REC_2BYTE_SQL_NULL_MASK | REC_2BYTE_EXTERN_MASK)); +} + +/******************************************************//** +The following function is used to read the offset of the start of a data field +in the record. The start of an SQL null field is the end offset of the +previous non-null field, or 0, if none exists. If n is the number of the last +field + 1, then the end offset of the last field is returned. +@return offset of the start of the field */ +UNIV_INLINE +ulint +rec_get_field_start_offs( +/*=====================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: field index */ +{ + ut_ad(rec); + ut_ad(n <= rec_get_n_fields_old(rec)); + + if (n == 0) { + + return(0); + } + + if (rec_get_1byte_offs_flag(rec)) { + + return(rec_1_get_field_start_offs(rec, n)); + } + + return(rec_2_get_field_start_offs(rec, n)); +} + +/************************************************************//** +Gets the physical size of an old-style field. +Also an SQL null may have a field of size > 0, +if the data type is of a fixed size. +@return field size in bytes */ +UNIV_INLINE +ulint +rec_get_nth_field_size( +/*===================*/ + const rec_t* rec, /*!< in: record */ + ulint n) /*!< in: index of the field */ +{ + ulint os; + ulint next_os; + + os = rec_get_field_start_offs(rec, n); + next_os = rec_get_field_start_offs(rec, n + 1); + + ut_ad(next_os - os < srv_page_size); + + return(next_os - os); +} + +/**********************************************************//** +The following function returns the data size of an old-style physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. +@return size */ +UNIV_INLINE +ulint +rec_get_data_size_old( +/*==================*/ + const rec_t* rec) /*!< in: physical record */ +{ + ut_ad(rec); + + return(rec_get_field_start_offs(rec, rec_get_n_fields_old(rec))); +} + +/**********************************************************//** +The following function sets the number of fields in offsets. */ +UNIV_INLINE +void +rec_offs_set_n_fields( +/*==================*/ + rec_offs* offsets, /*!< in/out: array returned by + rec_get_offsets() */ + ulint n_fields) /*!< in: number of fields */ +{ + ut_ad(offsets); + ut_ad(n_fields > 0); + ut_ad(n_fields <= REC_MAX_N_FIELDS); + ut_ad(n_fields + REC_OFFS_HEADER_SIZE + <= rec_offs_get_n_alloc(offsets)); + offsets[1] = static_cast(n_fields); +} + +/**********************************************************//** +The following function returns the data size of a physical +record, that is the sum of field lengths. SQL null fields +are counted as length 0 fields. The value returned by the function +is the distance from record origin to record end in bytes. +@return size */ +UNIV_INLINE +ulint +rec_offs_data_size( +/*===============*/ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ulint size; + + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + size = get_value(rec_offs_base(offsets)[rec_offs_n_fields(offsets)]); + ut_ad(size < srv_page_size); + return(size); +} + +/**********************************************************//** +Returns the total size of record minus data size of record. The value +returned by the function is the distance from record start to record origin +in bytes. +@return size */ +UNIV_INLINE +ulint +rec_offs_extra_size( +/*================*/ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ulint size; + ut_ad(rec_offs_validate(NULL, NULL, offsets)); + size = *rec_offs_base(offsets) & REC_OFFS_MASK; + ut_ad(size < srv_page_size); + return(size); +} + +/**********************************************************//** +Returns the total size of a physical record. +@return size */ +UNIV_INLINE +ulint +rec_offs_size( +/*==========*/ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + return(rec_offs_data_size(offsets) + rec_offs_extra_size(offsets)); +} + +#ifdef UNIV_DEBUG +/**********************************************************//** +Returns a pointer to the end of the record. +@return pointer to end */ +UNIV_INLINE +byte* +rec_get_end( +/*========*/ + const rec_t* rec, /*!< in: pointer to record */ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ut_ad(rec_offs_validate(rec, NULL, offsets)); + return(const_cast(rec + rec_offs_data_size(offsets))); +} + +/**********************************************************//** +Returns a pointer to the start of the record. +@return pointer to start */ +UNIV_INLINE +byte* +rec_get_start( +/*==========*/ + const rec_t* rec, /*!< in: pointer to record */ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + ut_ad(rec_offs_validate(rec, NULL, offsets)); + return(const_cast(rec - rec_offs_extra_size(offsets))); +} +#endif /* UNIV_DEBUG */ + +/** Copy a physical record to a buffer. +@param[in] buf buffer +@param[in] rec physical record +@param[in] offsets array returned by rec_get_offsets() +@return pointer to the origin of the copy */ +UNIV_INLINE +rec_t* +rec_copy( + void* buf, + const rec_t* rec, + const rec_offs* offsets) +{ + ulint extra_len; + ulint data_len; + + ut_ad(rec != NULL); + ut_ad(buf != NULL); + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(rec_validate(rec, offsets)); + + extra_len = rec_offs_extra_size(offsets); + data_len = rec_offs_data_size(offsets); + + memcpy(buf, rec - extra_len, extra_len + data_len); + + return((byte*) buf + extra_len); +} + +/**********************************************************//** +Returns the extra size of an old-style physical record if we know its +data size and number of fields. +@return extra size */ +UNIV_INLINE +ulint +rec_get_converted_extra_size( +/*=========================*/ + ulint data_size, /*!< in: data size */ + ulint n_fields, /*!< in: number of fields */ + ulint n_ext) /*!< in: number of externally stored columns */ +{ + if (!n_ext && data_size <= REC_1BYTE_OFFS_LIMIT) { + + return(REC_N_OLD_EXTRA_BYTES + n_fields); + } + + return(REC_N_OLD_EXTRA_BYTES + 2 * n_fields); +} + +/**********************************************************//** +The following function returns the size of a data tuple when converted to +a physical record. +@return size */ +UNIV_INLINE +ulint +rec_get_converted_size( +/*===================*/ + dict_index_t* index, /*!< in: record descriptor */ + const dtuple_t* dtuple, /*!< in: data tuple */ + ulint n_ext) /*!< in: number of externally stored columns */ +{ + ulint data_size; + ulint extra_size; + + ut_ad(dtuple_check_typed(dtuple)); +#ifdef UNIV_DEBUG + if (dict_index_is_ibuf(index)) { + ut_ad(dtuple->n_fields > 1); + } else if ((dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK) + == REC_STATUS_NODE_PTR) { + ut_ad(dtuple->n_fields - 1 + == dict_index_get_n_unique_in_tree_nonleaf(index)); + } else if (index->table->id == DICT_INDEXES_ID) { + /* The column SYS_INDEXES.MERGE_THRESHOLD was + instantly added in MariaDB 10.2.2 (MySQL 5.7). */ + ut_ad(!index->table->is_temporary()); + ut_ad(index->n_fields == DICT_NUM_FIELDS__SYS_INDEXES); + ut_ad(dtuple->n_fields == DICT_NUM_FIELDS__SYS_INDEXES + || dtuple->n_fields + == DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD); + } else { + ut_ad(dtuple->n_fields >= index->n_core_fields); + ut_ad(dtuple->n_fields <= index->n_fields + || dtuple->is_alter_metadata()); + } +#endif + + if (dict_table_is_comp(index->table)) { + return rec_get_converted_size_comp(index, dtuple, NULL); + } + + data_size = dtuple_get_data_size(dtuple, 0); + + /* If primary key is being updated then the new record inherits + externally stored fields from the delete-marked old record. + In that case, n_ext may be less value than + dtuple_get_n_ext(tuple). */ + ut_ad(n_ext <= dtuple_get_n_ext(dtuple)); + extra_size = rec_get_converted_extra_size( + data_size, dtuple_get_n_fields(dtuple), n_ext); + + return(data_size + extra_size); +} diff --git a/storage/innobase/include/rem0types.h b/storage/innobase/include/rem0types.h new file mode 100644 index 00000000..0e4075a9 --- /dev/null +++ b/storage/innobase/include/rem0types.h @@ -0,0 +1,78 @@ +/***************************************************************************** + +Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/rem0types.h +Record manager global types + +Created 5/30/1994 Heikki Tuuri +*************************************************************************/ + +#ifndef rem0types_h +#define rem0types_h + +/* We define the physical record simply as an array of bytes */ +typedef byte rec_t; + +/** This type represents a field offset in a rec_t* */ +typedef unsigned short int rec_offs; + +/* Maximum values for various fields (for non-blob tuples) */ +#define REC_MAX_N_FIELDS (1024 - 1) +#define REC_MAX_HEAP_NO (2 * 8192 - 1) +#define REC_MAX_N_OWNED (16 - 1) + +/* Maximum number of user defined fields/columns. The reserved columns +are the ones InnoDB adds internally: DB_ROW_ID, DB_TRX_ID, DB_ROLL_PTR. +Before MariaDB Server 10.5, we needed "* 2" because mlog_parse_index() +created a dummy table object possibly, with some of the system columns +in it, and then adds the 3 system columns (again) using +dict_table_add_system_columns(). +For now, we will keep this limitation to maintain file format compatibility +with older versions. */ +#define REC_MAX_N_USER_FIELDS (REC_MAX_N_FIELDS - DATA_N_SYS_COLS * 2) + +/* REC_ANTELOPE_MAX_INDEX_COL_LEN is measured in bytes and is the maximum +indexed field length (or indexed prefix length) for indexes on tables of +ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT format. +Before we support UTF-8 encodings with mbmaxlen = 4, a UTF-8 character +may take at most 3 bytes. So the limit was set to 3*256, so that one +can create a column prefix index on 256 characters of a TEXT or VARCHAR +column also in the UTF-8 charset. +This constant MUST NOT BE CHANGED, or the compatibility of InnoDB data +files would be at risk! */ +#define REC_ANTELOPE_MAX_INDEX_COL_LEN 768 + +/** Maximum indexed field length for tables that have atomic BLOBs. +This (3072) is the maximum index row length allowed, so we cannot create index +prefix column longer than that. */ +#define REC_VERSION_56_MAX_INDEX_COL_LEN 3072 + +/** Innodb row types are a subset of the MySQL global enum row_type. +They are made into their own enum so that switch statements can account +for each of them. */ +enum rec_format_enum { + REC_FORMAT_REDUNDANT = 0, /*!< REDUNDANT row format */ + REC_FORMAT_COMPACT = 1, /*!< COMPACT row format */ + REC_FORMAT_COMPRESSED = 2, /*!< COMPRESSED row format */ + REC_FORMAT_DYNAMIC = 3 /*!< DYNAMIC row format */ +}; +typedef enum rec_format_enum rec_format_t; + +#endif diff --git a/storage/innobase/include/row0ext.h b/storage/innobase/include/row0ext.h new file mode 100644 index 00000000..78886332 --- /dev/null +++ b/storage/innobase/include/row0ext.h @@ -0,0 +1,101 @@ +/***************************************************************************** + +Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0ext.h +Caching of externally stored column prefixes + +Created September 2006 Marko Makela +*******************************************************/ + +#ifndef row0ext_h +#define row0ext_h + +#include "data0types.h" +#include "mem0mem.h" +#include "dict0types.h" +#include "fsp0types.h" +#include "row0types.h" + +/********************************************************************//** +Creates a cache of column prefixes of externally stored columns. +@return own: column prefix cache */ +row_ext_t* +row_ext_create( +/*===========*/ + ulint n_ext, /*!< in: number of externally stored columns */ + const ulint* ext, /*!< in: col_no's of externally stored columns + in the InnoDB table object, as reported by + dict_col_get_no(); NOT relative to the records + in the clustered index */ + const dict_table_t& table, /*!< in: table */ + const dtuple_t* tuple, /*!< in: data tuple containing the field + references of the externally stored + columns; must be indexed by col_no; + the clustered index record must be + covered by a lock or a page latch + to prevent deletion (rollback or purge). */ + mem_heap_t* heap); /*!< in: heap where created */ + +/********************************************************************//** +Looks up a column prefix of an externally stored column. +@return column prefix, or NULL if the column is not stored externally, +or pointer to field_ref_zero if the BLOB pointer is unset */ +UNIV_INLINE +const byte* +row_ext_lookup_ith( +/*===============*/ + const row_ext_t* ext, /*!< in/out: column prefix cache */ + ulint i, /*!< in: index of ext->ext[] */ + ulint* len); /*!< out: length of prefix, in bytes, + at most the length determined by + DICT_MAX_FIELD_LEN_BY_FORMAT() */ +/********************************************************************//** +Looks up a column prefix of an externally stored column. +@return column prefix, or NULL if the column is not stored externally, +or pointer to field_ref_zero if the BLOB pointer is unset */ +UNIV_INLINE +const byte* +row_ext_lookup( +/*===========*/ + const row_ext_t* ext, /*!< in: column prefix cache */ + ulint col, /*!< in: column number in the InnoDB + table object, as reported by + dict_col_get_no(); NOT relative to the + records in the clustered index */ + ulint* len); /*!< out: length of prefix, in bytes, + at most the length determined by + DICT_MAX_FIELD_LEN_BY_FORMAT() */ + +/** Prefixes of externally stored columns */ +struct row_ext_t{ + ulint n_ext; /*!< number of externally stored columns */ + const ulint* ext; /*!< col_no's of externally stored columns */ + byte* buf; /*!< backing store of the column prefix cache */ + ulint max_len;/*!< maximum prefix length, it could be + REC_ANTELOPE_MAX_INDEX_COL_LEN or + REC_VERSION_56_MAX_INDEX_COL_LEN depending + on row format */ + ulint zip_size;/*!< ROW_FORMAT=COMPRESSED page size, or 0 */ + ulint len[1]; /*!< prefix lengths; 0 if not cached */ +}; + +#include "row0ext.inl" + +#endif diff --git a/storage/innobase/include/row0ext.inl b/storage/innobase/include/row0ext.inl new file mode 100644 index 00000000..913b51b3 --- /dev/null +++ b/storage/innobase/include/row0ext.inl @@ -0,0 +1,87 @@ +/***************************************************************************** + +Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0ext.ic +Caching of externally stored column prefixes + +Created September 2006 Marko Makela +*******************************************************/ + +#include "rem0types.h" +#include "btr0types.h" + +/********************************************************************//** +Looks up a column prefix of an externally stored column. +@return column prefix, or NULL if the column is not stored externally, +or pointer to field_ref_zero if the BLOB pointer is unset */ +UNIV_INLINE +const byte* +row_ext_lookup_ith( +/*===============*/ + const row_ext_t* ext, /*!< in/out: column prefix cache */ + ulint i, /*!< in: index of ext->ext[] */ + ulint* len) /*!< out: length of prefix, in bytes, + at most ext->max_len */ +{ + ut_ad(ext); + ut_ad(len); + ut_ad(i < ext->n_ext); + + *len = ext->len[i]; + + ut_ad(*len <= ext->max_len); + ut_ad(ext->max_len > 0); + + if (*len == 0) { + /* The BLOB could not be fetched to the cache. */ + return(field_ref_zero); + } else { + return(ext->buf + i * ext->max_len); + } +} + +/********************************************************************//** +Looks up a column prefix of an externally stored column. +@return column prefix, or NULL if the column is not stored externally, +or pointer to field_ref_zero if the BLOB pointer is unset */ +UNIV_INLINE +const byte* +row_ext_lookup( +/*===========*/ + const row_ext_t* ext, /*!< in: column prefix cache */ + ulint col, /*!< in: column number in the InnoDB + table object, as reported by + dict_col_get_no(); NOT relative to the + records in the clustered index */ + ulint* len) /*!< out: length of prefix, in bytes, + at most ext->max_len */ +{ + ulint i; + + ut_ad(ext); + ut_ad(len); + + for (i = 0; i < ext->n_ext; i++) { + if (col == ext->ext[i]) { + return(row_ext_lookup_ith(ext, i, len)); + } + } + + return(NULL); +} diff --git a/storage/innobase/include/row0ftsort.h b/storage/innobase/include/row0ftsort.h new file mode 100644 index 00000000..3ffa8243 --- /dev/null +++ b/storage/innobase/include/row0ftsort.h @@ -0,0 +1,268 @@ +/***************************************************************************** + +Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0ftsort.h +Create Full Text Index with (parallel) merge sort + +Created 10/13/2010 Jimmy Yang +*******************************************************/ + +#ifndef row0ftsort_h +#define row0ftsort_h + +#include "data0data.h" +#include "fts0fts.h" +#include "fts0priv.h" +#include "rem0types.h" +#include "row0merge.h" +#include "btr0bulk.h" +#include "srv0srv.h" + +/** This structure defineds information the scan thread will fetch +and put to the linked list for parallel tokenization/sort threads +to process */ +typedef struct fts_doc_item fts_doc_item_t; + +/** Information about temporary files used in merge sort */ +struct fts_doc_item { + dfield_t* field; /*!< field contains document string */ + doc_id_t doc_id; /*!< document ID */ + UT_LIST_NODE_T(fts_doc_item_t) doc_list; + /*!< list of doc items */ +}; + +/** This defines the list type that scan thread would feed the parallel +tokenization threads and sort threads. */ +typedef UT_LIST_BASE_NODE_T(fts_doc_item_t) fts_doc_list_t; + +#define FTS_PLL_MERGE 1 + +/** Sort information passed to each individual parallel sort thread */ +struct fts_psort_t; + +/** Common info passed to each parallel sort thread */ +struct fts_psort_common_t { + row_merge_dup_t* dup; /*!< descriptor of FTS index */ + dict_table_t* new_table; /*!< source table */ + /** Old table page size */ + ulint old_zip_size; + trx_t* trx; /*!< transaction */ + fts_psort_t* all_info; /*!< all parallel sort info */ + pthread_cond_t sort_cond; /*!< sort completion */ + ibool opt_doc_id_size;/*!< whether to use 4 bytes + instead of 8 bytes integer to + store Doc ID during sort, if + Doc ID will not be big enough + to use 8 bytes value */ +}; + +struct fts_psort_t { + ulint psort_id; /*!< Parallel sort ID */ + row_merge_buf_t* merge_buf[FTS_NUM_AUX_INDEX]; + /*!< sort buffer */ + merge_file_t* merge_file[FTS_NUM_AUX_INDEX]; + /*!< sort file */ + row_merge_block_t* merge_block[FTS_NUM_AUX_INDEX]; + /*!< buffer to write to file */ + row_merge_block_t* crypt_block[FTS_NUM_AUX_INDEX]; + /*!< buffer to crypt data */ + ulint child_status; /*!< child task status */ + ulint state; /*!< parent state */ + fts_doc_list_t fts_doc_list; /*!< doc list to process */ + fts_psort_common_t* psort_common; /*!< ptr to all psort info */ + tpool::waitable_task* task; /*!< threadpool task */ + dberr_t error; /*!< db error during psort */ + ulint memory_used; /*!< memory used by fts_doc_list */ + mysql_mutex_t mutex; /*!< mutex for fts_doc_list */ +}; + +/** Row fts token for plugin parser */ +struct row_fts_token_t { + fts_string_t* text; /*!< token */ + UT_LIST_NODE_T(row_fts_token_t) + token_list; /*!< next token link */ +}; + +typedef UT_LIST_BASE_NODE_T(row_fts_token_t) fts_token_list_t; + +/** Structure stores information from string tokenization operation */ +struct fts_tokenize_ctx { + /** the processed string length in bytes + (when using the built-in tokenizer), + or the number of row_merge_fts_doc_tokenize_by_parser() calls */ + ulint processed_len; + ulint init_pos; /*!< doc start position */ + ulint buf_used; /*!< the sort buffer (ID) when + tokenization stops, which + could due to sort buffer full */ + ulint rows_added[FTS_NUM_AUX_INDEX]; + /*!< number of rows added for + each FTS index partition */ + ib_rbt_t* cached_stopword;/*!< in: stopword list */ + dfield_t sort_field[FTS_NUM_FIELDS_SORT]; + /*!< in: sort field */ + /** parsed tokens (when using an external parser) */ + fts_token_list_t fts_token_list; + + fts_tokenize_ctx() : + processed_len(0), init_pos(0), buf_used(0), + rows_added(), cached_stopword(NULL), sort_field(), + fts_token_list() + { + memset(rows_added, 0, sizeof rows_added); + memset(sort_field, 0, sizeof sort_field); + UT_LIST_INIT(fts_token_list, &row_fts_token_t::token_list); + } +}; + +typedef struct fts_tokenize_ctx fts_tokenize_ctx_t; + +/** Structure stores information needed for the insertion phase of FTS +parallel sort. */ +struct fts_psort_insert { + CHARSET_INFO* charset; /*!< charset info */ + mem_heap_t* heap; /*!< heap */ + ibool opt_doc_id_size;/*!< Whether to use smaller (4 bytes) + integer for Doc ID */ + BtrBulk* btr_bulk; /*!< Bulk load instance */ + dtuple_t* tuple; /*!< Tuple to insert */ + +#ifdef UNIV_DEBUG + ulint aux_index_id; /*!< Auxiliary index id */ +#endif +}; + +typedef struct fts_psort_insert fts_psort_insert_t; + + +/** status bit used for communication between parent and child thread */ +#define FTS_PARENT_COMPLETE 1 +#define FTS_PARENT_EXITING 2 +#define FTS_CHILD_COMPLETE 1 + +/** Print some debug information */ +#define FTSORT_PRINT + +#ifdef FTSORT_PRINT +#define DEBUG_FTS_SORT_PRINT(str) \ + do { \ + ut_print_timestamp(stderr); \ + fprintf(stderr, str); \ + } while (0) +#else +#define DEBUG_FTS_SORT_PRINT(str) +#endif /* FTSORT_PRINT */ + +/*************************************************************//** +Create a temporary "fts sort index" used to merge sort the +tokenized doc string. The index has three "fields": + +1) Tokenized word, +2) Doc ID +3) Word's position in original 'doc'. + +@return dict_index_t structure for the fts sort index */ +dict_index_t* +row_merge_create_fts_sort_index( +/*============================*/ + dict_index_t* index, /*!< in: Original FTS index + based on which this sort index + is created */ + dict_table_t* table, /*!< in,out: table that FTS index + is being created on */ + ibool* opt_doc_id_size); + /*!< out: whether to use 4 bytes + instead of 8 bytes integer to + store Doc ID during sort */ + +/** Initialize FTS parallel sort structures. +@param[in] trx transaction +@param[in,out] dup descriptor of FTS index being created +@param[in] new_table table where indexes are created +@param[in] opt_doc_id_size whether to use 4 bytes instead of 8 bytes + integer to store Doc ID during sort +@param[in] old_zip_size page size of the old table during alter +@param[out] psort parallel sort info to be instantiated +@param[out] merge parallel merge info to be instantiated +@return true if all successful */ +bool +row_fts_psort_info_init( + trx_t* trx, + row_merge_dup_t*dup, + dict_table_t* new_table, + bool opt_doc_id_size, + ulint old_zip_size, + fts_psort_t** psort, + fts_psort_t** merge) + MY_ATTRIBUTE((nonnull)); + +/********************************************************************//** +Clean up and deallocate FTS parallel sort structures, and close +temparary merge sort files */ +void +row_fts_psort_info_destroy( +/*=======================*/ + fts_psort_t* psort_info, /*!< parallel sort info */ + fts_psort_t* merge_info); /*!< parallel merge info */ +/********************************************************************//** +Free up merge buffers when merge sort is done */ +void +row_fts_free_pll_merge_buf( +/*=======================*/ + fts_psort_t* psort_info); /*!< in: parallel sort info */ + +/*********************************************************************//** +Start the parallel tokenization and parallel merge sort */ +void +row_fts_start_psort( +/*================*/ + fts_psort_t* psort_info); /*!< in: parallel sort info */ +/*********************************************************************//** +Kick off the parallel merge and insert thread */ +void +row_fts_start_parallel_merge( +/*=========================*/ + fts_psort_t* merge_info); /*!< in: parallel sort info */ +/********************************************************************//** +Propagate a newly added record up one level in the selection tree +@return parent where this value propagated to */ +int +row_merge_fts_sel_propagate( +/*========================*/ + int propogated, /* + +/***************************************************************//** +Checks if foreign key constraint fails for an index entry. Sets shared locks +which lock either the success or the failure of the constraint. NOTE that +the caller must have a shared latch on dict_foreign_key_check_lock. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_NO_REFERENCED_ROW, or +DB_ROW_IS_REFERENCED */ +dberr_t +row_ins_check_foreign_constraint( +/*=============================*/ + ibool check_ref,/*!< in: TRUE If we want to check that + the referenced table is ok, FALSE if we + want to check the foreign key table */ + dict_foreign_t* foreign,/*!< in: foreign constraint; NOTE that the + tables mentioned in it must be in the + dictionary cache if they exist at all */ + dict_table_t* table, /*!< in: if check_ref is TRUE, then the foreign + table, else the referenced table */ + dtuple_t* entry, /*!< in: index entry for index */ + que_thr_t* thr) /*!< in: query thread */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/*********************************************************************//** +Sets a new row to insert for an INS_DIRECT node. This function is only used +if we have constructed the row separately, which is a rare case; this +function is quite slow. */ +void +ins_node_set_new_row( +/*=================*/ + ins_node_t* node, /*!< in: insert node */ + dtuple_t* row); /*!< in: new row (or first row) for the node */ +/***************************************************************//** +Tries to insert an entry into a clustered index, ignoring foreign key +constraints. If a record with the same unique key is found, the other +record is necessarily marked deleted by a committed transaction, or a +unique key violation error occurs. The delete marked record is then +updated to an existing record, and we must write an undo log record on +the delete marked record. +@retval DB_SUCCESS on success +@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG) +@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed +@return error code */ +dberr_t +row_ins_clust_index_entry_low( +/*==========================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /*!< in: clustered index */ + ulint n_uniq, /*!< in: 0 or index->n_uniq */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + ulint n_ext, /*!< in: number of externally stored columns */ + que_thr_t* thr) /*!< in: query thread or NULL */ + MY_ATTRIBUTE((warn_unused_result)); + +/***************************************************************//** +Tries to insert an entry into a secondary index. If a record with exactly the +same fields is found, the other record is necessarily marked deleted. +It is then unmarked. Otherwise, the entry is just inserted to the index. +@retval DB_SUCCESS on success +@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG) +@retval DB_FAIL if retry with BTR_INSERT_TREE is needed +@return error code */ +dberr_t +row_ins_sec_index_entry_low( +/*========================*/ + ulint flags, /*!< in: undo logging and locking flags */ + btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF or BTR_INSERT_TREE, + depending on whether we wish optimistic or + pessimistic descent down the index tree */ + dict_index_t* index, /*!< in: secondary index */ + mem_heap_t* offsets_heap, + /*!< in/out: memory heap that can be emptied */ + mem_heap_t* heap, /*!< in/out: memory heap */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + trx_id_t trx_id, /*!< in: PAGE_MAX_TRX_ID during + row_log_table_apply(), or 0 */ + que_thr_t* thr) /*!< in: query thread */ + MY_ATTRIBUTE((warn_unused_result)); + +/***************************************************************//** +Inserts an entry into a clustered index. Tries first optimistic, +then pessimistic descent down the tree. If the entry matches enough +to a delete marked record, performs the insert by updating or delete +unmarking the delete marked record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */ +dberr_t +row_ins_clust_index_entry( +/*======================*/ + dict_index_t* index, /*!< in: clustered index */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + que_thr_t* thr, /*!< in: query thread */ + ulint n_ext) /*!< in: number of externally stored columns */ + MY_ATTRIBUTE((warn_unused_result)); +/***************************************************************//** +Inserts an entry into a secondary index. Tries first optimistic, +then pessimistic descent down the tree. If the entry matches enough +to a delete marked record, performs the insert by updating or delete +unmarking the delete marked record. +@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */ +dberr_t +row_ins_sec_index_entry( +/*====================*/ + dict_index_t* index, /*!< in: secondary index */ + dtuple_t* entry, /*!< in/out: index entry to insert */ + que_thr_t* thr, /*!< in: query thread */ + bool check_foreign = true) /*!< in: true if check + foreign table is needed, false otherwise */ + MY_ATTRIBUTE((warn_unused_result)); +/***********************************************************//** +Inserts a row to a table. This is a high-level function used in +SQL execution graphs. +@return query thread to run next or NULL */ +que_thr_t* +row_ins_step( +/*=========*/ + que_thr_t* thr); /*!< in: query thread */ + +/* Insert node types */ +#define INS_SEARCHED 0 /* INSERT INTO ... SELECT ... */ +#define INS_VALUES 1 /* INSERT INTO ... VALUES ... */ +#define INS_DIRECT 2 /* this is for internal use in dict0crea: + insert the row directly */ + +/* Node execution states */ +#define INS_NODE_SET_IX_LOCK 1 /* we should set an IX lock on table */ +#define INS_NODE_ALLOC_ROW_ID 2 /* row id should be allocated */ +#define INS_NODE_INSERT_ENTRIES 3 /* index entries should be built and + inserted */ + +struct row_prebuilt_t; + +/** Insert node structure */ +struct ins_node_t +{ + explicit ins_node_t(ulint ins_type, dict_table_t *table) : + common(QUE_NODE_INSERT, NULL), + ins_type(ins_type), + row(NULL), table(table), select(NULL), values_list(NULL), + state(INS_NODE_SET_IX_LOCK), index(NULL), + entry_list(), entry(entry_list.end()), + trx_id(0), entry_sys_heap(mem_heap_create(128)) + { + } + ~ins_node_t() { mem_heap_free(entry_sys_heap); } + que_common_t common; /*!< node type: QUE_NODE_INSERT */ + ulint ins_type;/* INS_VALUES, INS_SEARCHED, or INS_DIRECT */ + dtuple_t* row; /*!< row to insert */ + dict_table_t* table; /*!< table where to insert */ + sel_node_t* select; /*!< select in searched insert */ + que_node_t* values_list;/* list of expressions to evaluate and + insert in an INS_VALUES insert */ + ulint state; /*!< node execution state */ + dict_index_t* index; /*!< NULL, or the next index where the index + entry should be inserted */ + std::vector + entry_list;/* list of entries, one for each index */ + std::vector::iterator + entry; /*!< NULL, or entry to insert in the index; + after a successful insert of the entry, + this should be reset to NULL */ + /** buffer for the system columns */ + byte sys_buf[DATA_ROW_ID_LEN + + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]; + trx_id_t trx_id; /*!< trx id or the last trx which executed the + node */ + byte vers_start_buf[8]; /* Buffers for System Versioning */ + byte vers_end_buf[8]; /* system fields. */ + mem_heap_t* entry_sys_heap; + /* memory heap used as auxiliary storage; + entry_list and sys fields are stored here; + if this is NULL, entry list should be created + and buffers for sys fields in row allocated */ + void vers_update_end(row_prebuilt_t *prebuilt, bool history_row); +}; + +/** Create an insert object. +@param ins_type INS_VALUES, ... +@param table table where to insert +@param heap memory heap +@return the created object */ +inline ins_node_t *ins_node_create(ulint ins_type, dict_table_t *table, + mem_heap_t *heap) +{ + return new (mem_heap_alloc(heap, sizeof(ins_node_t))) + ins_node_t(ins_type, table); +} + +#endif diff --git a/storage/innobase/include/row0log.h b/storage/innobase/include/row0log.h new file mode 100644 index 00000000..469f1f8a --- /dev/null +++ b/storage/innobase/include/row0log.h @@ -0,0 +1,239 @@ +/***************************************************************************** + +Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0log.h +Modification log for online index creation and online table rebuild + +Created 2011-05-26 Marko Makela +*******************************************************/ + +#pragma once + +#include "que0types.h" +#include "mtr0types.h" +#include "row0types.h" +#include "rem0types.h" +#include "dict0dict.h" +#include "trx0types.h" +#include "trx0undo.h" + +class ut_stage_alter_t; + +extern Atomic_counter onlineddl_rowlog_rows; +extern ulint onlineddl_rowlog_pct_used; +extern ulint onlineddl_pct_progress; + +/******************************************************//** +Allocate the row log for an index and flag the index +for online creation. +@retval true if success, false if not */ +bool +row_log_allocate( +/*=============*/ + const trx_t* trx, /*!< in: the ALTER TABLE transaction */ + dict_index_t* index, /*!< in/out: index */ + dict_table_t* table, /*!< in/out: new table being rebuilt, + or NULL when creating a secondary index */ + bool same_pk,/*!< in: whether the definition of the + PRIMARY KEY has remained the same */ + const dtuple_t* defaults, + /*!< in: default values of + added, changed columns, or NULL */ + const ulint* col_map,/*!< in: mapping of old column + numbers to new ones, or NULL if !table */ + const char* path, /*!< in: where to create temporary file */ + const TABLE* old_table, /*!< in:table definition before alter */ + bool allow_not_null) /*!< in: allow null to non-null + conversion */ + MY_ATTRIBUTE((nonnull(1), warn_unused_result)); + +/******************************************************//** +Free the row log for an index that was being created online. */ +void +row_log_free( +/*=========*/ + row_log_t* log) /*!< in,own: row log */ + MY_ATTRIBUTE((nonnull)); + +/******************************************************//** +Free the row log for an index on which online creation was aborted. */ +inline void row_log_abort_sec(dict_index_t *index) +{ + ut_ad(index->lock.have_u_or_x()); + ut_ad(!index->is_clust()); + dict_index_set_online_status(index, ONLINE_INDEX_ABORTED); + row_log_free(index->online_log); + index->online_log= nullptr; +} + +/** Logs an operation to a secondary index that is (or was) being created. +@param index index, S or X latched +@param tuple index tuple +@param trx_id transaction ID for insert, or 0 for delete +@retval false if row_log_apply() failure happens +or true otherwise */ +bool row_log_online_op(dict_index_t *index, const dtuple_t *tuple, + trx_id_t trx_id) ATTRIBUTE_COLD; + +/******************************************************//** +Gets the error status of the online index rebuild log. +@return DB_SUCCESS or error code */ +dberr_t +row_log_table_get_error( +/*====================*/ + const dict_index_t* index) /*!< in: clustered index of a table + that is being rebuilt online */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Check whether a virtual column is indexed in the new table being +created during alter table +@param[in] index cluster index +@param[in] v_no virtual column number +@return true if it is indexed, else false */ +bool +row_log_col_is_indexed( + const dict_index_t* index, + ulint v_no); + +/******************************************************//** +Logs a delete operation to a table that is being rebuilt. +This will be merged in row_log_table_apply_delete(). */ +void +row_log_table_delete( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index) */ + const byte* sys) /*!< in: DB_TRX_ID,DB_ROLL_PTR that should + be logged, or NULL to use those in rec */ + ATTRIBUTE_COLD __attribute__((nonnull(1,2,3))); + +/******************************************************//** +Logs an update operation to a table that is being rebuilt. +This will be merged in row_log_table_apply_update(). */ +void +row_log_table_update( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index) */ + const dtuple_t* old_pk);/*!< in: row_log_table_get_pk() + before the update */ + +/******************************************************//** +Constructs the old PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR +of a table that is being rebuilt. +@return tuple of PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in the rebuilt table, +or NULL if the PRIMARY KEY definition does not change */ +const dtuple_t* +row_log_table_get_pk( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index), + or NULL */ + byte* sys, /*!< out: DB_TRX_ID,DB_ROLL_PTR for + row_log_table_delete(), or NULL */ + mem_heap_t** heap) /*!< in/out: memory heap where allocated */ + ATTRIBUTE_COLD __attribute__((nonnull(1,2,5), warn_unused_result)); + +/******************************************************//** +Logs an insert to a table that is being rebuilt. +This will be merged in row_log_table_apply_insert(). */ +void +row_log_table_insert( +/*=================*/ + const rec_t* rec, /*!< in: clustered index leaf page record, + page X-latched */ + dict_index_t* index, /*!< in/out: clustered index, S-latched + or X-latched */ + const rec_offs* offsets);/*!< in: rec_get_offsets(rec,index) */ + +/** Apply the row_log_table log to a table upon completing rebuild. +@param[in] thr query graph +@param[in] old_table old table +@param[in,out] table MySQL table (for reporting duplicates) +@param[in,out] stage performance schema accounting object, used by +ALTER TABLE. stage->begin_phase_log_table() will be called initially and then +stage->inc() will be called for each block of log that is applied. +@param[in] new_table Altered table +@return DB_SUCCESS, or error code on failure */ +dberr_t +row_log_table_apply( + que_thr_t* thr, + dict_table_t* old_table, + struct TABLE* table, + ut_stage_alter_t* stage, + dict_table_t* new_table) + MY_ATTRIBUTE((warn_unused_result)); + +/******************************************************//** +Get the latest transaction ID that has invoked row_log_online_op() +during online creation. +@return latest transaction ID, or 0 if nothing was logged */ +trx_id_t +row_log_get_max_trx( +/*================*/ + dict_index_t* index) /*!< in: index, must be locked */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Apply the row log to the index upon completing index creation. +@param[in] trx transaction (for checking if the operation was +interrupted) +@param[in,out] index secondary index +@param[in,out] table MySQL table (for reporting duplicates) +@param[in,out] stage performance schema accounting object, used by +ALTER TABLE. stage->begin_phase_log_index() will be called initially and then +stage->inc() will be called for each block of log that is applied. +@return DB_SUCCESS, or error code on failure */ +dberr_t +row_log_apply( + const trx_t* trx, + dict_index_t* index, + struct TABLE* table, + ut_stage_alter_t* stage) + MY_ATTRIBUTE((warn_unused_result)); + +/** Get the n_core_fields of online log for the index +@param index index whose n_core_fields of log to be accessed +@return number of n_core_fields */ +unsigned row_log_get_n_core_fields(const dict_index_t *index); + +/** Get the error code of online log for the index +@param index online index +@return error code present in online log */ +dberr_t row_log_get_error(const dict_index_t *index); + +#ifdef HAVE_PSI_STAGE_INTERFACE +/** Estimate how much work is to be done by the log apply phase +of an ALTER TABLE for this index. +@param[in] index index whose log to assess +@return work to be done by log-apply in abstract units +*/ +ulint +row_log_estimate_work( + const dict_index_t* index); +#endif /* HAVE_PSI_STAGE_INTERFACE */ diff --git a/storage/innobase/include/row0merge.h b/storage/innobase/include/row0merge.h new file mode 100644 index 00000000..93ea650d --- /dev/null +++ b/storage/innobase/include/row0merge.h @@ -0,0 +1,496 @@ +/***************************************************************************** + +Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0merge.h +Index build routines using a merge sort + +Created 13/06/2005 Jan Lindstrom +*******************************************************/ + +#pragma once + +#include "que0types.h" +#include "trx0types.h" +#include "mtr0mtr.h" +#include "rem0types.h" +#include "rem0rec.h" +#include "btr0types.h" +#include "row0mysql.h" +#include "lock0types.h" +#include "srv0srv.h" + +class ut_stage_alter_t; + +/* Reserve free space from every block for key_version */ +#define ROW_MERGE_RESERVE_SIZE 4 + +/* Cluster index read task is mandatory */ +#define COST_READ_CLUSTERED_INDEX 1.0 + +/* Basic fixed cost to build all type of index */ +#define COST_BUILD_INDEX_STATIC 0.5 +/* Dynamic cost to build all type of index, dynamic cost will be re-distributed based on page count ratio of each index */ +#define COST_BUILD_INDEX_DYNAMIC 0.5 + +/* Sum of below two must be 1.0 */ +#define PCT_COST_MERGESORT_INDEX 0.4 +#define PCT_COST_INSERT_INDEX 0.6 + +// Forward declaration +struct ib_sequence_t; + +/** @brief Block size for I/O operations in merge sort. + +The minimum is srv_page_size, or page_get_free_space_of_empty() +rounded to a power of 2. + +When not creating a PRIMARY KEY that contains column prefixes, this +can be set as small as srv_page_size / 2. */ +typedef byte row_merge_block_t; + +/** @brief Secondary buffer for I/O operations of merge records. + +This buffer is used for writing or reading a record that spans two +row_merge_block_t. Thus, it must be able to hold one merge record, +whose maximum size is the same as the minimum size of +row_merge_block_t. */ +typedef byte mrec_buf_t[UNIV_PAGE_SIZE_MAX]; + +/** @brief Merge record in row_merge_block_t. + +The format is the same as a record in ROW_FORMAT=COMPACT with the +exception that the REC_N_NEW_EXTRA_BYTES are omitted. */ +typedef byte mrec_t; + +/** Merge record in row_merge_buf_t */ +struct mtuple_t { + dfield_t* fields; /*!< data fields */ +}; + +/** Buffer for sorting in main memory. */ +struct row_merge_buf_t { + mem_heap_t* heap; /*!< memory heap where allocated */ + dict_index_t* index; /*!< the index the tuples belong to */ + ulint total_size; /*!< total amount of data bytes */ + ulint n_tuples; /*!< number of data tuples */ + ulint max_tuples; /*!< maximum number of data tuples */ + mtuple_t* tuples; /*!< array of data tuples */ + mtuple_t* tmp_tuples; /*!< temporary copy of tuples, + for sorting */ +}; + +/** Information about temporary files used in merge sort */ +struct merge_file_t { + pfs_os_file_t fd; /*!< file descriptor */ + ulint offset; /*!< file offset (end of file) */ + ib_uint64_t n_rec; /*!< number of records in the file */ +}; + +/** Index field definition */ +struct index_field_t { + ulint col_no; /*!< column offset */ + ulint prefix_len; /*!< column prefix length, or 0 + if indexing the whole column */ + bool is_v_col; /*!< whether this is a virtual column */ + bool descending; /*!< whether to use DESC order */ +}; + +/** Definition of an index being created */ +struct index_def_t { + const char* name; /*!< index name */ + bool rebuild; /*!< whether the table is rebuilt */ + ulint ind_type; /*!< 0, DICT_UNIQUE, + or DICT_CLUSTERED */ + ulint key_number; /*!< MySQL key number, + or ULINT_UNDEFINED if none */ + ulint n_fields; /*!< number of fields in index */ + index_field_t* fields; /*!< field definitions */ + st_mysql_ftparser* + parser; /*!< fulltext parser plugin */ +}; + +/** Structure for reporting duplicate records. */ +struct row_merge_dup_t { + dict_index_t* index; /*!< index being sorted */ + struct TABLE* table; /*!< MySQL table object */ + const ulint* col_map;/*!< mapping of column numbers + in table to the rebuilt table + (index->table), or NULL if not + rebuilding table */ + ulint n_dup; /*!< number of duplicates */ +}; + +/*************************************************************//** +Report a duplicate key. */ +void +row_merge_dup_report( +/*=================*/ + row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */ + const dfield_t* entry) /*!< in: duplicate index entry */ + MY_ATTRIBUTE((nonnull)); + +/** Drop indexes that were created before an error occurred. +The data dictionary must have been locked exclusively by the caller, +because the transaction will not be committed. +@param trx dictionary transaction +@param table table containing the indexes +@param locked True if table is locked, + false - may need to do lazy drop +@param alter_trx Alter table transaction */ +void +row_merge_drop_indexes( + trx_t* trx, + dict_table_t* table, + bool locked, + const trx_t* alter_trx=NULL); + +/** During recovery, drop recovered index stubs that were created in +prepare_inplace_alter_table_dict(). */ +void row_merge_drop_temp_indexes(); + +/** Create temporary merge files in the given paramater path, and if +UNIV_PFS_IO defined, register the file descriptor with Performance Schema. +@param[in] path location for creating temporary merge files, or NULL +@return File descriptor */ +pfs_os_file_t +row_merge_file_create_low( + const char* path) + MY_ATTRIBUTE((warn_unused_result)); +/*********************************************************************//** +Destroy a merge file. And de-register the file from Performance Schema +if UNIV_PFS_IO is defined. */ +void +row_merge_file_destroy_low( +/*=======================*/ + const pfs_os_file_t& fd); /*!< in: merge file descriptor */ + +/*********************************************************************//** +Rename an index in the dictionary that was created. The data +dictionary must have been locked exclusively by the caller, because +the transaction will not be committed. +@return DB_SUCCESS if all OK */ +dberr_t +row_merge_rename_index_to_add( +/*==========================*/ + trx_t* trx, /*!< in/out: transaction */ + table_id_t table_id, /*!< in: table identifier */ + index_id_t index_id) /*!< in: index identifier */ + MY_ATTRIBUTE((nonnull(1), warn_unused_result)); + +/** Create the index and load in to the dictionary. +@param[in,out] table the index is on this table +@param[in] index_def the index definition +@param[in] add_v new virtual columns added along with add + index call +@return index, or NULL on error */ +dict_index_t* +row_merge_create_index( + dict_table_t* table, + const index_def_t* index_def, + const dict_add_v_col_t* add_v) + MY_ATTRIBUTE((warn_unused_result)); + +/*********************************************************************//** +Check if a transaction can use an index. +@return whether the index can be used by the transaction */ +bool +row_merge_is_index_usable( +/*======================*/ + const trx_t* trx, /*!< in: transaction */ + const dict_index_t* index) /*!< in: index to check */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Map from column numbers to column definitions that include +changes to the collation, when the encoding is compatible with +the original column and no table rebuild is needed */ +typedef std::map col_collations; + +/** Build indexes on a table by reading a clustered index, creating a temporary +file containing index entries, merge sorting these index entries and inserting +sorted index entries to indexes. +@param[in] trx transaction +@param[in] old_table table where rows are read from +@param[in] new_table table where indexes are created; identical to +old_table unless creating a PRIMARY KEY +@param[in] online true if creating indexes online +@param[in] indexes indexes to be created +@param[in] key_numbers MySQL key numbers +@param[in] n_indexes size of indexes[] +@param[in,out] table MySQL table, for reporting erroneous key value +if applicable +@param[in] defaults default values of added, changed columns, or NULL +@param[in] col_map mapping of old column numbers to new ones, or +NULL if old_table == new_table +@param[in] add_autoinc number of added AUTO_INCREMENT columns, or +ULINT_UNDEFINED if none is added +@param[in,out] sequence autoinc sequence +@param[in] skip_pk_sort whether the new PRIMARY KEY will follow +existing order +@param[in,out] stage performance schema accounting object, used by +ALTER TABLE. stage->begin_phase_read_pk() will be called at the beginning of +this function and it will be passed to other functions for further accounting. +@param[in] add_v new virtual columns added along with indexes +@param[in] eval_table mysql table used to evaluate virtual column + value, see innobase_get_computed_value(). +@param[in] allow_non_null allow the conversion from null to not-null +@param[in] col_collate columns whose collations changed, or nullptr +@return DB_SUCCESS or error code */ +dberr_t +row_merge_build_indexes( + trx_t* trx, + dict_table_t* old_table, + dict_table_t* new_table, + bool online, + dict_index_t** indexes, + const ulint* key_numbers, + ulint n_indexes, + struct TABLE* table, + const dtuple_t* defaults, + const ulint* col_map, + ulint add_autoinc, + ib_sequence_t& sequence, + bool skip_pk_sort, + ut_stage_alter_t* stage, + const dict_add_v_col_t* add_v, + struct TABLE* eval_table, + bool allow_non_null, + const col_collations* col_collate) + MY_ATTRIBUTE((warn_unused_result)); + +/** Write a buffer to a block. +@param buf sorted buffer +@param block buffer for writing to file +@param blob_file blob file handle for doing bulk insert operation */ +dberr_t row_merge_buf_write(const row_merge_buf_t *buf, +#ifndef DBUG_OFF + const merge_file_t *of, /*!< output file */ +#endif + row_merge_block_t *block, + merge_file_t *blob_file= nullptr); + +/********************************************************************//** +Sort a buffer. */ +void +row_merge_buf_sort( +/*===============*/ + row_merge_buf_t* buf, /*!< in/out: sort buffer */ + row_merge_dup_t* dup) /*!< in/out: reporter of duplicates + (NULL if non-unique index) */ + MY_ATTRIBUTE((nonnull(1))); + +/********************************************************************//** +Write a merge block to the file system. +@return whether the request was completed successfully +@retval false on error +@retval true on success */ +bool +row_merge_write( + const pfs_os_file_t& fd, /*!< in: file descriptor */ + ulint offset, /*!< in: offset where to write, + in number of row_merge_block_t elements */ + const void* buf, /*!< in: data */ + void* crypt_buf, /*!< in: crypt buf or NULL */ + ulint space) /*!< in: space id */ + MY_ATTRIBUTE((warn_unused_result)); + +/********************************************************************//** +Empty a sort buffer. +@return sort buffer */ +row_merge_buf_t* +row_merge_buf_empty( +/*================*/ + row_merge_buf_t* buf) /*!< in,own: sort buffer */ + MY_ATTRIBUTE((warn_unused_result, nonnull)); + +/** Create a merge file in the given location. +@param[out] merge_file merge file structure +@param[in] path location for creating temporary file, or NULL +@return file descriptor, or -1 on failure */ +pfs_os_file_t +row_merge_file_create( + merge_file_t* merge_file, + const char* path) + MY_ATTRIBUTE((warn_unused_result, nonnull(1))); + +/** Merge disk files. +@param[in] trx transaction +@param[in] dup descriptor of index being created +@param[in,out] file file containing index entries +@param[in,out] block 3 buffers +@param[in,out] tmpfd temporary file handle +@param[in] update_progress true, if we should update progress status +@param[in] pct_progress total progress percent until now +@param[in] pct_ocst current progress percent +@param[in] crypt_block crypt buf or NULL +@param[in] space space_id +@param[in,out] stage performance schema accounting object, used by +ALTER TABLE. If not NULL, stage->begin_phase_sort() will be called initially +and then stage->inc() will be called for each record processed. +@return DB_SUCCESS or error code */ +dberr_t +row_merge_sort( +/*===========*/ + trx_t* trx, + const row_merge_dup_t* dup, + merge_file_t* file, + row_merge_block_t* block, + pfs_os_file_t* tmpfd, + const bool update_progress, + const double pct_progress, + const double pct_cost, + row_merge_block_t* crypt_block, + ulint space, + ut_stage_alter_t* stage = NULL) + MY_ATTRIBUTE((warn_unused_result)); + +/*********************************************************************//** +Allocate a sort buffer. +@return own: sort buffer */ +row_merge_buf_t* +row_merge_buf_create( +/*=================*/ + dict_index_t* index) /*!< in: secondary index */ + MY_ATTRIBUTE((warn_unused_result, nonnull, malloc)); + +/*********************************************************************//** +Deallocate a sort buffer. */ +void +row_merge_buf_free( +/*===============*/ + row_merge_buf_t* buf) /*!< in,own: sort buffer to be freed */ + MY_ATTRIBUTE((nonnull)); + +/*********************************************************************//** +Destroy a merge file. */ +void +row_merge_file_destroy( +/*===================*/ + merge_file_t* merge_file) /*!< in/out: merge file structure */ + MY_ATTRIBUTE((nonnull)); + +/** Read a merge block from the file system. +@return whether the request was completed successfully */ +bool +row_merge_read( +/*===========*/ + const pfs_os_file_t& fd, /*!< in: file descriptor */ + ulint offset, /*!< in: offset where to read + in number of row_merge_block_t + elements */ + row_merge_block_t* buf, /*!< out: data */ + row_merge_block_t* crypt_buf, /*!< in: crypt buf or NULL */ + ulint space) /*!< in: space id */ + MY_ATTRIBUTE((warn_unused_result)); + +/********************************************************************//** +Read a merge record. +@return pointer to next record, or NULL on I/O error or end of list */ +const byte* +row_merge_read_rec( +/*===============*/ + row_merge_block_t* block, /*!< in/out: file buffer */ + mrec_buf_t* buf, /*!< in/out: secondary buffer */ + const byte* b, /*!< in: pointer to record */ + const dict_index_t* index, /*!< in: index of the record */ + const pfs_os_file_t& fd, /*!< in: file descriptor */ + ulint* foffs, /*!< in/out: file offset */ + const mrec_t** mrec, /*!< out: pointer to merge record, + or NULL on end of list + (non-NULL on I/O error) */ + rec_offs* offsets,/*!< out: offsets of mrec */ + row_merge_block_t* crypt_block, /*!< in: crypt buf or NULL */ + ulint space) /*!< in: space id */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Buffer for bulk insert */ +class row_merge_bulk_t +{ + /** Buffer for each index in the table. main memory + buffer for sorting the index */ + row_merge_buf_t *m_merge_buf; + /** Block for IO operation */ + row_merge_block_t *m_block= nullptr; + /** File to store the buffer and used for merge sort */ + merge_file_t *m_merge_files= nullptr; + /** Temporary file to be used for merge sort */ + pfs_os_file_t m_tmpfd; + /** Allocate memory for merge file data structure */ + ut_allocator m_alloc; + /** Storage for description for the m_alloc */ + ut_new_pfx_t m_block_pfx; + /** Temporary file to store the blob */ + merge_file_t m_blob_file; + /** Storage for description for the crypt_block */ + ut_new_pfx_t m_crypt_pfx; + /** Block for encryption */ + row_merge_block_t *m_crypt_block= nullptr; +public: + /** Constructor. + Create all merge files, merge buffer for all the table indexes + expect fts indexes. + Create a merge block which is used to write IO operation + @param table table which undergoes bulk insert operation */ + row_merge_bulk_t(dict_table_t *table); + + /** Destructor. + Remove all merge files, merge buffer for all table indexes. */ + ~row_merge_bulk_t(); + + /** Remove all buffer for the table indexes */ + void remove_all_bulk_buffer(); + + /** Clean the merge buffer for the given index number */ + void clean_bulk_buffer(ulint index_no); + + /** Create the temporary file for the given index number + @retval true if temporary file creation went well */ + bool create_tmp_file(ulint index_no); + + /** Write the merge buffer to the tmp file for the given + index number. + @param index_no buffer to be written for the index */ + dberr_t write_to_tmp_file(ulint index_no); + + /** Add the tuple to the merge buffer for the given index. + If the buffer ran out of memory then write the buffer into + the temporary file and do insert the tuple again. + @param row tuple to be inserted + @param ind index to be buffered + @param trx bulk transaction */ + dberr_t bulk_insert_buffered(const dtuple_t &row, const dict_index_t &ind, + trx_t *trx); + + /** Do bulk insert operation into the index tree from + buffer or merge file if exists + @param index_no index to be inserted + @param trx bulk transaction */ + dberr_t write_to_index(ulint index_no, trx_t *trx); + + /** Do bulk insert for the buffered insert for the table. + @param table table which undergoes for bulk insert operation + @param trx bulk transaction */ + dberr_t write_to_table(dict_table_t *table, trx_t *trx); + + /** Allocate block for writing the buffer into disk */ + dberr_t alloc_block(); + + /** Init temporary files for each index */ + void init_tmp_file(); +}; diff --git a/storage/innobase/include/row0mysql.h b/storage/innobase/include/row0mysql.h new file mode 100644 index 00000000..878d9c9f --- /dev/null +++ b/storage/innobase/include/row0mysql.h @@ -0,0 +1,841 @@ +/***************************************************************************** + +Copyright (c) 2000, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0mysql.h +Interface between Innobase row operations and MySQL. +Contains also create table and other data dictionary operations. + +Created 9/17/2000 Heikki Tuuri +*******************************************************/ + +#ifndef row0mysql_h +#define row0mysql_h + +#include "que0types.h" +#include "trx0types.h" +#include "row0types.h" +#include "btr0types.h" +#include "lock0types.h" +#include "fil0fil.h" +#include "fts0fts.h" +#include "gis0type.h" + +struct row_prebuilt_t; +class ha_innobase; +class ha_handler_stats; + +/*******************************************************************//** +Frees the blob heap in prebuilt when no longer needed. */ +void +row_mysql_prebuilt_free_blob_heap( +/*==============================*/ + row_prebuilt_t* prebuilt); /*!< in: prebuilt struct of a + ha_innobase:: table handle */ +/*******************************************************************//** +Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row +format. +@return pointer to the data, we skip the 1 or 2 bytes at the start +that are used to store the len */ +byte* +row_mysql_store_true_var_len( +/*=========================*/ + byte* dest, /*!< in: where to store */ + ulint len, /*!< in: length, must fit in two bytes */ + ulint lenlen);/*!< in: storage length of len: either 1 or 2 bytes */ +/*******************************************************************//** +Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and +returns a pointer to the data. +@return pointer to the data, we skip the 1 or 2 bytes at the start +that are used to store the len */ +const byte* +row_mysql_read_true_varchar( +/*========================*/ + ulint* len, /*!< out: variable-length field length */ + const byte* field, /*!< in: field in the MySQL format */ + ulint lenlen);/*!< in: storage length of len: either 1 + or 2 bytes */ +/*******************************************************************//** +Stores a reference to a BLOB in the MySQL format. */ +void +row_mysql_store_blob_ref( +/*=====================*/ + byte* dest, /*!< in: where to store */ + ulint col_len,/*!< in: dest buffer size: determines into + how many bytes the BLOB length is stored, + the space for the length may vary from 1 + to 4 bytes */ + const void* data, /*!< in: BLOB data; if the value to store + is SQL NULL this should be NULL pointer */ + ulint len); /*!< in: BLOB length; if the value to store + is SQL NULL this should be 0; remember + also to set the NULL bit in the MySQL record + header! */ +/*******************************************************************//** +Reads a reference to a BLOB in the MySQL format. +@return pointer to BLOB data */ +const byte* +row_mysql_read_blob_ref( +/*====================*/ + ulint* len, /*!< out: BLOB length */ + const byte* ref, /*!< in: BLOB reference in the + MySQL format */ + ulint col_len); /*!< in: BLOB reference length + (not BLOB length) */ +/*******************************************************************//** +Converts InnoDB geometry data format to MySQL data format. */ +void +row_mysql_store_geometry( +/*=====================*/ + byte* dest, /*!< in/out: where to store */ + ulint dest_len, /*!< in: dest buffer size: determines into + how many bytes the geometry length is stored, + the space for the length may vary from 1 + to 4 bytes */ + const byte* src, /*!< in: geometry data; if the value to store + is SQL NULL this should be NULL pointer */ + ulint src_len); /*!< in: geometry length; if the value to store + is SQL NULL this should be 0; remember + also to set the NULL bit in the MySQL record + header! */ +/**************************************************************//** +Pad a column with spaces. */ +void +row_mysql_pad_col( +/*==============*/ + ulint mbminlen, /*!< in: minimum size of a character, + in bytes */ + byte* pad, /*!< out: padded buffer */ + ulint len); /*!< in: number of bytes to pad */ + +/**************************************************************//** +Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format. +The counterpart of this function is row_sel_field_store_in_mysql_format() in +row0sel.cc. +@return up to which byte we used buf in the conversion */ +byte* +row_mysql_store_col_in_innobase_format( +/*===================================*/ + dfield_t* dfield, /*!< in/out: dfield where dtype + information must be already set when + this function is called! */ + byte* buf, /*!< in/out: buffer for a converted + integer value; this must be at least + col_len long then! NOTE that dfield + may also get a pointer to 'buf', + therefore do not discard this as long + as dfield is used! */ + ibool row_format_col, /*!< TRUE if the mysql_data is from + a MySQL row, FALSE if from a MySQL + key value; + in MySQL, a true VARCHAR storage + format differs in a row and in a + key value: in a key value the length + is always stored in 2 bytes! */ + const byte* mysql_data, /*!< in: MySQL column value, not + SQL NULL; NOTE that dfield may also + get a pointer to mysql_data, + therefore do not discard this as long + as dfield is used! */ + ulint col_len, /*!< in: MySQL column length; NOTE that + this is the storage length of the + column in the MySQL format row, not + necessarily the length of the actual + payload data; if the column is a true + VARCHAR then this is irrelevant */ + ulint comp); /*!< in: nonzero=compact format */ +/****************************************************************//** +Handles user errors and lock waits detected by the database engine. +@return true if it was a lock wait and we should continue running the +query thread */ +bool +row_mysql_handle_errors( +/*====================*/ + dberr_t* new_err,/*!< out: possible new error encountered in + rollback, or the old error which was + during the function entry */ + trx_t* trx, /*!< in: transaction */ + que_thr_t* thr, /*!< in: query thread, or NULL */ + trx_savept_t* savept) /*!< in: savepoint, or NULL */ + MY_ATTRIBUTE((nonnull(1,2))); +/********************************************************************//** +Create a prebuilt struct for a MySQL table handle. +@return own: a prebuilt struct */ +row_prebuilt_t* +row_create_prebuilt( +/*================*/ + dict_table_t* table, /*!< in: Innobase table handle */ + ulint mysql_row_len); /*!< in: length in bytes of a row in + the MySQL format */ +/** Free a prebuilt struct for a TABLE handle. */ +void row_prebuilt_free(row_prebuilt_t *prebuilt); +/*********************************************************************//** +Updates the transaction pointers in query graphs stored in the prebuilt +struct. */ +void +row_update_prebuilt_trx( +/*====================*/ + row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct + in MySQL handle */ + trx_t* trx); /*!< in: transaction handle */ + +/*********************************************************************//** +Sets an AUTO_INC type lock on the table mentioned in prebuilt. The +AUTO_INC lock gives exclusive access to the auto-inc counter of the +table. The lock is reserved only for the duration of an SQL statement. +It is not compatible with another AUTO_INC or exclusive lock on the +table. +@return error code or DB_SUCCESS */ +dberr_t +row_lock_table_autoinc_for_mysql( +/*=============================*/ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in the MySQL + table handle */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Lock a table. +@param[in,out] prebuilt table handle +@return error code or DB_SUCCESS */ +dberr_t +row_lock_table(row_prebuilt_t* prebuilt); + +/** System Versioning: row_insert_for_mysql() modes */ +enum ins_mode_t { + /* plain row (without versioning) */ + ROW_INS_NORMAL = 0, + /* row_start = TRX_ID, row_end = MAX */ + ROW_INS_VERSIONED, + /* row_end = TRX_ID */ + ROW_INS_HISTORICAL +}; + +/** Does an insert for MySQL. +@param[in] mysql_rec row in the MySQL format +@param[in,out] prebuilt prebuilt struct in MySQL handle +@param[in] ins_mode what row type we're inserting +@return error code or DB_SUCCESS*/ +dberr_t +row_insert_for_mysql( + const byte* mysql_rec, + row_prebuilt_t* prebuilt, + ins_mode_t ins_mode) + MY_ATTRIBUTE((warn_unused_result)); + +/*********************************************************************//** +Builds a dummy query graph used in selects. */ +void +row_prebuild_sel_graph( +/*===================*/ + row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in MySQL + handle */ +/*********************************************************************//** +Gets pointer to a prebuilt update vector used in updates. If the update +graph has not yet been built in the prebuilt struct, then this function +first builds it. +@return prebuilt update vector */ +upd_t* +row_get_prebuilt_update_vector( +/*===========================*/ + row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in MySQL + handle */ +/** Does an update or delete of a row for MySQL. +@param[in,out] prebuilt prebuilt struct in MySQL handle +@return error code or DB_SUCCESS */ +dberr_t +row_update_for_mysql( + row_prebuilt_t* prebuilt) + MY_ATTRIBUTE((warn_unused_result)); + +/** This can only be used when the current transaction is at +READ COMMITTED or READ UNCOMMITTED isolation level. +Before calling this function row_search_mvcc() must have +initialized prebuilt->new_rec_locks to store the information which new +record locks really were set. This function removes a newly set +clustered index record lock under prebuilt->pcur or +prebuilt->clust_pcur. Thus, this implements a 'mini-rollback' that +releases the latest clustered index record lock we set. +@param[in,out] prebuilt prebuilt struct in MySQL handle +@param[in] has_latches_on_recs TRUE if called so that we have the + latches on the records under pcur + and clust_pcur, and we do not need + to reposition the cursors. */ +void +row_unlock_for_mysql( + row_prebuilt_t* prebuilt, + ibool has_latches_on_recs); + +/*********************************************************************//** +Creates an query graph node of 'update' type to be used in the MySQL +interface. +@return own: update node */ +upd_node_t* +row_create_update_node_for_mysql( +/*=============================*/ + dict_table_t* table, /*!< in: table to update */ + mem_heap_t* heap); /*!< in: mem heap from which allocated */ + +/**********************************************************************//** +Does a cascaded delete or set null in a foreign key operation. +@return error code or DB_SUCCESS */ +dberr_t +row_update_cascade_for_mysql( +/*=========================*/ + que_thr_t* thr, /*!< in: query thread */ + upd_node_t* node, /*!< in: update node used in the cascade + or set null operation */ + dict_table_t* table) /*!< in: table where we do the operation */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Lock the data dictionary cache exclusively. */ +#define row_mysql_lock_data_dictionary(trx) \ + do { \ + ut_ad(!trx->dict_operation_lock_mode); \ + dict_sys.lock(SRW_LOCK_CALL); \ + trx->dict_operation_lock_mode = true; \ + } while (0) + +/** Unlock the data dictionary. */ +#define row_mysql_unlock_data_dictionary(trx) \ + do { \ + ut_ad(!lock_trx_has_sys_table_locks(trx)); \ + ut_ad(trx->dict_operation_lock_mode); \ + trx->dict_operation_lock_mode = false; \ + dict_sys.unlock(); \ + } while (0) + +/*********************************************************************//** +Creates a table for MySQL. On failure the transaction will be rolled back +and the 'table' object will be freed. +@return error code or DB_SUCCESS */ +dberr_t +row_create_table_for_mysql( +/*=======================*/ + dict_table_t* table, /*!< in, own: table definition + (will be freed, or on DB_SUCCESS + added to the data dictionary cache) */ + trx_t* trx) /*!< in/out: transaction */ + MY_ATTRIBUTE((warn_unused_result)); + +/*********************************************************************//** +Create an index when creating a table. +On failure, the caller must drop the table! +@return error number or DB_SUCCESS */ +dberr_t +row_create_index_for_mysql( +/*=======================*/ + dict_index_t* index, /*!< in, own: index definition + (will be freed) */ + trx_t* trx, /*!< in: transaction handle */ + const ulint* field_lengths, /*!< in: if not NULL, must contain + dict_index_get_n_fields(index) + actual field lengths for the + index columns, which are + then checked for not being too + large. */ + fil_encryption_t mode, /*!< in: encryption mode */ + uint32_t key_id) /*!< in: encryption key_id */ + MY_ATTRIBUTE((warn_unused_result)); + +/*********************************************************************//** +Discards the tablespace of a table which stored in an .ibd file. Discarding +means that this function deletes the .ibd file and assigns a new table id for +the table. Also the file_unreadable flag is set. +@return error code or DB_SUCCESS */ +dberr_t row_discard_tablespace_for_mysql(dict_table_t *table, trx_t *trx) + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*****************************************************************//** +Imports a tablespace. The space id in the .ibd file must match the space id +of the table in the data dictionary. +@return error code or DB_SUCCESS */ +dberr_t +row_import_tablespace_for_mysql( +/*============================*/ + dict_table_t* table, /*!< in/out: table */ + row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/*********************************************************************//** +Renames a table for MySQL. +@return error code or DB_SUCCESS */ +dberr_t +row_rename_table_for_mysql( +/*=======================*/ + const char* old_name, /*!< in: old table name */ + const char* new_name, /*!< in: new table name */ + trx_t* trx, /*!< in/out: transaction */ + bool use_fk) /*!< in: whether to parse and enforce + FOREIGN KEY constraints */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/* A struct describing a place for an individual column in the MySQL +row format which is presented to the table handler in ha_innobase. +This template struct is used to speed up row transformations between +Innobase and MySQL. */ + +struct mysql_row_templ_t { + ulint col_no; /*!< column number of the column */ + ulint rec_field_no; /*!< field number of the column in an + Innobase record in the current index; + not defined if template_type is + ROW_MYSQL_WHOLE_ROW */ + ibool rec_field_is_prefix; /* is this field in a prefix index? */ + ulint rec_prefix_field_no; /* record field, even if just a + prefix; same as rec_field_no when not a + prefix, otherwise rec_field_no is + ULINT_UNDEFINED but this is the true + field number*/ + ulint clust_rec_field_no; /*!< field number of the column in an + Innobase record in the clustered index; + not defined if template_type is + ROW_MYSQL_WHOLE_ROW */ + ulint icp_rec_field_no; /*!< field number of the column in an + Innobase record in the current index; + not defined unless + index condition pushdown is used */ + ulint mysql_col_offset; /*!< offset of the column in the MySQL + row format */ + ulint mysql_col_len; /*!< length of the column in the MySQL + row format */ + ulint mysql_null_byte_offset; /*!< MySQL NULL bit byte offset in a + MySQL record */ + ulint mysql_null_bit_mask; /*!< bit mask to get the NULL bit, + zero if column cannot be NULL */ + ulint type; /*!< column type in Innobase mtype + numbers DATA_CHAR... */ + ulint mysql_type; /*!< MySQL type code; this is always + < 256 */ + ulint mysql_length_bytes; /*!< if mysql_type + == DATA_MYSQL_TRUE_VARCHAR, this tells + whether we should use 1 or 2 bytes to + store the MySQL true VARCHAR data + length at the start of row in the MySQL + format (NOTE that the MySQL key value + format always uses 2 bytes for the data + len) */ + ulint charset; /*!< MySQL charset-collation code + of the column, or zero */ + ulint mbminlen; /*!< minimum length of a char, in bytes, + or zero if not a char type */ + ulint mbmaxlen; /*!< maximum length of a char, in bytes, + or zero if not a char type */ + ulint is_unsigned; /*!< if a column type is an integer + type and this field is != 0, then + it is an unsigned integer type */ + ulint is_virtual; /*!< if a column is a virtual column */ +}; + +#define MYSQL_FETCH_CACHE_SIZE 8 +/* After fetching this many rows, we start caching them in fetch_cache */ +#define MYSQL_FETCH_CACHE_THRESHOLD 4 + +#define ROW_PREBUILT_ALLOCATED 78540783 +#define ROW_PREBUILT_FREED 26423527 + +/** A struct for (sometimes lazily) prebuilt structures in an Innobase table +handle used within MySQL; these are used to save CPU time. */ + +struct row_prebuilt_t { + ulint magic_n; /*!< this magic number is set to + ROW_PREBUILT_ALLOCATED when created, + or ROW_PREBUILT_FREED when the + struct has been freed */ + dict_table_t* table; /*!< Innobase table handle */ + dict_index_t* index; /*!< current index for a search, if + any */ + trx_t* trx; /*!< current transaction handle */ + unsigned sql_stat_start:1;/*!< TRUE when we start processing of + an SQL statement: we may have to set + an intention lock on the table, + create a consistent read view etc. */ + unsigned clust_index_was_generated:1; + /*!< if the user did not define a + primary key in MySQL, then Innobase + automatically generated a clustered + index where the ordering column is + the row id: in this case this flag + is set to TRUE */ + unsigned index_usable:1; /*!< caches the value of + row_merge_is_index_usable(trx,index) */ + unsigned read_just_key:1;/*!< set to 1 when MySQL calls + ha_innobase::extra with the + argument HA_EXTRA_KEYREAD; it is enough + to read just columns defined in + the index (i.e., no read of the + clustered index record necessary) */ + unsigned used_in_HANDLER:1;/*!< TRUE if we have been using this + handle in a MySQL HANDLER low level + index cursor command: then we must + store the pcur position even in a + unique search from a clustered index, + because HANDLER allows NEXT and PREV + in such a situation */ + unsigned template_type:2;/*!< ROW_MYSQL_WHOLE_ROW, + ROW_MYSQL_REC_FIELDS, + ROW_MYSQL_DUMMY_TEMPLATE, or + ROW_MYSQL_NO_TEMPLATE */ + unsigned n_template:10; /*!< number of elements in the + template */ + unsigned null_bitmap_len:10;/*!< number of bytes in the SQL NULL + bitmap at the start of a row in the + MySQL format */ + unsigned need_to_access_clustered:1; /*!< if we are fetching + columns through a secondary index + and at least one column is not in + the secondary index, then this is + set to TRUE; note that sometimes this + is set but we later optimize out the + clustered index lookup */ + unsigned templ_contains_blob:1;/*!< TRUE if the template contains + a column with DATA_LARGE_MTYPE( + get_innobase_type_from_mysql_type()) + is TRUE; + not to be confused with InnoDB + externally stored columns + (VARCHAR can be off-page too) */ + unsigned versioned_write:1;/*!< whether this is + a versioned write */ + mysql_row_templ_t* mysql_template;/*!< template used to transform + rows fast between MySQL and Innobase + formats; memory for this template + is not allocated from 'heap' */ + mem_heap_t* heap; /*!< memory heap from which + these auxiliary structures are + allocated when needed */ + ins_node_t* ins_node; /*!< Innobase SQL insert node + used to perform inserts + to the table */ + byte* ins_upd_rec_buff;/*!< buffer for storing data converted + to the Innobase format from the MySQL + format */ + const byte* default_rec; /*!< the default values of all columns + (a "default row") in MySQL format */ + ulint hint_need_to_fetch_extra_cols; + /*!< normally this is set to 0; if this + is set to ROW_RETRIEVE_PRIMARY_KEY, + then we should at least retrieve all + columns in the primary key; if this + is set to ROW_RETRIEVE_ALL_COLS, then + we must retrieve all columns in the + key (if read_just_key == 1), or all + columns in the table */ + upd_node_t* upd_node; /*!< Innobase SQL update node used + to perform updates and deletes */ + trx_id_t trx_id; /*!< The table->def_trx_id when + ins_graph was built */ + que_fork_t* ins_graph; /*!< Innobase SQL query graph used + in inserts. Will be rebuilt on + trx_id or n_indexes mismatch. */ + que_fork_t* upd_graph; /*!< Innobase SQL query graph used + in updates or deletes */ + btr_pcur_t* pcur; /*!< persistent cursor used in selects + and updates */ + btr_pcur_t* clust_pcur; /*!< persistent cursor used in + some selects and updates */ + que_fork_t* sel_graph; /*!< dummy query graph used in + selects */ + dtuple_t* search_tuple; /*!< prebuilt dtuple used in selects */ + byte row_id[DATA_ROW_ID_LEN]; + /*!< if the clustered index was + generated, the row id of the + last row fetched is stored + here */ + doc_id_t fts_doc_id; /* if the table has an FTS index on + it then we fetch the doc_id. + FTS-FIXME: Currently we fetch it always + but in the future we must only fetch + it when FTS columns are being + updated */ + dtuple_t* clust_ref; /*!< prebuilt dtuple used in + sel/upd/del */ + lock_mode select_lock_type;/*!< LOCK_NONE, LOCK_S, or LOCK_X */ + bool skip_locked; /*!< TL_{READ,WRITE}_SKIP_LOCKED */ + lock_mode stored_select_lock_type;/*!< this field is used to + remember the original select_lock_type + that was decided in ha_innodb.cc, + ::store_lock(), ::external_lock(), + etc. */ + ulint row_read_type; /*!< ROW_READ_WITH_LOCKS if row locks + should be the obtained for records + under an UPDATE or DELETE cursor. + At READ UNCOMMITTED or + READ COMMITTED isolation level, + this can be set to + ROW_READ_TRY_SEMI_CONSISTENT, so that + if the row under an UPDATE or DELETE + cursor was locked by another + transaction, InnoDB will resort + to reading the last committed value + ('semi-consistent read'). Then, + this field will be set to + ROW_READ_DID_SEMI_CONSISTENT to + indicate that. If the row does not + match the WHERE condition, MySQL will + invoke handler::unlock_row() to + clear the flag back to + ROW_READ_TRY_SEMI_CONSISTENT and + to simply skip the row. If + the row matches, the next call to + row_search_mvcc() will lock + the row. + This eliminates lock waits in some + cases; note that this breaks + serializability. */ + ulint new_rec_locks; /*!< normally 0; if + the session is using READ + COMMITTED or READ UNCOMMITTED + isolation level, set in + row_search_mvcc() if we set a new + record lock on the secondary + or clustered index; this is + used in row_unlock_for_mysql() + when releasing the lock under + the cursor if we determine + after retrieving the row that + it does not need to be locked + ('mini-rollback') */ + ulint mysql_prefix_len;/*!< byte offset of the end of + the last requested column */ + ulint mysql_row_len; /*!< length in bytes of a row in the + MySQL format */ + ulint n_rows_fetched; /*!< number of rows fetched after + positioning the current cursor */ + ulint fetch_direction;/*!< ROW_SEL_NEXT or ROW_SEL_PREV */ + byte* fetch_cache[MYSQL_FETCH_CACHE_SIZE]; + /*!< a cache for fetched rows if we + fetch many rows from the same cursor: + it saves CPU time to fetch them in a + batch; we reserve mysql_row_len + bytes for each such row; these + pointers point 4 bytes past the + allocated mem buf start, because + there is a 4 byte magic number at the + start and at the end */ + bool keep_other_fields_on_keyread; /*!< when using fetch + cache with HA_EXTRA_KEYREAD, don't + overwrite other fields in mysql row + row buffer.*/ + ulint fetch_cache_first;/*!< position of the first not yet + fetched row in fetch_cache */ + ulint n_fetch_cached; /*!< number of not yet fetched rows + in fetch_cache */ + mem_heap_t* blob_heap; /*!< in SELECTS BLOB fields are copied + to this heap */ + mem_heap_t* old_vers_heap; /*!< memory heap where a previous + version is built in consistent read */ + bool in_fts_query; /*!< Whether we are in a FTS query */ + bool fts_doc_id_in_read_set; /*!< true if table has externally + defined FTS_DOC_ID coulmn. */ + /*----------------------*/ + ulonglong autoinc_last_value; + /*!< last value of AUTO-INC interval */ + ulonglong autoinc_increment;/*!< The increment step of the auto + increment column. Value must be + greater than or equal to 1. Required to + calculate the next value */ + ulonglong autoinc_offset; /*!< The offset passed to + get_auto_increment() by MySQL. Required + to calculate the next value */ + dberr_t autoinc_error; /*!< The actual error code encountered + while trying to init or read the + autoinc value from the table. We + store it here so that we can return + it to MySQL */ + /*----------------------*/ + + /** Argument of handler_rowid_filter_check(), + or NULL if no PRIMARY KEY filter is pushed */ + ha_innobase* pk_filter; + + /** Argument to handler_index_cond_check(), + or NULL if no index condition pushdown (ICP) is used. */ + ha_innobase* idx_cond; + ulint idx_cond_n_cols;/*!< Number of fields in idx_cond_cols. + 0 if and only if idx_cond == NULL. */ + /*----------------------*/ + + /*----------------------*/ + rtr_info_t* rtr_info; /*!< R-tree Search Info */ + /*----------------------*/ + + ulint magic_n2; /*!< this should be the same as + magic_n */ + + byte* srch_key_val1; /*!< buffer used in converting + search key values from MySQL format + to InnoDB format.*/ + byte* srch_key_val2; /*!< buffer used in converting + search key values from MySQL format + to InnoDB format.*/ + uint srch_key_val_len; /*!< Size of search key */ + /** The MySQL table object */ + TABLE* m_mysql_table; + + /** Get template by dict_table_t::cols[] number */ + const mysql_row_templ_t* get_template_by_col(ulint col) const + { + ut_ad(col < n_template); + ut_ad(mysql_template); + for (ulint i = col; i < n_template; ++i) { + const mysql_row_templ_t* templ = &mysql_template[i]; + if (!templ->is_virtual && templ->col_no == col) { + return templ; + } + } + return NULL; + } +}; + +/** Callback for row_mysql_sys_index_iterate() */ +struct SysIndexCallback { + virtual ~SysIndexCallback() = default; + + /** Callback method + @param mtr current mini transaction + @param pcur persistent cursor. */ + virtual void operator()(mtr_t* mtr, btr_pcur_t* pcur) throw() = 0; +}; + + +/** Storage for calculating virtual columns */ + +class String; +struct VCOL_STORAGE +{ + TABLE *maria_table; + byte *innobase_record; + byte *maria_record; + String *blob_value_storage; + VCOL_STORAGE(): maria_table(NULL), innobase_record(NULL), + maria_record(NULL), blob_value_storage(NULL) {} +}; + +/** + Allocate a heap and record for calculating virtual fields + Used mainly for virtual fields in indexes + +@param[in] thd MariaDB THD +@param[in] index Index in use +@param[out] heap Heap that holds temporary row +@param[in,out] mysql_table MariaDB table +@param[out] rec Pointer to allocated MariaDB record +@param[out] storage Internal storage for blobs etc + +@return FALSE ok +@return TRUE malloc failure +*/ + +bool innobase_allocate_row_for_vcol(THD *thd, + const dict_index_t* index, + mem_heap_t** heap, + TABLE** table, + VCOL_STORAGE* storage); + +/** Free memory allocated by innobase_allocate_row_for_vcol() */ +void innobase_free_row_for_vcol(VCOL_STORAGE *storage); + +class ib_vcol_row +{ + VCOL_STORAGE storage; +public: + mem_heap_t *heap; + + ib_vcol_row(mem_heap_t *heap) : heap(heap) {} + + byte *record(THD *thd, const dict_index_t *index, TABLE **table) + { + if (!storage.innobase_record && + !innobase_allocate_row_for_vcol(thd, index, &heap, table, &storage)) + return nullptr; + return storage.innobase_record; + } + + ~ib_vcol_row() + { + if (heap) + { + if (storage.innobase_record) + innobase_free_row_for_vcol(&storage); + mem_heap_free(heap); + } + } +}; + +/** Report virtual value computation failure in ib::error +@param[in] row the data row +*/ +ATTRIBUTE_COLD +void innobase_report_computed_value_failed(dtuple_t *row); + +/** Get the computed value by supplying the base column values. +@param[in,out] row the data row +@param[in] col virtual column +@param[in] index index on the virtual column +@param[in,out] local_heap heap memory for processing large data etc. +@param[in,out] heap memory heap that copies the actual index row +@param[in] ifield index field +@param[in] thd connection handle +@param[in,out] mysql_table MariaDB table handle +@param[in,out] mysql_rec MariaDB record buffer +@param[in] old_table during ALTER TABLE, this is the old table + or NULL. +@param[in] update update vector for the parent row +@param[in] ignore_warnings ignore warnings during calculation. Usually + means that a calculation is internal and + should have no side effects. +@return the field filled with computed value */ +dfield_t* +innobase_get_computed_value( + dtuple_t* row, + const dict_v_col_t* col, + const dict_index_t* index, + mem_heap_t** local_heap, + mem_heap_t* heap, + const dict_field_t* ifield, + THD* thd, + TABLE* mysql_table, + byte* mysql_rec, + const dict_table_t* old_table=NULL, + const upd_t* update=NULL, + bool ignore_warnings=false); + +/** Change dbname and table name in table->vc_templ. +@param[in,out] table the table whose virtual column template +dbname and tbname to be renamed. */ +void +innobase_rename_vc_templ( + dict_table_t* table); + +#define ROW_PREBUILT_FETCH_MAGIC_N 465765687 + +#define ROW_MYSQL_WHOLE_ROW 0 +#define ROW_MYSQL_REC_FIELDS 1 +#define ROW_MYSQL_NO_TEMPLATE 2 +#define ROW_MYSQL_DUMMY_TEMPLATE 3 /* dummy template used in + row_check_index() */ + +/* Values for hint_need_to_fetch_extra_cols */ +#define ROW_RETRIEVE_PRIMARY_KEY 1 +#define ROW_RETRIEVE_ALL_COLS 2 + +/* Values for row_read_type */ +#define ROW_READ_WITH_LOCKS 0 +#define ROW_READ_TRY_SEMI_CONSISTENT 1 +#define ROW_READ_DID_SEMI_CONSISTENT 2 + +#endif /* row0mysql.h */ diff --git a/storage/innobase/include/row0purge.h b/storage/innobase/include/row0purge.h new file mode 100644 index 00000000..1daf4d4a --- /dev/null +++ b/storage/innobase/include/row0purge.h @@ -0,0 +1,149 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0purge.h +Purge obsolete records + +Created 3/14/1997 Heikki Tuuri +*******************************************************/ + +#pragma once + +#include "que0types.h" +#include "btr0types.h" +#include "btr0pcur.h" +#include "trx0types.h" +#include "row0types.h" +#include "row0mysql.h" +#include "mysqld.h" +#include +#include + +class MDL_ticket; +/** Determines if it is possible to remove a secondary index entry. +Removal is possible if the secondary index entry does not refer to any +not delete marked version of a clustered index record where DB_TRX_ID +is newer than the purge view. + +NOTE: This function should only be called by the purge thread, only +while holding a latch on the leaf page of the secondary index entry +(or keeping the buffer pool watch on the page). It is possible that +this function first returns true and then false, if a user transaction +inserts a record that the secondary index entry would refer to. +However, in that case, the user transaction would also re-insert the +secondary index entry after purge has removed it and released the leaf +page latch. +@param[in,out] node row purge node +@param[in] index secondary index +@param[in] entry secondary index entry +@param[in,out] sec_pcur secondary index cursor or NULL + if it is called for purge buffering + operation. +@param[in,out] sec_mtr mini-transaction which holds + secondary index entry or NULL if it is + called for purge buffering operation. +@param[in] is_tree true=pessimistic purge, + false=optimistic (leaf-page only) +@return true if the secondary index record can be purged */ +bool +row_purge_poss_sec( + purge_node_t* node, + dict_index_t* index, + const dtuple_t* entry, + btr_pcur_t* sec_pcur=NULL, + mtr_t* sec_mtr=NULL, + bool is_tree=false); + +/*************************************************************** +Does the purge operation. +@return query thread to run next */ +que_thr_t* +row_purge_step( +/*===========*/ + que_thr_t* thr) /*!< in: query thread */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Purge worker context */ +struct purge_node_t +{ + /** node type: QUE_NODE_PURGE */ + que_common_t common; + + /** DB_TRX_ID of the undo log record */ + trx_id_t trx_id; + /** DB_ROLL_PTR pointing to undo log record */ + roll_ptr_t roll_ptr; + + /** undo number of the record */ + undo_no_t undo_no; + + /** record type: TRX_UNDO_INSERT_REC, ... */ + byte rec_type; + /** compiler analysis info of an update */ + byte cmpl_info; + /** whether the clustered index record determined by ref was found + in the clustered index of the table, and we were able to position + pcur on it */ + bool found_clust; +#ifdef UNIV_DEBUG + /** whether the operation is in progress */ + bool in_progress= false; +#endif + /** table where purge is done */ + dict_table_t *table= nullptr; + /** update vector for a clustered index record */ + upd_t *update; + /** row reference to the next row to handle, or nullptr */ + const dtuple_t *ref; + /** nullptr, or a deep copy of the indexed fields of the row to handle */ + dtuple_t *row; + /** nullptr, or the next index of table whose record should be handled */ + dict_index_t *index; + /** memory heap used as auxiliary storage; must be emptied between rows */ + mem_heap_t *heap; + /** persistent cursor to the clustered index record */ + btr_pcur_t pcur; + + /** Undo recs to purge */ + std::queue undo_recs; + + /** map of table identifiers to table handles and meta-data locks */ + std::unordered_map> tables; + + /** Constructor */ + explicit purge_node_t(que_thr_t *parent) : + common(QUE_NODE_PURGE, parent), heap(mem_heap_create(256)), + tables(TRX_PURGE_TABLE_BUCKETS) {} + +#ifdef UNIV_DEBUG + /** Validate the persistent cursor. The purge node has two references + to the clustered index record: ref and pcur, which must match + each other if found_clust. + @return whether pcur is consistent with ref */ + bool validate_pcur(); +#endif + + /** Start processing an undo log record. */ + inline void start(); + + /** Reset the state at end + @return the query graph parent */ + inline que_node_t *end(THD *); +}; diff --git a/storage/innobase/include/row0quiesce.h b/storage/innobase/include/row0quiesce.h new file mode 100644 index 00000000..b05b7666 --- /dev/null +++ b/storage/innobase/include/row0quiesce.h @@ -0,0 +1,67 @@ +/***************************************************************************** + +Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0quiesce.h + +Header file for tablespace quiesce functions. + +Created 2012-02-08 by Sunny Bains +*******************************************************/ + +#ifndef row0quiesce_h +#define row0quiesce_h + +#include "dict0types.h" + +struct trx_t; + +/** The version number of the export meta-data text file. */ +#define IB_EXPORT_CFG_VERSION_V1 0x1UL + +/*********************************************************************//** +Quiesce the tablespace that the table resides in. */ +void +row_quiesce_table_start( +/*====================*/ + dict_table_t* table, /*!< in: quiesce this table */ + trx_t* trx) /*!< in/out: transaction/session */ + MY_ATTRIBUTE((nonnull)); + +/*********************************************************************//** +Set a table's quiesce state. +@return DB_SUCCESS or errro code. */ +dberr_t +row_quiesce_set_state( +/*==================*/ + dict_table_t* table, /*!< in: quiesce this table */ + ib_quiesce_t state, /*!< in: quiesce state to set */ + trx_t* trx) /*!< in/out: transaction */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/*********************************************************************//** +Cleanup after table quiesce. */ +void +row_quiesce_table_complete( +/*=======================*/ + dict_table_t* table, /*!< in: quiesce this table */ + trx_t* trx) /*!< in/out: transaction/session */ + MY_ATTRIBUTE((nonnull)); + +#endif /* row0quiesce_h */ diff --git a/storage/innobase/include/row0row.h b/storage/innobase/include/row0row.h new file mode 100644 index 00000000..a1350740 --- /dev/null +++ b/storage/innobase/include/row0row.h @@ -0,0 +1,431 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0row.h +General row routines + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#ifndef row0row_h +#define row0row_h + +#include "que0types.h" +#include "ibuf0ibuf.h" +#include "trx0types.h" +#include "mtr0mtr.h" +#include "rem0types.h" +#include "row0types.h" +#include "btr0types.h" + +/*********************************************************************//** +Gets the offset of the DB_TRX_ID field, in bytes relative to the origin of +a clustered index record. +@return offset of DATA_TRX_ID */ +UNIV_INLINE +ulint +row_get_trx_id_offset( +/*==================*/ + const dict_index_t* index, /*!< in: clustered index */ + const rec_offs* offsets)/*!< in: record offsets */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*********************************************************************//** +Reads the trx id field from a clustered index record. +@return value of the field */ +UNIV_INLINE +trx_id_t +row_get_rec_trx_id( +/*===============*/ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: clustered index */ + const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*********************************************************************//** +Reads the roll pointer field from a clustered index record. +@return value of the field */ +UNIV_INLINE +roll_ptr_t +row_get_rec_roll_ptr( +/*=================*/ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: clustered index */ + const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/* Flags for row build type. */ +#define ROW_BUILD_NORMAL 0 /*!< build index row */ +#define ROW_BUILD_FOR_PURGE 1 /*!< build row for purge. */ +#define ROW_BUILD_FOR_UNDO 2 /*!< build row for undo. */ +#define ROW_BUILD_FOR_INSERT 3 /*!< build row for insert. */ + +/*****************************************************************//** +When an insert or purge to a table is performed, this function builds +the entry to be inserted into or purged from an index on the table. +@return index entry which should be inserted or purged +@retval NULL if the externally stored columns in the clustered index record +are unavailable and ext != NULL, or row is missing some needed columns. */ +dtuple_t* +row_build_index_entry_low( +/*======================*/ + const dtuple_t* row, /*!< in: row which should be + inserted or purged */ + const row_ext_t* ext, /*!< in: externally stored column + prefixes, or NULL */ + const dict_index_t* index, /*!< in: index on the table */ + mem_heap_t* heap, /*!< in,out: memory heap from which + the memory for the index entry + is allocated */ + ulint flag) /*!< in: ROW_BUILD_NORMAL, + ROW_BUILD_FOR_PURGE + or ROW_BUILD_FOR_UNDO */ + MY_ATTRIBUTE((warn_unused_result, nonnull(1,3,4))); +/*****************************************************************//** +When an insert or purge to a table is performed, this function builds +the entry to be inserted into or purged from an index on the table. +@return index entry which should be inserted or purged, or NULL if the +externally stored columns in the clustered index record are +unavailable and ext != NULL */ +UNIV_INLINE +dtuple_t* +row_build_index_entry( +/*==================*/ + const dtuple_t* row, /*!< in: row which should be + inserted or purged */ + const row_ext_t* ext, /*!< in: externally stored column + prefixes, or NULL */ + const dict_index_t* index, /*!< in: index on the table */ + mem_heap_t* heap) /*!< in,out: memory heap from which + the memory for the index entry + is allocated */ + MY_ATTRIBUTE((warn_unused_result, nonnull(1,3,4))); +/*******************************************************************//** +An inverse function to row_build_index_entry. Builds a row from a +record in a clustered index. +@return own: row built; see the NOTE below! */ +dtuple_t* +row_build( +/*======*/ + ulint type, /*!< in: ROW_COPY_POINTERS or + ROW_COPY_DATA; the latter + copies also the data fields to + heap while the first only + places pointers to data fields + on the index page, and thus is + more efficient */ + const dict_index_t* index, /*!< in: clustered index */ + const rec_t* rec, /*!< in: record in the clustered + index; NOTE: in the case + ROW_COPY_POINTERS the data + fields in the row will point + directly into this record, + therefore, the buffer page of + this record must be at least + s-latched and the latch held + as long as the row dtuple is used! */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec,index) + or NULL, in which case this function + will invoke rec_get_offsets() */ + const dict_table_t* col_table, + /*!< in: table, to check which + externally stored columns + occur in the ordering columns + of an index, or NULL if + index->table should be + consulted instead; the user + columns in this table should be + the same columns as in index->table */ + const dtuple_t* defaults, + /*!< in: default values of + added, changed columns, or NULL */ + const ulint* col_map,/*!< in: mapping of old column + numbers to new ones, or NULL */ + row_ext_t** ext, /*!< out, own: cache of + externally stored column + prefixes, or NULL */ + mem_heap_t* heap); /*!< in: memory heap from which + the memory needed is allocated */ + +/** An inverse function to row_build_index_entry. Builds a row from a +record in a clustered index, with possible indexing on ongoing +addition of new virtual columns. +@param[in] type ROW_COPY_POINTERS or ROW_COPY_DATA; +@param[in] index clustered index +@param[in] rec record in the clustered index +@param[in] offsets rec_get_offsets(rec,index) or NULL +@param[in] col_table table, to check which + externally stored columns + occur in the ordering columns + of an index, or NULL if + index->table should be + consulted instead +@param[in] defaults default values of added, changed columns, or NULL +@param[in] add_v new virtual columns added + along with new indexes +@param[in] col_map mapping of old column + numbers to new ones, or NULL +@param[in] ext cache of externally stored column + prefixes, or NULL +@param[in] heap memory heap from which + the memory needed is allocated +@return own: row built */ +dtuple_t* +row_build_w_add_vcol( + ulint type, + const dict_index_t* index, + const rec_t* rec, + const rec_offs* offsets, + const dict_table_t* col_table, + const dtuple_t* defaults, + const dict_add_v_col_t* add_v, + const ulint* col_map, + row_ext_t** ext, + mem_heap_t* heap); + +/*******************************************************************//** +Converts an index record to a typed data tuple. +@return index entry built; does not set info_bits, and the data fields +in the entry will point directly to rec */ +dtuple_t* +row_rec_to_index_entry_low( +/*=======================*/ + const rec_t* rec, /*!< in: record in the index */ + const dict_index_t* index, /*!< in: index */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory needed is allocated */ + MY_ATTRIBUTE((warn_unused_result)); +/*******************************************************************//** +Converts an index record to a typed data tuple. NOTE that externally +stored (often big) fields are NOT copied to heap. +@return own: index entry built */ +dtuple_t* +row_rec_to_index_entry( +/*===================*/ + const rec_t* rec, /*!< in: record in the index */ + const dict_index_t* index, /*!< in: index */ + const rec_offs* offsets,/*!< in/out: rec_get_offsets(rec) */ + mem_heap_t* heap) /*!< in: memory heap from which + the memory needed is allocated */ + MY_ATTRIBUTE((warn_unused_result)); + +/** Convert a metadata record to a data tuple. +@param[in] rec metadata record +@param[in] index clustered index after instant ALTER TABLE +@param[in] offsets rec_get_offsets(rec) +@param[in,out] heap memory heap for allocations +@param[in] info_bits the info_bits after an update +@param[in] pad whether to pad to index->n_fields */ +dtuple_t* +row_metadata_to_tuple( + const rec_t* rec, + const dict_index_t* index, + const rec_offs* offsets, + mem_heap_t* heap, + ulint info_bits, + bool pad) + MY_ATTRIBUTE((nonnull,warn_unused_result)); + +/*******************************************************************//** +Builds from a secondary index record a row reference with which we can +search the clustered index record. +@return own: row reference built; see the NOTE below! */ +dtuple_t* +row_build_row_ref( +/*==============*/ + ulint type, /*!< in: ROW_COPY_DATA, or ROW_COPY_POINTERS: + the former copies also the data fields to + heap, whereas the latter only places pointers + to data fields on the index page */ + dict_index_t* index, /*!< in: secondary index */ + const rec_t* rec, /*!< in: record in the index; + NOTE: in the case ROW_COPY_POINTERS + the data fields in the row will point + directly into this record, therefore, + the buffer page of this record must be + at least s-latched and the latch held + as long as the row reference is used! */ + mem_heap_t* heap) /*!< in: memory heap from which the memory + needed is allocated */ + MY_ATTRIBUTE((warn_unused_result)); +/*******************************************************************//** +Builds from a secondary index record a row reference with which we can +search the clustered index record. */ +void +row_build_row_ref_in_tuple( +/*=======================*/ + dtuple_t* ref, /*!< in/out: row reference built; + see the NOTE below! */ + const rec_t* rec, /*!< in: record in the index; + NOTE: the data fields in ref + will point directly into this + record, therefore, the buffer + page of this record must be at + least s-latched and the latch + held as long as the row + reference is used! */ + const dict_index_t* index, /*!< in: secondary index */ + rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) + or NULL */ + MY_ATTRIBUTE((nonnull(1,2,3))); +/*******************************************************************//** +Builds from a secondary index record a row reference with which we can +search the clustered index record. */ +UNIV_INLINE +void +row_build_row_ref_fast( +/*===================*/ + dtuple_t* ref, /*!< in/out: typed data tuple where the + reference is built */ + const ulint* map, /*!< in: array of field numbers in rec + telling how ref should be built from + the fields of rec */ + const rec_t* rec, /*!< in: secondary index record; + must be preserved while ref is used, as we do + not copy field values to heap */ + const rec_offs* offsets);/*!< in: array returned by rec_get_offsets() */ +/***************************************************************//** +Searches the clustered index record for a row, if we have the row +reference. +@return true if found */ +bool +row_search_on_row_ref( +/*==================*/ + btr_pcur_t* pcur, /*!< out: persistent cursor, which must + be closed by the caller */ + btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF, ... */ + const dict_table_t* table, /*!< in: table */ + const dtuple_t* ref, /*!< in: row reference */ + mtr_t* mtr) /*!< in/out: mtr */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*********************************************************************//** +Fetches the clustered index record for a secondary index record. The latches +on the secondary index record are preserved. +@return record or NULL, if no record found */ +rec_t* +row_get_clust_rec( +/*==============*/ + btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF, ... */ + const rec_t* rec, /*!< in: record in a secondary index */ + dict_index_t* index, /*!< in: secondary index */ + dict_index_t** clust_index,/*!< out: clustered index */ + mtr_t* mtr) /*!< in: mtr */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Parse the integer data from specified data, which could be +DATA_INT, DATA_FLOAT or DATA_DOUBLE. If the value is less than 0 +and the type is not unsigned then we reset the value to 0 +@param[in] data data to read +@param[in] len length of data +@param[in] mtype mtype of data +@param[in] unsigned_type if the data is unsigned +@return the integer value from the data */ +inline +ib_uint64_t +row_parse_int( + const byte* data, + ulint len, + ulint mtype, + bool unsigned_type); + +/** Result of row_search_index_entry */ +enum row_search_result { + ROW_FOUND = 0, /*!< the record was found */ + ROW_NOT_FOUND, /*!< record not found */ + ROW_BUFFERED, /*!< one of BTR_INSERT, BTR_DELETE, or + BTR_DELETE_MARK was specified, the + secondary index leaf page was not in + the buffer pool, and the operation was + enqueued in the insert/delete buffer */ + ROW_NOT_DELETED_REF /*!< BTR_DELETE was specified, and + row_purge_poss_sec() failed */ +}; + +/***************************************************************//** +Searches an index record. +@return whether the record was found or buffered */ +enum row_search_result +row_search_index_entry( +/*===================*/ + const dtuple_t* entry, /*!< in: index entry */ + btr_latch_mode mode, /*!< in: BTR_MODIFY_LEAF, ... */ + btr_pcur_t* pcur, /*!< in/out: persistent cursor, which must + be closed by the caller */ + mtr_t* mtr) /*!< in: mtr */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +#define ROW_COPY_DATA 1 +#define ROW_COPY_POINTERS 2 + +/* The allowed latching order of index records is the following: +(1) a secondary index record -> +(2) the clustered index record -> +(3) rollback segment data for the clustered index record. */ + +/*******************************************************************//** +Formats the raw data in "data" (in InnoDB on-disk format) using +"dict_field" and writes the result to "buf". +Not more than "buf_size" bytes are written to "buf". +The result is always NUL-terminated (provided buf_size is positive) and the +number of bytes that were written to "buf" is returned (including the +terminating NUL). +@return number of bytes that were written */ +ulint +row_raw_format( +/*===========*/ + const char* data, /*!< in: raw data */ + ulint data_len, /*!< in: raw data length + in bytes */ + const dict_field_t* dict_field, /*!< in: index field */ + char* buf, /*!< out: output buffer */ + ulint buf_size) /*!< in: output buffer size + in bytes */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Prepare to start a mini-transaction to modify an index. +@param[in,out] mtr mini-transaction +@param[in,out] index possibly secondary index +@param[in] pessimistic whether this is a pessimistic operation */ +inline +void +row_mtr_start(mtr_t* mtr, dict_index_t* index, bool pessimistic) +{ + mtr->start(); + + switch (index->table->space_id) { + case IBUF_SPACE_ID: + if (pessimistic + && !(index->type & (DICT_UNIQUE | DICT_SPATIAL))) { + ibuf_free_excess_pages(); + } + break; + case SRV_TMP_SPACE_ID: + mtr->set_log_mode(MTR_LOG_NO_REDO); + break; + default: + index->set_modified(*mtr); + break; + } + + log_free_check(); +} + +#include "row0row.inl" + +#endif diff --git a/storage/innobase/include/row0row.inl b/storage/innobase/include/row0row.inl new file mode 100644 index 00000000..e89adb58 --- /dev/null +++ b/storage/innobase/include/row0row.inl @@ -0,0 +1,221 @@ +/***************************************************************************** + +Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2018, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0row.ic +General row routines + +Created 4/20/1996 Heikki Tuuri +*******************************************************/ + +#include "dict0dict.h" +#include "rem0rec.h" +#include "trx0undo.h" + +/*********************************************************************//** +Gets the offset of the DB_TRX_ID field, in bytes relative to the origin of +a clustered index record. +@return offset of DATA_TRX_ID */ +UNIV_INLINE +ulint +row_get_trx_id_offset( +/*==================*/ + const dict_index_t* index, /*!< in: clustered index */ + const rec_offs* offsets)/*!< in: record offsets */ +{ + ulint offset; + ulint len; + + ut_ad(rec_offs_validate(NULL, index, offsets)); + + offset = rec_get_nth_field_offs(offsets, index->db_trx_id(), &len); + + ut_ad(len == DATA_TRX_ID_LEN); + + return(offset); +} + +/*********************************************************************//** +Reads the trx id field from a clustered index record. +@return value of the field */ +UNIV_INLINE +trx_id_t +row_get_rec_trx_id( +/*===============*/ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: clustered index */ + const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */ +{ + ulint offset; + + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + offset = index->trx_id_offset; + + if (!offset) { + offset = row_get_trx_id_offset(index, offsets); + } + + return(trx_read_trx_id(rec + offset)); +} + +/*********************************************************************//** +Reads the roll pointer field from a clustered index record. +@return value of the field */ +UNIV_INLINE +roll_ptr_t +row_get_rec_roll_ptr( +/*=================*/ + const rec_t* rec, /*!< in: record */ + const dict_index_t* index, /*!< in: clustered index */ + const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */ +{ + ulint offset; + + ut_ad(dict_index_is_clust(index)); + ut_ad(rec_offs_validate(rec, index, offsets)); + + offset = index->trx_id_offset; + + if (!offset) { + offset = row_get_trx_id_offset(index, offsets); + } + + return(trx_read_roll_ptr(rec + offset + DATA_TRX_ID_LEN)); +} + +/*****************************************************************//** +When an insert or purge to a table is performed, this function builds +the entry to be inserted into or purged from an index on the table. +@return index entry which should be inserted or purged, or NULL if the +externally stored columns in the clustered index record are +unavailable and ext != NULL */ +UNIV_INLINE +dtuple_t* +row_build_index_entry( +/*==================*/ + const dtuple_t* row, /*!< in: row which should be + inserted or purged */ + const row_ext_t* ext, /*!< in: externally stored column + prefixes, or NULL */ + const dict_index_t* index, /*!< in: index on the table */ + mem_heap_t* heap) /*!< in,out: memory heap from which + the memory for the index entry + is allocated */ +{ + dtuple_t* entry; + + ut_ad(dtuple_check_typed(row)); + entry = row_build_index_entry_low(row, ext, index, heap, + ROW_BUILD_NORMAL); + ut_ad(!entry || dtuple_check_typed(entry)); + return(entry); +} + +/*******************************************************************//** +Builds from a secondary index record a row reference with which we can +search the clustered index record. */ +UNIV_INLINE +void +row_build_row_ref_fast( +/*===================*/ + dtuple_t* ref, /*!< in/out: typed data tuple where the + reference is built */ + const ulint* map, /*!< in: array of field numbers in rec + telling how ref should be built from + the fields of rec */ + const rec_t* rec, /*!< in: secondary index record; + must be preserved while ref is used, as we do + not copy field values to heap */ + const rec_offs* offsets)/*!< in: array returned by rec_get_offsets() */ +{ + dfield_t* dfield; + const byte* field; + ulint len; + ulint ref_len; + ulint field_no; + ulint i; + + ut_ad(rec_offs_validate(rec, NULL, offsets)); + ut_ad(!rec_offs_any_extern(offsets)); + ref_len = dtuple_get_n_fields(ref); + + for (i = 0; i < ref_len; i++) { + dfield = dtuple_get_nth_field(ref, i); + + field_no = *(map + i); + + if (field_no != ULINT_UNDEFINED) { + + field = rec_get_nth_field(rec, offsets, + field_no, &len); + dfield_set_data(dfield, field, len); + } + } +} + +/** Parse the integer data from specified data, which could be +DATA_INT, DATA_FLOAT or DATA_DOUBLE. If the value is less than 0 +and the type is not unsigned then we reset the value to 0 +@param[in] data data to read +@param[in] len length of data +@param[in] mtype mtype of data +@param[in] unsigned_type if the data is unsigned +@return the integer value from the data */ +ib_uint64_t +row_parse_int( + const byte* data, + ulint len, + ulint mtype, + bool unsigned_type) +{ + ib_uint64_t value = 0; + + switch (mtype) { + case DATA_INT: + + ut_a(len <= sizeof value); + value = mach_read_int_type(data, len, unsigned_type); + break; + + case DATA_FLOAT: + + ut_a(len == sizeof(float)); + value = static_cast(mach_float_read(data)); + break; + + case DATA_DOUBLE: + + ut_a(len == sizeof(double)); + value = static_cast(mach_double_read(data)); + break; + + default: + ut_error; + + } + + if (!unsigned_type && static_cast(value) < 0) { + value = 0; + } + + return(value); +} + diff --git a/storage/innobase/include/row0sel.h b/storage/innobase/include/row0sel.h new file mode 100644 index 00000000..8134c60f --- /dev/null +++ b/storage/innobase/include/row0sel.h @@ -0,0 +1,457 @@ +/***************************************************************************** + +Copyright (c) 1997, 2017, Oracle and/or its affiliates. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0sel.h +Select + +Created 12/19/1997 Heikki Tuuri +*******************************************************/ + +#pragma once + +#include "data0data.h" +#include "que0types.h" +#include "trx0types.h" +#include "read0types.h" +#include "row0types.h" +#include "que0types.h" +#include "pars0sym.h" +#include "btr0pcur.h" +#include "row0mysql.h" + +/*********************************************************************//** +Creates a select node struct. +@return own: select node struct */ +sel_node_t* +sel_node_create( +/*============*/ + mem_heap_t* heap); /*!< in: memory heap where created */ +/*********************************************************************//** +Frees the memory private to a select node when a query graph is freed, +does not free the heap where the node was originally created. */ +void +sel_node_free_private( +/*==================*/ + sel_node_t* node); /*!< in: select node struct */ +/*********************************************************************//** +Frees a prefetch buffer for a column, including the dynamically allocated +memory for data stored there. */ +void +sel_col_prefetch_buf_free( +/*======================*/ + sel_buf_t* prefetch_buf); /*!< in, own: prefetch buffer */ +/**********************************************************************//** +Performs a select step. This is a high-level function used in SQL execution +graphs. +@return query thread to run next or NULL */ +que_thr_t* +row_sel_step( +/*=========*/ + que_thr_t* thr); /*!< in: query thread */ +/**********************************************************************//** +Performs a fetch for a cursor. +@return query thread to run next or NULL */ +que_thr_t* +fetch_step( +/*=======*/ + que_thr_t* thr); /*!< in: query thread */ +/***********************************************************//** +Prints a row in a select result. +@return query thread to run next or NULL */ +que_thr_t* +row_printf_step( +/*============*/ + que_thr_t* thr); /*!< in: query thread */ + +/** Copy used fields from cached row. +Copy cache record field by field, don't touch fields that +are not covered by current key. +@param[out] buf Where to copy the MySQL row. +@param[in] cached_rec What to copy (in MySQL row format). +@param[in] prebuilt prebuilt struct. */ +void +row_sel_copy_cached_fields_for_mysql( + byte* buf, + const byte* cached_rec, + row_prebuilt_t* prebuilt); + +/****************************************************************//** +Converts a key value stored in MySQL format to an Innobase dtuple. The last +field of the key value may be just a prefix of a fixed length field: hence +the parameter key_len. But currently we do not allow search keys where the +last field is only a prefix of the full key field len and print a warning if +such appears. */ +void +row_sel_convert_mysql_key_to_innobase( +/*==================================*/ + dtuple_t* tuple, /*!< in/out: tuple where to build; + NOTE: we assume that the type info + in the tuple is already according + to index! */ + byte* buf, /*!< in: buffer to use in field + conversions; NOTE that dtuple->data + may end up pointing inside buf so + do not discard that buffer while + the tuple is being used. See + row_mysql_store_col_in_innobase_format() + in the case of DATA_INT */ + ulint buf_len, /*!< in: buffer length */ + dict_index_t* index, /*!< in: index of the key value */ + const byte* key_ptr, /*!< in: MySQL key value */ + ulint key_len); /*!< in: MySQL key value length */ + + +/** Search for rows in the database using cursor. +Function is mainly used for tables that are shared across connections and +so it employs technique that can help re-construct the rows that +transaction is suppose to see. +It also has optimization such as pre-caching the rows, using AHI, etc. + +@param[out] buf buffer for the fetched row in MySQL format +@param[in] mode search mode PAGE_CUR_L +@param[in,out] prebuilt prebuilt struct for the table handler; + this contains the info to search_tuple, + index; if search tuple contains 0 field then + we position the cursor at start or the end of + index, depending on 'mode' +@param[in] match_mode 0 or ROW_SEL_EXACT or ROW_SEL_EXACT_PREFIX +@param[in] direction 0 or ROW_SEL_NEXT or ROW_SEL_PREV; + Note: if this is != 0, then prebuilt must has a + pcur with stored position! In opening of a + cursor 'direction' should be 0. +@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK, +DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */ +dberr_t +row_search_mvcc( + byte* buf, + page_cur_mode_t mode, + row_prebuilt_t* prebuilt, + ulint match_mode, + ulint direction) + MY_ATTRIBUTE((warn_unused_result)); + +/********************************************************************//** +Count rows in a R-Tree leaf level. +@return DB_SUCCESS if successful */ +dberr_t +row_count_rtree_recs( +/*=================*/ + row_prebuilt_t* prebuilt, /*!< in: prebuilt struct for the + table handle; this contains the info + of search_tuple, index; if search + tuple contains 0 fields then we + position the cursor at the start or + the end of the index, depending on + 'mode' */ + ulint* n_rows); /*!< out: number of entries + seen in the consistent read */ + +/** +Check the index records in CHECK TABLE. +The index must contain entries in an ascending order, +unique constraint must not be violated by duplicated keys, +and the number of index entries is counted in according to the +current read view. + +@param prebuilt index and transaction +@param n_rows number of records counted + +@return error code +@retval DB_SUCCESS if no error was found */ +dberr_t row_check_index(row_prebuilt_t *prebuilt, ulint *n_rows) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Read the max AUTOINC value from an index. +@param[in] index index starting with an AUTO_INCREMENT column +@return the largest AUTO_INCREMENT value +@retval 0 if no records were found */ +ib_uint64_t +row_search_max_autoinc(dict_index_t* index) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** A structure for caching column values for prefetched rows */ +struct sel_buf_t{ + byte* data; /*!< data, or NULL; if not NULL, this field + has allocated memory which must be explicitly + freed; can be != NULL even when len is + UNIV_SQL_NULL */ + ulint len; /*!< data length or UNIV_SQL_NULL */ + ulint val_buf_size; + /*!< size of memory buffer allocated for data: + this can be more than len; this is defined + when data != NULL */ +}; + +/** Copy used fields from cached row. +Copy cache record field by field, don't touch fields that +are not covered by current key. +@param[out] buf Where to copy the MySQL row. +@param[in] cached_rec What to copy (in MySQL row format). +@param[in] prebuilt prebuilt struct. */ +void +row_sel_copy_cached_fields_for_mysql( + byte* buf, + const byte* cached_rec, + row_prebuilt_t* prebuilt); + +/** Query plan */ +struct plan_t{ + dict_table_t* table; /*!< table struct in the dictionary + cache */ + dict_index_t* index; /*!< table index used in the search */ + btr_pcur_t pcur; /*!< persistent cursor used to search + the index */ + ibool asc; /*!< TRUE if cursor traveling upwards */ + ibool pcur_is_open; /*!< TRUE if pcur has been positioned + and we can try to fetch new rows */ + ibool cursor_at_end; /*!< TRUE if the cursor is open but + we know that there are no more + qualifying rows left to retrieve from + the index tree; NOTE though, that + there may still be unprocessed rows in + the prefetch stack; always FALSE when + pcur_is_open is FALSE */ + ibool stored_cursor_rec_processed; + /*!< TRUE if the pcur position has been + stored and the record it is positioned + on has already been processed */ + que_node_t** tuple_exps; /*!< array of expressions + which are used to calculate + the field values in the search + tuple: there is one expression + for each field in the search + tuple */ + dtuple_t* tuple; /*!< search tuple */ + page_cur_mode_t mode; /*!< search mode: PAGE_CUR_G, ... */ + ulint n_exact_match; /*!< number of first fields in + the search tuple which must be + exactly matched */ + ibool unique_search; /*!< TRUE if we are searching an + index record with a unique key */ + ulint n_rows_fetched; /*!< number of rows fetched using pcur + after it was opened */ + ulint n_rows_prefetched;/*!< number of prefetched rows cached + for fetch: fetching several rows in + the same mtr saves CPU time */ + ulint first_prefetched;/*!< index of the first cached row in + select buffer arrays for each column */ + ibool no_prefetch; /*!< no prefetch for this table */ + sym_node_list_t columns; /*!< symbol table nodes for the columns + to retrieve from the table */ + UT_LIST_BASE_NODE_T(func_node_t) + end_conds; /*!< conditions which determine the + fetch limit of the index segment we + have to look at: when one of these + fails, the result set has been + exhausted for the cursor in this + index; these conditions are normalized + so that in a comparison the column + for this table is the first argument */ + UT_LIST_BASE_NODE_T(func_node_t) + other_conds; /*!< the rest of search conditions we can + test at this table in a join */ + ibool must_get_clust; /*!< TRUE if index is a non-clustered + index and we must also fetch the + clustered index record; this is the + case if the non-clustered record does + not contain all the needed columns, or + if this is a single-table explicit + cursor, or a searched update or + delete */ + ulint* clust_map; /*!< map telling how clust_ref is built + from the fields of a non-clustered + record */ + dtuple_t* clust_ref; /*!< the reference to the clustered + index entry is built here if index is + a non-clustered index */ + btr_pcur_t clust_pcur; /*!< if index is non-clustered, we use + this pcur to search the clustered + index */ + mem_heap_t* old_vers_heap; /*!< memory heap used in building an old + version of a row, or NULL */ +}; + +/** Select node states */ +enum sel_node_state { + SEL_NODE_CLOSED, /*!< it is a declared cursor which is not + currently open */ + SEL_NODE_OPEN, /*!< intention locks not yet set on tables */ + SEL_NODE_FETCH, /*!< intention locks have been set */ + SEL_NODE_NO_MORE_ROWS /*!< cursor has reached the result set end */ +}; + +/** Select statement node */ +struct sel_node_t{ + que_common_t common; /*!< node type: QUE_NODE_SELECT */ + enum sel_node_state + state; /*!< node state */ + que_node_t* select_list; /*!< select list */ + sym_node_t* into_list; /*!< variables list or NULL */ + sym_node_t* table_list; /*!< table list */ + ibool asc; /*!< TRUE if the rows should be fetched + in an ascending order */ + ibool set_x_locks; /*!< TRUE if the cursor is for update or + delete, which means that a row x-lock + should be placed on the cursor row */ + lock_mode row_lock_mode; /*!< LOCK_X or LOCK_S */ + ulint n_tables; /*!< number of tables */ + ulint fetch_table; /*!< number of the next table to access + in the join */ + plan_t* plans; /*!< array of n_tables many plan nodes + containing the search plan and the + search data structures */ + que_node_t* search_cond; /*!< search condition */ + ReadView* read_view; /*!< if the query is a non-locking + consistent read, its read view is + placed here, otherwise NULL */ + ibool consistent_read;/*!< TRUE if the select is a consistent, + non-locking read */ + order_node_t* order_by; /*!< order by column definition, or + NULL */ + ibool is_aggregate; /*!< TRUE if the select list consists of + aggregate functions */ + ibool aggregate_already_fetched; + /*!< TRUE if the aggregate row has + already been fetched for the current + cursor */ + ibool can_get_updated;/*!< this is TRUE if the select + is in a single-table explicit + cursor which can get updated + within the stored procedure, + or in a searched update or + delete; NOTE that to determine + of an explicit cursor if it + can get updated, the parser + checks from a stored procedure + if it contains positioned + update or delete statements */ + sym_node_t* explicit_cursor;/*!< not NULL if an explicit cursor */ + UT_LIST_BASE_NODE_T(sym_node_t) + copy_variables; /*!< variables whose values we have to + copy when an explicit cursor is opened, + so that they do not change between + fetches */ +}; + +/** +Get the plan node for a table in a join. +@param node query graph node for SELECT +@param i plan node element +@return ith plan node */ +inline plan_t *sel_node_get_nth_plan(sel_node_t *node, ulint i) +{ + ut_ad(i < node->n_tables); + return &node->plans[i]; +} + +/** Fetch statement node */ +struct fetch_node_t{ + que_common_t common; /*!< type: QUE_NODE_FETCH */ + sel_node_t* cursor_def; /*!< cursor definition */ + sym_node_t* into_list; /*!< variables to set */ + + pars_user_func_t* + func; /*!< User callback function or NULL. + The first argument to the function + is a sel_node_t*, containing the + results of the SELECT operation for + one row. If the function returns + NULL, it is not interested in + further rows and the cursor is + modified so (cursor % NOTFOUND) is + true. If it returns not-NULL, + continue normally. */ +}; + +/** Open or close cursor operation type */ +enum open_node_op { + ROW_SEL_OPEN_CURSOR, /*!< open cursor */ + ROW_SEL_CLOSE_CURSOR /*!< close cursor */ +}; + +/** Open or close cursor statement node */ +struct open_node_t{ + que_common_t common; /*!< type: QUE_NODE_OPEN */ + enum open_node_op + op_type; /*!< operation type: open or + close cursor */ + sel_node_t* cursor_def; /*!< cursor definition */ +}; + +/** Row printf statement node */ +struct row_printf_node_t{ + que_common_t common; /*!< type: QUE_NODE_ROW_PRINTF */ + sel_node_t* sel_node; /*!< select */ +}; + +/** Search direction for the MySQL interface */ +enum row_sel_direction { + ROW_SEL_NEXT = 1, /*!< ascending direction */ + ROW_SEL_PREV = 2 /*!< descending direction */ +}; + +/** Match mode for the MySQL interface */ +enum row_sel_match_mode { + ROW_SEL_EXACT = 1, /*!< search using a complete key value */ + ROW_SEL_EXACT_PREFIX /*!< search using a key prefix which + must match rows: the prefix may + contain an incomplete field (the last + field in prefix may be just a prefix + of a fixed length column) */ +}; + +#ifdef UNIV_DEBUG +/** Convert a non-SQL-NULL field from Innobase format to MySQL format. */ +# define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len) \ + row_sel_field_store_in_mysql_format_func(dest,templ,idx,field,src,len) +#else /* UNIV_DEBUG */ +/** Convert a non-SQL-NULL field from Innobase format to MySQL format. */ +# define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len) \ + row_sel_field_store_in_mysql_format_func(dest,templ,src,len) +#endif /* UNIV_DEBUG */ + +/**************************************************************//** +Stores a non-SQL-NULL field in the MySQL format. The counterpart of this +function is row_mysql_store_col_in_innobase_format() in row0mysql.cc. */ + +void +row_sel_field_store_in_mysql_format_func( +/*=====================================*/ + byte* dest, /*!< in/out: buffer where to store; NOTE + that BLOBs are not in themselves + stored here: the caller must allocate + and copy the BLOB into buffer before, + and pass the pointer to the BLOB in + 'data' */ + const mysql_row_templ_t* templ, + /*!< in: MySQL column template. + Its following fields are referenced: + type, is_unsigned, mysql_col_len, + mbminlen, mbmaxlen */ +#ifdef UNIV_DEBUG + const dict_index_t* index, + /*!< in: InnoDB index */ + ulint field_no, + /*!< in: templ->rec_field_no or + templ->clust_rec_field_no or + templ->icp_rec_field_no */ +#endif /* UNIV_DEBUG */ + const byte* data, /*!< in: data to store */ + ulint len); /*!< in: length of the data */ diff --git a/storage/innobase/include/row0types.h b/storage/innobase/include/row0types.h new file mode 100644 index 00000000..5e737c1c --- /dev/null +++ b/storage/innobase/include/row0types.h @@ -0,0 +1,54 @@ +/***************************************************************************** + +Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0types.h +Row operation global types + +Created 12/27/1996 Heikki Tuuri +*******************************************************/ + +#pragma once +#include "buf0types.h" + +struct plan_t; + +struct upd_t; +struct upd_field_t; +struct upd_node_t; +struct del_node_t; +struct ins_node_t; +struct sel_node_t; +struct open_node_t; +struct fetch_node_t; + +struct row_printf_node_t; +struct sel_buf_t; + +struct undo_node_t; + +struct purge_node_t; + +struct row_ext_t; + +/** Buffer for logging modifications during online index creation */ +struct row_log_t; + +/* MySQL data types */ +struct TABLE; diff --git a/storage/innobase/include/row0uins.h b/storage/innobase/include/row0uins.h new file mode 100644 index 00000000..a9877969 --- /dev/null +++ b/storage/innobase/include/row0uins.h @@ -0,0 +1,50 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0uins.h +Fresh insert undo + +Created 2/25/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0uins_h +#define row0uins_h + +#include "data0data.h" +#include "trx0types.h" +#include "que0types.h" +#include "row0types.h" +#include "mtr0mtr.h" + +/***********************************************************//** +Undoes a fresh insert of a row to a table. A fresh insert means that +the same clustered index unique key did not have any record, even delete +marked, at the time of the insert. InnoDB is eager in a rollback: +if it figures out that an index record will be removed in the purge +anyway, it will remove it in the rollback. +@return DB_SUCCESS */ +dberr_t +row_undo_ins( +/*=========*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +#endif diff --git a/storage/innobase/include/row0umod.h b/storage/innobase/include/row0umod.h new file mode 100644 index 00000000..5032e103 --- /dev/null +++ b/storage/innobase/include/row0umod.h @@ -0,0 +1,46 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0umod.h +Undo modify of a row + +Created 2/27/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0umod_h +#define row0umod_h + +#include "data0data.h" +#include "trx0types.h" +#include "que0types.h" +#include "row0types.h" +#include "mtr0mtr.h" + +/***********************************************************//** +Undoes a modify operation on a row of a table. +@return DB_SUCCESS or error code */ +dberr_t +row_undo_mod( +/*=========*/ + undo_node_t* node, /*!< in: row undo node */ + que_thr_t* thr) /*!< in: query thread */ + MY_ATTRIBUTE((warn_unused_result)); + +#endif diff --git a/storage/innobase/include/row0undo.h b/storage/innobase/include/row0undo.h new file mode 100644 index 00000000..ae067a8a --- /dev/null +++ b/storage/innobase/include/row0undo.h @@ -0,0 +1,114 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0undo.h +Row undo + +Created 1/8/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0undo_h +#define row0undo_h + +#include "trx0sys.h" +#include "btr0types.h" +#include "btr0pcur.h" +#include "que0types.h" +#include "row0types.h" + +/********************************************************************//** +Creates a row undo node to a query graph. +@return own: undo node */ +undo_node_t* +row_undo_node_create( +/*=================*/ + trx_t* trx, /*!< in: transaction */ + que_thr_t* parent, /*!< in: parent node, i.e., a thr node */ + mem_heap_t* heap); /*!< in: memory heap where created */ +/***********************************************************//** +Looks for the clustered index record when node has the row reference. +The pcur in node is used in the search. If found, stores the row to node, +and stores the position of pcur, and detaches it. The pcur must be closed +by the caller in any case. +@return true if found; NOTE the node->pcur must be closed by the +caller, regardless of the return value */ +bool +row_undo_search_clust_to_pcur( +/*==========================*/ + undo_node_t* node) /*!< in/out: row undo node */ + MY_ATTRIBUTE((warn_unused_result)); +/***********************************************************//** +Undoes a row operation in a table. This is a high-level function used +in SQL execution graphs. +@return query thread to run next or NULL */ +que_thr_t* +row_undo_step( +/*==========*/ + que_thr_t* thr); /*!< in: query thread */ + +/* A single query thread will try to perform the undo for all successive +versions of a clustered index record, if the transaction has modified it +several times during the execution which is rolled back. It may happen +that the task is transferred to another query thread, if the other thread +is assigned to handle an undo log record in the chain of different versions +of the record, and the other thread happens to get the x-latch to the +clustered index record at the right time. + If a query thread notices that the clustered index record it is looking +for is missing, or the roll ptr field in the record doed not point to the +undo log record the thread was assigned to handle, then it gives up the undo +task for that undo log record, and fetches the next. This situation can occur +just in the case where the transaction modified the same record several times +and another thread is currently doing the undo for successive versions of +that index record. */ + +/** Undo node structure */ +struct undo_node_t{ + que_common_t common; /*!< node type: QUE_NODE_UNDO */ + bool is_temp;/*!< whether this is a temporary table */ + trx_t* trx; /*!< trx for which undo is done */ + roll_ptr_t roll_ptr;/*!< roll pointer to undo log record */ + trx_undo_rec_t* undo_rec;/*!< undo log record */ + undo_no_t undo_no;/*!< undo number of the record */ + byte rec_type;/*!< undo log record type: TRX_UNDO_INSERT_REC, + ... */ + trx_id_t new_trx_id; /*!< trx id to restore to clustered index + record */ + btr_pcur_t pcur; /*!< persistent cursor used in searching the + clustered index record */ + dict_table_t* table; /*!< table where undo is done */ + ulint cmpl_info;/*!< compiler analysis of an update */ + upd_t* update; /*!< update vector for a clustered index + record */ + const dtuple_t* ref; /*!< row reference to the next row to handle */ + dtuple_t* row; /*!< a copy (also fields copied to heap) of the + row to handle */ + row_ext_t* ext; /*!< NULL, or prefixes of the externally + stored columns of the row */ + dtuple_t* undo_row;/*!< NULL, or the row after undo */ + row_ext_t* undo_ext;/*!< NULL, or prefixes of the externally + stored columns of undo_row */ + dict_index_t* index; /*!< the next index whose record should be + handled */ + mem_heap_t* heap; /*!< memory heap used as auxiliary storage for + row; this must be emptied after undo is tried + on a row */ +}; + +#endif diff --git a/storage/innobase/include/row0upd.h b/storage/innobase/include/row0upd.h new file mode 100644 index 00000000..f60fc359 --- /dev/null +++ b/storage/innobase/include/row0upd.h @@ -0,0 +1,559 @@ +/***************************************************************************** + +Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0upd.h +Update of a row + +Created 12/27/1996 Heikki Tuuri +*******************************************************/ + +#ifndef row0upd_h +#define row0upd_h + +#include "data0data.h" +#include "rem0types.h" +#include "row0types.h" +#include "btr0types.h" +#include "trx0types.h" +#include "btr0pcur.h" +#include "que0types.h" +#include "pars0types.h" + +/*********************************************************************//** +Creates an update vector object. +@return own: update vector object */ +UNIV_INLINE +upd_t* +upd_create( +/*=======*/ + ulint n, /*!< in: number of fields */ + mem_heap_t* heap); /*!< in: heap from which memory allocated */ +/*********************************************************************//** +Returns the number of fields in the update vector == number of columns +to be updated by an update vector. +@return number of fields */ +UNIV_INLINE +ulint +upd_get_n_fields( +/*=============*/ + const upd_t* update); /*!< in: update vector */ +#ifdef UNIV_DEBUG +/*********************************************************************//** +Returns the nth field of an update vector. +@return update vector field */ +UNIV_INLINE +upd_field_t* +upd_get_nth_field( +/*==============*/ + const upd_t* update, /*!< in: update vector */ + ulint n); /*!< in: field position in update vector */ +#else +# define upd_get_nth_field(update, n) ((update)->fields + (n)) +#endif + +/*********************************************************************//** +Sets an index field number to be updated by an update vector field. */ +UNIV_INLINE +void +upd_field_set_field_no( +/*===================*/ + upd_field_t* upd_field, /*!< in: update vector field */ + uint16_t field_no, /*!< in: field number in a clustered + index */ + dict_index_t* index); + +/** set field number to a update vector field, marks this field is updated +@param[in,out] upd_field update vector field +@param[in] field_no virtual column sequence num +@param[in] index index */ +UNIV_INLINE +void +upd_field_set_v_field_no( + upd_field_t* upd_field, + uint16_t field_no, + dict_index_t* index); +/*********************************************************************//** +Returns a field of an update vector by field_no. +@return update vector field, or NULL */ +UNIV_INLINE +const upd_field_t* +upd_get_field_by_field_no( +/*======================*/ + const upd_t* update, /*!< in: update vector */ + uint16_t no, /*!< in: field_no */ + bool is_virtual) /*!< in: if it is a virtual column */ + MY_ATTRIBUTE((warn_unused_result)); +/*********************************************************************//** +Creates an update node for a query graph. +@return own: update node */ +upd_node_t* +upd_node_create( +/*============*/ + mem_heap_t* heap); /*!< in: mem heap where created */ +/***********************************************************//** +Returns TRUE if row update changes size of some field in index or if some +field to be updated is stored externally in rec or update. +@return TRUE if the update changes the size of some field in index or +the field is external in rec or update */ +ibool +row_upd_changes_field_size_or_external( +/*===================================*/ + dict_index_t* index, /*!< in: index */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + const upd_t* update);/*!< in: update vector */ + +/***************************************************************//** +Builds an update vector from those fields which in a secondary index entry +differ from a record that has the equal ordering fields. NOTE: we compare +the fields as binary strings! +@return own: update vector of differing fields */ +upd_t* +row_upd_build_sec_rec_difference_binary( +/*====================================*/ + const rec_t* rec, /*!< in: secondary index record */ + dict_index_t* index, /*!< in: index */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + const dtuple_t* entry, /*!< in: entry to insert */ + mem_heap_t* heap) /*!< in: memory heap from which allocated */ + MY_ATTRIBUTE((warn_unused_result, nonnull)); +/** Builds an update vector from those fields, excluding the roll ptr and +trx id fields, which in an index entry differ from a record that has +the equal ordering fields. NOTE: we compare the fields as binary strings! +@param[in] index clustered index +@param[in] entry clustered index entry to insert +@param[in] rec clustered index record +@param[in] offsets rec_get_offsets(rec,index), or NULL +@param[in] no_sys skip the system columns + DB_TRX_ID and DB_ROLL_PTR +@param[in] ignore_warnings ignore warnings during vcol calculation, which + means that this calculation is internal only +@param[in] trx transaction (for diagnostics), + or NULL +@param[in] heap memory heap from which allocated +@param[in,out] mysql_table NULL, or mysql table object when + user thread invokes dml +@param[out] error error number in case of failure +@return own: update vector of differing fields, excluding roll ptr and +trx id */ +upd_t* +row_upd_build_difference_binary( + dict_index_t* index, + const dtuple_t* entry, + const rec_t* rec, + const rec_offs* offsets, + bool no_sys, + bool ignore_warnings, + trx_t* trx, + mem_heap_t* heap, + TABLE* mysql_table, + dberr_t* error) + MY_ATTRIBUTE((nonnull(1,2,3,8,10), warn_unused_result)); +/** Apply an update vector to an index entry. +@param[in,out] entry index entry to be updated; the clustered index record + must be covered by a lock or a page latch to prevent + deletion (rollback or purge) +@param[in] index index of the entry +@param[in] update update vector built for the entry +@param[in,out] heap memory heap for copying off-page columns */ +void +row_upd_index_replace_new_col_vals_index_pos( + dtuple_t* entry, + const dict_index_t* index, + const upd_t* update, + mem_heap_t* heap) + MY_ATTRIBUTE((nonnull)); +/** Replace the new column values stored in the update vector, +during trx_undo_prev_version_build(). +@param entry clustered index tuple where the values are replaced + (the clustered index leaf page latch must be held) +@param index clustered index +@param update update vector for the clustered index +@param heap memory heap for allocating and copying values +@return whether the previous version was built successfully */ +bool +row_upd_index_replace_new_col_vals(dtuple_t *entry, const dict_index_t &index, + const upd_t *update, mem_heap_t *heap) + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/***********************************************************//** +Replaces the new column values stored in the update vector. */ +void +row_upd_replace( +/*============*/ + dtuple_t* row, /*!< in/out: row where replaced, + indexed by col_no; + the clustered index record must be + covered by a lock or a page latch to + prevent deletion (rollback or purge) */ + row_ext_t** ext, /*!< out, own: NULL, or externally + stored column prefixes */ + const dict_index_t* index, /*!< in: clustered index */ + const upd_t* update, /*!< in: an update vector built for the + clustered index */ + mem_heap_t* heap); /*!< in: memory heap */ +/** Replaces the virtual column values stored in a dtuple with that of +a update vector. +@param[in,out] row dtuple whose column to be updated +@param[in] table table +@param[in] update an update vector built for the clustered index +@param[in] upd_new update to new or old value +@param[in,out] undo_row undo row (if needs to be updated) +@param[in] ptr remaining part in update undo log */ +void +row_upd_replace_vcol( + dtuple_t* row, + const dict_table_t* table, + const upd_t* update, + bool upd_new, + dtuple_t* undo_row, + const byte* ptr); + +/***********************************************************//** +Checks if an update vector changes an ordering field of an index record. + +This function is fast if the update vector is short or the number of ordering +fields in the index is small. Otherwise, this can be quadratic. +NOTE: we compare the fields as binary strings! +@return TRUE if update vector changes an ordering field in the index record */ +ibool +row_upd_changes_ord_field_binary_func( +/*==================================*/ + dict_index_t* index, /*!< in: index of the record */ + const upd_t* update, /*!< in: update vector for the row; NOTE: the + field numbers in this MUST be clustered index + positions! */ +#ifdef UNIV_DEBUG + const que_thr_t*thr, /*!< in: query thread */ +#endif /* UNIV_DEBUG */ + const dtuple_t* row, /*!< in: old value of row, or NULL if the + row and the data values in update are not + known when this function is called, e.g., at + compile time */ + const row_ext_t*ext, /*!< NULL, or prefixes of the externally + stored columns in the old row */ + ulint flag) /*!< in: ROW_BUILD_NORMAL, + ROW_BUILD_FOR_PURGE or ROW_BUILD_FOR_UNDO */ + MY_ATTRIBUTE((nonnull(1,2), warn_unused_result)); +#ifdef UNIV_DEBUG +# define row_upd_changes_ord_field_binary(index,update,thr,row,ext) \ + row_upd_changes_ord_field_binary_func(index,update,thr,row,ext,0) +#else /* UNIV_DEBUG */ +# define row_upd_changes_ord_field_binary(index,update,thr,row,ext) \ + row_upd_changes_ord_field_binary_func(index,update,row,ext,0) +#endif /* UNIV_DEBUG */ +/***********************************************************//** +Checks if an FTS indexed column is affected by an UPDATE. +@return offset within fts_t::indexes if FTS indexed column updated else +ULINT_UNDEFINED */ +ulint +row_upd_changes_fts_column( +/*=======================*/ + dict_table_t* table, /*!< in: table */ + upd_field_t* upd_field); /*!< in: field to check */ +/***********************************************************//** +Checks if an FTS Doc ID column is affected by an UPDATE. +@return whether Doc ID column is affected */ +bool +row_upd_changes_doc_id( +/*===================*/ + dict_table_t* table, /*!< in: table */ + upd_field_t* upd_field) /*!< in: field to check */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/***********************************************************//** +Checks if an update vector changes an ordering field of an index record. +This function is fast if the update vector is short or the number of ordering +fields in the index is small. Otherwise, this can be quadratic. +NOTE: we compare the fields as binary strings! +@return TRUE if update vector may change an ordering field in an index +record */ +ibool +row_upd_changes_some_index_ord_field_binary( +/*========================================*/ + const dict_table_t* table, /*!< in: table */ + const upd_t* update);/*!< in: update vector for the row */ +/***********************************************************//** +Updates a row in a table. This is a high-level function used +in SQL execution graphs. +@return query thread to run next or NULL */ +que_thr_t* +row_upd_step( +/*=========*/ + que_thr_t* thr); /*!< in: query thread */ + +/* Update vector field */ +struct upd_field_t{ + uint16_t field_no; /*!< field number in an index, usually + the clustered index, but in updating + a secondary index record in btr0cur.cc + this is the position in the secondary + index. If this field is a virtual + column, then field_no represents + the nth virtual column in the table */ + uint16_t orig_len; /*!< original length of the locally + stored part of an externally stored + column, or 0 */ + que_node_t* exp; /*!< expression for calculating a new + value: it refers to column values and + constants in the symbol table of the + query graph */ + dfield_t new_val; /*!< new value for the column */ + dfield_t* old_v_val; /*!< old value for the virtual column */ +}; + + +/* check whether an update field is on virtual column */ +#define upd_fld_is_virtual_col(upd_fld) \ + (((upd_fld)->new_val.type.prtype & DATA_VIRTUAL) == DATA_VIRTUAL) + +/* set DATA_VIRTUAL bit on update field to show it is a virtual column */ +#define upd_fld_set_virtual_col(upd_fld) \ + ((upd_fld)->new_val.type.prtype |= DATA_VIRTUAL) + +/* Update vector structure */ +struct upd_t{ + mem_heap_t* heap; /*!< heap from which memory allocated */ + byte info_bits; /*!< new value of info bits to record; + default is 0 */ + dtuple_t* old_vrow; /*!< pointer to old row, used for + virtual column update now */ + ulint n_fields; /*!< number of update fields */ + upd_field_t* fields; /*!< array of update fields */ + byte vers_sys_value[8]; /*!< buffer for updating system fields */ + + /** Append an update field to the end of array + @param[in] field an update field */ + void append(const upd_field_t& field) + { + fields[n_fields++] = field; + } + + void remove_element(ulint i) + { + ut_ad(n_fields > 0); + ut_ad(i < n_fields); + while (i < n_fields - 1) + { + fields[i]= fields[i + 1]; + i++; + } + n_fields--; + } + + bool remove(const ulint field_no) + { + for (ulint i= 0; i < n_fields; ++i) + { + if (field_no == fields[i].field_no) + { + remove_element(i); + return true; + } + } + return false; + } + + /** Determine if the given field_no is modified. + @return true if modified, false otherwise. */ + bool is_modified(uint16_t field_no) const + { + for (ulint i = 0; i < n_fields; ++i) { + if (field_no == fields[i].field_no) { + return(true); + } + } + return(false); + } + + /** Determine if the update affects a system versioned column or row_end. */ + bool affects_versioned() const + { + for (ulint i = 0; i < n_fields; i++) { + dtype_t type = fields[i].new_val.type; + if (type.is_versioned()) { + return true; + } + // versioned DELETE is UPDATE SET row_end=NOW + if (type.vers_sys_end()) { + return true; + } + } + return false; + } + + /** @return whether this is for a hidden metadata record + for instant ALTER TABLE */ + bool is_metadata() const { return dtuple_t::is_metadata(info_bits); } + /** @return whether this is for a hidden metadata record + for instant ALTER TABLE (not only ADD COLUMN) */ + bool is_alter_metadata() const + { return dtuple_t::is_alter_metadata(info_bits); } + +#ifdef UNIV_DEBUG + bool validate() const + { + for (ulint i = 0; i < n_fields; ++i) { + dfield_t* field = &fields[i].new_val; + if (dfield_is_ext(field)) { + ut_ad(dfield_get_len(field) + >= BTR_EXTERN_FIELD_REF_SIZE); + } + } + return(true); + } +#endif // UNIV_DEBUG +}; + +/** Kinds of update operation */ +enum delete_mode_t { + NO_DELETE = 0, /*!< this operation does not delete */ + PLAIN_DELETE, /*!< ordinary delete */ + VERSIONED_DELETE /*!< update old and insert a new row */ +}; + +/* Update node structure which also implements the delete operation +of a row */ + +struct upd_node_t{ + que_common_t common; /*!< node type: QUE_NODE_UPDATE */ + delete_mode_t is_delete; /*!< kind of DELETE */ + ibool searched_update; + /* TRUE if searched update, FALSE if + positioned */ + bool in_mysql_interface; + /* whether the update node was created + for the MySQL interface */ + dict_foreign_t* foreign;/* NULL or pointer to a foreign key + constraint if this update node is used in + doing an ON DELETE or ON UPDATE operation */ + upd_node_t* cascade_node;/* NULL or an update node template which + is used to implement ON DELETE/UPDATE CASCADE + or ... SET NULL for foreign keys */ + mem_heap_t* cascade_heap; + /*!< NULL or a mem heap where cascade + node is created.*/ + sel_node_t* select; /*!< query graph subtree implementing a base + table cursor: the rows returned will be + updated */ + btr_pcur_t* pcur; /*!< persistent cursor placed on the clustered + index record which should be updated or + deleted; the cursor is stored in the graph + of 'select' field above, except in the case + of the MySQL interface */ + dict_table_t* table; /*!< table where updated */ + upd_t* update; /*!< update vector for the row */ + ulint update_n_fields; + /* when this struct is used to implement + a cascade operation for foreign keys, we store + here the size of the buffer allocated for use + as the update vector */ + sym_node_list_t columns;/* symbol table nodes for the columns + to retrieve from the table */ + ibool has_clust_rec_x_lock; + /* TRUE if the select which retrieves the + records to update already sets an x-lock on + the clustered record; note that it must always + set at least an s-lock */ + ulint cmpl_info;/* information extracted during query + compilation; speeds up execution: + UPD_NODE_NO_ORD_CHANGE and + UPD_NODE_NO_SIZE_CHANGE, ORed */ + /*----------------------*/ + /* Local storage for this graph node */ + ulint state; /*!< node execution state */ + dict_index_t* index; /*!< NULL, or the next index whose record should + be updated */ + dtuple_t* row; /*!< NULL, or a copy (also fields copied to + heap) of the row to update; this must be reset + to NULL after a successful update */ + dtuple_t* historical_row; /*!< historical row used in + CASCADE UPDATE/SET NULL; + allocated from historical_heap */ + mem_heap_t* historical_heap; /*!< heap for historical row insertion; + created when row to update is located; + freed right before row update */ + row_ext_t* ext; /*!< NULL, or prefixes of the externally + stored columns in the old row */ + dtuple_t* upd_row;/* NULL, or a copy of the updated row */ + row_ext_t* upd_ext;/* NULL, or prefixes of the externally + stored columns in upd_row */ + mem_heap_t* heap; /*!< memory heap used as auxiliary storage; + this must be emptied after a successful + update */ + /*----------------------*/ + sym_node_t* table_sym;/* table node in symbol table */ + que_node_t* col_assign_list; + /* column assignment list */ + ulint magic_n; + +private: + /** Appends row_start or row_end field to update vector and sets a + CURRENT_TIMESTAMP/trx->id value to it. + Supposed to be called only by make_versioned_update() and + make_versioned_delete(). + @param[in] trx transaction + @param[in] vers_sys_idx table->row_start or table->row_end */ + void vers_update_fields(const trx_t *trx, ulint idx); + +public: + /** Also set row_start = CURRENT_TIMESTAMP/trx->id + @param[in] trx transaction */ + void vers_make_update(const trx_t *trx) + { + vers_update_fields(trx, table->vers_start); + } + + /** Prepare update vector for versioned delete. + Set row_end to CURRENT_TIMESTAMP or trx->id. + Initialize fts_next_doc_id for versioned delete. + @param[in] trx transaction */ + void vers_make_delete(trx_t *trx); +}; + +#define UPD_NODE_MAGIC_N 1579975 + +/* Node execution states */ +#define UPD_NODE_SET_IX_LOCK 1 /* execution came to the node from + a node above and if the field + has_clust_rec_x_lock is FALSE, we + should set an intention x-lock on + the table */ +#define UPD_NODE_UPDATE_CLUSTERED 2 /* clustered index record should be + updated */ +#define UPD_NODE_INSERT_CLUSTERED 3 /* clustered index record should be + inserted, old record is already delete + marked */ +#define UPD_NODE_UPDATE_ALL_SEC 5 /* an ordering field of the clustered + index record was changed, or this is + a delete operation: should update + all the secondary index records */ +#define UPD_NODE_UPDATE_SOME_SEC 6 /* secondary index entries should be + looked at and updated if an ordering + field changed */ + +/* Compilation info flags: these must fit within 3 bits; see trx0rec.h */ +#define UPD_NODE_NO_ORD_CHANGE 1 /* no secondary index record will be + changed in the update and no ordering + field of the clustered index */ +#define UPD_NODE_NO_SIZE_CHANGE 2 /* no record field size will be + changed in the update */ + + +#include "row0upd.inl" + +#endif diff --git a/storage/innobase/include/row0upd.inl b/storage/innobase/include/row0upd.inl new file mode 100644 index 00000000..13aacf3f --- /dev/null +++ b/storage/innobase/include/row0upd.inl @@ -0,0 +1,153 @@ +/***************************************************************************** + +Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0upd.ic +Update of a row + +Created 12/27/1996 Heikki Tuuri +*******************************************************/ + +#include "mtr0log.h" +#include "trx0trx.h" +#include "trx0undo.h" +#include "row0row.h" +#include "lock0lock.h" +#include "page0zip.h" + +/*********************************************************************//** +Creates an update vector object. +@return own: update vector object */ +UNIV_INLINE +upd_t* +upd_create( +/*=======*/ + ulint n, /*!< in: number of fields */ + mem_heap_t* heap) /*!< in: heap from which memory allocated */ +{ + upd_t* update; + + update = static_cast(mem_heap_zalloc( + heap, sizeof(upd_t) + sizeof(upd_field_t) * n)); + + update->n_fields = n; + update->fields = reinterpret_cast(&update[1]); + update->heap = heap; + + return(update); +} + +/*********************************************************************//** +Returns the number of fields in the update vector == number of columns +to be updated by an update vector. +@return number of fields */ +UNIV_INLINE +ulint +upd_get_n_fields( +/*=============*/ + const upd_t* update) /*!< in: update vector */ +{ + ut_ad(update); + + return(update->n_fields); +} + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Returns the nth field of an update vector. +@return update vector field */ +UNIV_INLINE +upd_field_t* +upd_get_nth_field( +/*==============*/ + const upd_t* update, /*!< in: update vector */ + ulint n) /*!< in: field position in update vector */ +{ + ut_ad(update); + ut_ad(n < update->n_fields); + + return((upd_field_t*) update->fields + n); +} +#endif /* UNIV_DEBUG */ + +/*********************************************************************//** +Sets an index field number to be updated by an update vector field. */ +UNIV_INLINE +void +upd_field_set_field_no( +/*===================*/ + upd_field_t* upd_field, /*!< in: update vector field */ + uint16_t field_no, /*!< in: field number in a clustered + index */ + dict_index_t* index) /*!< in: index */ +{ + upd_field->field_no = field_no; + upd_field->orig_len = 0; + dict_col_copy_type(dict_index_get_nth_col(index, field_no), + dfield_get_type(&upd_field->new_val)); +} + +/** set field number to a update vector field, marks this field is updated. +@param[in,out] upd_field update vector field +@param[in] field_no virtual column sequence num +@param[in] index index */ +UNIV_INLINE +void +upd_field_set_v_field_no( + upd_field_t* upd_field, + uint16_t field_no, + dict_index_t* index) +{ + ut_a(field_no < dict_table_get_n_v_cols(index->table)); + upd_field->field_no = field_no; + upd_field->orig_len = 0; + + dict_col_copy_type(&dict_table_get_nth_v_col( + index->table, field_no)->m_col, + dfield_get_type(&upd_field->new_val)); +} + +/*********************************************************************//** +Returns a field of an update vector by field_no. +@return update vector field, or NULL */ +UNIV_INLINE +const upd_field_t* +upd_get_field_by_field_no( +/*======================*/ + const upd_t* update, /*!< in: update vector */ + uint16_t no, /*!< in: field_no */ + bool is_virtual) /*!< in: if it is virtual column */ +{ + ulint i; + for (i = 0; i < upd_get_n_fields(update); i++) { + const upd_field_t* uf = upd_get_nth_field(update, i); + + /* matches only if the field matches that of is_virtual */ + if ((!is_virtual) != (!upd_fld_is_virtual_col(uf))) { + continue; + } + + if (uf->field_no == no) { + + return(uf); + } + } + + return(NULL); +} diff --git a/storage/innobase/include/row0vers.h b/storage/innobase/include/row0vers.h new file mode 100644 index 00000000..60f310e1 --- /dev/null +++ b/storage/innobase/include/row0vers.h @@ -0,0 +1,143 @@ +/***************************************************************************** + +Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/row0vers.h +Row versions + +Created 2/6/1997 Heikki Tuuri +*******************************************************/ + +#ifndef row0vers_h +#define row0vers_h + +#include "data0data.h" +#include "trx0types.h" +#include "que0types.h" +#include "rem0types.h" +#include "mtr0mtr.h" +#include "dict0mem.h" +#include "row0types.h" + +// Forward declaration +class ReadView; + +/** Determine if an active transaction has inserted or modified a secondary +index record. +@param[in,out] caller_trx trx of current thread +@param[in] rec secondary index record +@param[in] index secondary index +@param[in] offsets rec_get_offsets(rec, index) +@return the active transaction; state must be rechecked after +acquiring trx->mutex, and trx->release_reference() must be invoked +@retval NULL if the record was committed */ +trx_t* +row_vers_impl_x_locked( + trx_t* caller_trx, + const rec_t* rec, + dict_index_t* index, + const rec_offs* offsets); + +/** Finds out if a version of the record, where the version >= the current +purge_sys.view, should have ientry as its secondary index entry. We check +if there is any not delete marked version of the record where the trx +id >= purge view, and the secondary index entry == ientry; exactly in +this case we return TRUE. +@param[in] also_curr TRUE if also rec is included in the versions + to search; otherwise only versions prior + to it are searched +@param[in] rec record in the clustered index; the caller + must have a latch on the page +@param[in] mtr mtr holding the latch on rec; it will + also hold the latch on purge_view +@param[in] index secondary index +@param[in] ientry secondary index entry +@param[in] roll_ptr roll_ptr for the purge record +@param[in] trx_id transaction ID on the purging record +@return TRUE if earlier version should have */ +bool +row_vers_old_has_index_entry( + bool also_curr, + const rec_t* rec, + mtr_t* mtr, + dict_index_t* index, + const dtuple_t* ientry, + roll_ptr_t roll_ptr, + trx_id_t trx_id); + +/*****************************************************************//** +Constructs the version of a clustered index record which a consistent +read should see. We assume that the trx id stored in rec is such that +the consistent read should not see rec in its present version. +@return error code +@retval DB_SUCCESS if a previous version was fetched +@retval DB_MISSING_HISTORY if the history is missing (a sign of corruption) */ +dberr_t +row_vers_build_for_consistent_read( +/*===============================*/ + const rec_t* rec, /*!< in: record in a clustered index; the + caller must have a latch on the page; this + latch locks the top of the stack of versions + of this records */ + mtr_t* mtr, /*!< in: mtr holding the latch on rec; it will + also hold the latch on purge_view */ + dict_index_t* index, /*!< in: the clustered index */ + rec_offs** offsets,/*!< in/out: offsets returned by + rec_get_offsets(rec, index) */ + ReadView* view, /*!< in: the consistent read view */ + mem_heap_t** offset_heap,/*!< in/out: memory heap from which + the offsets are allocated */ + mem_heap_t* in_heap,/*!< in: memory heap from which the memory for + *old_vers is allocated; memory for possible + intermediate versions is allocated and freed + locally within the function */ + rec_t** old_vers,/*!< out, own: old version, or NULL + if the history is missing or the record + does not exist in the view, that is, + it was freshly inserted afterwards */ + dtuple_t** vrow); /*!< out: reports virtual column info if any */ + +/*****************************************************************//** +Constructs the last committed version of a clustered index record, +which should be seen by a semi-consistent read. */ +void +row_vers_build_for_semi_consistent_read( +/*====================================*/ + trx_t* caller_trx,/*! +#include "my_dbug.h" + +/** Simple read-write lock based on std::atomic */ +class rw_lock +{ + /** The lock word */ + std::atomic lock; + +protected: + /** Available lock */ + static constexpr uint32_t UNLOCKED= 0; + /** Flag to indicate that write_lock() is being held */ + static constexpr uint32_t WRITER= 1U << 31; + /** Flag to indicate that write_lock_wait() is pending */ + static constexpr uint32_t WRITER_WAITING= 1U << 30; + /** Flag to indicate that write_lock() or write_lock_wait() is pending */ + static constexpr uint32_t WRITER_PENDING= WRITER | WRITER_WAITING; + + /** Start waiting for an exclusive lock. */ + void write_lock_wait_start() + { +#if defined __GNUC__ && (defined __i386__ || defined __x86_64__) + static_assert(WRITER_WAITING == 1U << 30, "compatibility"); + __asm__ __volatile__("lock btsl $30, %0" : "+m" (lock)); +#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64) + static_assert(WRITER_WAITING == 1U << 30, "compatibility"); + _interlockedbittestandset(reinterpret_cast(&lock), 30); +#else + lock.fetch_or(WRITER_WAITING, std::memory_order_relaxed); +#endif + } + /** Start waiting for an exclusive lock. + @return current value of the lock word */ + uint32_t write_lock_wait_start_read() + { return lock.fetch_or(WRITER_WAITING, std::memory_order_relaxed); } + /** Wait for an exclusive lock. + @param l the value of the lock word + @return whether the exclusive lock was acquired */ + bool write_lock_wait_try(uint32_t &l) + { + return lock.compare_exchange_strong(l, WRITER, std::memory_order_acquire, + std::memory_order_relaxed); + } + /** Try to acquire a shared lock. + @param l the value of the lock word + @return whether the lock was acquired */ + bool read_trylock(uint32_t &l) + { + l= UNLOCKED; + while (!lock.compare_exchange_strong(l, l + 1, std::memory_order_acquire, + std::memory_order_relaxed)) + { + DBUG_ASSERT(!(WRITER & l) || !(~WRITER_PENDING & l)); + if (l & WRITER_PENDING) + return false; + } + return true; + } + + /** Wait for an exclusive lock. + @return whether the exclusive lock was acquired */ + bool write_lock_poll() + { + auto l= WRITER_WAITING; + if (write_lock_wait_try(l)) + return true; + if (!(l & WRITER_WAITING)) + /* write_lock() must have succeeded for another thread */ + write_lock_wait_start(); + return false; + } + /** @return the lock word value */ + uint32_t value() const { return lock.load(std::memory_order_acquire); } + +public: + /** Default constructor */ + rw_lock() : lock(UNLOCKED) {} + + /** Release a shared lock. + @return whether any writers may have to be woken up */ + bool read_unlock() + { + auto l= lock.fetch_sub(1, std::memory_order_release); + DBUG_ASSERT(!(l & WRITER)); /* no write lock must have existed */ + DBUG_ASSERT(~(WRITER_PENDING) & l); /* at least one read lock */ + return (~WRITER_PENDING & l) == 1; + } + /** Release an exclusive lock */ + void write_unlock() + { + /* Below, we use fetch_sub(WRITER) instead of fetch_and(~WRITER). + The reason is that on IA-32 and AMD64 it translates into the 80486 + instruction LOCK XADD, while fetch_and() translates into a loop + around LOCK CMPXCHG. For other ISA either form should be fine. */ + static_assert(WRITER == 1U << 31, "compatibility"); + IF_DBUG_ASSERT(auto l=,) lock.fetch_sub(WRITER, std::memory_order_release); + /* the write lock must have existed */ + DBUG_ASSERT(l & WRITER); + } + /** Try to acquire a shared lock. + @return whether the lock was acquired */ + bool read_trylock() { uint32_t l; return read_trylock(l); } + /** Try to acquire an exclusive lock. + @return whether the lock was acquired */ + bool write_trylock() + { + auto l= UNLOCKED; + return lock.compare_exchange_strong(l, WRITER, std::memory_order_acquire, + std::memory_order_relaxed); + } + + /** @return whether an exclusive lock is being held by any thread */ + bool is_write_locked() const { return !!(value() & WRITER); } + /** @return whether any lock is being held or waited for by any thread */ + bool is_locked_or_waiting() const { return value() != 0; } + /** @return whether any lock is being held by any thread */ + bool is_locked() const { return (value() & ~WRITER_WAITING) != 0; } +}; diff --git a/storage/innobase/include/small_vector.h b/storage/innobase/include/small_vector.h new file mode 100644 index 00000000..d28a3618 --- /dev/null +++ b/storage/innobase/include/small_vector.h @@ -0,0 +1,100 @@ +/***************************************************************************** + +Copyright (c) 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +#pragma once +/* A normally small vector, inspired by llvm::SmallVector */ +#include "my_global.h" +#include +#include + +class small_vector_base +{ +protected: + typedef uint32_t Size_T; + void *BeginX; + Size_T Size= 0, Capacity; + small_vector_base()= delete; + small_vector_base(void *small, size_t small_size) + : BeginX(small), Capacity(Size_T(small_size)) {} + ATTRIBUTE_COLD void grow_by_1(void *small, size_t element_size); +public: + size_t size() const { return Size; } + size_t capacity() const { return Capacity; } + bool empty() const { return !Size; } + void clear() { Size= 0; } +protected: + void set_size(size_t N) { Size= Size_T(N); } +}; + +template +class small_vector : public small_vector_base +{ + /** The fixed storage allocation */ + T small[N]; + + using small_vector_base::set_size; + + void grow_if_needed() + { + if (unlikely(size() >= capacity())) + grow_by_1(small, sizeof *small); + } + +public: + small_vector() : small_vector_base(small, N) + { + TRASH_ALLOC(small, sizeof small); + } + ~small_vector() + { + if (small != begin()) + my_free(begin()); + MEM_MAKE_ADDRESSABLE(small, sizeof small); + } + + using iterator= T *; + using const_iterator= const T *; + using reverse_iterator= std::reverse_iterator; + using reference= T &; + using const_reference= const T&; + + iterator begin() { return static_cast(BeginX); } + const_iterator begin() const { return static_cast(BeginX); } + iterator end() { return begin() + size(); } + const_iterator end() const { return begin() + size(); } + + reverse_iterator rbegin() { return reverse_iterator(end()); } + reverse_iterator rend() { return reverse_iterator(begin()); } + + reference operator[](size_t i) { assert(i < size()); return begin()[i]; } + const_reference operator[](size_t i) const + { return const_cast(*this)[i]; } + + void erase(const_iterator S, const_iterator E) + { + set_size(std::move(const_cast(E), end(), + const_cast(S)) - begin()); + } + + void emplace_back(T &&arg) + { + grow_if_needed(); + ::new (end()) T(arg); + set_size(size() + 1); + } +}; diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h new file mode 100644 index 00000000..51f3049b --- /dev/null +++ b/storage/innobase/include/srv0mon.h @@ -0,0 +1,846 @@ +/*********************************************************************** + +Copyright (c) 2010, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +***********************************************************************/ + +/**************************************************//** +@file include/srv0mon.h +Server monitor counter related defines + +Created 12/15/2009 Jimmy Yang +*******************************************************/ + +#ifndef srv0mon_h +#define srv0mon_h + +#include "univ.i" + +#ifndef __STDC_LIMIT_MACROS +/* Required for FreeBSD so that INT64_MAX is defined. */ +#define __STDC_LIMIT_MACROS +#endif /* __STDC_LIMIT_MACROS */ + +#include +#include "my_atomic.h" +#include "my_atomic_wrapper.h" + +/** Possible status values for "mon_status" in "struct monitor_value" */ +enum monitor_running_status { + MONITOR_STARTED = 1, /*!< Monitor has been turned on */ + MONITOR_STOPPED = 2 /*!< Monitor has been turned off */ +}; + +typedef enum monitor_running_status monitor_running_t; + +/** Monitor counter value type */ +typedef int64_t mon_type_t; + +/** Two monitor structures are defined in this file. One is +"monitor_value_t" which contains dynamic counter values for each +counter. The other is "monitor_info_t", which contains +static information (counter name, desc etc.) for each counter. +In addition, an enum datatype "monitor_id_t" is also defined, +it identifies each monitor with an internally used symbol, whose +integer value indexes into above two structure for its dynamic +and static information. +Developer who intend to add new counters would require to +fill in counter information as described in "monitor_info_t" and +create the internal counter ID in "monitor_id_t". */ + +/** Structure containing the actual values of a monitor counter. */ +struct monitor_value_t { + time_t mon_start_time; /*!< Start time of monitoring */ + time_t mon_stop_time; /*!< Stop time of monitoring */ + time_t mon_reset_time; /*!< Time of resetting the counter */ + mon_type_t mon_value; /*!< Current counter Value */ + mon_type_t mon_max_value; /*!< Current Max value */ + mon_type_t mon_min_value; /*!< Current Min value */ + mon_type_t mon_value_reset;/*!< value at last reset */ + mon_type_t mon_max_value_start; /*!< Max value since start */ + mon_type_t mon_min_value_start; /*!< Min value since start */ + mon_type_t mon_start_value;/*!< Value at the start time */ + mon_type_t mon_last_value; /*!< Last set of values */ + monitor_running_t mon_status; /* whether monitor still running */ +}; + +/** Follwoing defines are possible values for "monitor_type" field in +"struct monitor_info" */ +enum monitor_type_t { + MONITOR_NONE = 0, /*!< No monitoring */ + MONITOR_MODULE = 1, /*!< This is a monitor module type, + not a counter */ + MONITOR_EXISTING = 2, /*!< The monitor carries information from + an existing system status variable */ + MONITOR_NO_AVERAGE = 4, /*!< Set this status if we don't want to + calculate the average value for the counter */ + MONITOR_DISPLAY_CURRENT = 8, /*!< Display current value of the + counter, rather than incremental value + over the period. Mostly for counters + displaying current resource usage */ + MONITOR_GROUP_MODULE = 16, /*!< Monitor can be turned on/off + only as a module, but not individually */ + MONITOR_DEFAULT_ON = 32,/*!< Monitor will be turned on by default at + server start up */ + MONITOR_SET_OWNER = 64, /*!< Owner of "monitor set", a set of + monitor counters */ + MONITOR_SET_MEMBER = 128,/*!< Being part of a "monitor set" */ + MONITOR_HIDDEN = 256 /*!< Do not display this monitor in the + metrics table */ +}; + +/** Counter minimum value is initialized to be max value of + mon_type_t (int64_t) */ +#ifndef INT64_MAX +#define INT64_MAX (9223372036854775807LL) +#endif +#ifndef INT64_MIN +#define INT64_MIN (-9223372036854775807LL-1) +#endif +#define MIN_RESERVED INT64_MAX +#define MAX_RESERVED INT64_MIN + +/** This enumeration defines internal monitor identifier used internally +to identify each particular counter. Its value indexes into two arrays, +one is the "innodb_counter_value" array which records actual monitor +counter values, the other is "innodb_counter_info" array which describes +each counter's basic information (name, desc etc.). A couple of +naming rules here: +1) If the monitor defines a module, it starts with MONITOR_MODULE +2) If the monitor uses exisitng counters from "status variable", its ID +name shall start with MONITOR_OVLD + +Please refer to "innodb_counter_info" in srv/srv0mon.cc for detail +information for each monitor counter */ + +enum monitor_id_t { + /* This is to identify the default value set by the metrics + control global variables */ + MONITOR_DEFAULT_START = 0, + + /* Start of Metadata counter */ + MONITOR_MODULE_METADATA, + MONITOR_TABLE_OPEN, + + /* Lock manager related counters */ + MONITOR_MODULE_LOCK, + MONITOR_DEADLOCK, + MONITOR_TIMEOUT, + MONITOR_LOCKREC_WAIT, + MONITOR_TABLELOCK_WAIT, + MONITOR_NUM_RECLOCK_REQ, + MONITOR_RECLOCK_CREATED, + MONITOR_RECLOCK_REMOVED, + MONITOR_NUM_RECLOCK, + MONITOR_TABLELOCK_CREATED, + MONITOR_TABLELOCK_REMOVED, + MONITOR_NUM_TABLELOCK, + MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT, + MONITOR_OVLD_LOCK_WAIT_TIME, + MONITOR_OVLD_LOCK_MAX_WAIT_TIME, + MONITOR_OVLD_ROW_LOCK_WAIT, + MONITOR_OVLD_LOCK_AVG_WAIT_TIME, + + /* Buffer and I/O realted counters. */ + MONITOR_MODULE_BUFFER, + MONITOR_OVLD_BUFFER_POOL_SIZE, + MONITOR_OVLD_BUF_POOL_READS, + MONITOR_OVLD_BUF_POOL_READ_REQUESTS, + MONITOR_OVLD_BUF_POOL_WRITE_REQUEST, + MONITOR_OVLD_BUF_POOL_WAIT_FREE, + MONITOR_OVLD_BUF_POOL_READ_AHEAD, + MONITOR_OVLD_BUF_POOL_READ_AHEAD_EVICTED, + MONITOR_OVLD_BUF_POOL_PAGE_TOTAL, + MONITOR_OVLD_BUF_POOL_PAGE_MISC, + MONITOR_OVLD_BUF_POOL_PAGES_DATA, + MONITOR_OVLD_BUF_POOL_BYTES_DATA, + MONITOR_OVLD_BUF_POOL_PAGES_DIRTY, + MONITOR_OVLD_BUF_POOL_BYTES_DIRTY, + MONITOR_OVLD_BUF_POOL_PAGES_FREE, + MONITOR_OVLD_PAGE_CREATED, + MONITOR_OVLD_PAGES_WRITTEN, + MONITOR_OVLD_PAGES_READ, + MONITOR_OVLD_BYTE_READ, + MONITOR_OVLD_BYTE_WRITTEN, + MONITOR_FLUSH_BATCH_SCANNED, + MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL, + MONITOR_FLUSH_BATCH_SCANNED_PER_CALL, + MONITOR_FLUSH_BATCH_TOTAL_PAGE, + MONITOR_FLUSH_BATCH_COUNT, + MONITOR_FLUSH_BATCH_PAGES, + MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE, + MONITOR_FLUSH_NEIGHBOR_COUNT, + MONITOR_FLUSH_NEIGHBOR_PAGES, + MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, + + MONITOR_FLUSH_N_TO_FLUSH_BY_AGE, + MONITOR_FLUSH_ADAPTIVE_AVG_TIME, + + MONITOR_FLUSH_ADAPTIVE_AVG_PASS, + + MONITOR_LRU_GET_FREE_LOOPS, + MONITOR_LRU_GET_FREE_WAITS, + + MONITOR_FLUSH_AVG_PAGE_RATE, + MONITOR_FLUSH_LSN_AVG_RATE, + MONITOR_FLUSH_PCT_FOR_DIRTY, + MONITOR_FLUSH_PCT_FOR_LSN, + MONITOR_FLUSH_SYNC_WAITS, + MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE, + MONITOR_FLUSH_ADAPTIVE_COUNT, + MONITOR_FLUSH_ADAPTIVE_PAGES, + MONITOR_FLUSH_SYNC_TOTAL_PAGE, + MONITOR_FLUSH_SYNC_COUNT, + MONITOR_FLUSH_SYNC_PAGES, + MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE, + MONITOR_FLUSH_BACKGROUND_COUNT, + MONITOR_FLUSH_BACKGROUND_PAGES, + MONITOR_LRU_BATCH_SCANNED, + MONITOR_LRU_BATCH_SCANNED_NUM_CALL, + MONITOR_LRU_BATCH_SCANNED_PER_CALL, + MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE, + MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE, + MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT, + MONITOR_LRU_GET_FREE_SEARCH, + MONITOR_LRU_SEARCH_SCANNED, + MONITOR_LRU_SEARCH_SCANNED_NUM_CALL, + MONITOR_LRU_SEARCH_SCANNED_PER_CALL, + MONITOR_LRU_UNZIP_SEARCH_SCANNED, + MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL, + MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL, + + /* Buffer Page I/O specific counters. */ + MONITOR_MODULE_BUF_PAGE, + MONITOR_INDEX_LEAF_PAGE_READ, + MONITOR_INDEX_NON_LEAF_PAGE_READ, + MONITOR_INDEX_IBUF_LEAF_PAGE_READ, + MONITOR_INDEX_IBUF_NON_LEAF_PAGE_READ, + MONITOR_UNDO_LOG_PAGE_READ, + MONITOR_INODE_PAGE_READ, + MONITOR_IBUF_FREELIST_PAGE_READ, + MONITOR_IBUF_BITMAP_PAGE_READ, + MONITOR_SYSTEM_PAGE_READ, + MONITOR_TRX_SYSTEM_PAGE_READ, + MONITOR_FSP_HDR_PAGE_READ, + MONITOR_XDES_PAGE_READ, + MONITOR_BLOB_PAGE_READ, + MONITOR_ZBLOB_PAGE_READ, + MONITOR_ZBLOB2_PAGE_READ, + MONITOR_OTHER_PAGE_READ, + MONITOR_INDEX_LEAF_PAGE_WRITTEN, + MONITOR_INDEX_NON_LEAF_PAGE_WRITTEN, + MONITOR_INDEX_IBUF_LEAF_PAGE_WRITTEN, + MONITOR_INDEX_IBUF_NON_LEAF_PAGE_WRITTEN, + MONITOR_UNDO_LOG_PAGE_WRITTEN, + MONITOR_INODE_PAGE_WRITTEN, + MONITOR_IBUF_FREELIST_PAGE_WRITTEN, + MONITOR_IBUF_BITMAP_PAGE_WRITTEN, + MONITOR_SYSTEM_PAGE_WRITTEN, + MONITOR_TRX_SYSTEM_PAGE_WRITTEN, + MONITOR_FSP_HDR_PAGE_WRITTEN, + MONITOR_XDES_PAGE_WRITTEN, + MONITOR_BLOB_PAGE_WRITTEN, + MONITOR_ZBLOB_PAGE_WRITTEN, + MONITOR_ZBLOB2_PAGE_WRITTEN, + MONITOR_OTHER_PAGE_WRITTEN, + + /* OS level counters (I/O) */ + MONITOR_MODULE_OS, + MONITOR_OVLD_OS_FILE_READ, + MONITOR_OVLD_OS_FILE_WRITE, + MONITOR_OVLD_OS_FSYNC, + MONITOR_OS_PENDING_READS, + MONITOR_OS_PENDING_WRITES, + MONITOR_OVLD_OS_LOG_WRITTEN, + + /* Transaction related counters */ + MONITOR_MODULE_TRX, + MONITOR_TRX_RW_COMMIT, + MONITOR_TRX_RO_COMMIT, + MONITOR_TRX_NL_RO_COMMIT, + MONITOR_TRX_COMMIT_UNDO, + MONITOR_TRX_ROLLBACK, + MONITOR_TRX_ROLLBACK_SAVEPOINT, + MONITOR_RSEG_HISTORY_LEN, + MONITOR_NUM_UNDO_SLOT_USED, + MONITOR_NUM_UNDO_SLOT_CACHED, + MONITOR_RSEG_CUR_SIZE, + + /* Purge related counters */ + MONITOR_MODULE_PURGE, + MONITOR_N_DEL_ROW_PURGE, + MONITOR_N_UPD_EXIST_EXTERN, + MONITOR_PURGE_INVOKED, + MONITOR_PURGE_N_PAGE_HANDLED, + MONITOR_DML_PURGE_DELAY, + MONITOR_PURGE_STOP_COUNT, + MONITOR_PURGE_RESUME_COUNT, + + /* Recovery related counters */ + MONITOR_MODULE_RECOVERY, + MONITOR_OVLD_CHECKPOINTS, + MONITOR_OVLD_LSN_FLUSHDISK, + MONITOR_OVLD_LSN_CHECKPOINT, + MONITOR_OVLD_LSN_CURRENT, + MONITOR_LSN_CHECKPOINT_AGE, + MONITOR_OVLD_BUF_OLDEST_LSN, + MONITOR_OVLD_MAX_AGE_ASYNC, + MONITOR_OVLD_LOG_WAITS, + MONITOR_OVLD_LOG_WRITE_REQUEST, + MONITOR_OVLD_LOG_WRITES, + + /* Page Manager related counters */ + MONITOR_MODULE_PAGE, + MONITOR_PAGE_COMPRESS, + MONITOR_PAGE_DECOMPRESS, + MONITOR_PAD_INCREMENTS, + MONITOR_PAD_DECREMENTS, + /* New monitor variables for page compression */ + MONITOR_OVLD_PAGE_COMPRESS_SAVED, + MONITOR_OVLD_PAGES_PAGE_COMPRESSED, + MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP, + MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED, + MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR, + + /* New monitor variables for page encryption */ + MONITOR_OVLD_PAGES_ENCRYPTED, + MONITOR_OVLD_PAGES_DECRYPTED, + + /* Index related counters */ + MONITOR_MODULE_INDEX, + MONITOR_INDEX_SPLIT, + MONITOR_INDEX_MERGE_ATTEMPTS, + MONITOR_INDEX_MERGE_SUCCESSFUL, + MONITOR_INDEX_REORG_ATTEMPTS, + MONITOR_INDEX_REORG_SUCCESSFUL, + MONITOR_INDEX_DISCARD, + +#ifdef BTR_CUR_HASH_ADAPT + /* Adaptive Hash Index related counters */ + MONITOR_MODULE_ADAPTIVE_HASH, + MONITOR_OVLD_ADAPTIVE_HASH_SEARCH, + MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE, + MONITOR_ADAPTIVE_HASH_PAGE_ADDED, + MONITOR_ADAPTIVE_HASH_PAGE_REMOVED, + MONITOR_ADAPTIVE_HASH_ROW_ADDED, + MONITOR_ADAPTIVE_HASH_ROW_REMOVED, + MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND, + MONITOR_ADAPTIVE_HASH_ROW_UPDATED, +#endif /* BTR_CUR_HASH_ADAPT */ + + /* Tablespace related counters */ + MONITOR_MODULE_FIL_SYSTEM, + MONITOR_OVLD_N_FILE_OPENED, + + /* InnoDB Change Buffer related counters */ + MONITOR_MODULE_IBUF_SYSTEM, + MONITOR_OVLD_IBUF_MERGE_INSERT, + MONITOR_OVLD_IBUF_MERGE_DELETE, + MONITOR_OVLD_IBUF_MERGE_PURGE, + MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT, + MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE, + MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE, + MONITOR_OVLD_IBUF_MERGES, + MONITOR_OVLD_IBUF_SIZE, + + /* Counters for server operations */ + MONITOR_MODULE_SERVER, + MONITOR_MASTER_THREAD_SLEEP, + MONITOR_OVLD_SERVER_ACTIVITY, + MONITOR_MASTER_ACTIVE_LOOPS, + MONITOR_MASTER_IDLE_LOOPS, + MONITOR_SRV_LOG_FLUSH_MICROSECOND, + MONITOR_SRV_DICT_LRU_MICROSECOND, + MONITOR_SRV_DICT_LRU_EVICT_COUNT_ACTIVE, + MONITOR_SRV_DICT_LRU_EVICT_COUNT_IDLE, + MONITOR_OVLD_SRV_DBLWR_WRITES, + MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN, + MONITOR_OVLD_SRV_PAGE_SIZE, + + /* Data DDL related counters */ + MONITOR_MODULE_DDL_STATS, + MONITOR_BACKGROUND_DROP_INDEX, + MONITOR_ONLINE_CREATE_INDEX, + MONITOR_PENDING_ALTER_TABLE, + MONITOR_ALTER_TABLE_SORT_FILES, + MONITOR_ALTER_TABLE_LOG_FILES, + + MONITOR_MODULE_ICP, + MONITOR_ICP_ATTEMPTS, + MONITOR_ICP_NO_MATCH, + MONITOR_ICP_OUT_OF_RANGE, + MONITOR_ICP_MATCH, + + /* This is used only for control system to turn + on/off and reset all monitor counters */ + MONITOR_ALL_COUNTER, + + /* This must be the last member */ + NUM_MONITOR +}; + +/** This informs the monitor control system to turn +on/off and reset monitor counters through wild card match */ +#define MONITOR_WILDCARD_MATCH (NUM_MONITOR + 1) + +/** Cannot find monitor counter with a specified name */ +#define MONITOR_NO_MATCH (NUM_MONITOR + 2) + +/** struct monitor_info describes the basic/static information +about each monitor counter. */ +struct monitor_info_t { + const char* monitor_name; /*!< Monitor name */ + const char* monitor_module; /*!< Sub Module the monitor + belongs to */ + const char* monitor_desc; /*!< Brief desc of monitor counter */ + monitor_type_t monitor_type; /*!< Type of Monitor Info */ + monitor_id_t monitor_related_id;/*!< Monitor ID of counter that + related to this monitor. This is + set when the monitor belongs to + a "monitor set" */ + monitor_id_t monitor_id; /*!< Monitor ID as defined in enum + monitor_id_t */ +}; + +/** Following are the "set_option" values allowed for +srv_mon_process_existing_counter() and srv_mon_process_existing_counter() +functions. To turn on/off/reset the monitor counters. */ +enum mon_option_t { + MONITOR_TURN_ON = 1, /*!< Turn on the counter */ + MONITOR_TURN_OFF, /*!< Turn off the counter */ + MONITOR_RESET_VALUE, /*!< Reset current values */ + MONITOR_RESET_ALL_VALUE, /*!< Reset all values */ + MONITOR_GET_VALUE /*!< Option for + srv_mon_process_existing_counter() + function */ +}; + +/** Number of bit in a ulint datatype */ +#define NUM_BITS_ULINT (sizeof(ulint) * CHAR_BIT) + +/** This "monitor_set_tbl" is a bitmap records whether a particular monitor +counter has been turned on or off */ +extern Atomic_relaxed + monitor_set_tbl[(NUM_MONITOR + NUM_BITS_ULINT - 1) / NUM_BITS_ULINT]; + +/** Macros to turn on/off the control bit in monitor_set_tbl for a monitor +counter option. */ +#define MONITOR_ON(monitor) \ + (monitor_set_tbl[unsigned(monitor) / NUM_BITS_ULINT].fetch_or( \ + (ulint(1) << (unsigned(monitor) % NUM_BITS_ULINT)))) + +#define MONITOR_OFF(monitor) \ + (monitor_set_tbl[unsigned(monitor) / NUM_BITS_ULINT].fetch_and( \ + ~(ulint(1) << (unsigned(monitor) % NUM_BITS_ULINT)))) + +/** Check whether the requested monitor is turned on/off */ +#define MONITOR_IS_ON(monitor) \ + (monitor_set_tbl[unsigned(monitor) / NUM_BITS_ULINT] & \ + (ulint(1) << (unsigned(monitor) % NUM_BITS_ULINT))) + +/** The actual monitor counter array that records each monintor counter +value */ +extern monitor_value_t innodb_counter_value[NUM_MONITOR]; + +/** Following are macro defines for basic montior counter manipulations. +Please note we do not provide any synchronization for these monitor +operations due to performance consideration. Most counters can +be placed under existing mutex protections in respective code +module. */ + +/** Macros to access various fields of a monitor counters */ +#define MONITOR_FIELD(monitor, field) \ + (innodb_counter_value[monitor].field) + +#define MONITOR_VALUE(monitor) \ + MONITOR_FIELD(monitor, mon_value) + +#define MONITOR_MAX_VALUE(monitor) \ + MONITOR_FIELD(monitor, mon_max_value) + +#define MONITOR_MIN_VALUE(monitor) \ + MONITOR_FIELD(monitor, mon_min_value) + +#define MONITOR_VALUE_RESET(monitor) \ + MONITOR_FIELD(monitor, mon_value_reset) + +#define MONITOR_MAX_VALUE_START(monitor) \ + MONITOR_FIELD(monitor, mon_max_value_start) + +#define MONITOR_MIN_VALUE_START(monitor) \ + MONITOR_FIELD(monitor, mon_min_value_start) + +#define MONITOR_LAST_VALUE(monitor) \ + MONITOR_FIELD(monitor, mon_last_value) + +#define MONITOR_START_VALUE(monitor) \ + MONITOR_FIELD(monitor, mon_start_value) + +#define MONITOR_VALUE_SINCE_START(monitor) \ + (MONITOR_VALUE(monitor) + MONITOR_VALUE_RESET(monitor)) + +#define MONITOR_STATUS(monitor) \ + MONITOR_FIELD(monitor, mon_status) + +#define MONITOR_SET_START(monitor) \ + do { \ + MONITOR_STATUS(monitor) = MONITOR_STARTED; \ + MONITOR_FIELD((monitor), mon_start_time) = time(NULL); \ + } while (0) + +#define MONITOR_SET_OFF(monitor) \ + do { \ + MONITOR_STATUS(monitor) = MONITOR_STOPPED; \ + MONITOR_FIELD((monitor), mon_stop_time) = time(NULL); \ + } while (0) + +#define MONITOR_INIT_ZERO_VALUE 0 + +/** Max and min values are initialized when we first turn on the monitor +counter, and set the MONITOR_STATUS. */ +#define MONITOR_MAX_MIN_NOT_INIT(monitor) \ + (MONITOR_STATUS(monitor) == MONITOR_INIT_ZERO_VALUE \ + && MONITOR_MIN_VALUE(monitor) == MONITOR_INIT_ZERO_VALUE \ + && MONITOR_MAX_VALUE(monitor) == MONITOR_INIT_ZERO_VALUE) + +#define MONITOR_INIT(monitor) \ + if (MONITOR_MAX_MIN_NOT_INIT(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = MIN_RESERVED; \ + MONITOR_MIN_VALUE_START(monitor) = MIN_RESERVED; \ + MONITOR_MAX_VALUE(monitor) = MAX_RESERVED; \ + MONITOR_MAX_VALUE_START(monitor) = MAX_RESERVED; \ + } + +/** Macros to increment/decrement the counters. The normal +monitor counter operation expects appropriate synchronization +already exists. No additional mutex is necessary when operating +on the counters */ +#define MONITOR_INC(monitor) \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor)++; \ + if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +/** Atomically increment a monitor counter. +Use MONITOR_INC if appropriate mutex protection exists. +@param monitor monitor to be incremented by 1 +@param enabled whether the monitor is enabled */ +#define MONITOR_ATOMIC_INC_LOW(monitor, enabled) \ + if (enabled) { \ + ib_uint64_t value; \ + value = my_atomic_add64_explicit( \ + (int64*) &MONITOR_VALUE(monitor), 1, \ + MY_MEMORY_ORDER_RELAXED) + 1; \ + /* Note: This is not 100% accurate because of the \ + inherent race, we ignore it due to performance. */ \ + if (value > (ib_uint64_t) MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = value; \ + } \ + } + +/** Atomically decrement a monitor counter. +Use MONITOR_DEC if appropriate mutex protection exists. +@param monitor monitor to be decremented by 1 +@param enabled whether the monitor is enabled */ +#define MONITOR_ATOMIC_DEC_LOW(monitor, enabled) \ + if (enabled) { \ + ib_uint64_t value; \ + value = my_atomic_add64_explicit( \ + (int64*) &MONITOR_VALUE(monitor), -1, \ + MY_MEMORY_ORDER_RELAXED) - 1; \ + /* Note: This is not 100% accurate because of the \ + inherent race, we ignore it due to performance. */ \ + if (value < (ib_uint64_t) MONITOR_MIN_VALUE(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = value; \ + } \ + } + +/** Atomically increment a monitor counter if it is enabled. +Use MONITOR_INC if appropriate mutex protection exists. +@param monitor monitor to be incremented by 1 */ +#define MONITOR_ATOMIC_INC(monitor) \ + MONITOR_ATOMIC_INC_LOW(monitor, MONITOR_IS_ON(monitor)) +/** Atomically decrement a monitor counter if it is enabled. +Use MONITOR_DEC if appropriate mutex protection exists. +@param monitor monitor to be decremented by 1 */ +#define MONITOR_ATOMIC_DEC(monitor) \ + MONITOR_ATOMIC_DEC_LOW(monitor, MONITOR_IS_ON(monitor)) + +#define MONITOR_DEC(monitor) \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor)--; \ + if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +#ifdef HAVE_MEM_CHECK +# define MONITOR_CHECK_DEFINED(value) do { \ + mon_type_t m __attribute__((unused))= value; \ + MEM_CHECK_DEFINED(&m, sizeof m); \ +} while (0) +#else /* HAVE_MEM_CHECK */ +# define MONITOR_CHECK_DEFINED(value) (void) 0 +#endif /* HAVE_MEM_CHECK */ + +#define MONITOR_INC_VALUE(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor) += (mon_type_t) (value); \ + if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +#define MONITOR_DEC_VALUE(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + ut_ad(MONITOR_VALUE(monitor) >= (mon_type_t) (value); \ + MONITOR_VALUE(monitor) -= (mon_type_t) (value); \ + if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +/* Increment/decrement counter without check the monitor on/off bit, which +could already be checked as a module group */ +#define MONITOR_INC_NOCHECK(monitor) \ + do { \ + MONITOR_VALUE(monitor)++; \ + if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } while (0) \ + +#define MONITOR_DEC_NOCHECK(monitor) \ + do { \ + MONITOR_VALUE(monitor)--; \ + if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } while (0) + +/** Directly set a monitor counter's value */ +#define MONITOR_SET(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor) = (mon_type_t) (value); \ + if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) { \ + MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +/** Add time difference between now and input "value" (in seconds) to the +monitor counter +@param monitor monitor to update for the time difference +@param value the start time value */ +#define MONITOR_INC_TIME_IN_MICRO_SECS(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + uintmax_t old_time = value; \ + value = microsecond_interval_timer(); \ + MONITOR_VALUE(monitor) += (mon_type_t) (value - old_time);\ + } + +/** This macro updates 3 counters in one call. However, it only checks the +main/first monitor counter 'monitor', to see it is on or off to decide +whether to do the update. +@param monitor the main monitor counter to update. It accounts for + the accumulative value for the counter. +@param monitor_n_calls counter that counts number of times this macro is + called +@param monitor_per_call counter that records the current and max value of + each incremental value +@param value incremental value to record this time */ +#define MONITOR_INC_VALUE_CUMULATIVE( \ + monitor, monitor_n_calls, monitor_per_call, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor_n_calls)++; \ + MONITOR_VALUE(monitor_per_call) = (mon_type_t) (value); \ + if (MONITOR_VALUE(monitor_per_call) \ + > MONITOR_MAX_VALUE(monitor_per_call)) { \ + MONITOR_MAX_VALUE(monitor_per_call) = \ + (mon_type_t) (value); \ + } \ + MONITOR_VALUE(monitor) += (mon_type_t) (value); \ + if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +/** Directly set a monitor counter's value, and if the value +is monotonically increasing, only max value needs to be updated */ +#define MONITOR_SET_UPD_MAX_ONLY(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor) = (mon_type_t) (value); \ + if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) { \ + MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\ + } \ + } + +/** Some values such as log sequence number are montomically increasing +number, do not need to record max/min values */ +#define MONITOR_SET_SIMPLE(monitor, value) \ + MONITOR_CHECK_DEFINED(value); \ + if (MONITOR_IS_ON(monitor)) { \ + MONITOR_VALUE(monitor) = (mon_type_t) (value); \ + } + +/** Reset the monitor value and max/min value to zero. The reset +operation would only be conducted when the counter is turned off */ +#define MONITOR_RESET_ALL(monitor) \ + do { \ + MONITOR_VALUE(monitor) = MONITOR_INIT_ZERO_VALUE; \ + MONITOR_MAX_VALUE(monitor) = MAX_RESERVED; \ + MONITOR_MIN_VALUE(monitor) = MIN_RESERVED; \ + MONITOR_VALUE_RESET(monitor) = MONITOR_INIT_ZERO_VALUE; \ + MONITOR_MAX_VALUE_START(monitor) = MAX_RESERVED; \ + MONITOR_MIN_VALUE_START(monitor) = MIN_RESERVED; \ + MONITOR_LAST_VALUE(monitor) = MONITOR_INIT_ZERO_VALUE; \ + MONITOR_FIELD(monitor, mon_start_time) = \ + MONITOR_INIT_ZERO_VALUE; \ + MONITOR_FIELD(monitor, mon_stop_time) = \ + MONITOR_INIT_ZERO_VALUE; \ + MONITOR_FIELD(monitor, mon_reset_time) = \ + MONITOR_INIT_ZERO_VALUE; \ + } while (0) + +/** Following four macros defines necessary operations to fetch and +consolidate information from existing system status variables. */ + +/** Save the passed-in value to mon_start_value field of monitor +counters */ +#define MONITOR_SAVE_START(monitor, value) do { \ + MONITOR_CHECK_DEFINED(value); \ + (MONITOR_START_VALUE(monitor) = \ + (mon_type_t) (value) - MONITOR_VALUE_RESET(monitor)); \ + } while (0) + +/** Save the passed-in value to mon_last_value field of monitor +counters */ +#define MONITOR_SAVE_LAST(monitor) \ + do { \ + MONITOR_LAST_VALUE(monitor) = MONITOR_VALUE(monitor); \ + MONITOR_START_VALUE(monitor) += MONITOR_VALUE(monitor); \ + } while (0) + +/** Set monitor value to the difference of value and mon_start_value +compensated by mon_last_value if accumulated value is required. */ +#define MONITOR_SET_DIFF(monitor, value) \ + MONITOR_SET_UPD_MAX_ONLY(monitor, ((value) \ + - MONITOR_VALUE_RESET(monitor) \ + - MONITOR_FIELD(monitor, mon_start_value) \ + + MONITOR_FIELD(monitor, mon_last_value))) + +/****************************************************************//** +Get monitor's monitor_info_t by its monitor id (index into the +innodb_counter_info array +@return Point to corresponding monitor_info_t, or NULL if no such +monitor */ +monitor_info_t* +srv_mon_get_info( +/*=============*/ + monitor_id_t monitor_id); /*!< id index into the + innodb_counter_info array */ +/****************************************************************//** +Get monitor's name by its monitor id (index into the +innodb_counter_info array +@return corresponding monitor name, or NULL if no such +monitor */ +const char* +srv_mon_get_name( +/*=============*/ + monitor_id_t monitor_id); /*!< id index into the + innodb_counter_info array */ + +/****************************************************************//** +Turn on/off/reset monitor counters in a module. If module_value +is NUM_MONITOR then turn on all monitor counters. +@return 0 if successful, or the first monitor that cannot be +turned on because it is already turned on. */ +void +srv_mon_set_module_control( +/*=======================*/ + monitor_id_t module_id, /*!< in: Module ID as in + monitor_counter_id. If it is + set to NUM_MONITOR, this means + we shall turn on all the counters */ + mon_option_t set_option); /*!< in: Turn on/off reset the + counter */ +/****************************************************************//** +This function consolidates some existing server counters used +by "system status variables". These existing system variables do not have +mechanism to start/stop and reset the counters, so we simulate these +controls by remembering the corresponding counter values when the +corresponding monitors are turned on/off/reset, and do appropriate +mathematics to deduct the actual value. */ +void +srv_mon_process_existing_counter( +/*=============================*/ + monitor_id_t monitor_id, /*!< in: the monitor's ID as in + monitor_counter_id */ + mon_option_t set_option); /*!< in: Turn on/off reset the + counter */ +/*************************************************************//** +This function is used to calculate the maximum counter value +since the start of monitor counter +@return max counter value since start. */ +UNIV_INLINE +mon_type_t +srv_mon_calc_max_since_start( +/*=========================*/ + monitor_id_t monitor); /*!< in: monitor id */ +/*************************************************************//** +This function is used to calculate the minimum counter value +since the start of monitor counter +@return min counter value since start. */ +UNIV_INLINE +mon_type_t +srv_mon_calc_min_since_start( +/*=========================*/ + monitor_id_t monitor); /*!< in: monitor id*/ +/*************************************************************//** +Reset a monitor, create a new base line with the current monitor +value. This baseline is recorded by MONITOR_VALUE_RESET(monitor) */ +void +srv_mon_reset( +/*==========*/ + monitor_id_t monitor); /*!< in: monitor id*/ +/*************************************************************//** +This function resets all values of a monitor counter */ +UNIV_INLINE +void +srv_mon_reset_all( +/*==============*/ + monitor_id_t monitor); /*!< in: monitor id*/ +/*************************************************************//** +Turn on monitor counters that are marked as default ON. */ +void +srv_mon_default_on(void); +/*====================*/ + +#include "srv0mon.inl" + +#endif diff --git a/storage/innobase/include/srv0mon.inl b/storage/innobase/include/srv0mon.inl new file mode 100644 index 00000000..158345b2 --- /dev/null +++ b/storage/innobase/include/srv0mon.inl @@ -0,0 +1,113 @@ +/***************************************************************************** + +Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/srv0mon.ic +Server monitoring system + +Created 1/20/2010 Jimmy Yang +************************************************************************/ + +/*************************************************************//** +This function is used to calculate the maximum counter value +since the start of monitor counter +@return max counter value since start. */ +UNIV_INLINE +mon_type_t +srv_mon_calc_max_since_start( +/*=========================*/ + monitor_id_t monitor) /*!< in: monitor id */ +{ + if (MONITOR_MAX_VALUE_START(monitor) == MAX_RESERVED) { + + /* MONITOR_MAX_VALUE_START has not yet been + initialized, the max value since start is the + max count in MONITOR_MAX_VALUE */ + MONITOR_MAX_VALUE_START(monitor) = + MONITOR_MAX_VALUE(monitor); + + } else if (MONITOR_MAX_VALUE(monitor) != MAX_RESERVED + && (MONITOR_MAX_VALUE(monitor) + + MONITOR_VALUE_RESET(monitor) + > MONITOR_MAX_VALUE_START(monitor))) { + + /* If the max value since reset (as specified + in MONITOR_MAX_VALUE) plus the reset value is + larger than MONITOR_MAX_VALUE_START, reset + MONITOR_MAX_VALUE_START to this new max value */ + MONITOR_MAX_VALUE_START(monitor) = + MONITOR_MAX_VALUE(monitor) + + MONITOR_VALUE_RESET(monitor); + } + + return(MONITOR_MAX_VALUE_START(monitor)); +} + +/*************************************************************//** +This function is used to calculate the minimum counter value +since the start of monitor counter +@return min counter value since start. */ +UNIV_INLINE +mon_type_t +srv_mon_calc_min_since_start( +/*=========================*/ + monitor_id_t monitor) /*!< in: monitor id */ +{ + if (MONITOR_MIN_VALUE_START(monitor) == MIN_RESERVED) { + + /* MONITOR_MIN_VALUE_START has not yet been + initialized, the min value since start is the + min count in MONITOR_MIN_VALUE */ + MONITOR_MIN_VALUE_START(monitor) = + MONITOR_MIN_VALUE(monitor); + + } else if (MONITOR_MIN_VALUE(monitor) != MIN_RESERVED + && (MONITOR_MIN_VALUE(monitor) + + MONITOR_VALUE_RESET(monitor) + < MONITOR_MIN_VALUE_START(monitor))) { + + /* If the min value since reset (as specified + in MONITOR_MIN_VALUE) plus the reset value is + less than MONITOR_MIN_VALUE_START, reset + MONITOR_MIN_VALUE_START to this new min value */ + MONITOR_MIN_VALUE_START(monitor) = + MONITOR_MIN_VALUE(monitor) + + MONITOR_VALUE_RESET(monitor); + } + + return(MONITOR_MIN_VALUE_START(monitor)); +} + +/*************************************************************//** +This function resets all values of a monitor counter */ +UNIV_INLINE +void +srv_mon_reset_all( +/*==============*/ + monitor_id_t monitor) /*!< in: monitor id */ +{ + /* Do not reset all counter values if monitor is still on. */ + if (MONITOR_IS_ON(monitor)) { + fprintf(stderr, "InnoDB: Cannot reset all values for" + " monitor counter %s while it is on. Please" + " turn it off and retry.\n", + srv_mon_get_name(monitor)); + } else { + MONITOR_RESET_ALL(monitor); + } +} diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h new file mode 100644 index 00000000..db846795 --- /dev/null +++ b/storage/innobase/include/srv0srv.h @@ -0,0 +1,715 @@ +/***************************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All rights reserved. +Copyright (c) 2008, 2009, Google Inc. +Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, 2023, MariaDB Corporation. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +Portions of this file contain modifications contributed and copyrighted +by Percona Inc.. Those modifications are +gratefully acknowledged and are described briefly in the InnoDB +documentation. The contributions by Percona Inc. are incorporated with +their permission, and subject to the conditions contained in the file +COPYING.Percona. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/srv0srv.h +The server main program + +Created 10/10/1995 Heikki Tuuri +*******************************************************/ + +#pragma once + +#include "log0log.h" +#include "que0types.h" +#include "trx0types.h" +#include "fil0fil.h" +#include "ut0counter.h" + +#include "mysql/psi/mysql_stage.h" +#include "mysql/psi/psi.h" +#include +#include + +/** Simple non-atomic counter +@tparam Type the integer type of the counter */ +template +struct alignas(CPU_LEVEL1_DCACHE_LINESIZE) simple_counter +{ + /** Increment the counter */ + Type inc() { return add(1); } + /** Decrement the counter */ + Type dec() { return add(Type(~0)); } + + /** Add to the counter + @param i amount to be added + @return the value of the counter after adding */ + Type add(Type i) { return m_counter += i; } + + /** @return the value of the counter */ + operator Type() const { return m_counter; } + +private: + /** The counter */ + Type m_counter; +}; + +/** Global counters used inside InnoDB. */ +struct srv_stats_t +{ + typedef ib_counter_t ulint_ctr_n_t; + typedef simple_counter lsn_ctr_1_t; + typedef simple_counter ulint_ctr_1_t; + typedef simple_counter int64_ctr_1_t; + + /** Count the amount of data written in total (in bytes) */ + ulint_ctr_1_t data_written; + /** Number of bytes saved by page compression */ + ulint_ctr_n_t page_compression_saved; + /* Number of pages compressed with page compression */ + ulint_ctr_n_t pages_page_compressed; + /* Number of TRIM operations induced by page compression */ + ulint_ctr_n_t page_compressed_trim_op; + /* Number of pages decompressed with page compression */ + ulint_ctr_n_t pages_page_decompressed; + /* Number of page compression errors */ + ulint_ctr_n_t pages_page_compression_error; + /* Number of pages encrypted */ + ulint_ctr_n_t pages_encrypted; + /* Number of pages decrypted */ + ulint_ctr_n_t pages_decrypted; + /* Number of merge blocks encrypted */ + ulint_ctr_n_t n_merge_blocks_encrypted; + /* Number of merge blocks decrypted */ + ulint_ctr_n_t n_merge_blocks_decrypted; + /* Number of row log blocks encrypted */ + ulint_ctr_n_t n_rowlog_blocks_encrypted; + /* Number of row log blocks decrypted */ + ulint_ctr_n_t n_rowlog_blocks_decrypted; + + /** Number of data read in total (in bytes) */ + ulint_ctr_1_t data_read; + + /** Number of encryption_get_latest_key_version calls */ + ulint_ctr_n_t n_key_requests; + + /** Number of temporary tablespace blocks encrypted */ + ulint_ctr_n_t n_temp_blocks_encrypted; + + /** Number of temporary tablespace blocks decrypted */ + ulint_ctr_n_t n_temp_blocks_decrypted; +}; + +/** We are prepared for a situation that we have this many threads waiting for +a transactional lock inside InnoDB. srv_start() sets the value. */ +extern ulint srv_max_n_threads; + +extern const char* srv_main_thread_op_info; + +/** Prefix used by MySQL to indicate pre-5.1 table name encoding */ +extern const char srv_mysql50_table_name_prefix[10]; + +/** The buffer pool dump/load file name */ +#define SRV_BUF_DUMP_FILENAME_DEFAULT "ib_buffer_pool" +extern char* srv_buf_dump_filename; + +/** Boolean config knobs that tell InnoDB to dump the buffer pool at shutdown +and/or load it during startup. */ +extern char srv_buffer_pool_dump_at_shutdown; +extern char srv_buffer_pool_load_at_startup; + +/* Whether to disable file system cache if it is defined */ +extern char srv_disable_sort_file_cache; + +/* If the last data file is auto-extended, we add this many pages to it +at a time */ +#define SRV_AUTO_EXTEND_INCREMENT (srv_sys_space.get_autoextend_increment()) + +/** Mutex protecting page_zip_stat_per_index */ +extern mysql_mutex_t page_zip_stat_per_index_mutex; +/** Mutex for locking srv_monitor_file */ +extern mysql_mutex_t srv_monitor_file_mutex; +/* Temporary file for innodb monitor output */ +extern FILE* srv_monitor_file; +/** Mutex for locking srv_misc_tmpfile */ +extern mysql_mutex_t srv_misc_tmpfile_mutex; +/* Temporary file for miscellanous diagnostic output */ +extern FILE* srv_misc_tmpfile; + +/* Server parameters which are read from the initfile */ + +extern char* srv_data_home; + +/** Set if InnoDB must operate in read-only mode. We don't do any +recovery and open all tables in RO mode instead of RW mode. We don't +sync the max trx id to disk either. */ +extern my_bool srv_read_only_mode; +/** Set if InnoDB operates in read-only mode or innodb-force-recovery +is greater than SRV_FORCE_NO_IBUF_MERGE. */ +extern my_bool high_level_read_only; +/** store to its own file each table created by an user; data +dictionary tables are in the system tablespace 0 */ +extern my_bool srv_file_per_table; + +/** Sort buffer size in index creation */ +extern ulong srv_sort_buf_size; +/** Maximum modification log file size for online index creation */ +extern unsigned long long srv_online_max_size; + +/* If this flag is TRUE, then we will use the native aio of the +OS (provided we compiled Innobase with it in), otherwise we will +use simulated aio. +Currently we support native aio on windows and linux */ +extern my_bool srv_use_native_aio; +extern my_bool srv_numa_interleave; + +/* Use atomic writes i.e disable doublewrite buffer */ +extern my_bool srv_use_atomic_writes; + +/* Compression algorithm*/ +extern ulong innodb_compression_algorithm; + +/** TRUE if the server was successfully started */ +extern bool srv_was_started; + +/** Server undo tablespaces directory, can be absolute path. */ +extern char* srv_undo_dir; + +/** Number of undo tablespaces to use. */ +extern uint srv_undo_tablespaces; + +/** The number of UNDO tablespaces that are active (hosting some rollback +segment). It is quite possible that some of the tablespaces doesn't host +any of the rollback-segment based on configuration used. */ +extern uint32_t srv_undo_tablespaces_active; + +/** Maximum size of undo tablespace. */ +extern unsigned long long srv_max_undo_log_size; + +extern uint srv_n_fil_crypt_threads; +extern uint srv_n_fil_crypt_threads_started; + +/** Rate at which UNDO records should be purged. */ +extern ulong srv_purge_rseg_truncate_frequency; + +/** Enable or Disable Truncate of UNDO tablespace. */ +extern my_bool srv_undo_log_truncate; + +/** Default size of UNDO tablespace (10MiB for innodb_page_size=16k) */ +constexpr ulint SRV_UNDO_TABLESPACE_SIZE_IN_PAGES= (10U << 20) / + UNIV_PAGE_SIZE_DEF; + +extern char* srv_log_group_home_dir; + +/** The InnoDB redo log file size, or 0 when changing the redo log format +at startup (while disallowing writes to the redo log). */ +extern ulonglong srv_log_file_size; +extern ulong srv_flush_log_at_trx_commit; +extern uint srv_flush_log_at_timeout; +extern my_bool srv_adaptive_flushing; +extern my_bool srv_flush_sync; + +/** Requested size in bytes */ +extern ulint srv_buf_pool_size; +/** Requested buffer pool chunk size */ +extern size_t srv_buf_pool_chunk_unit; +/** Scan depth for LRU flush batch i.e.: number of blocks scanned*/ +extern ulong srv_LRU_scan_depth; +/** Whether or not to flush neighbors of a block */ +extern ulong srv_flush_neighbors; +/** Previously requested size */ +extern ulint srv_buf_pool_old_size; +/** Current size as scaling factor for the other components */ +extern ulint srv_buf_pool_base_size; +/** Current size in bytes */ +extern ulint srv_buf_pool_curr_size; +/** Dump this % of each buffer pool during BP dump */ +extern ulong srv_buf_pool_dump_pct; +#ifdef UNIV_DEBUG +/** Abort load after this amount of pages */ +extern ulong srv_buf_pool_load_pages_abort; +#endif +/** Lock table size in bytes */ +extern ulint srv_lock_table_size; + +/** the value of innodb_checksum_algorithm */ +extern ulong srv_checksum_algorithm; +extern my_bool srv_random_read_ahead; +extern ulong srv_read_ahead_threshold; +extern uint srv_n_read_io_threads; +extern uint srv_n_write_io_threads; + +/* Defragmentation, Origianlly facebook default value is 100, but it's too high */ +#define SRV_DEFRAGMENT_FREQUENCY_DEFAULT 40 +extern my_bool srv_defragment; +extern uint srv_defragment_n_pages; +extern uint srv_defragment_stats_accuracy; +extern uint srv_defragment_fill_factor_n_recs; +extern double srv_defragment_fill_factor; +extern uint srv_defragment_frequency; +extern ulonglong srv_defragment_interval; + +extern uint srv_change_buffer_max_size; + +/* Number of IO operations per second the server can do */ +extern ulong srv_io_capacity; + +/* We use this dummy default value at startup for max_io_capacity. +The real value is set based on the value of io_capacity. */ +#define SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT (~0UL) +#define SRV_MAX_IO_CAPACITY_LIMIT (~0UL) +extern ulong srv_max_io_capacity; + +/* The "innodb_stats_method" setting, decides how InnoDB is going +to treat NULL value when collecting statistics. It is not defined +as enum type because the configure option takes unsigned integer type. */ +extern ulong srv_innodb_stats_method; + +extern ulint srv_max_n_open_files; + +extern double srv_max_buf_pool_modified_pct; +extern double srv_max_dirty_pages_pct_lwm; + +extern double srv_adaptive_flushing_lwm; +extern ulong srv_flushing_avg_loops; + +extern ulong srv_force_recovery; + +/** innodb_fast_shutdown=1 skips purge and change buffer merge. +innodb_fast_shutdown=2 effectively crashes the server (no log checkpoint). +innodb_fast_shutdown=3 is a clean shutdown that skips the rollback +of active transaction (to be done on restart). */ +extern uint srv_fast_shutdown; + +extern ibool srv_innodb_status; + +extern unsigned long long srv_stats_transient_sample_pages; +extern my_bool srv_stats_persistent; +extern unsigned long long srv_stats_persistent_sample_pages; +extern my_bool srv_stats_auto_recalc; +extern my_bool srv_stats_include_delete_marked; +extern unsigned long long srv_stats_modified_counter; +extern my_bool srv_stats_sample_traditional; + +extern my_bool srv_use_doublewrite_buf; +extern ulong srv_checksum_algorithm; + +extern my_bool srv_force_primary_key; + +extern ulong srv_max_purge_lag; +extern ulong srv_max_purge_lag_delay; + +extern my_bool innodb_encrypt_temporary_tables; + +extern my_bool srv_immediate_scrub_data_uncompressed; +/*-------------------------------------------*/ + +/** Modes of operation */ +enum srv_operation_mode { + /** Normal mode (MariaDB Server) */ + SRV_OPERATION_NORMAL, + /** Mariabackup is executing server to export already restored + tablespaces */ + SRV_OPERATION_EXPORT_RESTORED, + /** Mariabackup taking a backup */ + SRV_OPERATION_BACKUP, + /** Mariabackup restoring a backup for subsequent --copy-back */ + SRV_OPERATION_RESTORE, + /** Mariabackup restoring the incremental part of a backup */ + SRV_OPERATION_RESTORE_DELTA, + /** Mariabackup restoring a backup for subsequent --export */ + SRV_OPERATION_RESTORE_EXPORT, + /** Mariabackup taking a backup and avoid deferring + any tablespace */ + SRV_OPERATION_BACKUP_NO_DEFER +}; + +/** Current mode of operation */ +extern enum srv_operation_mode srv_operation; + +/** whether this is the server's first start after mariabackup --prepare */ +extern bool srv_start_after_restore; + +extern my_bool srv_print_innodb_monitor; +extern my_bool srv_print_innodb_lock_monitor; +extern ibool srv_print_verbose_log; + +extern bool srv_monitor_active; + + +extern ulong srv_n_spin_wait_rounds; +extern uint srv_spin_wait_delay; + +/** Number of initialized rollback segments for persistent undo log */ +extern ulong srv_available_undo_logs; +/** Iterations of the loop bounded by 'srv_active' label. */ +extern ulint srv_main_active_loops; +/** Iterations of the loop bounded by the 'srv_idle' label. */ +extern ulint srv_main_idle_loops; +/** Log writes involving flush. */ +extern ulint srv_log_writes_and_flush; + +#ifdef UNIV_DEBUG +extern my_bool innodb_evict_tables_on_commit_debug; +extern my_bool srv_purge_view_update_only_debug; + +/** InnoDB system tablespace to set during recovery */ +extern uint srv_sys_space_size_debug; +/** whether redo log file has been created at startup */ +extern bool srv_log_file_created; +#endif /* UNIV_DEBUG */ + +extern ulint srv_dml_needed_delay; + +/** innodb_purge_threads; the number of purge tasks to use */ +extern uint srv_n_purge_threads; + +/* the number of pages to purge in one batch */ +extern ulong srv_purge_batch_size; + +/* print all user-level transactions deadlocks to mysqld stderr */ +extern my_bool srv_print_all_deadlocks; + +extern my_bool srv_cmp_per_index_enabled; + +/** innodb_encrypt_log */ +extern my_bool srv_encrypt_log; + +/* is encryption enabled */ +extern ulong srv_encrypt_tables; + + +/** Status variables to be passed to MySQL */ +extern struct export_var_t export_vars; + +/** Global counters */ +extern srv_stats_t srv_stats; + +/** Fatal semaphore wait threshold = maximum number of seconds +that semaphore times out in InnoDB */ +#define DEFAULT_SRV_FATAL_SEMAPHORE_TIMEOUT 600 +extern ulong srv_fatal_semaphore_wait_threshold; + +/** Buffer pool dump status frequence in percentages */ +extern ulong srv_buf_dump_status_frequency; + +# ifdef UNIV_PFS_THREAD +extern mysql_pfs_key_t page_cleaner_thread_key; +extern mysql_pfs_key_t trx_rollback_clean_thread_key; +extern mysql_pfs_key_t thread_pool_thread_key; + +/* This macro register the current thread and its key with performance +schema */ +# define pfs_register_thread(key) \ +do { \ + struct PSI_thread* psi __attribute__((unused)) \ + = PSI_CALL_new_thread(key, NULL, 0); \ + PSI_CALL_set_thread_os_id(psi); \ + PSI_CALL_set_thread(psi); \ +} while (0) + +/* This macro delist the current thread from performance schema */ +# define pfs_delete_thread() \ +do { \ + PSI_CALL_delete_current_thread(); \ +} while (0) +# else +# define pfs_register_thread(key) +# define pfs_delete_thread() +# endif /* UNIV_PFS_THREAD */ + +#ifdef HAVE_PSI_STAGE_INTERFACE +/** Performance schema stage event for monitoring ALTER TABLE progress +in ha_innobase::commit_inplace_alter_table(). */ +extern PSI_stage_info srv_stage_alter_table_end; + +/** Performance schema stage event for monitoring ALTER TABLE progress +row_merge_insert_index_tuples(). */ +extern PSI_stage_info srv_stage_alter_table_insert; + +/** Performance schema stage event for monitoring ALTER TABLE progress +row_log_apply(). */ +extern PSI_stage_info srv_stage_alter_table_log_index; + +/** Performance schema stage event for monitoring ALTER TABLE progress +row_log_table_apply(). */ +extern PSI_stage_info srv_stage_alter_table_log_table; + +/** Performance schema stage event for monitoring ALTER TABLE progress +row_merge_sort(). */ +extern PSI_stage_info srv_stage_alter_table_merge_sort; + +/** Performance schema stage event for monitoring ALTER TABLE progress +row_merge_read_clustered_index(). */ +extern PSI_stage_info srv_stage_alter_table_read_pk_internal_sort; + +/** Performance schema stage event for monitoring buffer pool load progress. */ +extern PSI_stage_info srv_stage_buffer_pool_load; +#endif /* HAVE_PSI_STAGE_INTERFACE */ + +/** Alternatives for srv_force_recovery. Non-zero values are intended +to help the user get a damaged database up so that he can dump intact +tables and rows with SELECT INTO OUTFILE. The database must not otherwise +be used with these options! A bigger number below means that all precautions +of lower numbers are included. */ +enum { + SRV_FORCE_IGNORE_CORRUPT = 1, /*!< let the server run even if it + detects a corrupt page */ + SRV_FORCE_NO_BACKGROUND = 2, /*!< prevent the main thread from + running: if a crash would occur + in purge, this prevents it */ + SRV_FORCE_NO_TRX_UNDO = 3, /*!< do not run DML rollback after + recovery */ + SRV_FORCE_NO_DDL_UNDO = 4, /*!< prevent also DDL rollback */ + SRV_FORCE_NO_UNDO_LOG_SCAN = 5, /*!< do not look at undo logs when + starting the database: InnoDB will + treat even incomplete transactions + as committed */ + SRV_FORCE_NO_LOG_REDO = 6 /*!< do not do the log roll-forward + in connection with recovery */ +}; + +/* Alternatives for srv_innodb_stats_method, which could be changed by +setting innodb_stats_method */ +enum srv_stats_method_name_enum { + SRV_STATS_NULLS_EQUAL, /* All NULL values are treated as + equal. This is the default setting + for innodb_stats_method */ + SRV_STATS_NULLS_UNEQUAL, /* All NULL values are treated as + NOT equal. */ + SRV_STATS_NULLS_IGNORED /* NULL values are ignored */ +}; + +typedef enum srv_stats_method_name_enum srv_stats_method_name_t; + +/*********************************************************************//** +Boots Innobase server. */ +void +srv_boot(void); +/*==========*/ +/*********************************************************************//** +Frees the data structures created in srv_init(). */ +void +srv_free(void); + +/******************************************************************//** +Outputs to a file the output of the InnoDB Monitor. +@return FALSE if not all information printed +due to failure to obtain necessary mutex */ +ibool +srv_printf_innodb_monitor( +/*======================*/ + FILE* file, /*!< in: output stream */ + ibool nowait, /*!< in: whether to wait for lock_sys.latch */ + ulint* trx_start, /*!< out: file position of the start of + the list of active transactions */ + ulint* trx_end); /*!< out: file position of the end of + the list of active transactions */ + +/******************************************************************//** +Function to pass InnoDB status variables to MySQL */ +void +srv_export_innodb_status(void); +/*==========================*/ +/*******************************************************************//** +Get current server activity count. +@return activity count. */ +ulint +srv_get_activity_count(void); +/*========================*/ + +/******************************************************************//** +Increment the server activity counter. */ +void +srv_inc_activity_count(void); +/*=========================*/ + +/**********************************************************************//** +Enqueues a task to server task queue and releases a worker thread, if there +is a suspended one. */ +void +srv_que_task_enqueue_low( +/*=====================*/ + que_thr_t* thr); /*!< in: query thread */ + +#ifdef UNIV_DEBUG +/** @return whether purge or master task is active */ +bool srv_any_background_activity(); +#endif + +extern "C" { + + +/** Periodic task which prints the info output by various InnoDB monitors.*/ +void srv_monitor_task(void*); + + +/** The periodic master task controlling the server. */ +void srv_master_callback(void*); + + +/** +Complete the shutdown tasks such as background DROP TABLE, +and optionally change buffer merge (on innodb_fast_shutdown=0). */ +void srv_shutdown(bool ibuf_merge); + +} /* extern "C" */ + +#ifdef UNIV_DEBUG +/** @return number of tasks in queue */ +ulint srv_get_task_queue_length(); +#endif + +/** Shut down the purge threads. */ +void srv_purge_shutdown(); + +/** Init purge tasks*/ +void srv_init_purge_tasks(); + +/** Status variables to be passed to MySQL */ +struct export_var_t{ +#ifdef BTR_CUR_HASH_ADAPT + ulint innodb_ahi_hit; + ulint innodb_ahi_miss; +#endif /* BTR_CUR_HASH_ADAPT */ + char innodb_buffer_pool_dump_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool dump status */ + char innodb_buffer_pool_load_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool load status */ + char innodb_buffer_pool_resize_status[512];/*!< Buf pool resize status */ + my_bool innodb_buffer_pool_load_incomplete;/*!< Buf pool load incomplete */ + ulint innodb_buffer_pool_pages_total; /*!< Buffer pool size */ + ulint innodb_buffer_pool_bytes_data; /*!< File bytes used */ + ulint innodb_buffer_pool_pages_misc; /*!< Miscellanous pages */ +#ifdef UNIV_DEBUG + ulint innodb_buffer_pool_pages_latched; /*!< Latched pages */ +#endif /* UNIV_DEBUG */ + /** buf_pool.stat.n_page_gets (a sharded counter) */ + ulint innodb_buffer_pool_read_requests; + ulint innodb_checkpoint_age; + ulint innodb_checkpoint_max_age; + ulint innodb_data_pending_reads; /*!< Pending reads */ + ulint innodb_data_pending_writes; /*!< Pending writes */ + ulint innodb_data_read; /*!< Data bytes read */ + ulint innodb_data_writes; /*!< I/O write requests */ + ulint innodb_data_written; /*!< Data bytes written */ + ulint innodb_data_reads; /*!< I/O read requests */ + ulint innodb_dblwr_pages_written; /*!< srv_dblwr_pages_written */ + ulint innodb_dblwr_writes; /*!< srv_dblwr_writes */ + ulint innodb_deadlocks; + ulint innodb_history_list_length; + lsn_t innodb_lsn_current; + lsn_t innodb_lsn_flushed; + lsn_t innodb_lsn_last_checkpoint; + trx_id_t innodb_max_trx_id; +#ifdef BTR_CUR_HASH_ADAPT + ulint innodb_mem_adaptive_hash; +#endif + ulint innodb_mem_dictionary; + /** log_sys.get_lsn() - recv_sys.lsn */ + lsn_t innodb_os_log_written; + ulint innodb_row_lock_waits; /*!< srv_n_lock_wait_count */ + ulint innodb_row_lock_current_waits; /*!< srv_n_lock_wait_current_count */ + int64_t innodb_row_lock_time; /*!< srv_n_lock_wait_time + / 1000 */ + uint64_t innodb_row_lock_time_avg; /*!< srv_n_lock_wait_time + / srv_n_lock_wait_count */ + uint64_t innodb_row_lock_time_max; /*!< srv_n_lock_max_wait_time */ + + /** Number of undo tablespace truncation operations */ + ulong innodb_undo_truncations; + ulint innodb_defragment_compression_failures; /*!< Number of + defragment re-compression + failures */ + + ulint innodb_defragment_failures; /*!< Number of defragment + failures*/ + ulint innodb_defragment_count; /*!< Number of defragment + operations*/ + + /** Number of instant ALTER TABLE operations that affect columns */ + ulong innodb_instant_alter_column; + + ulint innodb_onlineddl_rowlog_rows; /*!< Online alter rows */ + ulint innodb_onlineddl_rowlog_pct_used; /*!< Online alter percentage + of used row log buffer */ + ulint innodb_onlineddl_pct_progress; /*!< Online alter progress */ + + int64_t innodb_page_compression_saved;/*!< Number of bytes saved + by page compression */ + int64_t innodb_pages_page_compressed;/*!< Number of pages + compressed by page compression */ + int64_t innodb_page_compressed_trim_op;/*!< Number of TRIM operations + induced by page compression */ + int64_t innodb_pages_page_decompressed;/*!< Number of pages + decompressed by page + compression */ + int64_t innodb_pages_page_compression_error;/*!< Number of page + compression errors */ + int64_t innodb_pages_encrypted; /*!< Number of pages + encrypted */ + int64_t innodb_pages_decrypted; /*!< Number of pages + decrypted */ + + /*!< Number of merge blocks encrypted */ + ib_int64_t innodb_n_merge_blocks_encrypted; + /*!< Number of merge blocks decrypted */ + ib_int64_t innodb_n_merge_blocks_decrypted; + /*!< Number of row log blocks encrypted */ + ib_int64_t innodb_n_rowlog_blocks_encrypted; + /*!< Number of row log blocks decrypted */ + ib_int64_t innodb_n_rowlog_blocks_decrypted; + + /* Number of temporary tablespace pages encrypted */ + ib_int64_t innodb_n_temp_blocks_encrypted; + + /* Number of temporary tablespace pages decrypted */ + ib_int64_t innodb_n_temp_blocks_decrypted; + + ulint innodb_encryption_rotation_pages_read_from_cache; + ulint innodb_encryption_rotation_pages_read_from_disk; + ulint innodb_encryption_rotation_pages_modified; + ulint innodb_encryption_rotation_pages_flushed; + ulint innodb_encryption_rotation_estimated_iops; + int64_t innodb_encryption_key_requests; +}; + +extern tpool::thread_pool *srv_thread_pool; +extern std::unique_ptr srv_master_timer; +extern std::unique_ptr srv_monitor_timer; + +/** The interval at which srv_monitor_task is invoked, in milliseconds */ +constexpr unsigned SRV_MONITOR_INTERVAL= 15000; /* 4 times per minute */ + +static inline void srv_monitor_timer_schedule_now() +{ + srv_monitor_timer->set_time(0, SRV_MONITOR_INTERVAL); +} +static inline void srv_start_periodic_timer(std::unique_ptr& t, + void (*func)(void*), int period) +{ + t.reset(srv_thread_pool->create_timer(func)); + t->set_time(0, period); +} + +void srv_thread_pool_init(); +void srv_thread_pool_end(); diff --git a/storage/innobase/include/srv0start.h b/storage/innobase/include/srv0start.h new file mode 100644 index 00000000..c18cf1ce --- /dev/null +++ b/storage/innobase/include/srv0start.h @@ -0,0 +1,124 @@ +/***************************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/srv0start.h +Starts the Innobase database server + +Created 10/10/1995 Heikki Tuuri +*******************************************************/ + +#pragma once + +#include "log0log.h" +#include "ut0byte.h" + +// Forward declaration +struct dict_table_t; + +/** Open the configured number of dedicated undo tablespaces. +@param[in] create_new_undo whether the undo tablespaces has to be created +@param[in,out] mtr mini-transaction +@return DB_SUCCESS or error code */ +dberr_t srv_undo_tablespaces_init(bool create_new_undo, mtr_t *mtr); + +/** Start InnoDB. +@param[in] create_new_db whether to create a new database +@return DB_SUCCESS or error code */ +dberr_t srv_start(bool create_new_db); + +/** + Shutdown purge to make sure that there is no possibility that we call any + plugin code (e.g., audit) inside virtual column computation. +*/ +void innodb_preshutdown(); + +/** Shut down InnoDB. */ +void innodb_shutdown(); + +/*************************************************************//** +Copy the file path component of the physical file to parameter. It will +copy up to and including the terminating path separator. +@return number of bytes copied or ULINT_UNDEFINED if destination buffer + is smaller than the path to be copied. */ +ulint +srv_path_copy( +/*==========*/ + char* dest, /*!< out: destination buffer */ + ulint dest_len, /*!< in: max bytes to copy */ + const char* basedir, /*!< in: base directory */ + const char* table_name) /*!< in: source table name */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Get the meta-data filename from the table name for a +single-table tablespace. +@param[in] table table object +@param[out] filename filename +@param[in] max_len filename max length */ +void +srv_get_meta_data_filename( + dict_table_t* table, + char* filename, + ulint max_len); + +/** Get the encryption-data filename from the table name for a +single-table tablespace. +@param[in] table table object +@param[out] filename filename +@param[in] max_len filename max length */ +void +srv_get_encryption_data_filename( + dict_table_t* table, + char* filename, + ulint max_len); + +/** Log sequence number at shutdown */ +extern lsn_t srv_shutdown_lsn; + +/** TRUE if the server is being started */ +extern bool srv_is_being_started; +/** TRUE if the server is being started, before rolling back any +incomplete transactions */ +extern bool srv_startup_is_before_trx_rollback_phase; + +/** TRUE if a raw partition is in use */ +extern ibool srv_start_raw_disk_in_use; + +/** Shutdown state */ +enum srv_shutdown_t { + SRV_SHUTDOWN_NONE = 0, /*!< Database running normally */ + /** Shutdown initiated in srv_shutdown_bg_undo_sources() */ + SRV_SHUTDOWN_INITIATED, + SRV_SHUTDOWN_CLEANUP, /*!< Cleaning up in + logs_empty_and_mark_files_at_shutdown() */ + SRV_SHUTDOWN_LAST_PHASE,/*!< Last phase after ensuring that + the buffer pool can be freed: flush + all file spaces and close all files */ + SRV_SHUTDOWN_EXIT_THREADS/*!< Exit all threads */ +}; + +/** Whether any undo log records can be generated */ +extern bool srv_undo_sources; + +/** At a shutdown this value climbs from SRV_SHUTDOWN_NONE to +SRV_SHUTDOWN_CLEANUP and then to SRV_SHUTDOWN_LAST_PHASE, and so on */ +extern enum srv_shutdown_t srv_shutdown_state; + +/** Files comprising the system tablespace */ +extern pfs_os_file_t files[1000]; diff --git a/storage/innobase/include/srw_lock.h b/storage/innobase/include/srw_lock.h new file mode 100644 index 00000000..1dca0cc1 --- /dev/null +++ b/storage/innobase/include/srw_lock.h @@ -0,0 +1,554 @@ +/***************************************************************************** + +Copyright (c) 2020, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +#pragma once +#include "univ.i" +#include "rw_lock.h" + +#if defined __linux__ +/* futex(2): FUTEX_WAIT_PRIVATE, FUTEX_WAKE_PRIVATE */ +#elif defined __OpenBSD__ || defined __FreeBSD__ || defined __DragonFly__ +/* system calls similar to Linux futex(2) */ +#elif defined _WIN32 +/* SRWLOCK as well as WaitOnAddress(), WakeByAddressSingle() */ +#else +# define SUX_LOCK_GENERIC /* fall back to generic synchronization primitives */ +#endif + +#if !defined SUX_LOCK_GENERIC && 0 /* defined SAFE_MUTEX */ +# define SUX_LOCK_GENERIC /* Use dummy implementation for debugging purposes */ +#endif + +#ifdef SUX_LOCK_GENERIC +/** An exclusive-only variant of srw_lock */ +template +class pthread_mutex_wrapper final +{ + pthread_mutex_t lock; +public: + void init() + { + if (spinloop) + pthread_mutex_init(&lock, MY_MUTEX_INIT_FAST); + else + pthread_mutex_init(&lock, nullptr); + } + void destroy() { pthread_mutex_destroy(&lock); } +# ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP + void wr_lock() { pthread_mutex_lock(&lock); } +# else +private: + void wr_wait(); +public: + inline void wr_lock(); +# endif + void wr_unlock() { pthread_mutex_unlock(&lock); } + bool wr_lock_try() { return !pthread_mutex_trylock(&lock); } +}; + +# ifndef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP +template<> void pthread_mutex_wrapper::wr_wait(); +template<> +inline void pthread_mutex_wrapper::wr_lock() +{ pthread_mutex_lock(&lock); } +template<> +inline void pthread_mutex_wrapper::wr_lock() +{ if (!wr_lock_try()) wr_wait(); } +# endif +#endif + +/** Futex-based mutex */ +template +class srw_mutex_impl final +{ + /** The lock word, containing HOLDER + 1 if the lock is being held, + plus the number of waiters */ + std::atomic lock; + /** Identifies that the lock is being held */ + static constexpr uint32_t HOLDER= 1U << 31; + +#ifdef SUX_LOCK_GENERIC +public: + /** The mutex for the condition variables. */ + pthread_mutex_t mutex; +private: + /** Condition variable for the lock word. Used with mutex. */ + pthread_cond_t cond; +#endif + + /** Wait until the mutex has been acquired */ + void wait_and_lock(); + /** Wait for lock!=lk */ + inline void wait(uint32_t lk); + /** Wake up one wait() thread */ + void wake(); +public: + /** @return whether the mutex is being held or waited for */ + bool is_locked_or_waiting() const + { return lock.load(std::memory_order_acquire) != 0; } + /** @return whether the mutex is being held by any thread */ + bool is_locked() const + { return (lock.load(std::memory_order_acquire) & HOLDER) != 0; } + + void init() + { + DBUG_ASSERT(!is_locked_or_waiting()); +#ifdef SUX_LOCK_GENERIC + pthread_mutex_init(&mutex, nullptr); + pthread_cond_init(&cond, nullptr); +#endif + } + void destroy() + { + DBUG_ASSERT(!is_locked_or_waiting()); +#ifdef SUX_LOCK_GENERIC + pthread_mutex_destroy(&mutex); + pthread_cond_destroy(&cond); +#endif + } + + /** @return whether the mutex was acquired */ + bool wr_lock_try() + { + uint32_t lk= 0; + return lock.compare_exchange_strong(lk, HOLDER + 1, + std::memory_order_acquire, + std::memory_order_relaxed); + } + + void wr_lock() { if (!wr_lock_try()) wait_and_lock(); } + void wr_unlock() + { + const uint32_t lk= lock.fetch_sub(HOLDER + 1, std::memory_order_release); + if (lk != HOLDER + 1) + { + DBUG_ASSERT(lk & HOLDER); + wake(); + } + } +}; + +#ifdef SUX_LOCK_GENERIC +typedef pthread_mutex_wrapper srw_spin_mutex; +typedef pthread_mutex_wrapper srw_mutex; +#else +typedef srw_mutex_impl srw_spin_mutex; +typedef srw_mutex_impl srw_mutex; +#endif + +template class srw_lock_impl; + +/** Slim shared-update-exclusive lock with no recursion */ +template +class ssux_lock_impl final +{ +#ifdef UNIV_PFS_RWLOCK + friend class ssux_lock; +# ifdef SUX_LOCK_GENERIC +# elif defined _WIN32 +# else + friend srw_lock_impl; +# endif +#endif + /** mutex for synchronization; held by U or X lock holders */ + srw_mutex_impl writer; +#ifdef SUX_LOCK_GENERIC + /** Condition variable for "readers"; used with writer.mutex. */ + pthread_cond_t readers_cond; +#endif + /** S or U holders, and WRITER flag for X holder or waiter */ + std::atomic readers; + /** indicates an X request; readers=WRITER indicates granted X lock */ + static constexpr uint32_t WRITER= 1U << 31; + + /** Wait for readers!=lk */ + inline void wait(uint32_t lk); + + /** Wait for readers!=lk|WRITER */ + void wr_wait(uint32_t lk); + /** Wake up wait() on the last rd_unlock() */ + void wake(); + /** Acquire a read lock */ + void rd_wait(); +public: + void init() + { + writer.init(); + DBUG_ASSERT(is_vacant()); +#ifdef SUX_LOCK_GENERIC + pthread_cond_init(&readers_cond, nullptr); +#endif + } + void destroy() + { + DBUG_ASSERT(is_vacant()); + writer.destroy(); +#ifdef SUX_LOCK_GENERIC + pthread_cond_destroy(&readers_cond); +#endif + } + /** @return whether any writer is waiting */ + bool is_waiting() const + { return (readers.load(std::memory_order_relaxed) & WRITER) != 0; } +#ifndef DBUG_OFF + /** @return whether the lock is being held or waited for */ + bool is_vacant() const { return !is_locked_or_waiting(); } +#endif /* !DBUG_OFF */ + + bool rd_lock_try() + { + uint32_t lk= 0; + while (!readers.compare_exchange_weak(lk, lk + 1, + std::memory_order_acquire, + std::memory_order_relaxed)) + if (lk & WRITER) + return false; + return true; + } + + bool u_lock_try() + { + if (!writer.wr_lock_try()) + return false; + IF_DBUG_ASSERT(uint32_t lk=,) + readers.fetch_add(1, std::memory_order_acquire); + DBUG_ASSERT(lk < WRITER - 1); + return true; + } + + bool wr_lock_try() + { + if (!writer.wr_lock_try()) + return false; + uint32_t lk= 0; + if (readers.compare_exchange_strong(lk, WRITER, + std::memory_order_acquire, + std::memory_order_relaxed)) + return true; + writer.wr_unlock(); + return false; + } + + void rd_lock() { if (!rd_lock_try()) rd_wait(); } + void u_lock() + { + writer.wr_lock(); + IF_DBUG_ASSERT(uint32_t lk=,) + readers.fetch_add(1, std::memory_order_acquire); + DBUG_ASSERT(lk < WRITER - 1); + } + void wr_lock() + { + writer.wr_lock(); +#if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64 + /* On IA-32 and AMD64, this type of fetch_or() can only be implemented + as a loop around LOCK CMPXCHG. In this particular case, setting the + most significant bit using fetch_add() is equivalent, and is + translated into a simple LOCK XADD. */ + static_assert(WRITER == 1U << 31, "compatibility"); + if (uint32_t lk= readers.fetch_add(WRITER, std::memory_order_acquire)) + wr_wait(lk); +#else + if (uint32_t lk= readers.fetch_or(WRITER, std::memory_order_acquire)) + wr_wait(lk); +#endif + } + + void u_wr_upgrade() + { + DBUG_ASSERT(writer.is_locked()); + uint32_t lk= readers.fetch_add(WRITER - 1, std::memory_order_acquire); + if (lk != 1) + wr_wait(lk - 1); + } + void wr_u_downgrade() + { + DBUG_ASSERT(writer.is_locked()); + DBUG_ASSERT(is_write_locked()); + readers.store(1, std::memory_order_release); + /* Note: Any pending rd_lock() will not be woken up until u_unlock() */ + } + + void rd_unlock() + { + uint32_t lk= readers.fetch_sub(1, std::memory_order_release); + ut_ad(~WRITER & lk); + if (lk == WRITER + 1) + wake(); + } + void u_unlock() + { + IF_DBUG_ASSERT(uint32_t lk=,) + readers.fetch_sub(1, std::memory_order_release); + DBUG_ASSERT(lk); + DBUG_ASSERT(lk < WRITER); + writer.wr_unlock(); + } + void wr_unlock() + { + DBUG_ASSERT(is_write_locked()); + readers.store(0, std::memory_order_release); + writer.wr_unlock(); + } + /** @return whether an exclusive lock may be held by any thread */ + bool is_write_locked() const noexcept + { return readers.load(std::memory_order_acquire) == WRITER; } + /** @return whether any lock may be held by any thread */ + bool is_locked() const noexcept + { return readers.load(std::memory_order_acquire) != 0; } + /** @return whether any lock may be held by any thread */ + bool is_locked_or_waiting() const noexcept + { return is_locked() || writer.is_locked_or_waiting(); } + + void lock_shared() { rd_lock(); } + void unlock_shared() { rd_unlock(); } + void lock() { wr_lock(); } + void unlock() { wr_unlock(); } +}; + +#if defined _WIN32 || defined SUX_LOCK_GENERIC +/** Slim read-write lock */ +template +class srw_lock_ +{ +# ifdef UNIV_PFS_RWLOCK + friend srw_lock_impl; +# endif +# ifdef _WIN32 + SRWLOCK lk; +# else + rw_lock_t lk; +# endif + + void rd_wait(); + void wr_wait(); +public: + void init() { IF_WIN(,my_rwlock_init(&lk, nullptr)); } + void destroy() { IF_WIN(,rwlock_destroy(&lk)); } + inline void rd_lock(); + inline void wr_lock(); + bool rd_lock_try() + { return IF_WIN(TryAcquireSRWLockShared(&lk), !rw_tryrdlock(&lk)); } + void rd_unlock() + { IF_WIN(ReleaseSRWLockShared(&lk), rw_unlock(&lk)); } + bool wr_lock_try() + { return IF_WIN(TryAcquireSRWLockExclusive(&lk), !rw_trywrlock(&lk)); } + void wr_unlock() + { IF_WIN(ReleaseSRWLockExclusive(&lk), rw_unlock(&lk)); } +#ifdef _WIN32 + /** @return whether any lock may be held by any thread */ + bool is_locked_or_waiting() const noexcept { return (size_t&)(lk) != 0; } + /** @return whether any lock may be held by any thread */ + bool is_locked() const noexcept { return is_locked_or_waiting(); } + /** @return whether an exclusive lock may be held by any thread */ + bool is_write_locked() const noexcept + { + // FIXME: this returns false positives for shared locks + return is_locked(); + } + + void lock_shared() { rd_lock(); } + void unlock_shared() { rd_unlock(); } + void lock() { wr_lock(); } + void unlock() { wr_unlock(); } +#endif +}; + +template<> void srw_lock_::rd_wait(); +template<> void srw_lock_::wr_wait(); + +template<> +inline void srw_lock_::rd_lock() +{ IF_WIN(AcquireSRWLockShared(&lk), rw_rdlock(&lk)); } +template<> +inline void srw_lock_::wr_lock() +{ IF_WIN(AcquireSRWLockExclusive(&lk), rw_wrlock(&lk)); } + +template<> +inline void srw_lock_::rd_lock() { if (!rd_lock_try()) rd_wait(); } +template<> +inline void srw_lock_::wr_lock() { if (!wr_lock_try()) wr_wait(); } + +typedef srw_lock_ srw_lock_low; +typedef srw_lock_ srw_spin_lock_low; +#else +typedef ssux_lock_impl srw_lock_low; +typedef ssux_lock_impl srw_spin_lock_low; +#endif + +#ifndef UNIV_PFS_RWLOCK +# define SRW_LOCK_INIT(key) init() +# define SRW_LOCK_ARGS(file, line) /* nothing */ +# define SRW_LOCK_CALL /* nothing */ +typedef srw_lock_low srw_lock; +typedef srw_spin_lock_low srw_spin_lock; +#else +# define SRW_LOCK_INIT(key) init(key) +# define SRW_LOCK_ARGS(file, line) file, line +# define SRW_LOCK_CALL __FILE__, __LINE__ + +/** Slim shared-update-exclusive lock with PERFORMANCE_SCHEMA instrumentation */ +class ssux_lock +{ + PSI_rwlock *pfs_psi; + ssux_lock_impl lock; + + ATTRIBUTE_NOINLINE void psi_rd_lock(const char *file, unsigned line); + ATTRIBUTE_NOINLINE void psi_wr_lock(const char *file, unsigned line); + ATTRIBUTE_NOINLINE void psi_u_lock(const char *file, unsigned line); + ATTRIBUTE_NOINLINE void psi_u_wr_upgrade(const char *file, unsigned line); +public: + void init(mysql_pfs_key_t key) + { + pfs_psi= PSI_RWLOCK_CALL(init_rwlock)(key, this); + lock.init(); + } + void destroy() + { + if (psi_likely(pfs_psi != nullptr)) + { + PSI_RWLOCK_CALL(destroy_rwlock)(pfs_psi); + pfs_psi= nullptr; + } + lock.destroy(); + } + void rd_lock(const char *file, unsigned line) + { + if (psi_likely(pfs_psi != nullptr)) + psi_rd_lock(file, line); + else + lock.rd_lock(); + } + void rd_unlock() + { + if (psi_likely(pfs_psi != nullptr)) + PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi); + lock.rd_unlock(); + } + void u_lock(const char *file, unsigned line) + { + if (psi_likely(pfs_psi != nullptr)) + psi_u_lock(file, line); + else + lock.u_lock(); + } + void u_unlock() + { + if (psi_likely(pfs_psi != nullptr)) + PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi); + lock.u_unlock(); + } + void wr_lock(const char *file, unsigned line) + { + if (psi_likely(pfs_psi != nullptr)) + psi_wr_lock(file, line); + else + lock.wr_lock(); + } + void wr_unlock() + { + if (psi_likely(pfs_psi != nullptr)) + PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi); + lock.wr_unlock(); + } + void u_wr_upgrade(const char *file, unsigned line) + { + if (psi_likely(pfs_psi != nullptr)) + psi_u_wr_upgrade(file, line); + else + lock.u_wr_upgrade(); + } + bool rd_lock_try() { return lock.rd_lock_try(); } + bool u_lock_try() { return lock.u_lock_try(); } + bool wr_lock_try() { return lock.wr_lock_try(); } + bool is_waiting() const { return lock.is_waiting(); } +}; + +/** Slim reader-writer lock with PERFORMANCE_SCHEMA instrumentation */ +template +class srw_lock_impl +{ + PSI_rwlock *pfs_psi; +# if defined _WIN32 || defined SUX_LOCK_GENERIC + srw_lock_ lock; +# else + ssux_lock_impl lock; +# endif + + ATTRIBUTE_NOINLINE void psi_rd_lock(const char *file, unsigned line); + ATTRIBUTE_NOINLINE void psi_wr_lock(const char *file, unsigned line); +public: + void init(mysql_pfs_key_t key) + { + pfs_psi= PSI_RWLOCK_CALL(init_rwlock)(key, this); + lock.init(); + } + void destroy() + { + if (psi_likely(pfs_psi != nullptr)) + { + PSI_RWLOCK_CALL(destroy_rwlock)(pfs_psi); + pfs_psi= nullptr; + } + lock.destroy(); + } + void rd_lock(const char *file, unsigned line) + { + if (psi_likely(pfs_psi != nullptr)) + psi_rd_lock(file, line); + else + lock.rd_lock(); + } + void rd_unlock() + { + if (psi_likely(pfs_psi != nullptr)) + PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi); + lock.rd_unlock(); + } + void wr_lock(const char *file, unsigned line) + { + if (psi_likely(pfs_psi != nullptr)) + psi_wr_lock(file, line); + else + lock.wr_lock(); + } + void wr_unlock() + { + if (psi_likely(pfs_psi != nullptr)) + PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi); + lock.wr_unlock(); + } + bool rd_lock_try() { return lock.rd_lock_try(); } + bool wr_lock_try() { return lock.wr_lock_try(); } + void lock_shared() { return rd_lock(SRW_LOCK_CALL); } + void unlock_shared() { return rd_unlock(); } +#ifndef SUX_LOCK_GENERIC + /** @return whether any lock may be held by any thread */ + bool is_locked_or_waiting() const noexcept + { return lock.is_locked_or_waiting(); } + /** @return whether an exclusive lock may be held by any thread */ + bool is_locked() const noexcept { return lock.is_locked(); } + /** @return whether an exclusive lock may be held by any thread */ + bool is_write_locked() const noexcept { return lock.is_write_locked(); } +#endif +}; + +typedef srw_lock_impl srw_lock; +typedef srw_lock_impl srw_spin_lock; + +#endif diff --git a/storage/innobase/include/sux_lock.h b/storage/innobase/include/sux_lock.h new file mode 100644 index 00000000..2c0167ac --- /dev/null +++ b/storage/innobase/include/sux_lock.h @@ -0,0 +1,472 @@ +/***************************************************************************** + +Copyright (c) 2020, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +#pragma once +#include "srw_lock.h" +#include "my_atomic_wrapper.h" +#ifdef UNIV_DEBUG +# include +#endif + +/** A "fat" rw-lock that supports +S (shared), U (update, or shared-exclusive), and X (exclusive) modes +as well as recursive U and X latch acquisition +@tparam ssux ssux_lock_impl or ssux_lock */ +template +class sux_lock final +{ + /** The underlying non-recursive lock */ + ssux lock; + /** Numbers of U and X locks. Protected by lock. */ + uint32_t recursive; + /** The owner of the U or X lock (0 if none); protected by lock */ + std::atomic writer; + /** Special writer!=0 value to indicate that the lock is non-recursive + and will be released by an I/O thread */ +#if defined __linux__ || defined _WIN32 + static constexpr pthread_t FOR_IO= pthread_t(~0UL); +#else +# define FOR_IO ((pthread_t) ~0UL) /* it could be a pointer */ +#endif +#ifdef UNIV_DEBUG + /** Protects readers */ + mutable srw_mutex readers_lock; + /** Threads that hold the lock in shared mode */ + std::atomic*> readers; +#endif + + /** The multiplier in recursive for X locks */ + static constexpr uint32_t RECURSIVE_X= 1U; + /** The multiplier in recursive for U locks */ + static constexpr uint32_t RECURSIVE_U= 1U << 16; + /** The maximum allowed level of recursion */ + static constexpr uint32_t RECURSIVE_MAX= RECURSIVE_U - 1; + +public: +#ifdef UNIV_PFS_RWLOCK + inline void init(); +#endif + void SRW_LOCK_INIT(mysql_pfs_key_t key) + { + lock.SRW_LOCK_INIT(key); + ut_ad(!writer.load(std::memory_order_relaxed)); + ut_ad(!recursive); + ut_d(readers_lock.init()); +#ifdef UNIV_DEBUG + if (auto r= readers.load(std::memory_order_relaxed)) + ut_ad(r->empty()); +#endif + } + + /** Free the rw-lock after init() */ + void free() + { + ut_ad(!writer.load(std::memory_order_relaxed)); + ut_ad(!recursive); +#ifdef UNIV_DEBUG + readers_lock.destroy(); + if (auto r= readers.load(std::memory_order_relaxed)) + { + ut_ad(r->empty()); + delete r; + readers.store(nullptr, std::memory_order_relaxed); + } +#endif + lock.destroy(); + } + + /** needed for dict_index_t::clone() */ + inline void operator=(const sux_lock&); + +#ifdef UNIV_DEBUG + /** @return whether no recursive locks are being held */ + bool not_recursive() const + { + ut_ad(recursive); + return recursive == RECURSIVE_X || recursive == RECURSIVE_U; + } + + /** @return the number of X locks being held (by any thread) */ + unsigned x_lock_count() const { return recursive & RECURSIVE_MAX; } +#endif + + /** Acquire a recursive lock */ + template void writer_recurse() + { + ut_ad(writer == pthread_self()); + ut_d(auto rec= (recursive / (allow_readers ? RECURSIVE_U : RECURSIVE_X)) & + RECURSIVE_MAX); + ut_ad(allow_readers ? recursive : rec); + ut_ad(rec < RECURSIVE_MAX); + recursive+= allow_readers ? RECURSIVE_U : RECURSIVE_X; + } + +private: + /** Transfer the ownership of a write lock to another thread + @param id the new owner of the U or X lock */ + void set_new_owner(pthread_t id) + { + IF_DBUG(DBUG_ASSERT(writer.exchange(id, std::memory_order_relaxed)), + writer.store(id, std::memory_order_relaxed)); + } + /** Assign the ownership of a write lock to a thread + @param id the owner of the U or X lock */ + void set_first_owner(pthread_t id) + { + IF_DBUG(DBUG_ASSERT(!writer.exchange(id, std::memory_order_relaxed)), + writer.store(id, std::memory_order_relaxed)); + } +#ifdef UNIV_DEBUG + /** Register the current thread as a holder of a shared lock */ + void s_lock_register() + { + const pthread_t id= pthread_self(); + readers_lock.wr_lock(); + auto r= readers.load(std::memory_order_relaxed); + if (!r) + { + r= new std::unordered_multiset(); + readers.store(r, std::memory_order_relaxed); + } + r->emplace(id); + readers_lock.wr_unlock(); + } +#endif + +public: + /** In crash recovery or the change buffer, claim the ownership + of the exclusive block lock to the current thread */ + void claim_ownership() { set_new_owner(pthread_self()); } + + /** @return whether the current thread is holding X or U latch */ + bool have_u_or_x() const + { + if (pthread_self() != writer.load(std::memory_order_relaxed)) + return false; + ut_ad(recursive); + return true; + } + /** @return whether the current thread is holding U but not X latch */ + bool have_u_not_x() const + { return have_u_or_x() && !((recursive / RECURSIVE_X) & RECURSIVE_MAX); } + /** @return whether the current thread is holding X latch */ + bool have_x() const + { return have_u_or_x() && ((recursive / RECURSIVE_X) & RECURSIVE_MAX); } +#ifdef UNIV_DEBUG + /** @return whether the current thread is holding S latch */ + bool have_s() const + { + if (auto r= readers.load(std::memory_order_relaxed)) + { + readers_lock.wr_lock(); + bool found= r->find(pthread_self()) != r->end(); + readers_lock.wr_unlock(); + return found; + } + return false; + } + /** @return whether the current thread is holding the latch */ + bool have_any() const { return have_u_or_x() || have_s(); } +#endif + + /** Acquire a shared lock */ + inline void s_lock(); + inline void s_lock(const char *file, unsigned line); + /** Acquire an update lock */ + inline void u_lock(); + inline void u_lock(const char *file, unsigned line); + /** Acquire an exclusive lock */ + inline void x_lock(bool for_io= false); + inline void x_lock(const char *file, unsigned line); + /** Acquire a recursive exclusive lock */ + void x_lock_recursive() { writer_recurse(); } + /** Upgrade an update lock */ + inline void u_x_upgrade(); + inline void u_x_upgrade(const char *file, unsigned line); + /** Downgrade a single exclusive lock to an update lock */ + void x_u_downgrade() + { + ut_ad(have_u_or_x()); + ut_ad(recursive <= RECURSIVE_MAX); + recursive*= RECURSIVE_U; + lock.wr_u_downgrade(); + } + + /** Acquire an exclusive lock or upgrade an update lock + @return whether U locks were upgraded to X */ + inline bool x_lock_upgraded(); + + /** @return whether a shared lock was acquired */ + bool s_lock_try() + { + bool acquired= lock.rd_lock_try(); + ut_d(if (acquired) s_lock_register()); + return acquired; + } + + /** Try to acquire an update lock + @param for_io whether the lock will be released by another thread + @return whether the update lock was acquired */ + inline bool u_lock_try(bool for_io); + + /** Try to acquire an exclusive lock + @return whether an exclusive lock was acquired */ + inline bool x_lock_try(); + + /** Release a shared lock */ + void s_unlock() + { +#ifdef UNIV_DEBUG + const pthread_t id= pthread_self(); + auto r= readers.load(std::memory_order_relaxed); + ut_ad(r); + readers_lock.wr_lock(); + auto i= r->find(id); + ut_ad(i != r->end()); + r->erase(i); + readers_lock.wr_unlock(); +#endif + lock.rd_unlock(); + } + /** Release an update or exclusive lock + @param allow_readers whether we are releasing a U lock + @param claim_ownership whether the lock was acquired by another thread */ + void u_or_x_unlock(bool allow_readers, bool claim_ownership= false) + { + ut_d(auto owner= writer.load(std::memory_order_relaxed)); + ut_ad(owner == pthread_self() || + (owner == FOR_IO && claim_ownership && + recursive == (allow_readers ? RECURSIVE_U : RECURSIVE_X))); + ut_d(auto rec= (recursive / (allow_readers ? RECURSIVE_U : RECURSIVE_X)) & + RECURSIVE_MAX); + ut_ad(rec); + if (!(recursive-= allow_readers ? RECURSIVE_U : RECURSIVE_X)) + { + set_new_owner(0); + if (allow_readers) + lock.u_unlock(); + else + lock.wr_unlock(); + } + } + /** Release an update lock */ + void u_unlock(bool claim_ownership= false) + { u_or_x_unlock(true, claim_ownership); } + /** Release an exclusive lock */ + void x_unlock(bool claim_ownership= false) + { u_or_x_unlock(false, claim_ownership); } + + /** @return whether any writer is waiting */ + bool is_waiting() const { return lock.is_waiting(); } + + bool is_write_locked() const { return lock.is_write_locked(); } + + bool is_locked_or_waiting() const { return lock.is_locked_or_waiting(); } + + inline void lock_shared(); + inline void unlock_shared(); +}; + +typedef sux_lock> block_lock; + +#ifndef UNIV_PFS_RWLOCK +typedef sux_lock> index_lock; +#else +typedef sux_lock index_lock; + +template<> inline void sux_lock>::init() +{ + lock.init(); + ut_ad(!writer.load(std::memory_order_relaxed)); + ut_ad(!recursive); + ut_d(readers_lock.init()); +#ifdef UNIV_DEBUG + if (auto r= readers.load(std::memory_order_relaxed)) + ut_ad(r->empty()); +#endif +} + +template<> +inline void sux_lock::s_lock(const char *file, unsigned line) +{ + ut_ad(!have_x()); + ut_ad(!have_s()); + lock.rd_lock(file, line); + ut_d(s_lock_register()); +} + +template<> +inline void sux_lock::u_lock(const char *file, unsigned line) +{ + pthread_t id= pthread_self(); + if (writer.load(std::memory_order_relaxed) == id) + writer_recurse(); + else + { + lock.u_lock(file, line); + ut_ad(!recursive); + recursive= RECURSIVE_U; + set_first_owner(id); + } +} + +template<> +inline void sux_lock::x_lock(const char *file, unsigned line) +{ + pthread_t id= pthread_self(); + if (writer.load(std::memory_order_relaxed) == id) + writer_recurse(); + else + { + lock.wr_lock(file, line); + ut_ad(!recursive); + recursive= RECURSIVE_X; + set_first_owner(id); + } +} + +template<> +inline void sux_lock::u_x_upgrade(const char *file, unsigned line) +{ + ut_ad(have_u_not_x()); + lock.u_wr_upgrade(file, line); + recursive/= RECURSIVE_U; +} +#endif + +/** needed for dict_index_t::clone() */ +template<> inline void index_lock::operator=(const sux_lock&) +{ + memset((void*) this, 0, sizeof *this); +} + +template inline void sux_lock::s_lock() +{ + ut_ad(!have_x()); + ut_ad(!have_s()); + lock.rd_lock(); + ut_d(s_lock_register()); +} + +template +inline void sux_lock::lock_shared() { s_lock(); } +template +inline void sux_lock::unlock_shared() { s_unlock(); } + +template inline void sux_lock::u_lock() +{ + pthread_t id= pthread_self(); + if (writer.load(std::memory_order_relaxed) == id) + writer_recurse(); + else + { + lock.u_lock(); + ut_ad(!recursive); + recursive= RECURSIVE_U; + set_first_owner(id); + } +} + +template inline void sux_lock::x_lock(bool for_io) +{ + pthread_t id= pthread_self(); + if (writer.load(std::memory_order_relaxed) == id) + { + ut_ad(!for_io); + writer_recurse(); + } + else + { + lock.wr_lock(); + ut_ad(!recursive); + recursive= RECURSIVE_X; + set_first_owner(for_io ? FOR_IO : id); + } +} + +template inline void sux_lock::u_x_upgrade() +{ + ut_ad(have_u_not_x()); + lock.u_wr_upgrade(); + recursive/= RECURSIVE_U; +} + +template inline bool sux_lock::x_lock_upgraded() +{ + pthread_t id= pthread_self(); + if (writer.load(std::memory_order_relaxed) == id) + { + ut_ad(recursive); + static_assert(RECURSIVE_X == 1, "compatibility"); + if (recursive & RECURSIVE_MAX) + { + writer_recurse(); + return false; + } + /* Upgrade the lock. */ + lock.u_wr_upgrade(); + recursive/= RECURSIVE_U; + return true; + } + else + { + lock.wr_lock(); + ut_ad(!recursive); + recursive= RECURSIVE_X; + set_first_owner(id); + return false; + } +} + +template inline bool sux_lock::u_lock_try(bool for_io) +{ + pthread_t id= pthread_self(); + if (writer.load(std::memory_order_relaxed) == id) + { + if (for_io) + return false; + writer_recurse(); + return true; + } + if (lock.u_lock_try()) + { + ut_ad(!recursive); + recursive= RECURSIVE_U; + set_first_owner(for_io ? FOR_IO : id); + return true; + } + return false; +} + +template inline bool sux_lock::x_lock_try() +{ + pthread_t id= pthread_self(); + if (writer.load(std::memory_order_relaxed) == id) + { + writer_recurse(); + return true; + } + if (lock.wr_lock_try()) + { + ut_ad(!recursive); + recursive= RECURSIVE_X; + set_first_owner(id); + return true; + } + return false; +} diff --git a/storage/innobase/include/transactional_lock_guard.h b/storage/innobase/include/transactional_lock_guard.h new file mode 100644 index 00000000..168a6897 --- /dev/null +++ b/storage/innobase/include/transactional_lock_guard.h @@ -0,0 +1,174 @@ +/***************************************************************************** + +Copyright (c) 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +#pragma once + +#if defined __powerpc64__ +#elif defined __s390__ +#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64) && !defined(__clang__) +#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__) +# if __GNUC__ >= 8 +# elif defined __clang_major__ && __clang_major__ > 6 +# else +# define NO_ELISION +# endif +#else /* Transactional memory has not been implemented for this ISA */ +# define NO_ELISION +#endif + +#ifdef NO_ELISION +constexpr bool have_transactional_memory= false; +# ifdef UNIV_DEBUG +static inline bool xtest() { return false; } +# endif +# define TRANSACTIONAL_TARGET /* nothing */ +# define TRANSACTIONAL_INLINE /* nothing */ +#else +# if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64 +extern bool have_transactional_memory; +bool transactional_lock_enabled(); + +# include +# if defined __GNUC__ && !defined __INTEL_COMPILER +# define TRANSACTIONAL_TARGET __attribute__((target("rtm"),hot)) +# define TRANSACTIONAL_INLINE __attribute__((target("rtm"),hot,always_inline)) +# else +# define TRANSACTIONAL_TARGET /* nothing */ +# define TRANSACTIONAL_INLINE /* nothing */ +# endif + +TRANSACTIONAL_INLINE static inline bool xbegin() +{ + return have_transactional_memory && _xbegin() == _XBEGIN_STARTED; +} + +# ifdef UNIV_DEBUG +# ifdef __GNUC__ +/** @return whether a memory transaction is active */ +bool xtest(); +# else +static inline bool xtest() { return have_transactional_memory && _xtest(); } +# endif +# endif + +TRANSACTIONAL_INLINE static inline void xabort() { _xabort(0); } + +TRANSACTIONAL_INLINE static inline void xend() { _xend(); } +# elif defined __powerpc64__ || defined __s390__ +extern bool have_transactional_memory; +bool transactional_lock_enabled(); +# define TRANSACTIONAL_TARGET __attribute__((hot)) +# define TRANSACTIONAL_INLINE __attribute__((hot,always_inline)) + +/** + Newer gcc compilers only provide __builtin_{htm} + functions when the -mhtm CFLAG is actually provided. So + we've got the option of including it globally, or + pushing down the inclusion of htmxlintrin.h to one + file with -mhtm enabled and removing the inline + optimization. + + Per FIXME in s390x's htmxlintrin.h, the __TM_simple_begin + isn't always_inline resulting in duplicate definitions if + it where included more than once. While xabort and xend + could be implemented here, we keep the implementation the + same as ppc64. + */ +TRANSACTIONAL_TARGET bool xbegin(); +TRANSACTIONAL_TARGET void xabort(); +TRANSACTIONAL_TARGET void xend(); +# ifdef UNIV_DEBUG +bool xtest(); +# endif + +# endif +#endif + +template +class transactional_lock_guard +{ + mutex &m; + +public: + TRANSACTIONAL_INLINE transactional_lock_guard(mutex &m) : m(m) + { +#ifndef NO_ELISION + if (xbegin()) + { + if (was_elided()) + return; + xabort(); + } +#endif + m.lock(); + } + transactional_lock_guard(const transactional_lock_guard &)= delete; + TRANSACTIONAL_INLINE ~transactional_lock_guard() + { +#ifndef NO_ELISION + if (was_elided()) xend(); else +#endif + m.unlock(); + } + +#ifndef NO_ELISION + bool was_elided() const noexcept { return !m.is_locked_or_waiting(); } +#else + bool was_elided() const noexcept { return false; } +#endif +}; + +template +class transactional_shared_lock_guard +{ + mutex &m; +#ifndef NO_ELISION + bool elided; +#else + static constexpr bool elided= false; +#endif + +public: + TRANSACTIONAL_INLINE transactional_shared_lock_guard(mutex &m) : m(m) + { +#ifndef NO_ELISION + if (xbegin()) + { + if (!m.is_write_locked()) + { + elided= true; + return; + } + xabort(); + } + elided= false; +#endif + m.lock_shared(); + } + transactional_shared_lock_guard(const transactional_shared_lock_guard &)= + delete; + TRANSACTIONAL_INLINE ~transactional_shared_lock_guard() + { +#ifndef NO_ELISION + if (was_elided()) xend(); else +#endif + m.unlock_shared(); + } + + bool was_elided() const noexcept { return elided; } +}; diff --git a/storage/innobase/include/trx0i_s.h b/storage/innobase/include/trx0i_s.h new file mode 100644 index 00000000..caacfa09 --- /dev/null +++ b/storage/innobase/include/trx0i_s.h @@ -0,0 +1,277 @@ +/***************************************************************************** + +Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0i_s.h +INFORMATION SCHEMA innodb_trx, innodb_locks and +innodb_lock_waits tables cache structures and public +functions. + +Created July 17, 2007 Vasil Dimov +*******************************************************/ + +#ifndef trx0i_s_h +#define trx0i_s_h + +#include "trx0types.h" +#include "dict0types.h" +#include "buf0types.h" + +/** The maximum amount of memory that can be consumed by innodb_trx, +innodb_locks and innodb_lock_waits information schema tables. */ +#define TRX_I_S_MEM_LIMIT 16777216 /* 16 MiB */ + +/** The maximum length of a string that can be stored in +i_s_locks_row_t::lock_data */ +#define TRX_I_S_LOCK_DATA_MAX_LEN 8192 + +/** The maximum length of a string that can be stored in +i_s_trx_row_t::trx_query */ +#define TRX_I_S_TRX_QUERY_MAX_LEN 1024 + +/** The maximum length of a string that can be stored in +i_s_trx_row_t::trx_foreign_key_error */ +#define TRX_I_S_TRX_FK_ERROR_MAX_LEN 256 + +/** Safely copy strings in to the INNODB_TRX table's +string based columns */ +#define TRX_I_S_STRING_COPY(data, field, constraint, tcache) \ +do { \ + if (strlen(data) > constraint) { \ + char buff[constraint + 1]; \ + strncpy(buff, data, constraint); \ + buff[constraint] = '\0'; \ + \ + field = static_cast( \ + ha_storage_put_memlim( \ + (tcache)->storage, buff, constraint + 1,\ + MAX_ALLOWED_FOR_STORAGE(tcache))); \ + } else { \ + field = static_cast( \ + ha_storage_put_str_memlim( \ + (tcache)->storage, data, \ + MAX_ALLOWED_FOR_STORAGE(tcache))); \ + } \ +} while (0) + +/** A row of INFORMATION_SCHEMA.innodb_locks */ +struct i_s_locks_row_t; + +/** Objects of trx_i_s_cache_t::locks_hash */ +struct i_s_hash_chain_t; + +/** Objects of this type are added to the hash table +trx_i_s_cache_t::locks_hash */ +struct i_s_hash_chain_t { + i_s_locks_row_t* value; /*!< row of + INFORMATION_SCHEMA.innodb_locks*/ + i_s_hash_chain_t* next; /*!< next item in the hash chain */ +}; + +/** This structure represents INFORMATION_SCHEMA.innodb_locks row */ +struct i_s_locks_row_t { + trx_id_t lock_trx_id; /*!< transaction identifier */ + const char* lock_table; /*!< table name from + lock_get_table_name() */ + /** index name of a record lock; NULL for table locks */ + const char* lock_index; + /** page identifier of the record; (0,0) if !lock_index */ + page_id_t lock_page; + /** heap number of the record; 0 if !lock_index */ + uint16_t lock_rec; + /** lock mode corresponding to lock_mode_values_typelib */ + uint8_t lock_mode; + /** (some) content of the record, if available in the buffer pool; + NULL if !lock_index */ + const char* lock_data; + + /** The following are auxiliary and not included in the table */ + /* @{ */ + table_id_t lock_table_id; + /*!< table identifier from + lock_get_table_id */ + i_s_hash_chain_t hash_chain; /*!< hash table chain node for + trx_i_s_cache_t::locks_hash */ + /* @} */ +}; + +/** This structure represents INFORMATION_SCHEMA.innodb_trx row */ +struct i_s_trx_row_t { + trx_id_t trx_id; /*!< transaction identifier */ + const char* trx_state; + time_t trx_started; /*!< trx_t::start_time */ + const i_s_locks_row_t* requested_lock_row; + /*!< pointer to a row + in innodb_locks if trx + is waiting, or NULL */ + time_t trx_wait_started; /*!< trx_t->lock.wait_started */ + uintmax_t trx_weight; /*!< TRX_WEIGHT() */ + ulint trx_mysql_thread_id; /*!< thd_get_thread_id() */ + const char* trx_query; /*!< MySQL statement being + executed in the transaction */ + CHARSET_INFO* trx_query_cs; /*!< the charset of trx_query */ + const char* trx_operation_state; /*!< trx_t::op_info */ + ulint trx_tables_in_use;/*!< n_mysql_tables_in_use in + trx_t */ + ulint trx_tables_locked; + /*!< mysql_n_tables_locked in + trx_t */ + ulint trx_lock_structs;/*!< list len of trx_locks in + trx_t */ + ulint trx_lock_memory_bytes; + /*!< mem_heap_get_size( + trx->lock_heap) */ + ulint trx_rows_locked;/*!< trx_lock_t::n_rec_locks */ + uintmax_t trx_rows_modified;/*!< trx_t::undo_no */ + uint trx_isolation_level; + /*!< trx_t::isolation_level */ + bool trx_unique_checks; + /*!< check_unique_secondary in trx_t*/ + bool trx_foreign_key_checks; + /*!< check_foreigns in trx_t */ + const char* trx_foreign_key_error; + /*!< detailed_error in trx_t */ + bool trx_is_read_only; + /*!< trx_t::read_only */ + bool trx_is_autocommit_non_locking; + /*!< trx:t::is_autocommit_non_locking() + */ +}; + +/** This structure represents INFORMATION_SCHEMA.innodb_lock_waits row */ +struct i_s_lock_waits_row_t { + const i_s_locks_row_t* requested_lock_row; /*!< requested lock */ + const i_s_locks_row_t* blocking_lock_row; /*!< blocking lock */ +}; + +/** Cache of INFORMATION_SCHEMA table data */ +struct trx_i_s_cache_t; + +/** Auxiliary enum used by functions that need to select one of the +INFORMATION_SCHEMA tables */ +enum i_s_table { + I_S_INNODB_TRX, /*!< INFORMATION_SCHEMA.innodb_trx */ + I_S_INNODB_LOCKS, /*!< INFORMATION_SCHEMA.innodb_locks */ + I_S_INNODB_LOCK_WAITS /*!< INFORMATION_SCHEMA.innodb_lock_waits */ +}; + +/** This is the intermediate buffer where data needed to fill the +INFORMATION SCHEMA tables is fetched and later retrieved by the C++ +code in handler/i_s.cc. */ +extern trx_i_s_cache_t* trx_i_s_cache; + +/*******************************************************************//** +Initialize INFORMATION SCHEMA trx related cache. */ +void +trx_i_s_cache_init( +/*===============*/ + trx_i_s_cache_t* cache); /*!< out: cache to init */ +/*******************************************************************//** +Free the INFORMATION SCHEMA trx related cache. */ +void +trx_i_s_cache_free( +/*===============*/ + trx_i_s_cache_t* cache); /*!< in/out: cache to free */ + +/*******************************************************************//** +Issue a shared/read lock on the tables cache. */ +void +trx_i_s_cache_start_read( +/*=====================*/ + trx_i_s_cache_t* cache); /*!< in: cache */ + +/*******************************************************************//** +Release a shared/read lock on the tables cache. */ +void +trx_i_s_cache_end_read( +/*===================*/ + trx_i_s_cache_t* cache); /*!< in: cache */ + +/*******************************************************************//** +Issue an exclusive/write lock on the tables cache. */ +void +trx_i_s_cache_start_write( +/*======================*/ + trx_i_s_cache_t* cache); /*!< in: cache */ + +/*******************************************************************//** +Release an exclusive/write lock on the tables cache. */ +void +trx_i_s_cache_end_write( +/*====================*/ + trx_i_s_cache_t* cache); /*!< in: cache */ + + +/*******************************************************************//** +Retrieves the number of used rows in the cache for a given +INFORMATION SCHEMA table. +@return number of rows */ +ulint +trx_i_s_cache_get_rows_used( +/*========================*/ + trx_i_s_cache_t* cache, /*!< in: cache */ + enum i_s_table table); /*!< in: which table */ + +/*******************************************************************//** +Retrieves the nth row in the cache for a given INFORMATION SCHEMA +table. +@return row */ +void* +trx_i_s_cache_get_nth_row( +/*======================*/ + trx_i_s_cache_t* cache, /*!< in: cache */ + enum i_s_table table, /*!< in: which table */ + ulint n); /*!< in: row number */ + +/*******************************************************************//** +Update the transactions cache if it has not been read for some time. +@return 0 - fetched, 1 - not */ +int +trx_i_s_possibly_fetch_data_into_cache( +/*===================================*/ + trx_i_s_cache_t* cache); /*!< in/out: cache */ + +/*******************************************************************//** +Returns true, if the data in the cache is truncated due to the memory +limit posed by TRX_I_S_MEM_LIMIT. +@return TRUE if truncated */ +bool +trx_i_s_cache_is_truncated( +/*=======================*/ + trx_i_s_cache_t* cache); /*!< in: cache */ +/** The maximum length of a resulting lock_id_size in +trx_i_s_create_lock_id(), not including the terminating NUL. +":%lu:%lu:%lu" -> 63 chars */ +#define TRX_I_S_LOCK_ID_MAX_LEN (TRX_ID_MAX_LEN + 63) + +/*******************************************************************//** +Crafts a lock id string from a i_s_locks_row_t object. Returns its +second argument. This function aborts if there is not enough space in +lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you +want to be 100% sure that it will not abort. +@return resulting lock id */ +char* +trx_i_s_create_lock_id( +/*===================*/ + const i_s_locks_row_t* row, /*!< in: innodb_locks row */ + char* lock_id,/*!< out: resulting lock_id */ + ulint lock_id_size);/*!< in: size of the lock id + buffer */ + +#endif /* trx0i_s_h */ diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h new file mode 100644 index 00000000..3ddd2e98 --- /dev/null +++ b/storage/innobase/include/trx0purge.h @@ -0,0 +1,427 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0purge.h +Purge old versions + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#pragma once + +#include "trx0sys.h" +#include "que0types.h" +#include "srw_lock.h" + +#include +#include + +/** Prepend the history list with an undo log. +Remove the undo log segment from the rseg slot if it is too big for reuse. +@param[in] trx transaction +@param[in,out] undo undo log +@param[in,out] mtr mini-transaction */ +void +trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr); + +/** +Remove unnecessary history data from rollback segments. NOTE that when this +function is called, the caller (purge_coordinator_callback) +must not have any latches on undo log pages! +*/ +void trx_purge_truncate_history(); + +/** +Run a purge batch. +@param n_tasks number of purge tasks to submit to the queue +@param history_size trx_sys.history_size() +@return number of undo log pages handled in the batch */ +ulint trx_purge(ulint n_tasks, ulint history_size); + +/** Rollback segements from a given transaction with trx-no +scheduled for purge. */ +class TrxUndoRsegs { +private: + typedef std::vector > + trx_rsegs_t; +public: + typedef trx_rsegs_t::iterator iterator; + typedef trx_rsegs_t::const_iterator const_iterator; + + TrxUndoRsegs() = default; + + /** Constructor */ + TrxUndoRsegs(trx_rseg_t& rseg) + : trx_no(rseg.last_trx_no()), m_rsegs(1, &rseg) {} + /** Constructor */ + TrxUndoRsegs(trx_id_t trx_no, trx_rseg_t& rseg) + : trx_no(trx_no), m_rsegs(1, &rseg) {} + + bool operator!=(const TrxUndoRsegs& other) const + { return trx_no != other.trx_no; } + bool empty() const { return m_rsegs.empty(); } + void erase(iterator& it) { m_rsegs.erase(it); } + iterator begin() { return(m_rsegs.begin()); } + iterator end() { return(m_rsegs.end()); } + const_iterator begin() const { return m_rsegs.begin(); } + const_iterator end() const { return m_rsegs.end(); } + + /** Compare two TrxUndoRsegs based on trx_no. + @param elem1 first element to compare + @param elem2 second element to compare + @return true if elem1 > elem2 else false.*/ + bool operator()(const TrxUndoRsegs& lhs, const TrxUndoRsegs& rhs) + { + return(lhs.trx_no > rhs.trx_no); + } + + /** Copy of trx_rseg_t::last_trx_no() */ + trx_id_t trx_no= 0; +private: + /** Rollback segments of a transaction, scheduled for purge. */ + trx_rsegs_t m_rsegs{}; +}; + +typedef std::priority_queue< + TrxUndoRsegs, + std::vector >, + TrxUndoRsegs> purge_pq_t; + +/** Chooses the rollback segment with the oldest committed transaction */ +struct TrxUndoRsegsIterator { + /** Constructor */ + TrxUndoRsegsIterator(); + /** Sets the next rseg to purge in purge_sys. + Executed in the purge coordinator thread. + @retval false when nothing is to be purged + @retval true when purge_sys.rseg->latch was locked */ + inline bool set_next(); + +private: + // Disable copying + TrxUndoRsegsIterator(const TrxUndoRsegsIterator&); + TrxUndoRsegsIterator& operator=(const TrxUndoRsegsIterator&); + + /** The current element to process */ + TrxUndoRsegs m_rsegs; + /** Track the current element in m_rsegs */ + TrxUndoRsegs::const_iterator m_iter; +}; + +/** The control structure used in the purge operation */ +class purge_sys_t +{ + friend TrxUndoRsegsIterator; +public: + /** latch protecting view, m_enabled */ + alignas(CPU_LEVEL1_DCACHE_LINESIZE) mutable srw_spin_lock latch; +private: + /** Read view at the start of a purge batch. Any encountered index records + that are older than view will be removed. */ + ReadViewBase view; + /** whether the subsystem has been initialized */ + bool m_initialized{false}; + /** whether purge is enabled; protected by latch and std::atomic */ + std::atomic m_enabled{false}; +public: + /** whether purge is active (may hold table handles) */ + std::atomic m_active{false}; +private: + /** number of pending stop() calls without resume() */ + Atomic_counter m_paused; + /** number of stop_SYS() calls without resume_SYS() */ + Atomic_counter m_SYS_paused; + /** number of stop_FTS() calls without resume_FTS() */ + Atomic_counter m_FTS_paused; + + /** latch protecting end_view */ + alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_spin_lock_low end_latch; + /** Read view at the end of a purge batch (copied from view). Any undo pages + containing records older than end_view may be freed. */ + ReadViewBase end_view; + + struct hasher + { + size_t operator()(const page_id_t &id) const { return size_t(id.raw()); } + }; + + using unordered_map = + std::unordered_map= 8 + std::equal_to + /* GCC 4.8.5 would fail to find a matching allocator */ +#else + std::equal_to, + ut_allocator> +#endif + >; + /** map of buffer-fixed undo log pages processed during a purge batch */ + unordered_map pages; +public: + /** @return the number of processed undo pages */ + size_t n_pages_handled() const { return pages.size(); } + + /** Look up an undo log page. + @param id undo page identifier + @return undo page + @retval nullptr in case the page is corrupted */ + buf_block_t *get_page(page_id_t id); + + que_t* query; /*!< The query graph which will do the + parallelized purge operation */ + + /** Iterator to the undo log records of committed transactions */ + struct iterator + { + bool operator<=(const iterator& other) const + { + if (trx_no < other.trx_no) return true; + if (trx_no > other.trx_no) return false; + return undo_no <= other.undo_no; + } + + /** Free the undo pages up to this. */ + dberr_t free_history() const; + + /** trx_t::no of the committed transaction */ + trx_id_t trx_no; + /** The record number within the committed transaction's undo + log, increasing, purged from from 0 onwards */ + undo_no_t undo_no; + }; + + /** The tail of the purge queue; the last parsed undo log of a + committed transaction. */ + iterator tail; + /** The head of the purge queue; any older undo logs of committed + transactions may be discarded (history list truncation). + Protected by latch. */ + iterator head; + /*-----------------------------*/ + bool next_stored; /*!< whether rseg holds the next record + to purge */ + trx_rseg_t* rseg; /*!< Rollback segment for the next undo + record to purge */ +private: + uint32_t page_no; /*!< Page number for the next undo + record to purge, page number of the + log header, if dummy record */ + uint32_t hdr_page_no; /*!< Header page of the undo log where + the next record to purge belongs */ + uint16_t offset; /*!< Page offset for the next undo + record to purge, 0 if the dummy + record */ + uint16_t hdr_offset; /*!< Header byte offset on the page */ + + + TrxUndoRsegsIterator + rseg_iter; /*!< Iterator to get the next rseg + to process */ +public: + purge_pq_t purge_queue; /*!< Binary min-heap, ordered on + TrxUndoRsegs::trx_no. It is protected + by the pq_mutex */ + mysql_mutex_t pq_mutex; /*!< Mutex protecting purge_queue */ + + /** Undo tablespace file truncation (only accessed by the + srv_purge_coordinator_thread) */ + struct { + /** The undo tablespace that is currently being truncated */ + fil_space_t* current; + /** The undo tablespace that was last truncated */ + fil_space_t* last; + } truncate; + + /** Create the instance */ + void create(); + + /** Close the purge system on shutdown */ + void close(); + + /** @return whether purge is enabled */ + bool enabled() { return m_enabled.load(std::memory_order_relaxed); } + /** @return whether the purge coordinator is paused */ + bool paused() + { return m_paused != 0; } + + /** Enable purge at startup. */ + void coordinator_startup() + { + ut_ad(!enabled()); + m_enabled.store(true, std::memory_order_relaxed); + wake_if_not_active(); + } + + /** Disable purge at shutdown */ + void coordinator_shutdown() + { + ut_ad(enabled()); + m_enabled.store(false, std::memory_order_relaxed); + } + + /** @return whether the purge tasks are active */ + static bool running(); + + /** Stop purge during FLUSH TABLES FOR EXPORT. */ + void stop(); + /** Resume purge at UNLOCK TABLES after FLUSH TABLES FOR EXPORT */ + void resume(); + + /** Close and reopen all tables in case of a MDL conflict with DDL */ + dict_table_t *close_and_reopen(table_id_t id, THD *thd, MDL_ticket **mdl); +private: + /** Suspend purge during a DDL operation on FULLTEXT INDEX tables */ + void wait_FTS(bool also_sys); +public: + /** Suspend purge in data dictionary tables */ + void stop_SYS() { m_SYS_paused++; } + /** Resume purge in data dictionary tables */ + static void resume_SYS(void *); + + /** Pause purge during a DDL operation that could drop FTS_ tables. */ + void stop_FTS(); + /** Resume purge after stop_FTS(). */ + void resume_FTS() { ut_d(const auto p=) m_FTS_paused--; ut_ad(p); } + /** @return whether stop_SYS() is in effect */ + bool must_wait_FTS() const { return m_FTS_paused; } + +private: + /** + Get the next record to purge and update the info in the purge system. + @param roll_ptr undo log pointer to the record + @return buffer-fixed reference to undo log record + @retval {nullptr,1} if the whole undo log can skipped in purge + @retval {nullptr,0} if nothing is left, or on corruption */ + inline trx_purge_rec_t get_next_rec(roll_ptr_t roll_ptr); + + /** Choose the next undo log to purge. + @return whether anything is to be purged */ + bool choose_next_log(); + + /** Update the last not yet purged history log info in rseg when + we have purged a whole undo log. Advances also purge_trx_no + past the purged log. */ + void rseg_get_next_history_log(); + +public: + /** + Fetch the next undo log record from the history list to purge. + @return buffer-fixed reference to undo log record + @retval {nullptr,1} if the whole undo log can skipped in purge + @retval {nullptr,0} if nothing is left, or on corruption */ + inline trx_purge_rec_t fetch_next_rec(); + + /** Determine if the history of a transaction is purgeable. + @param trx_id transaction identifier + @return whether the history is purgeable */ + TRANSACTIONAL_TARGET bool is_purgeable(trx_id_t trx_id) const; + + /** A wrapper around ReadView::low_limit_no(). */ + trx_id_t low_limit_no() const + { + /* This function may only be called by purge_coordinator_callback(). + + The purge coordinator task may call this without holding any latch, + because it is the only thread that may modify purge_sys.view. + + Any other threads that access purge_sys.view must hold purge_sys.latch, + typically via purge_sys_t::view_guard. */ + return view.low_limit_no(); + } + /** A wrapper around ReadView::sees(). */ + trx_id_t sees(trx_id_t id) const + { + /* This function may only be called by purge_coordinator_callback(). + + The purge coordinator task may call this without holding any latch, + because it is the only thread that may modify purge_sys.view. + + Any other threads that access purge_sys.view must hold purge_sys.latch, + typically via purge_sys_t::view_guard. */ + return view.sees(id); + } + /** A wrapper around trx_sys_t::clone_oldest_view(). */ + template + void clone_oldest_view() + { + if (!also_end_view) + wait_FTS(true); + latch.wr_lock(SRW_LOCK_CALL); + trx_sys.clone_oldest_view(&view); + if (also_end_view) + (end_view= view). + clamp_low_limit_id(head.trx_no ? head.trx_no : tail.trx_no); + latch.wr_unlock(); + } + + /** Wake up the purge threads if there is work to do. */ + void wake_if_not_active(); + + /** Release undo pages and update end_view at the end of a purge batch. + @retval false when nothing is to be purged + @retval true when purge_sys.rseg->latch was locked */ + inline void batch_cleanup(const iterator &head); + + struct view_guard + { + inline view_guard(); + inline ~view_guard(); + + /** @return purge_sys.view */ + inline const ReadViewBase &view() const; + }; + + struct end_view_guard + { + inline end_view_guard(); + inline ~end_view_guard(); + + /** @return purge_sys.end_view */ + inline const ReadViewBase &view() const; + }; + + /** Stop the purge thread and check n_ref_count of all auxiliary + and common table associated with the fts table. + @param table parent FTS table + @param already_stopped True indicates purge threads were + already stopped */ + void stop_FTS(const dict_table_t &table, bool already_stopped=false); +}; + +/** The global data structure coordinating a purge */ +extern purge_sys_t purge_sys; + +purge_sys_t::view_guard::view_guard() +{ purge_sys.latch.rd_lock(SRW_LOCK_CALL); } + +purge_sys_t::view_guard::~view_guard() +{ purge_sys.latch.rd_unlock(); } + +const ReadViewBase &purge_sys_t::view_guard::view() const +{ return purge_sys.view; } + +purge_sys_t::end_view_guard::end_view_guard() +{ purge_sys.end_latch.rd_lock(); } + +purge_sys_t::end_view_guard::~end_view_guard() +{ purge_sys.end_latch.rd_unlock(); } + +const ReadViewBase &purge_sys_t::end_view_guard::view() const +{ return purge_sys.end_view; } diff --git a/storage/innobase/include/trx0rec.h b/storage/innobase/include/trx0rec.h new file mode 100644 index 00000000..3d9b1868 --- /dev/null +++ b/storage/innobase/include/trx0rec.h @@ -0,0 +1,299 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0rec.h +Transaction undo log record + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#pragma once + +#include "trx0types.h" +#include "row0types.h" +#include "page0types.h" +#include "que0types.h" + +/**********************************************************************//** +Reads the undo log record number. +@return undo no */ +inline undo_no_t trx_undo_rec_get_undo_no(const trx_undo_rec_t *undo_rec) +{ + return mach_u64_read_much_compressed(undo_rec + 3); +} + +/**********************************************************************//** +Returns the start of the undo record data area. */ +#define trx_undo_rec_get_ptr(undo_rec, undo_no) \ + ((undo_rec) + trx_undo_rec_get_offset(undo_no)) + +/**********************************************************************//** +Reads from an undo log record the general parameters. +@return remaining part of undo log record after reading these values */ +const byte* +trx_undo_rec_get_pars( +/*==================*/ + const trx_undo_rec_t* undo_rec, /*!< in: undo log record */ + byte* type, /*!< out: undo record type: + TRX_UNDO_INSERT_REC, ... */ + byte* cmpl_info, /*!< out: compiler info, relevant only + for update type records */ + bool* updated_extern, /*!< out: true if we updated an + externally stored fild */ + undo_no_t* undo_no, /*!< out: undo log record number */ + table_id_t* table_id) /*!< out: table id */ + MY_ATTRIBUTE((nonnull)); + +/*******************************************************************//** +Builds a row reference from an undo log record. +@return pointer to remaining part of undo record */ +const byte* +trx_undo_rec_get_row_ref( +/*=====================*/ + const byte* ptr, /*!< in: remaining part of a copy of an undo log + record, at the start of the row reference; + NOTE that this copy of the undo log record must + be preserved as long as the row reference is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /*!< in: clustered index */ + const dtuple_t**ref, /*!< out, own: row reference */ + mem_heap_t* heap) /*!< in: memory heap from which the memory + needed is allocated */ + MY_ATTRIBUTE((nonnull)); +/**********************************************************************//** +Reads from an undo log update record the system field values of the old +version. +@return remaining part of undo log record after reading these values */ +byte* +trx_undo_update_rec_get_sys_cols( +/*=============================*/ + const byte* ptr, /*!< in: remaining part of undo + log record after reading + general parameters */ + trx_id_t* trx_id, /*!< out: trx id */ + roll_ptr_t* roll_ptr, /*!< out: roll ptr */ + byte* info_bits); /*!< out: info bits state */ +/*******************************************************************//** +Builds an update vector based on a remaining part of an undo log record. +@return remaining part of the record, NULL if an error detected, which +means that the record is corrupted */ +byte* +trx_undo_update_rec_get_update( +/*===========================*/ + const byte* ptr, /*!< in: remaining part in update undo log + record, after reading the row reference + NOTE that this copy of the undo log record must + be preserved as long as the update vector is + used, as we do NOT copy the data in the + record! */ + dict_index_t* index, /*!< in: clustered index */ + ulint type, /*!< in: TRX_UNDO_UPD_EXIST_REC, + TRX_UNDO_UPD_DEL_REC, or + TRX_UNDO_DEL_MARK_REC; in the last case, + only trx id and roll ptr fields are added to + the update vector */ + trx_id_t trx_id, /*!< in: transaction id from this undorecord */ + roll_ptr_t roll_ptr,/*!< in: roll pointer from this undo record */ + byte info_bits,/*!< in: info bits from this undo record */ + mem_heap_t* heap, /*!< in: memory heap from which the memory + needed is allocated */ + upd_t** upd); /*!< out, own: update vector */ +/** Report a RENAME TABLE operation. +@param[in,out] trx transaction +@param[in] table table that is being renamed +@return DB_SUCCESS or error code */ +dberr_t trx_undo_report_rename(trx_t* trx, const dict_table_t* table) + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/***********************************************************************//** +Writes information to an undo log about an insert, update, or a delete marking +of a clustered index record. This information is used in a rollback of the +transaction and in consistent reads that must look to the history of this +transaction. +@return DB_SUCCESS or error code */ +dberr_t +trx_undo_report_row_operation( +/*==========================*/ + que_thr_t* thr, /*!< in: query thread */ + dict_index_t* index, /*!< in: clustered index */ + const dtuple_t* clust_entry, /*!< in: in the case of an insert, + index entry to insert into the + clustered index; in updates, + may contain a clustered index + record tuple that also contains + virtual columns of the table; + otherwise, NULL */ + const upd_t* update, /*!< in: in the case of an update, + the update vector, otherwise NULL */ + ulint cmpl_info, /*!< in: compiler info on secondary + index updates */ + const rec_t* rec, /*!< in: case of an update or delete + marking, the record in the clustered + index; NULL if insert */ + const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */ + roll_ptr_t* roll_ptr) /*!< out: DB_ROLL_PTR to the + undo log record */ + MY_ATTRIBUTE((nonnull(1,2), warn_unused_result)); + +/** status bit used for trx_undo_prev_version_build() */ + +/** TRX_UNDO_PREV_IN_PURGE tells trx_undo_prev_version_build() that it +is being called purge view and we would like to get the purge record +even it is in the purge view (in normal case, it will return without +fetching the purge record */ +static constexpr ulint TRX_UNDO_PREV_IN_PURGE = 1; + +/** This tells trx_undo_prev_version_build() to fetch the old value in +the undo log (which is the after image for an update) */ +static constexpr ulint TRX_UNDO_GET_OLD_V_VALUE = 2; + +/** indicate a call from row_vers_old_has_index_entry() */ +static constexpr ulint TRX_UNDO_CHECK_PURGEABILITY = 4; + +/** Build a previous version of a clustered index record. The caller +must hold a latch on the index page of the clustered index record. +@param rec version of a clustered index record +@param index clustered index +@param offsets rec_get_offsets(rec, index) +@param heap memory heap from which the memory needed is + allocated +@param old_vers previous version or NULL if rec is the + first inserted version, or if history data + has been deleted (an error), or if the purge + could have removed the version + though it has not yet done so +@param v_heap memory heap used to create vrow + dtuple if it is not yet created. This heap + diffs from "heap" above in that it could be + prebuilt->old_vers_heap for selection +@param vrow virtual column info, if any +@param v_status status determine if it is going into this + function by purge thread or not. + And if we read "after image" of undo log +@return error code +@retval DB_SUCCESS if previous version was successfully built, +or if it was an insert or the undo record refers to the table before rebuild +@retval DB_MISSING_HISTORY if the history is missing */ +dberr_t +trx_undo_prev_version_build( + const rec_t *rec, + dict_index_t *index, + rec_offs *offsets, + mem_heap_t *heap, + rec_t **old_vers, + mem_heap_t *v_heap, + dtuple_t **vrow, + ulint v_status); + +/** Read from an undo log record a non-virtual column value. +@param ptr pointer to remaining part of the undo record +@param field stored field +@param len length of the field, or UNIV_SQL_NULL +@param orig_len original length of the locally stored part +of an externally stored column, or 0 +@return remaining part of undo log record after reading these values */ +const byte *trx_undo_rec_get_col_val(const byte *ptr, const byte **field, + uint32_t *len, uint32_t *orig_len); + +/** Read virtual column value from undo log +@param[in] table the table +@param[in] ptr undo log pointer +@param[in,out] row the dtuple to fill +@param[in] in_purge whether this is called by purge */ +void +trx_undo_read_v_cols( + const dict_table_t* table, + const byte* ptr, + dtuple_t* row, + bool in_purge); + +/** Read virtual column index from undo log if the undo log contains such +info, and verify the column is still indexed, and output its position +@param[in] table the table +@param[in] ptr undo log pointer +@param[in] first_v_col if this is the first virtual column, which + has the version marker +@param[in,out] is_undo_log his function is used to parse both undo log, + and online log for virtual columns. So + check to see if this is undo log +@param[out] field_no the column number, or FIL_NULL if not indexed +@return remaining part of undo log record after reading these values */ +const byte* +trx_undo_read_v_idx( + const dict_table_t* table, + const byte* ptr, + bool first_v_col, + bool* is_undo_log, + uint32_t* field_no); + +/* Types of an undo log record: these have to be smaller than 16, as the +compilation info multiplied by 16 is ORed to this value in an undo log +record */ + +/** Undo log records for DDL operations + +Note: special rollback and purge triggers exist for SYS_INDEXES records: +@see dict_drop_index_tree() */ +enum trx_undo_ddl_type +{ + /** RENAME TABLE (logging the old table name). + + Because SYS_TABLES has PRIMARY KEY(NAME), the row-level undo log records + for SYS_TABLES cannot be distinguished from DROP TABLE, CREATE TABLE. */ + TRX_UNDO_RENAME_TABLE= 9, + /** insert a metadata pseudo-record for instant ALTER TABLE */ + TRX_UNDO_INSERT_METADATA= 10 +}; + +/* DML operations */ +#define TRX_UNDO_INSERT_REC 11 /* fresh insert into clustered index */ +#define TRX_UNDO_UPD_EXIST_REC 12 /* update of a non-delete-marked + record */ +#define TRX_UNDO_UPD_DEL_REC 13 /* update of a delete marked record to + a not delete marked record; also the + fields of the record can change */ +#define TRX_UNDO_DEL_MARK_REC 14 /* delete marking of a record; fields + do not change */ +/** Bulk insert operation. It is written only when the table is +under exclusive lock and the clustered index root page latch is being held, +and the clustered index is empty. Rollback will empty the table and +free the leaf segment of all indexes, re-create the new +leaf segment and re-initialize the root page alone. */ +#define TRX_UNDO_EMPTY 15 + +#define TRX_UNDO_CMPL_INFO_MULT 16U /* compilation info is multiplied by + this and ORed to the type above */ +#define TRX_UNDO_UPD_EXTERN 128U /* This bit can be ORed to type_cmpl + to denote that we updated external + storage fields: used by purge to + free the external storage */ + +/** The search tuple corresponding to TRX_UNDO_INSERT_METADATA */ +extern const dtuple_t trx_undo_metadata; + +/** Read the table id from an undo log record. +@param[in] rec Undo log record +@return table id stored as a part of undo log record */ +inline table_id_t trx_undo_rec_get_table_id(const trx_undo_rec_t *rec) +{ + rec+= 3; + mach_read_next_much_compressed(&rec); + return mach_read_next_much_compressed(&rec); +} diff --git a/storage/innobase/include/trx0roll.h b/storage/innobase/include/trx0roll.h new file mode 100644 index 00000000..9ef9ebe9 --- /dev/null +++ b/storage/innobase/include/trx0roll.h @@ -0,0 +1,168 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0roll.h +Transaction rollback + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0roll_h +#define trx0roll_h + +#include "trx0trx.h" +#include "mtr0mtr.h" +#include "trx0sys.h" + +extern bool trx_rollback_is_active; +extern const trx_t* trx_roll_crash_recv_trx; + +/** Report progress when rolling back a row of a recovered transaction. */ +void trx_roll_report_progress(); +/*******************************************************************//** +Rollback or clean up any incomplete transactions which were +encountered in crash recovery. If the transaction already was +committed, then we clean up a possible insert undo log. If the +transaction was not yet committed, then we roll it back. +@param all true=roll back all recovered active transactions; +false=roll back any incomplete dictionary transaction */ +void +trx_rollback_recovered(bool all); +/*******************************************************************//** +Rollback or clean up any incomplete transactions which were +encountered in crash recovery. If the transaction already was +committed, then we clean up a possible insert undo log. If the +transaction was not yet committed, then we roll it back. +Note: this is done in a background thread. */ +void trx_rollback_all_recovered(void*); +/*********************************************************************//** +Creates a rollback command node struct. +@return own: rollback node struct */ +roll_node_t* +roll_node_create( +/*=============*/ + mem_heap_t* heap); /*!< in: mem heap where created */ +/***********************************************************//** +Performs an execution step for a rollback command node in a query graph. +@return query thread to run next, or NULL */ +que_thr_t* +trx_rollback_step( +/*==============*/ + que_thr_t* thr); /*!< in: query thread */ +/*******************************************************************//** +Rollback a transaction used in MySQL. +@return error code or DB_SUCCESS */ +dberr_t +trx_rollback_for_mysql( +/*===================*/ + trx_t* trx) /*!< in/out: transaction */ + MY_ATTRIBUTE((nonnull)); +/*******************************************************************//** +Rollback the latest SQL statement for MySQL. +@return error code or DB_SUCCESS */ +dberr_t +trx_rollback_last_sql_stat_for_mysql( +/*=================================*/ + trx_t* trx) /*!< in/out: transaction */ + MY_ATTRIBUTE((nonnull)); +/*******************************************************************//** +Rolls back a transaction back to a named savepoint. Modifications after the +savepoint are undone but InnoDB does NOT release the corresponding locks +which are stored in memory. If a lock is 'implicit', that is, a new inserted +row holds a lock where the lock information is carried by the trx id stored in +the row, these locks are naturally released in the rollback. Savepoints which +were set after this savepoint are deleted. +@return if no savepoint of the name found then DB_NO_SAVEPOINT, +otherwise DB_SUCCESS */ +dberr_t +trx_rollback_to_savepoint_for_mysql( +/*================================*/ + trx_t* trx, /*!< in: transaction handle */ + const char* savepoint_name, /*!< in: savepoint name */ + int64_t* mysql_binlog_cache_pos) /*!< out: the MySQL binlog cache + position corresponding to this + savepoint; MySQL needs this + information to remove the + binlog entries of the queries + executed after the savepoint */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/*******************************************************************//** +Creates a named savepoint. If the transaction is not yet started, starts it. +If there is already a savepoint of the same name, this call erases that old +savepoint and replaces it with a new. Savepoints are deleted in a transaction +commit or rollback. +@return always DB_SUCCESS */ +dberr_t +trx_savepoint_for_mysql( +/*====================*/ + trx_t* trx, /*!< in: transaction handle */ + const char* savepoint_name, /*!< in: savepoint name */ + int64_t binlog_cache_pos) /*!< in: MySQL binlog cache + position corresponding to this + connection at the time of the + savepoint */ + MY_ATTRIBUTE((nonnull)); +/*******************************************************************//** +Releases a named savepoint. Savepoints which +were set after this savepoint are deleted. +@return if no savepoint of the name found then DB_NO_SAVEPOINT, +otherwise DB_SUCCESS */ +dberr_t +trx_release_savepoint_for_mysql( +/*============================*/ + trx_t* trx, /*!< in: transaction handle */ + const char* savepoint_name) /*!< in: savepoint name */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Rollback node states */ +enum roll_node_state { + ROLL_NODE_NONE = 0, /*!< Unknown state */ + ROLL_NODE_SEND, /*!< about to send a rollback signal to + the transaction */ + ROLL_NODE_WAIT /*!< rollback signal sent to the + transaction, waiting for completion */ +}; + +/** Rollback command node in a query graph */ +struct roll_node_t{ + que_common_t common; /*!< node type: QUE_NODE_ROLLBACK */ + enum roll_node_state state; /*!< node execution state */ + const trx_savept_t* savept; /*!< savepoint to which to + roll back, in the case of a + partial rollback */ + que_thr_t* undo_thr;/*!< undo query graph */ +}; + +/** A savepoint set with SQL's "SAVEPOINT savepoint_id" command */ +struct trx_named_savept_t{ + char* name; /*!< savepoint name */ + trx_savept_t savept; /*!< the undo number corresponding to + the savepoint */ + int64_t mysql_binlog_cache_pos; + /*!< the MySQL binlog cache position + corresponding to this savepoint, not + defined if the MySQL binlogging is not + enabled */ + UT_LIST_NODE_T(trx_named_savept_t) + trx_savepoints; /*!< the list of savepoints of a + transaction */ +}; + +#endif diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h new file mode 100644 index 00000000..43e0c290 --- /dev/null +++ b/storage/innobase/include/trx0rseg.h @@ -0,0 +1,301 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0rseg.h +Rollback segment + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#pragma once +#include "trx0types.h" +#include "fut0lst.h" + +/** Create a rollback segment header. +@param[in,out] space system, undo, or temporary tablespace +@param[in] rseg_id rollback segment identifier +@param[in] max_trx_id new value of TRX_RSEG_MAX_TRX_ID +@param[in,out] mtr mini-transaction +@param[out] err error code +@return the created rollback segment +@retval nullptr on failure */ +buf_block_t *trx_rseg_header_create(fil_space_t *space, ulint rseg_id, + trx_id_t max_trx_id, mtr_t *mtr, + dberr_t *err) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Initialize or recover the rollback segments at startup. */ +dberr_t trx_rseg_array_init(); + +/** Create the temporary rollback segments. */ +dberr_t trx_temp_rseg_create(mtr_t *mtr); + +/* Number of undo log slots in a rollback segment file copy */ +#define TRX_RSEG_N_SLOTS (srv_page_size / 16) + +/* Maximum number of transactions supported by a single rollback segment */ +#define TRX_RSEG_MAX_N_TRXS (TRX_RSEG_N_SLOTS / 2) + +/** The rollback segment memory object */ +struct alignas(CPU_LEVEL1_DCACHE_LINESIZE) trx_rseg_t +{ + /** tablespace containing the rollback segment; constant after init() */ + fil_space_t *space; + /** latch protecting everything except page_no, space */ + srw_spin_lock latch; + /** rollback segment header page number; constant after init() */ + uint32_t page_no; + /** length of the TRX_RSEG_HISTORY list (number of transactions) */ + uint32_t history_size; + + /** Last known transaction that has not been purged yet, + or 0 if everything has been purged. */ + trx_id_t needs_purge; + +private: + /** Reference counter to track is_persistent() transactions, + with SKIP flag. */ + std::atomic ref; + + /** Whether undo tablespace truncation is pending */ + static constexpr uint32_t SKIP= 1; + /** Transaction reference count multiplier */ + static constexpr uint32_t REF= 2; + + uint32_t ref_load() const { return ref.load(std::memory_order_relaxed); } + + /** Set the SKIP bit */ + void ref_set_skip() + { + static_assert(SKIP == 1U, "compatibility"); +#if defined __GNUC__ && (defined __i386__ || defined __x86_64__) + __asm__ __volatile__("lock btsl $0, %0" : "+m" (ref)); +#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64) + _interlockedbittestandset(reinterpret_cast(&ref), 0); +#else + ref.fetch_or(SKIP, std::memory_order_relaxed); +#endif + } + /** Clear a bit in ref */ + void ref_reset_skip() + { + static_assert(SKIP == 1U, "compatibility"); +#if defined __GNUC__ && (defined __i386__ || defined __x86_64__) + __asm__ __volatile__("lock btrl $0, %0" : "+m" (ref)); +#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64) + _interlockedbittestandreset(reinterpret_cast(&ref), 0); +#else + ref.fetch_and(~SKIP, std::memory_order_relaxed); +#endif + } + +public: + + /** Initialize the fields that are not zero-initialized. */ + void init(fil_space_t *space, uint32_t page); + /** Reinitialize the fields on undo tablespace truncation. */ + void reinit(uint32_t page); + /** Clean up. */ + void destroy(); + + /** Note that undo tablespace truncation was started. */ + void set_skip_allocation() { ut_ad(is_persistent()); ref_set_skip(); } + /** Note that undo tablespace truncation was completed. */ + void clear_skip_allocation() + { + ut_ad(is_persistent()); +#if defined DBUG_OFF + ref_reset_skip(); +#else + ut_d(auto r=) ref.fetch_and(~SKIP, std::memory_order_relaxed); + ut_ad(r == SKIP); +#endif + } + /** @return whether the segment is marked for undo truncation */ + bool skip_allocation() const + { return ref.load(std::memory_order_acquire) & SKIP; } + /** Increment the reference count */ + void acquire() + { ut_d(auto r=) ref.fetch_add(REF); ut_ad(!(r & SKIP)); } + /** Increment the reference count if possible + @retval true if the reference count was incremented + @retval false if skip_allocation() holds */ + bool acquire_if_available() + { + uint32_t r= 0; + while (!ref.compare_exchange_weak(r, r + REF, + std::memory_order_relaxed, + std::memory_order_relaxed)) + if (r & SKIP) + return false; + return true; + } + + /** Decrement the reference count */ + void release() + { + ut_d(const auto r=) + ref.fetch_sub(REF, std::memory_order_relaxed); + ut_ad(r >= REF); + } + /** @return whether references exist */ + bool is_referenced() const { return ref_load() >= REF; } + + /** current size in pages */ + uint32_t curr_size; + + /** List of undo logs (transactions) */ + UT_LIST_BASE_NODE_T(trx_undo_t) undo_list; + /** List of undo log segments cached for fast reuse */ + UT_LIST_BASE_NODE_T(trx_undo_t) undo_cached; + + /** Last not yet purged undo log header; FIL_NULL if all purged */ + uint32_t last_page_no; + + /** trx_t::no | last_offset << 48 */ + uint64_t last_commit_and_offset; + + /** @return the commit ID of the last committed transaction */ + trx_id_t last_trx_no() const + { return last_commit_and_offset & ((1ULL << 48) - 1); } + /** @return header offset of the last committed transaction */ + uint16_t last_offset() const + { return static_cast(last_commit_and_offset >> 48); } + + void set_last_commit(uint16_t last_offset, trx_id_t trx_no) + { + last_commit_and_offset= static_cast(last_offset) << 48 | trx_no; + } + + /** @return the page identifier */ + page_id_t page_id() const { return page_id_t{space->id, page_no}; } + + /** @return the rollback segment header page, exclusively latched */ + buf_block_t *get(mtr_t *mtr, dberr_t *err) const; + + /** @return whether the rollback segment is persistent */ + bool is_persistent() const + { + ut_ad(space == fil_system.temp_space || space == fil_system.sys_space || + (srv_undo_space_id_start > 0 && + space->id >= srv_undo_space_id_start && + space->id <= srv_undo_space_id_start + TRX_SYS_MAX_UNDO_SPACES)); + ut_ad(space == fil_system.temp_space || space == fil_system.sys_space || + !srv_was_started || + (srv_undo_space_id_start > 0 && + space->id >= srv_undo_space_id_start + && space->id <= srv_undo_space_id_start + + srv_undo_tablespaces_open)); + return space->id != SRV_TMP_SPACE_ID; + } +}; + +/* Undo log segment slot in a rollback segment header */ +/*-------------------------------------------------------------*/ +#define TRX_RSEG_SLOT_PAGE_NO 0 /* Page number of the header page of + an undo log segment */ +/*-------------------------------------------------------------*/ +/* Slot size */ +#define TRX_RSEG_SLOT_SIZE 4 + +/* The offset of the rollback segment header on its page */ +#define TRX_RSEG FSEG_PAGE_DATA + +/* Transaction rollback segment header */ +/*-------------------------------------------------------------*/ +/** 0xfffffffe = pre-MariaDB 10.3.5 format; 0=MariaDB 10.3.5 or later */ +#define TRX_RSEG_FORMAT 0 +/** Number of pages in the TRX_RSEG_HISTORY list */ +#define TRX_RSEG_HISTORY_SIZE 4 +/** Committed transaction logs that have not been purged yet */ +#define TRX_RSEG_HISTORY 8 +#define TRX_RSEG_FSEG_HEADER (8 + FLST_BASE_NODE_SIZE) + /* Header for the file segment where + this page is placed */ +#define TRX_RSEG_UNDO_SLOTS (8 + FLST_BASE_NODE_SIZE + FSEG_HEADER_SIZE) + /* Undo log segment slots */ +/** Maximum transaction ID (valid only if TRX_RSEG_FORMAT is 0) */ +#define TRX_RSEG_MAX_TRX_ID (TRX_RSEG_UNDO_SLOTS + TRX_RSEG_N_SLOTS \ + * TRX_RSEG_SLOT_SIZE) + +/** 8 bytes offset within the binlog file */ +#define TRX_RSEG_BINLOG_OFFSET TRX_RSEG_MAX_TRX_ID + 8 +/** MySQL log file name, 512 bytes, including terminating NUL +(valid only if TRX_RSEG_FORMAT is 0). +If no binlog information is present, the first byte is NUL. */ +#define TRX_RSEG_BINLOG_NAME TRX_RSEG_MAX_TRX_ID + 16 +/** Maximum length of binlog file name, including terminating NUL, in bytes */ +#define TRX_RSEG_BINLOG_NAME_LEN 512 + +#ifdef WITH_WSREP +# include "trx0xa.h" + +/** Update the WSREP XID information in rollback segment header. +@param[in,out] rseg_header rollback segment header +@param[in] xid WSREP XID +@param[in,out] mtr mini-transaction */ +void +trx_rseg_update_wsrep_checkpoint( + buf_block_t* rseg_header, + const XID* xid, + mtr_t* mtr); + +/** Update WSREP checkpoint XID in first rollback segment header +as part of wsrep_set_SE_checkpoint() when it is guaranteed that there +are no wsrep transactions committing. +If the UUID part of the WSREP XID does not match to the UUIDs of XIDs already +stored into rollback segments, the WSREP XID in all the remaining rollback +segments will be reset. +@param[in] xid WSREP XID */ +void trx_rseg_update_wsrep_checkpoint(const XID* xid); + +/** Recover the latest WSREP checkpoint XID. +@param[out] xid WSREP XID +@return whether the WSREP XID was found */ +bool trx_rseg_read_wsrep_checkpoint(XID& xid); +#endif /* WITH_WSREP */ + +/** Read the page number of an undo log slot. +@param[in] rseg_header rollback segment header +@param[in] n slot number */ +inline uint32_t trx_rsegf_get_nth_undo(const buf_block_t *rseg_header, ulint n) +{ + ut_ad(n < TRX_RSEG_N_SLOTS); + return mach_read_from_4(TRX_RSEG + TRX_RSEG_UNDO_SLOTS + + n * TRX_RSEG_SLOT_SIZE + rseg_header->page.frame); +} + +/** Upgrade a rollback segment header page to MariaDB 10.3 format. +@param[in,out] rseg_header rollback segment header page +@param[in,out] mtr mini-transaction */ +void trx_rseg_format_upgrade(buf_block_t *rseg_header, mtr_t *mtr); + +/** Update the offset information about the end of the binlog entry +which corresponds to the transaction just being committed. +In a replication slave, this updates the master binlog position +up to which replication has proceeded. +@param[in,out] rseg_header rollback segment header +@param[in] log_file_name binlog file name +@param[in] log_offset binlog offset value +@param[in,out] mtr mini-transaction */ +void trx_rseg_update_binlog_offset(buf_block_t *rseg_header, + const char *log_file_name, + ulonglong log_offset, + mtr_t *mtr); diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h new file mode 100644 index 00000000..5dd0169f --- /dev/null +++ b/storage/innobase/include/trx0sys.h @@ -0,0 +1,1274 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0sys.h +Transaction system + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#pragma once +#include "buf0buf.h" +#include "fil0fil.h" +#include "trx0rseg.h" +#include "mem0mem.h" +#include "mtr0mtr.h" +#include "ut0byte.h" +#include "ut0lst.h" +#include "read0types.h" +#include "page0types.h" +#include "trx0trx.h" +#include "ilist.h" +#include "my_cpu.h" + +#ifdef UNIV_PFS_MUTEX +extern mysql_pfs_key_t trx_sys_mutex_key; +#endif + +/** Checks if a page address is the trx sys header page. +@param[in] page_id page id +@return true if trx sys header page */ +inline bool trx_sys_hdr_page(const page_id_t page_id) +{ + return page_id == page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO); +} + +/*****************************************************************//** +Creates and initializes the transaction system at the database creation. */ +dberr_t trx_sys_create_sys_pages(mtr_t *mtr); + +/** Find an available rollback segment. +@param[in] sys_header +@return an unallocated rollback segment slot in the TRX_SYS header +@retval ULINT_UNDEFINED if not found */ +ulint +trx_sys_rseg_find_free(const buf_block_t* sys_header); +/** Request the TRX_SYS page. +@param[in] rw whether to lock the page for writing +@return the TRX_SYS page +@retval NULL if the page cannot be read */ +inline buf_block_t *trx_sysf_get(mtr_t* mtr, bool rw= true) +{ + return buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO), + 0, rw ? RW_X_LATCH : RW_S_LATCH, mtr); +} + +#ifdef UNIV_DEBUG +/* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */ +extern uint trx_rseg_n_slots_debug; +#endif + +/** Write DB_TRX_ID. +@param[out] db_trx_id the DB_TRX_ID field to be written to +@param[in] id transaction ID */ +UNIV_INLINE +void +trx_write_trx_id(byte* db_trx_id, trx_id_t id) +{ + compile_time_assert(DATA_TRX_ID_LEN == 6); + mach_write_to_6(db_trx_id, id); +} + +/** Read a transaction identifier. +@return id */ +inline +trx_id_t +trx_read_trx_id(const byte* ptr) +{ + compile_time_assert(DATA_TRX_ID_LEN == 6); + return(mach_read_from_6(ptr)); +} + +#ifdef UNIV_DEBUG +/** Check that the DB_TRX_ID in a record is valid. +@param[in] db_trx_id the DB_TRX_ID column to validate +@param[in] trx_id the id of the ALTER TABLE transaction */ +inline bool trx_id_check(const void* db_trx_id, trx_id_t trx_id) +{ + trx_id_t id = trx_read_trx_id(static_cast(db_trx_id)); + ut_ad(id == 0 || id > trx_id); + return true; +} +#endif + +/*****************************************************************//** +Updates the offset information about the end of the MySQL binlog entry +which corresponds to the transaction just being committed. In a MySQL +replication slave updates the latest master binlog position up to which +replication has proceeded. */ +void +trx_sys_update_mysql_binlog_offset( +/*===============================*/ + const char* file_name,/*!< in: MySQL log file name */ + int64_t offset, /*!< in: position in that log file */ + buf_block_t* sys_header, /*!< in,out: trx sys header */ + mtr_t* mtr); /*!< in,out: mini-transaction */ +/** Display the MySQL binlog offset info if it is present in the trx +system header. */ +void +trx_sys_print_mysql_binlog_offset(); + +/** Create the rollback segments. +@return whether the creation succeeded */ +bool +trx_sys_create_rsegs(); + +/** The offset of the transaction system header on the page */ +#define TRX_SYS FSEG_PAGE_DATA + +/** Transaction system header */ +/*------------------------------------------------------------- @{ */ +/** In old versions of InnoDB, this persisted the value of +trx_sys.get_max_trx_id(). Starting with MariaDB 10.3.5, +the field TRX_RSEG_MAX_TRX_ID in rollback segment header pages +and the fields TRX_UNDO_TRX_ID, TRX_UNDO_TRX_NO in undo log pages +are used instead. The field only exists for the purpose of upgrading +from older MySQL or MariaDB versions. */ +#define TRX_SYS_TRX_ID_STORE 0 +#define TRX_SYS_FSEG_HEADER 8 /*!< segment header for the + tablespace segment the trx + system is created into */ +#define TRX_SYS_RSEGS (8 + FSEG_HEADER_SIZE) + /*!< the start of the array of + rollback segment specification + slots */ + +/* Rollback segment specification slot offsets */ + +/** the tablespace ID of an undo log header; starting with +MySQL/InnoDB 5.1.7, this is FIL_NULL if the slot is unused */ +#define TRX_SYS_RSEG_SPACE 0 +/** the page number of an undo log header, or FIL_NULL if unused */ +#define TRX_SYS_RSEG_PAGE_NO 4 +/** Size of a rollback segment specification slot */ +#define TRX_SYS_RSEG_SLOT_SIZE 8 + +/** Read the tablespace ID of a rollback segment slot. +@param[in] sys_header TRX_SYS page +@param[in] rseg_id rollback segment identifier +@return undo tablespace id */ +inline +uint32_t +trx_sysf_rseg_get_space(const buf_block_t* sys_header, ulint rseg_id) +{ + ut_ad(rseg_id < TRX_SYS_N_RSEGS); + return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SPACE + + rseg_id * TRX_SYS_RSEG_SLOT_SIZE + + sys_header->page.frame); +} + +/** Read the page number of a rollback segment slot. +@param[in] sys_header TRX_SYS page +@param[in] rseg_id rollback segment identifier +@return undo page number */ +inline uint32_t +trx_sysf_rseg_get_page_no(const buf_block_t *sys_header, ulint rseg_id) +{ + ut_ad(rseg_id < TRX_SYS_N_RSEGS); + return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO + + rseg_id * TRX_SYS_RSEG_SLOT_SIZE + + sys_header->page.frame); +} + +/** Maximum length of MySQL binlog file name, in bytes. +(Used before MariaDB 10.3.5.) */ +#define TRX_SYS_MYSQL_LOG_NAME_LEN 512 +/** Contents of TRX_SYS_MYSQL_LOG_MAGIC_N_FLD */ +#define TRX_SYS_MYSQL_LOG_MAGIC_N 873422344 + +#if UNIV_PAGE_SIZE_MIN < 4096 +# error "UNIV_PAGE_SIZE_MIN < 4096" +#endif +/** The offset of the MySQL binlog offset info in the trx system header */ +#define TRX_SYS_MYSQL_LOG_INFO (srv_page_size - 1000) +#define TRX_SYS_MYSQL_LOG_MAGIC_N_FLD 0 /*!< magic number which is + TRX_SYS_MYSQL_LOG_MAGIC_N + if we have valid data in the + MySQL binlog info */ +#define TRX_SYS_MYSQL_LOG_OFFSET 4 /*!< the 64-bit offset + within that file */ +#define TRX_SYS_MYSQL_LOG_NAME 12 /*!< MySQL log file name */ + +/** Memory map TRX_SYS_PAGE_NO = 5 when srv_page_size = 4096 + +0...37 FIL_HEADER +38...45 TRX_SYS_TRX_ID_STORE +46...55 TRX_SYS_FSEG_HEADER (FSEG_HEADER_SIZE == 10) +56 TRX_SYS_RSEGS + 56...59 TRX_SYS_RSEG_SPACE for slot 0 + 60...63 TRX_SYS_RSEG_PAGE_NO for slot 0 + 64...67 TRX_SYS_RSEG_SPACE for slot 1 + 68...71 TRX_SYS_RSEG_PAGE_NO for slot 1 +.... + 594..597 TRX_SYS_RSEG_SPACE for slot 72 + 598..601 TRX_SYS_RSEG_PAGE_NO for slot 72 +... + ...1063 TRX_SYS_RSEG_PAGE_NO for slot 126 + +(srv_page_size-3500 WSREP ::: FAIL would overwrite undo tablespace +space_id, page_no pairs :::) +596 TRX_SYS_WSREP_XID_INFO TRX_SYS_WSREP_XID_MAGIC_N_FLD +600 TRX_SYS_WSREP_XID_FORMAT +604 TRX_SYS_WSREP_XID_GTRID_LEN +608 TRX_SYS_WSREP_XID_BQUAL_LEN +612 TRX_SYS_WSREP_XID_DATA (len = 128) +739 TRX_SYS_WSREP_XID_DATA_END + +FIXED WSREP XID info offsets for 4k page size 10.0.32-galera +(srv_page_size-2500) +1596 TRX_SYS_WSREP_XID_INFO TRX_SYS_WSREP_XID_MAGIC_N_FLD +1600 TRX_SYS_WSREP_XID_FORMAT +1604 TRX_SYS_WSREP_XID_GTRID_LEN +1608 TRX_SYS_WSREP_XID_BQUAL_LEN +1612 TRX_SYS_WSREP_XID_DATA (len = 128) +1739 TRX_SYS_WSREP_XID_DATA_END + +(srv_page_size - 2000 MYSQL MASTER LOG) +2096 TRX_SYS_MYSQL_MASTER_LOG_INFO TRX_SYS_MYSQL_LOG_MAGIC_N_FLD +2100 TRX_SYS_MYSQL_LOG_OFFSET_HIGH +2104 TRX_SYS_MYSQL_LOG_OFFSET_LOW +2108 TRX_SYS_MYSQL_LOG_NAME + +(srv_page_size - 1000 MYSQL LOG) +3096 TRX_SYS_MYSQL_LOG_INFO TRX_SYS_MYSQL_LOG_MAGIC_N_FLD +3100 TRX_SYS_MYSQL_LOG_OFFSET_HIGH +3104 TRX_SYS_MYSQL_LOG_OFFSET_LOW +3108 TRX_SYS_MYSQL_LOG_NAME + +(srv_page_size - 200 DOUBLEWRITE) +3896 TRX_SYS_DOUBLEWRITE TRX_SYS_DOUBLEWRITE_FSEG +3906 TRX_SYS_DOUBLEWRITE_MAGIC +3910 TRX_SYS_DOUBLEWRITE_BLOCK1 +3914 TRX_SYS_DOUBLEWRITE_BLOCK2 +3918 TRX_SYS_DOUBLEWRITE_REPEAT +3930 TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N + +(srv_page_size - 8, TAILER) +4088..4096 FIL_TAILER + +*/ +#ifdef WITH_WSREP +/** The offset to WSREP XID headers (used before MariaDB 10.3.5) */ +#define TRX_SYS_WSREP_XID_INFO std::max(srv_page_size - 3500, 1596UL) +#define TRX_SYS_WSREP_XID_MAGIC_N_FLD 0 +#define TRX_SYS_WSREP_XID_MAGIC_N 0x77737265 + +/** XID field: formatID, gtrid_len, bqual_len, xid_data */ +#define TRX_SYS_WSREP_XID_LEN (4 + 4 + 4 + XIDDATASIZE) +#define TRX_SYS_WSREP_XID_FORMAT 4 +#define TRX_SYS_WSREP_XID_GTRID_LEN 8 +#define TRX_SYS_WSREP_XID_BQUAL_LEN 12 +#define TRX_SYS_WSREP_XID_DATA 16 +#endif /* WITH_WSREP*/ + +/** Doublewrite buffer */ +/* @{ */ +/** The offset of the doublewrite buffer header on the trx system header page */ +#define TRX_SYS_DOUBLEWRITE (srv_page_size - 200) +/*-------------------------------------------------------------*/ +#define TRX_SYS_DOUBLEWRITE_FSEG 0 /*!< fseg header of the fseg + containing the doublewrite + buffer */ +#define TRX_SYS_DOUBLEWRITE_MAGIC FSEG_HEADER_SIZE + /*!< 4-byte magic number which + shows if we already have + created the doublewrite + buffer */ +#define TRX_SYS_DOUBLEWRITE_BLOCK1 (4 + FSEG_HEADER_SIZE) + /*!< page number of the + first page in the first + sequence of 64 + (= FSP_EXTENT_SIZE) consecutive + pages in the doublewrite + buffer */ +#define TRX_SYS_DOUBLEWRITE_BLOCK2 (8 + FSEG_HEADER_SIZE) + /*!< page number of the + first page in the second + sequence of 64 consecutive + pages in the doublewrite + buffer */ +#define TRX_SYS_DOUBLEWRITE_REPEAT 12 /*!< we repeat + TRX_SYS_DOUBLEWRITE_MAGIC, + TRX_SYS_DOUBLEWRITE_BLOCK1, + TRX_SYS_DOUBLEWRITE_BLOCK2 + so that if the trx sys + header is half-written + to disk, we still may + be able to recover the + information */ +/** If this is not yet set to TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N, +we must reset the doublewrite buffer, because starting from 4.1.x the +space id of a data page is stored into +FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */ +#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED (24 + FSEG_HEADER_SIZE) + +/*-------------------------------------------------------------*/ +/** Contents of TRX_SYS_DOUBLEWRITE_MAGIC */ +constexpr uint32_t TRX_SYS_DOUBLEWRITE_MAGIC_N= 536853855; +/** Contents of TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED */ +constexpr uint32_t TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N= 1783657386; +/* @} */ + +trx_t* current_trx(); + +struct rw_trx_hash_element_t +{ + rw_trx_hash_element_t() + { + memset(reinterpret_cast(this), 0, sizeof *this); + mutex.init(); + } + + + ~rw_trx_hash_element_t() { mutex.destroy(); } + + + trx_id_t id; /* lf_hash_init() relies on this to be first in the struct */ + + /** + Transaction serialization number. + + Assigned shortly before the transaction is moved to COMMITTED_IN_MEMORY + state. Initially set to TRX_ID_MAX. + */ + Atomic_counter no; + trx_t *trx; + srw_mutex mutex; +}; + + +/** + Wrapper around LF_HASH to store set of in memory read-write transactions. +*/ + +class rw_trx_hash_t +{ + LF_HASH hash; + + + template + using walk_action= my_bool(rw_trx_hash_element_t *element, T *action); + + + /** + Constructor callback for lock-free allocator. + + Object is just allocated and is not yet accessible via rw_trx_hash by + concurrent threads. Object can be reused multiple times before it is freed. + Every time object is being reused initializer() callback is called. + */ + + static void rw_trx_hash_constructor(uchar *arg) + { + new(arg + LF_HASH_OVERHEAD) rw_trx_hash_element_t(); + } + + + /** + Destructor callback for lock-free allocator. + + Object is about to be freed and is not accessible via rw_trx_hash by + concurrent threads. + */ + + static void rw_trx_hash_destructor(uchar *arg) + { + reinterpret_cast + (arg + LF_HASH_OVERHEAD)->~rw_trx_hash_element_t(); + } + + + /** + Destructor callback for lock-free allocator. + + This destructor is used at shutdown. It frees remaining transaction + objects. + + XA PREPARED transactions may remain if they haven't been committed or + rolled back. ACTIVE transactions may remain if startup was interrupted or + server is running in read-only mode or for certain srv_force_recovery + levels. + */ + + static void rw_trx_hash_shutdown_destructor(uchar *arg) + { + rw_trx_hash_element_t *element= + reinterpret_cast(arg + LF_HASH_OVERHEAD); + if (trx_t *trx= element->trx) + { + ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED) || + trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED) || + (trx_state_eq(trx, TRX_STATE_ACTIVE) && + (!srv_was_started || + srv_read_only_mode || + srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO))); + trx_free_at_shutdown(trx); + } + element->~rw_trx_hash_element_t(); + } + + + /** + Initializer callback for lock-free hash. + + Object is not yet accessible via rw_trx_hash by concurrent threads, but is + about to become such. Object id can be changed only by this callback and + remains the same until all pins to this object are released. + + Object trx can be changed to 0 by erase() under object mutex protection, + which indicates it is about to be removed from lock-free hash and become + not accessible by concurrent threads. + */ + + static void rw_trx_hash_initializer(LF_HASH *, + rw_trx_hash_element_t *element, + trx_t *trx) + { + ut_ad(element->trx == 0); + element->trx= trx; + element->id= trx->id; + element->no= TRX_ID_MAX; + trx->rw_trx_hash_element= element; + } + + + /** + Gets LF_HASH pins. + + Pins are used to protect object from being destroyed or reused. They are + normally stored in trx object for quick access. If caller doesn't have trx + available, we try to get it using currnet_trx(). If caller doesn't have trx + at all, temporary pins are allocated. + */ + + LF_PINS *get_pins(trx_t *trx) + { + if (!trx->rw_trx_hash_pins) + { + trx->rw_trx_hash_pins= lf_hash_get_pins(&hash); + ut_a(trx->rw_trx_hash_pins); + } + return trx->rw_trx_hash_pins; + } + + + template struct eliminate_duplicates_arg + { + trx_ids_t ids; + walk_action *action; + T *argument; + eliminate_duplicates_arg(size_t size, walk_action *act, T *arg): + action(act), argument(arg) { ids.reserve(size); } + }; + + + template + static my_bool eliminate_duplicates(rw_trx_hash_element_t *element, + eliminate_duplicates_arg *arg) + { + for (trx_ids_t::iterator it= arg->ids.begin(); it != arg->ids.end(); it++) + { + if (*it == element->id) + return 0; + } + arg->ids.push_back(element->id); + return arg->action(element, arg->argument); + } + + +#ifdef UNIV_DEBUG + static void validate_element(trx_t *trx) + { + ut_ad(!trx->read_only || !trx->rsegs.m_redo.rseg); + ut_ad(!trx->is_autocommit_non_locking()); + /* trx->state can be anything except TRX_STATE_NOT_STARTED */ + ut_d(trx->mutex_lock()); + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) || + trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY) || + trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED) || + trx_state_eq(trx, TRX_STATE_PREPARED)); + ut_d(trx->mutex_unlock()); + } + + + template struct debug_iterator_arg + { + walk_action *action; + T *argument; + }; + + + template + static my_bool debug_iterator(rw_trx_hash_element_t *element, + debug_iterator_arg *arg) + { + element->mutex.wr_lock(); + if (element->trx) + validate_element(element->trx); + element->mutex.wr_unlock(); + ut_ad(element->id < element->no); + return arg->action(element, arg->argument); + } +#endif + + +public: + void init() + { + lf_hash_init(&hash, sizeof(rw_trx_hash_element_t), LF_HASH_UNIQUE, 0, + sizeof(trx_id_t), 0, &my_charset_bin); + hash.alloc.constructor= rw_trx_hash_constructor; + hash.alloc.destructor= rw_trx_hash_destructor; + hash.initializer= + reinterpret_cast(rw_trx_hash_initializer); + } + + + void destroy() + { + hash.alloc.destructor= rw_trx_hash_shutdown_destructor; + lf_hash_destroy(&hash); + } + + + /** + Releases LF_HASH pins. + + Must be called by thread that owns trx_t object when the latter is being + "detached" from thread (e.g. released to the pool by trx_t::free()). Can be + called earlier if thread is expected not to use rw_trx_hash. + + Since pins are not allowed to be transferred to another thread, + initialisation thread calls this for recovered transactions. + */ + + void put_pins(trx_t *trx) + { + if (trx->rw_trx_hash_pins) + { + lf_hash_put_pins(trx->rw_trx_hash_pins); + trx->rw_trx_hash_pins= 0; + } + } + + + /** + Finds trx object in lock-free hash with given id. + + Only ACTIVE or PREPARED trx objects may participate in hash. Nevertheless + the transaction may get committed before this method returns. + + With do_ref_count == false the caller may dereference returned trx pointer + only if lock_sys.latch was acquired before calling find(). + + With do_ref_count == true caller may dereference trx even if it is not + holding lock_sys.latch. Caller is responsible for calling + trx->release_reference() when it is done playing with trx. + + Ideally this method should get caller rw_trx_hash_pins along with trx + object as a parameter, similar to insert() and erase(). However most + callers lose trx early in their call chains and it is not that easy to pass + them through. + + So we take more expensive approach: get trx through current_thd()->ha_data. + Some threads don't have trx attached to THD, and at least server + initialisation thread, fts_optimize_thread, srv_master_thread, + dict_stats_thread, srv_monitor_thread, btr_defragment_thread don't even + have THD at all. For such cases we allocate pins only for duration of + search and free them immediately. + + This has negative performance impact and should be fixed eventually (by + passing caller_trx as a parameter). Still stream of DML is more or less Ok. + + @return + @retval 0 not found + @retval pointer to trx + */ + + trx_t *find(trx_t *caller_trx, trx_id_t trx_id, bool do_ref_count) + { + /* + In MariaDB 10.3, purge will reset DB_TRX_ID to 0 + when the history is lost. Read/write transactions will + always have a nonzero trx_t::id; there the value 0 is + reserved for transactions that did not write or lock + anything yet. + + The caller should already have handled trx_id==0 specially. + */ + ut_ad(trx_id); + ut_ad(!caller_trx || caller_trx->id != trx_id || !do_ref_count); + + trx_t *trx= 0; + LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash); + ut_a(pins); + + rw_trx_hash_element_t *element= reinterpret_cast + (lf_hash_search(&hash, pins, reinterpret_cast(&trx_id), + sizeof(trx_id_t))); + if (element) + { + /* rw_trx_hash_t::erase() sets element->trx to nullptr under + element->mutex protection before removing the element from hash table. + If the element was removed before the mutex acquisition, element->trx + will be equal to nullptr. */ + DEBUG_SYNC_C("before_trx_hash_find_element_mutex_enter"); + element->mutex.wr_lock(); + /* element_trx can't point to reused object now. If transaction was + deregistered before element->mutex acquisition, element->trx is nullptr. + It can't be deregistered while element->mutex is held. */ + trx_t *element_trx = element->trx; + lf_hash_search_unpin(pins); + /* The *element can be reused now, as element->trx value is stored + locally in element_trx. */ + DEBUG_SYNC_C("after_trx_hash_find_element_mutex_enter"); + if ((trx= element_trx)) { + DBUG_ASSERT(trx_id == trx->id); + ut_d(validate_element(trx)); + if (do_ref_count) + { + /* + We have an early state check here to avoid committer + starvation in a wait loop for transaction references, + when there's a stream of trx_sys.find() calls from other + threads. The trx->state may change to COMMITTED after + trx->mutex is released, and it will have to be rechecked + by the caller after reacquiring the mutex. + */ + /* trx_t::commit_in_memory() sets the state to + TRX_STATE_COMMITTED_IN_MEMORY before deregistering the transaction. + It also waits for any implicit-to-explicit lock conversions to cease + after deregistering. */ + if (trx->state == TRX_STATE_COMMITTED_IN_MEMORY) + trx= nullptr; + else + trx->reference(); + } + } + /* element's lifetime is equal to the hash lifetime, that's why + element->mutex is valid here despite the element is unpinned. In the + worst case some thread will wait for element->mutex releasing. */ + element->mutex.wr_unlock(); + } + if (!caller_trx) + lf_hash_put_pins(pins); + return trx; + } + + + /** + Inserts trx to lock-free hash. + + Object becomes accessible via rw_trx_hash. + */ + + void insert(trx_t *trx) + { + ut_d(validate_element(trx)); + int res= lf_hash_insert(&hash, get_pins(trx), + reinterpret_cast(trx)); + ut_a(res == 0); + } + + + /** + Removes trx from lock-free hash. + + Object becomes not accessible via rw_trx_hash. But it still can be pinned + by concurrent find(), which is supposed to release it immediately after + it sees object trx is 0. + */ + + void erase(trx_t *trx) + { + ut_d(validate_element(trx)); + trx->rw_trx_hash_element->mutex.wr_lock(); + trx->rw_trx_hash_element->trx= nullptr; + trx->rw_trx_hash_element->mutex.wr_unlock(); + int res= lf_hash_delete(&hash, get_pins(trx), + reinterpret_cast(&trx->id), + sizeof(trx_id_t)); + ut_a(res == 0); + } + + + /** + Returns the number of elements in the hash. + + The number is exact only if hash is protected against concurrent + modifications (e.g. single threaded startup or hash is protected + by some mutex). Otherwise the number may be used as a hint only, + because it may change even before this method returns. + */ + + uint32_t size() { return uint32_t(lf_hash_size(&hash)); } + + + /** + Iterates the hash. + + @param caller_trx used to get/set pins + @param action called for every element in hash + @param argument opque argument passed to action + + May return the same element multiple times if hash is under contention. + If caller doesn't like to see the same transaction multiple times, it has + to call iterate_no_dups() instead. + + May return element with committed transaction. If caller doesn't like to + see committed transactions, it has to skip those under element mutex: + + element->mutex.wr_lock(); + if (trx_t trx= element->trx) + { + // trx is protected against commit in this branch + } + element->mutex.wr_unlock(); + + May miss concurrently inserted transactions. + + @return + @retval 0 iteration completed successfully + @retval 1 iteration was interrupted (action returned 1) + */ + + template + int iterate(trx_t *caller_trx, walk_action *action, T *argument= nullptr) + { + LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash); + ut_a(pins); +#ifdef UNIV_DEBUG + debug_iterator_arg debug_arg= { action, argument }; + action= reinterpret_cast(debug_iterator); + argument= reinterpret_cast(&debug_arg); +#endif + int res= lf_hash_iterate(&hash, pins, + reinterpret_cast(action), + const_cast(static_cast + (argument))); + if (!caller_trx) + lf_hash_put_pins(pins); + return res; + } + + + template + int iterate(walk_action *action, T *argument= nullptr) + { + return iterate(current_trx(), action, argument); + } + + + /** + Iterates the hash and eliminates duplicate elements. + + @sa iterate() + */ + + template + int iterate_no_dups(trx_t *caller_trx, walk_action *action, + T *argument= nullptr) + { + eliminate_duplicates_arg arg(size() + 32, action, argument); + return iterate(caller_trx, eliminate_duplicates, &arg); + } + + + template + int iterate_no_dups(walk_action *action, T *argument= nullptr) + { + return iterate_no_dups(current_trx(), action, argument); + } +}; + +class thread_safe_trx_ilist_t +{ +public: + void create() { mysql_mutex_init(trx_sys_mutex_key, &mutex, nullptr); } + void close() { mysql_mutex_destroy(&mutex); } + + bool empty() const + { + mysql_mutex_lock(&mutex); + auto result= trx_list.empty(); + mysql_mutex_unlock(&mutex); + return result; + } + + void push_front(trx_t &trx) + { + mysql_mutex_lock(&mutex); + trx_list.push_front(trx); + mysql_mutex_unlock(&mutex); + } + + void remove(trx_t &trx) + { + mysql_mutex_lock(&mutex); + trx_list.remove(trx); + mysql_mutex_unlock(&mutex); + } + + template void for_each(Callable &&callback) const + { + mysql_mutex_lock(&mutex); + for (const auto &trx : trx_list) + callback(trx); + mysql_mutex_unlock(&mutex); + } + + template void for_each(Callable &&callback) + { + mysql_mutex_lock(&mutex); + for (auto &trx : trx_list) + callback(trx); + mysql_mutex_unlock(&mutex); + } + + void freeze() const { mysql_mutex_lock(&mutex); } + void unfreeze() const { mysql_mutex_unlock(&mutex); } + +private: + alignas(CPU_LEVEL1_DCACHE_LINESIZE) mutable mysql_mutex_t mutex; + alignas(CPU_LEVEL1_DCACHE_LINESIZE) ilist trx_list; +}; + +/** The transaction system central memory data structure. */ +class trx_sys_t +{ + /** + The smallest number not yet assigned as a transaction id or transaction + number. Accessed and updated with atomic operations. + */ + alignas(CPU_LEVEL1_DCACHE_LINESIZE) Atomic_counter m_max_trx_id; + + + /** + Solves race conditions between register_rw() and snapshot_ids() as well as + race condition between assign_new_trx_no() and snapshot_ids(). + + @sa register_rw() + @sa assign_new_trx_no() + @sa snapshot_ids() + */ + alignas(CPU_LEVEL1_DCACHE_LINESIZE) + std::atomic m_rw_trx_hash_version; + + + bool m_initialised; + + /** False if there is no undo log to purge or rollback */ + bool undo_log_nonempty; +public: + /** List of all transactions. */ + thread_safe_trx_ilist_t trx_list; + + /** Temporary rollback segments */ + trx_rseg_t temp_rsegs[TRX_SYS_N_RSEGS]; + + /** Persistent rollback segments; space==nullptr if slot not in use */ + trx_rseg_t rseg_array[TRX_SYS_N_RSEGS]; + + /** + Lock-free hash of in memory read-write transactions. + Works faster when it is on it's own cache line (tested). + */ + + alignas(CPU_LEVEL1_DCACHE_LINESIZE) rw_trx_hash_t rw_trx_hash; + + +#ifdef WITH_WSREP + /** Latest recovered XID during startup */ + XID recovered_wsrep_xid; +#endif + /** Latest recovered binlog offset */ + uint64_t recovered_binlog_offset; + /** Latest recovered binlog file name */ + char recovered_binlog_filename[TRX_SYS_MYSQL_LOG_NAME_LEN]; + /** FIL_PAGE_LSN of the page with the latest recovered binlog metadata */ + lsn_t recovered_binlog_lsn; + + + /** + Constructor. + + Some members may require late initialisation, thus we just mark object as + uninitialised. Real initialisation happens in create(). + */ + + trx_sys_t(): m_initialised(false) {} + + + /** + @return TRX_RSEG_HISTORY length (number of committed transactions to purge) + */ + size_t history_size(); + + + /** + Check whether history_size() exceeds a specified number. + @param threshold number of committed transactions + @return whether TRX_RSEG_HISTORY length exceeds the threshold + */ + bool history_exceeds(size_t threshold); + + + /** + @return approximate history_size(), without latch protection + */ + TPOOL_SUPPRESS_TSAN size_t history_size_approx() const; + + + /** + @return whether history_size() is nonzero (with some race condition) + */ + TPOOL_SUPPRESS_TSAN bool history_exists(); + + + /** + Determine if the specified transaction or any older one might be active. + + @param trx current transaction + @param id transaction identifier + @return whether any transaction not newer than id might be active + */ + + bool find_same_or_older(trx_t *trx, trx_id_t id) + { + if (trx->max_inactive_id >= id) + return false; + bool found= rw_trx_hash.iterate(trx, find_same_or_older_callback, &id); + if (!found) + trx->max_inactive_id= id; + return found; + } + + + /** + Determines the maximum transaction id. + + @return maximum currently allocated trx id; will be stale after the + next call to trx_sys.get_new_trx_id() + */ + + trx_id_t get_max_trx_id() + { + return m_max_trx_id; + } + + + /** + Allocates a new transaction id. + @return new, allocated trx id + */ + + trx_id_t get_new_trx_id() + { + trx_id_t id= get_new_trx_id_no_refresh(); + refresh_rw_trx_hash_version(); + return id; + } + + + /** + Allocates and assigns new transaction serialisation number. + + There's a gap between m_max_trx_id increment and transaction serialisation + number becoming visible through rw_trx_hash. While we're in this gap + concurrent thread may come and do MVCC snapshot without seeing allocated + but not yet assigned serialisation number. Then at some point purge thread + may clone this view. As a result it won't see newly allocated serialisation + number and may remove "unnecessary" history data of this transaction from + rollback segments. + + m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has + to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively + means that all transaction serialisation numbers up to m_max_trx_id are + available through rw_trx_hash. + + We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so + that m_rw_trx_hash_version increment happens after + trx->rw_trx_hash_element->no becomes visible through rw_trx_hash. + + @param trx transaction + */ + void assign_new_trx_no(trx_t *trx) + { + trx->rw_trx_hash_element->no= get_new_trx_id_no_refresh(); + refresh_rw_trx_hash_version(); + } + + + /** + Takes MVCC snapshot. + + To reduce malloc probablility we reserve rw_trx_hash.size() + 32 elements + in ids. + + For details about get_rw_trx_hash_version() != get_max_trx_id() spin + @sa register_rw() and @sa assign_new_trx_no(). + + We rely on get_rw_trx_hash_version() to issue ACQUIRE memory barrier so + that loading of m_rw_trx_hash_version happens before accessing rw_trx_hash. + + To optimise snapshot creation rw_trx_hash.iterate() is being used instead + of rw_trx_hash.iterate_no_dups(). It means that some transaction + identifiers may appear multiple times in ids. + + @param[in,out] caller_trx used to get access to rw_trx_hash_pins + @param[out] ids array to store registered transaction identifiers + @param[out] max_trx_id variable to store m_max_trx_id value + @param[out] mix_trx_no variable to store min(no) value + */ + + void snapshot_ids(trx_t *caller_trx, trx_ids_t *ids, trx_id_t *max_trx_id, + trx_id_t *min_trx_no) + { + snapshot_ids_arg arg(ids); + + while ((arg.m_id= get_rw_trx_hash_version()) != get_max_trx_id()) + ut_delay(1); + arg.m_no= arg.m_id; + + ids->clear(); + ids->reserve(rw_trx_hash.size() + 32); + rw_trx_hash.iterate(caller_trx, copy_one_id, &arg); + + *max_trx_id= arg.m_id; + *min_trx_no= arg.m_no; + } + + + /** Initialiser for m_max_trx_id and m_rw_trx_hash_version. */ + void init_max_trx_id(trx_id_t value) + { + m_max_trx_id= value; + m_rw_trx_hash_version.store(value, std::memory_order_relaxed); + } + + + bool is_initialised() const { return m_initialised; } + + + /** Initialise the transaction subsystem. */ + void create(); + + /** Close the transaction subsystem on shutdown. */ + void close(); + + /** @return total number of active (non-prepared) transactions */ + size_t any_active_transactions(size_t *prepared= nullptr); + + + /** + Determine the rollback segment identifier. + + @param rseg rollback segment + @param persistent whether the rollback segment is persistent + @return the rollback segment identifier + */ + unsigned rseg_id(const trx_rseg_t *rseg, bool persistent) const + { + const trx_rseg_t *array= persistent ? rseg_array : temp_rsegs; + ut_ad(rseg >= array); + ut_ad(rseg < &array[TRX_SYS_N_RSEGS]); + return static_cast(rseg - array); + } + + + /** + Registers read-write transaction. + + Transaction becomes visible to MVCC. + + There's a gap between m_max_trx_id increment and transaction becoming + visible through rw_trx_hash. While we're in this gap concurrent thread may + come and do MVCC snapshot. As a result concurrent read view will be able to + observe records owned by this transaction even before it was committed. + + m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has + to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively + means that all transactions up to m_max_trx_id are available through + rw_trx_hash. + + We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so + that m_rw_trx_hash_version increment happens after transaction becomes + visible through rw_trx_hash. + */ + + void register_rw(trx_t *trx) + { + trx->id= get_new_trx_id_no_refresh(); + rw_trx_hash.insert(trx); + refresh_rw_trx_hash_version(); + } + + + /** + Deregisters read-write transaction. + + Transaction is removed from rw_trx_hash, which releases all implicit locks. + MVCC snapshot won't see this transaction anymore. + */ + + void deregister_rw(trx_t *trx) + { + rw_trx_hash.erase(trx); + } + + + bool is_registered(trx_t *caller_trx, trx_id_t id) + { + return id && find(caller_trx, id, false); + } + + + trx_t *find(trx_t *caller_trx, trx_id_t id, bool do_ref_count= true) + { + return rw_trx_hash.find(caller_trx, id, do_ref_count); + } + + + /** + Registers transaction in trx_sys. + + @param trx transaction + */ + void register_trx(trx_t *trx) + { + trx_list.push_front(*trx); + } + + + /** + Deregisters transaction in trx_sys. + + @param trx transaction + */ + void deregister_trx(trx_t *trx) + { + trx_list.remove(*trx); + } + + + /** + Clones the oldest view and stores it in view. + + No need to call ReadView::close(). The caller owns the view that is passed + in. This function is called by purge thread to determine whether it should + purge the delete marked record or not. + */ + void clone_oldest_view(ReadViewBase *view) const; + + + /** @return the number of active views */ + size_t view_count() const + { + size_t count= 0; + + trx_list.for_each([&count](const trx_t &trx) { + if (trx.read_view.is_open()) + ++count; + }); + + return count; + } + + /** Set the undo log empty value */ + void set_undo_non_empty(bool val) + { + if (!undo_log_nonempty) + undo_log_nonempty= val; + } + + /** Get the undo log empty value */ + bool is_undo_empty() const { return !undo_log_nonempty; } + + /* Reset the trx_sys page and retain the dblwr information, + system rollback segment header page + @return error code */ + inline dberr_t reset_page(mtr_t *mtr); +private: + static my_bool find_same_or_older_callback(rw_trx_hash_element_t *element, + trx_id_t *id) + { + return element->id <= *id; + } + + + struct snapshot_ids_arg + { + snapshot_ids_arg(trx_ids_t *ids): m_ids(ids) {} + trx_ids_t *m_ids; + trx_id_t m_id; + trx_id_t m_no; + }; + + + static my_bool copy_one_id(rw_trx_hash_element_t *element, + snapshot_ids_arg *arg) + { + if (element->id < arg->m_id) + { + trx_id_t no= element->no; + arg->m_ids->push_back(element->id); + if (no < arg->m_no) + arg->m_no= no; + } + return 0; + } + + + /** Getter for m_rw_trx_hash_version, must issue ACQUIRE memory barrier. */ + trx_id_t get_rw_trx_hash_version() + { + return m_rw_trx_hash_version.load(std::memory_order_acquire); + } + + + /** Increments m_rw_trx_hash_version, must issue RELEASE memory barrier. */ + void refresh_rw_trx_hash_version() + { + m_rw_trx_hash_version.fetch_add(1, std::memory_order_release); + } + + + /** + Allocates new transaction id without refreshing rw_trx_hash version. + + This method is extracted for exclusive use by register_rw() and + assign_new_trx_no() where new id must be allocated atomically with + payload of these methods from MVCC snapshot point of view. + + @sa get_new_trx_id() + @sa assign_new_trx_no() + + @return new transaction id + */ + + trx_id_t get_new_trx_id_no_refresh() + { + return m_max_trx_id++; + } +}; + + +/** The transaction system */ +extern trx_sys_t trx_sys; diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h new file mode 100644 index 00000000..3cfbe331 --- /dev/null +++ b/storage/innobase/include/trx0trx.h @@ -0,0 +1,1268 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2015, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0trx.h +The transaction + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0trx_h +#define trx0trx_h + +#include "trx0types.h" +#include "lock0types.h" +#include "que0types.h" +#include "mem0mem.h" +#include "trx0xa.h" +#include "ut0vec.h" +#include "fts0fts.h" +#include "read0types.h" +#include "ilist.h" +#include "row0merge.h" + +#include + +// Forward declaration +struct mtr_t; +struct rw_trx_hash_element_t; + +/******************************************************************//** +Set detailed error message for the transaction. */ +void +trx_set_detailed_error( +/*===================*/ + trx_t* trx, /*!< in: transaction struct */ + const char* msg); /*!< in: detailed error message */ +/*************************************************************//** +Set detailed error message for the transaction from a file. Note that the +file is rewinded before reading from it. */ +void +trx_set_detailed_error_from_file( +/*=============================*/ + trx_t* trx, /*!< in: transaction struct */ + FILE* file); /*!< in: file to read message from */ +/****************************************************************//** +Retrieves the error_info field from a trx. +@return the error info */ +UNIV_INLINE +const dict_index_t* +trx_get_error_info( +/*===============*/ + const trx_t* trx); /*!< in: trx object */ + +/** @return an allocated transaction */ +trx_t *trx_create(); + +/** At shutdown, frees a transaction object. */ +void trx_free_at_shutdown(trx_t *trx); + +/** Disconnect a prepared transaction from MySQL. +@param[in,out] trx transaction */ +void trx_disconnect_prepared(trx_t *trx); + +/** Initialize (resurrect) transactions at startup. */ +dberr_t trx_lists_init_at_db_start(); + +/*************************************************************//** +Starts the transaction if it is not yet started. */ +void +trx_start_if_not_started_xa_low( +/*============================*/ + trx_t* trx, /*!< in/out: transaction */ + bool read_write); /*!< in: true if read write transaction */ +/*************************************************************//** +Starts the transaction if it is not yet started. */ +void +trx_start_if_not_started_low( +/*=========================*/ + trx_t* trx, /*!< in/out: transaction */ + bool read_write); /*!< in: true if read write transaction */ + +/** +Start a transaction for internal processing. +@param trx transaction +@param read_write whether writes may be performed */ +void trx_start_internal_low(trx_t *trx, bool read_write); + +#ifdef UNIV_DEBUG +#define trx_start_if_not_started_xa(t, rw) \ + do { \ + (t)->start_line = __LINE__; \ + (t)->start_file = __FILE__; \ + trx_start_if_not_started_xa_low((t), rw); \ + } while (false) + +#define trx_start_if_not_started(t, rw) \ + do { \ + (t)->start_line = __LINE__; \ + (t)->start_file = __FILE__; \ + trx_start_if_not_started_low((t), rw); \ + } while (false) + +#define trx_start_internal(t) \ + do { \ + (t)->start_line = __LINE__; \ + (t)->start_file = __FILE__; \ + trx_start_internal_low(t, true); \ + } while (false) +#define trx_start_internal_read_only(t) \ + do { \ + (t)->start_line = __LINE__; \ + (t)->start_file = __FILE__; \ + trx_start_internal_low(t, false); \ + } while (false) +#else +#define trx_start_if_not_started(t, rw) \ + trx_start_if_not_started_low((t), rw) + +#define trx_start_internal(t) trx_start_internal_low(t, true) +#define trx_start_internal_read_only(t) trx_start_internal_low(t, false) + +#define trx_start_if_not_started_xa(t, rw) \ + trx_start_if_not_started_xa_low((t), (rw)) +#endif /* UNIV_DEBUG */ + +/** Start a transaction for a DDL operation. +@param trx transaction */ +void trx_start_for_ddl_low(trx_t *trx); + +#ifdef UNIV_DEBUG +# define trx_start_for_ddl(t) \ + do { \ + ut_ad((t)->start_file == 0); \ + (t)->start_line = __LINE__; \ + (t)->start_file = __FILE__; \ + trx_start_for_ddl_low(t); \ + } while (0) +#else +# define trx_start_for_ddl(t) trx_start_for_ddl_low(t) +#endif /* UNIV_DEBUG */ + +/**********************************************************************//** +Does the transaction commit for MySQL. +@return DB_SUCCESS or error number */ +dberr_t +trx_commit_for_mysql( +/*=================*/ + trx_t* trx); /*!< in/out: transaction */ +/** XA PREPARE a transaction. +@param[in,out] trx transaction to prepare */ +void trx_prepare_for_mysql(trx_t* trx); +/**********************************************************************//** +This function is used to find number of prepared transactions and +their transaction objects for a recovery. +@return number of prepared transactions */ +int +trx_recover_for_mysql( +/*==================*/ + XID* xid_list, /*!< in/out: prepared transactions */ + uint len); /*!< in: number of slots in xid_list */ +/** Look up an X/Open distributed transaction in XA PREPARE state. +@param[in] xid X/Open XA transaction identifier +@return transaction on match (the trx_t::xid will be invalidated); +note that the trx may have been committed before the caller acquires +trx_t::mutex +@retval NULL if no match */ +trx_t* trx_get_trx_by_xid(const XID* xid); +/** Durably write log until trx->commit_lsn +(if trx_t::commit_in_memory() was invoked with flush_log_later=true). */ +void trx_commit_complete_for_mysql(trx_t *trx); +/**********************************************************************//** +Marks the latest SQL statement ended. */ +void +trx_mark_sql_stat_end( +/*==================*/ + trx_t* trx); /*!< in: trx handle */ +/****************************************************************//** +Prepares a transaction for commit/rollback. */ +void +trx_commit_or_rollback_prepare( +/*===========================*/ + trx_t* trx); /*!< in/out: transaction */ +/*********************************************************************//** +Creates a commit command node struct. +@return own: commit node struct */ +commit_node_t* +trx_commit_node_create( +/*===================*/ + mem_heap_t* heap); /*!< in: mem heap where created */ +/***********************************************************//** +Performs an execution step for a commit type node in a query graph. +@return query thread to run next, or NULL */ +que_thr_t* +trx_commit_step( +/*============*/ + que_thr_t* thr); /*!< in: query thread */ + +/**********************************************************************//** +Prints info about a transaction. */ +void +trx_print_low( +/*==========*/ + FILE* f, + /*!< in: output stream */ + const trx_t* trx, + /*!< in: transaction */ + ulint max_query_len, + /*!< in: max query length to print, + or 0 to use the default max length */ + ulint n_rec_locks, + /*!< in: trx->lock.n_rec_locks */ + ulint n_trx_locks, + /*!< in: length of trx->lock.trx_locks */ + ulint heap_size); + /*!< in: mem_heap_get_size(trx->lock.lock_heap) */ + +/**********************************************************************//** +Prints info about a transaction. +When possible, use trx_print() instead. */ +void +trx_print_latched( +/*==============*/ + FILE* f, /*!< in: output stream */ + const trx_t* trx, /*!< in: transaction */ + ulint max_query_len); /*!< in: max query length to print, + or 0 to use the default max length */ + +/**********************************************************************//** +Prints info about a transaction. +Acquires and releases lock_sys.latch. */ +void +trx_print( +/*======*/ + FILE* f, /*!< in: output stream */ + const trx_t* trx, /*!< in: transaction */ + ulint max_query_len); /*!< in: max query length to print, + or 0 to use the default max length */ + +/**********************************************************************//** +Determines if a transaction is in the given state. +The caller must hold trx->mutex, or it must be the thread +that is serving a running transaction. +A running RW transaction must be in trx_sys.rw_trx_hash. +@return TRUE if trx->state == state */ +UNIV_INLINE +bool +trx_state_eq( +/*=========*/ + const trx_t* trx, /*!< in: transaction */ + trx_state_t state, /*!< in: state; + if state != TRX_STATE_NOT_STARTED + asserts that + trx->state != TRX_STATE_NOT_STARTED */ + bool relaxed = false) + /*!< in: whether to allow + trx->state == TRX_STATE_NOT_STARTED + after an error has been reported */ + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/**********************************************************************//** +Determines if the currently running transaction has been interrupted. +@return true if interrupted */ +bool +trx_is_interrupted( +/*===============*/ + const trx_t* trx); /*!< in: transaction */ + +/*******************************************************************//** +Calculates the "weight" of a transaction. The weight of one transaction +is estimated as the number of altered rows + the number of locked rows. +@param t transaction +@return transaction weight */ +#define TRX_WEIGHT(t) ((t)->undo_no + UT_LIST_GET_LEN((t)->lock.trx_locks)) + +/** Create the trx_t pool */ +void +trx_pool_init(); + +/** Destroy the trx_t pool */ +void +trx_pool_close(); + +/** +Set the transaction as a read-write transaction if it is not already +tagged as such. +@param[in,out] trx Transaction that needs to be "upgraded" to RW from RO */ +void +trx_set_rw_mode( + trx_t* trx); + +/** +Transactions that aren't started by the MySQL server don't set +the trx_t::mysql_thd field. For such transactions we set the lock +wait timeout to 0 instead of the user configured value that comes +from innodb_lock_wait_timeout via trx_t::mysql_thd. +@param trx transaction +@return lock wait timeout in seconds */ +#define trx_lock_wait_timeout_get(t) \ + ((t)->mysql_thd != NULL \ + ? thd_lock_wait_timeout((t)->mysql_thd) \ + : 0) + +typedef std::vector > lock_list; + +/** The locks and state of an active transaction. Protected by +lock_sys.latch, trx->mutex or both. */ +struct trx_lock_t +{ + /** Lock request being waited for. + Set to nonnull when holding lock_sys.latch, lock_sys.wait_mutex and + trx->mutex, by the thread that is executing the transaction. + Set to nullptr when holding lock_sys.wait_mutex. */ + Atomic_relaxed wait_lock; + /** Transaction being waited for; protected by lock_sys.wait_mutex */ + trx_t *wait_trx; + /** condition variable for !wait_lock; used with lock_sys.wait_mutex */ + pthread_cond_t cond; + /** lock wait start time */ + Atomic_relaxed suspend_time; + +#if defined(UNIV_DEBUG) || !defined(DBUG_OFF) + /** 2=high priority WSREP thread has marked this trx to abort; + 1=another transaction chose this as a victim in deadlock resolution. + + Other threads than the one that is executing the transaction may set + flags in this while holding lock_sys.wait_mutex. */ + Atomic_relaxed was_chosen_as_deadlock_victim; + + /** Flag the lock owner as a victim in Galera conflict resolution. */ + void set_wsrep_victim() + { +# if defined __GNUC__ && (defined __i386__ || defined __x86_64__) + /* There is no 8-bit version of the 80386 BTS instruction. + Technically, this is the wrong addressing mode (16-bit), but + there are other data members stored after the byte. */ + __asm__ __volatile__("lock btsw $1, %0" + : "+m" (was_chosen_as_deadlock_victim)); +# else + was_chosen_as_deadlock_victim.fetch_or(2); +# endif + } +#else /* defined(UNIV_DEBUG) || !defined(DBUG_OFF) */ + + /** High priority WSREP thread has marked this trx to abort or + another transaction chose this as a victim in deadlock resolution. + + Other threads than the one that is executing the transaction may set + this while holding lock_sys.wait_mutex. */ + Atomic_relaxed was_chosen_as_deadlock_victim; + + /** Flag the lock owner as a victim in Galera conflict resolution. */ + void set_wsrep_victim() { was_chosen_as_deadlock_victim= true; } +#endif /* defined(UNIV_DEBUG) || !defined(DBUG_OFF) */ + + /** Next available rec_pool[] entry */ + byte rec_cached; + /** Next available table_pool[] entry */ + byte table_cached; + + que_thr_t* wait_thr; /*!< query thread belonging to this + trx that is in waiting + state. For threads suspended in a + lock wait, this is protected by + lock_sys.latch. Otherwise, this may + only be modified by the thread that is + serving the running transaction. */ + + /** Pre-allocated record locks */ + struct { + alignas(CPU_LEVEL1_DCACHE_LINESIZE) ib_lock_t lock; + } rec_pool[8]; + + /** Pre-allocated table locks */ + ib_lock_t table_pool[8]; + + /** Memory heap for trx_locks. Protected by lock_sys.assert_locked() + and lock_sys.is_writer() || trx->mutex_is_owner(). */ + mem_heap_t *lock_heap; + + /** Locks held by the transaction. Protected by lock_sys.assert_locked() + and lock_sys.is_writer() || trx->mutex_is_owner(). + (If lock_sys.latch is only held in shared mode, then the modification + must be protected by trx->mutex.) */ + trx_lock_list_t trx_locks; + + lock_list table_locks; /*!< All table locks requested by this + transaction, including AUTOINC locks */ + + /** List of pending trx_t::evict_table() */ + UT_LIST_BASE_NODE_T(dict_table_t) evicted_tables; + + /** number of record locks; protected by lock_sys.assert_locked(page_id) */ + ulint n_rec_locks; +}; + +/** Logical first modification time of a table in a transaction */ +class trx_mod_table_time_t +{ + /** Impossible value for trx_t::undo_no */ + static constexpr undo_no_t NONE= ~undo_no_t{0}; + /** Theoretical maximum value for trx_t::undo_no. + DB_ROLL_PTR is only 7 bytes, so it cannot point to more than + this many undo log records. */ + static constexpr undo_no_t LIMIT= (undo_no_t{1} << (7 * 8)) - 1; + + /** Flag in 'first' to indicate that subsequent operations are + covered by a TRX_UNDO_EMPTY record (for the first statement to + insert into an empty table) */ + static constexpr undo_no_t BULK= 1ULL << 63; + + /** First modification of the table, possibly ORed with BULK */ + undo_no_t first; + /** First modification of a system versioned column + (NONE= no versioning, BULK= the table was dropped) */ + undo_no_t first_versioned= NONE; +#ifdef UNIV_DEBUG + /** Whether the modified table is a FTS auxiliary table */ + bool fts_aux_table= false; +#endif /* UNIV_DEBUG */ + + /** Buffer to store insert opertion */ + row_merge_bulk_t *bulk_store= nullptr; + + friend struct trx_t; +public: + /** Constructor + @param rows number of modified rows so far */ + trx_mod_table_time_t(undo_no_t rows) : first(rows) { ut_ad(rows < LIMIT); } + +#ifdef UNIV_DEBUG + /** Validation + @param rows number of modified rows so far + @return whether the object is valid */ + bool valid(undo_no_t rows= NONE) const + { auto f= first & LIMIT; return f <= first_versioned && f <= rows; } +#endif /* UNIV_DEBUG */ + /** @return if versioned columns were modified */ + bool is_versioned() const { return (~first_versioned & LIMIT) != 0; } + /** @return if the table was dropped */ + bool is_dropped() const { return first_versioned == BULK; } + + /** After writing an undo log record, set is_versioned() if needed + @param rows number of modified rows so far */ + void set_versioned(undo_no_t rows) + { + ut_ad(first_versioned == NONE); + first_versioned= rows; + ut_ad(valid(rows)); + } + + /** After writing an undo log record, note that the table will be dropped */ + void set_dropped() + { + ut_ad(first_versioned == NONE); + first_versioned= BULK; + } + + /** Notify the start of a bulk insert operation + @param table table to do bulk operation */ + void start_bulk_insert(dict_table_t *table) + { + first|= BULK; + if (!table->is_temporary()) + bulk_store= new row_merge_bulk_t(table); + } + + /** Notify the end of a bulk insert operation */ + void end_bulk_insert() { first&= ~BULK; } + + /** @return whether an insert is covered by TRX_UNDO_EMPTY record */ + bool is_bulk_insert() const { return first & BULK; } + + /** Invoked after partial rollback + @param limit number of surviving modified rows (trx_t::undo_no) + @return whether this should be erased from trx_t::mod_tables */ + bool rollback(undo_no_t limit) + { + ut_ad(valid()); + if ((LIMIT & first) >= limit) + return true; + if (first_versioned < limit) + first_versioned= NONE; + return false; + } + +#ifdef UNIV_DEBUG + void set_aux_table() { fts_aux_table= true; } + + bool is_aux_table() const { return fts_aux_table; } +#endif /* UNIV_DEBUG */ + + /** @return the first undo record that modified the table */ + undo_no_t get_first() const + { + ut_ad(valid()); + return LIMIT & first; + } + + /** Add the tuple to the transaction bulk buffer for the given index. + @param entry tuple to be inserted + @param index bulk insert for the index + @param trx transaction */ + dberr_t bulk_insert_buffered(const dtuple_t &entry, + const dict_index_t &index, trx_t *trx) + { + return bulk_store->bulk_insert_buffered(entry, index, trx); + } + + /** Do bulk insert operation present in the buffered operation + @return DB_SUCCESS or error code */ + dberr_t write_bulk(dict_table_t *table, trx_t *trx); + + /** @return whether the buffer storage exist */ + bool bulk_buffer_exist() const + { + return bulk_store && is_bulk_insert(); + } + + /** Free bulk insert operation */ + void clear_bulk_buffer() + { + delete bulk_store; + bulk_store= nullptr; + } +}; + +/** Collection of persistent tables and their first modification +in a transaction. +We store pointers to the table objects in memory because +we know that a table object will not be destroyed while a transaction +that modified it is running. */ +typedef std::map< + dict_table_t*, trx_mod_table_time_t, + std::less, + ut_allocator > > + trx_mod_tables_t; + +/** The transaction handle + +Normally, there is a 1:1 relationship between a transaction handle +(trx) and a session (client connection). One session is associated +with exactly one user transaction. There are some exceptions to this: + +* For DDL operations, a subtransaction is allocated that modifies the +data dictionary tables. Lock waits and deadlocks are prevented by +acquiring the dict_sys.latch before starting the subtransaction +and releasing it after committing the subtransaction. + +* The purge system uses a special transaction that is not associated +with any session. + +* If the system crashed or it was quickly shut down while there were +transactions in the ACTIVE or PREPARED state, these transactions would +no longer be associated with a session when the server is restarted. + +A session may be served by at most one thread at a time. The serving +thread of a session might change in some MySQL implementations. +Therefore we do not have pthread_self() assertions in the code. + +Normally, only the thread that is currently associated with a running +transaction may access (read and modify) the trx object, and it may do +so without holding any mutex. The following are exceptions to this: + +* trx_rollback_recovered() may access resurrected (connectionless) +transactions (state == TRX_STATE_ACTIVE && is_recovered) +while the system is already processing new user transactions (!is_recovered). + +* trx_print_low() may access transactions not associated with the current +thread. The caller must be holding lock_sys.latch. + +* When a transaction handle is in the trx_sys.trx_list, some of its fields +must not be modified without holding trx->mutex. + +* The locking code (in particular, lock_deadlock_recursive() and +lock_rec_convert_impl_to_expl()) will access transactions associated +to other connections. The locks of transactions are protected by +lock_sys.latch (insertions also by trx->mutex). */ + +/** Represents an instance of rollback segment along with its state variables.*/ +struct trx_undo_ptr_t { + trx_rseg_t* rseg; /*!< rollback segment assigned to the + transaction, or NULL if not assigned + yet */ + trx_undo_t* undo; /*!< pointer to the undo log, or + NULL if nothing logged yet */ +}; + +/** An instance of temporary rollback segment. */ +struct trx_temp_undo_t { + /** temporary rollback segment, or NULL if not assigned yet */ + trx_rseg_t* rseg; + /** pointer to the undo log, or NULL if nothing logged yet */ + trx_undo_t* undo; +}; + +/** Rollback segments assigned to a transaction for undo logging. */ +struct trx_rsegs_t { + /** undo log ptr holding reference to a rollback segment that resides in + system/undo tablespace used for undo logging of tables that needs + to be recovered on crash. */ + trx_undo_ptr_t m_redo; + + /** undo log for temporary tables; discarded immediately after + transaction commit/rollback */ + trx_temp_undo_t m_noredo; +}; + +struct trx_t : ilist_node<> +{ +private: + /** + Least significant 31 bits is count of references. + + We can't release the locks nor commit the transaction until this reference + is 0. We can change the state to TRX_STATE_COMMITTED_IN_MEMORY to signify + that it is no longer "active". + + If the most significant bit is set this transaction should stop inheriting + (GAP)locks. Generally set to true during transaction prepare for RC or lower + isolation, if requested. Needed for replication replay where + we don't want to get blocked on GAP locks taken for protecting + concurrent unique insert or replace operation. + */ + alignas(CPU_LEVEL1_DCACHE_LINESIZE) + Atomic_relaxed skip_lock_inheritance_and_n_ref; + + +public: + /** Transaction identifier (0 if no locks were acquired). + Set by trx_sys_t::register_rw() or trx_resurrect() before + the transaction is added to trx_sys.rw_trx_hash. + Cleared in commit_in_memory() after commit_state(), + trx_sys_t::deregister_rw(), release_locks(). */ + trx_id_t id; + /** The largest encountered transaction identifier for which no + transaction was observed to be active. This is a cache to speed up + trx_sys_t::find_same_or_older(). */ + trx_id_t max_inactive_id; + +private: + /** mutex protecting state and some of lock + (some are protected by lock_sys.latch) */ + srw_spin_mutex mutex; +#ifdef UNIV_DEBUG + /** The owner of mutex (0 if none); protected by mutex */ + std::atomic mutex_owner{0}; +#endif /* UNIV_DEBUG */ +public: + void mutex_init() { mutex.init(); } + void mutex_destroy() { mutex.destroy(); } + + /** Acquire the mutex */ + void mutex_lock() + { + ut_ad(!mutex_is_owner()); + mutex.wr_lock(); + ut_ad(!mutex_owner.exchange(pthread_self(), + std::memory_order_relaxed)); + } + /** Release the mutex */ + void mutex_unlock() + { + ut_ad(mutex_owner.exchange(0, std::memory_order_relaxed) + == pthread_self()); + mutex.wr_unlock(); + } +#ifndef SUX_LOCK_GENERIC + bool mutex_is_locked() const noexcept { return mutex.is_locked(); } +#endif +#ifdef UNIV_DEBUG + /** @return whether the current thread holds the mutex */ + bool mutex_is_owner() const + { + return mutex_owner.load(std::memory_order_relaxed) == + pthread_self(); + } +#endif /* UNIV_DEBUG */ + + /** State of the trx from the point of view of concurrency control + and the valid state transitions. + + Possible states: + + TRX_STATE_NOT_STARTED + TRX_STATE_ACTIVE + TRX_STATE_PREPARED + TRX_STATE_PREPARED_RECOVERED (special case of TRX_STATE_PREPARED) + TRX_STATE_COMMITTED_IN_MEMORY (alias below COMMITTED) + + Valid state transitions are: + + Regular transactions: + * NOT_STARTED -> ACTIVE -> COMMITTED -> NOT_STARTED + + Auto-commit non-locking read-only: + * NOT_STARTED -> ACTIVE -> NOT_STARTED + + XA (2PC): + * NOT_STARTED -> ACTIVE -> PREPARED -> COMMITTED -> NOT_STARTED + + Recovered XA: + * NOT_STARTED -> PREPARED -> COMMITTED -> (freed) + + Recovered XA followed by XA ROLLBACK: + * NOT_STARTED -> PREPARED -> ACTIVE -> COMMITTED -> (freed) + + XA (2PC) (shutdown or disconnect before ROLLBACK or COMMIT): + * NOT_STARTED -> PREPARED -> (freed) + + Disconnected XA PREPARE transaction can become recovered: + * ... -> ACTIVE -> PREPARED (connected) -> PREPARED (disconnected) + + Latching and various transaction lists membership rules: + + XA (2PC) transactions are always treated as non-autocommit. + + Transitions to ACTIVE or NOT_STARTED occur when transaction + is not in rw_trx_hash. + + Autocommit non-locking read-only transactions move between states + without holding any mutex. They are not in rw_trx_hash. + + All transactions, unless they are determined to be ac-nl-ro, + explicitly tagged as read-only or read-write, will first be put + on the read-only transaction list. Only when a !read-only transaction + in the read-only list tries to acquire an X or IX lock on a table + do we remove it from the read-only list and put it on the read-write + list. During this switch we assign it a rollback segment. + + When a transaction is NOT_STARTED, it can be in trx_list. It cannot be + in rw_trx_hash. + + ACTIVE->PREPARED->COMMITTED is only possible when trx is in rw_trx_hash. + The transition ACTIVE->PREPARED is protected by trx->mutex. + + ACTIVE->COMMITTED is possible when the transaction is in + rw_trx_hash. + + Transitions to COMMITTED are protected by trx_t::mutex. */ + Atomic_relaxed state; + + /** The locks of the transaction. Protected by lock_sys.latch + (insertions also by trx_t::mutex). */ + alignas(CPU_LEVEL1_DCACHE_LINESIZE) trx_lock_t lock; + +#ifdef WITH_WSREP + /** whether wsrep_on(mysql_thd) held at the start of transaction */ + byte wsrep; + bool is_wsrep() const { return UNIV_UNLIKELY(wsrep); } + bool is_wsrep_UK_scan() const { return UNIV_UNLIKELY(wsrep & 2); } +#else /* WITH_WSREP */ + bool is_wsrep() const { return false; } +#endif /* WITH_WSREP */ + + /** Consistent read view of the transaction */ + ReadView read_view; + + /* These fields are not protected by any mutex. */ + + /** false=normal transaction, true=recovered (must be rolled back) + or disconnected transaction in XA PREPARE STATE. + + This field is accessed by the thread that owns the transaction, + without holding any mutex. + There is only one foreign-thread access in trx_print_low() + and a possible race condition with trx_disconnect_prepared(). */ + bool is_recovered; + const char* op_info; /*!< English text describing the + current operation, or an empty + string */ + uint isolation_level;/*!< TRX_ISO_REPEATABLE_READ, ... */ + bool check_foreigns; /*!< normally TRUE, but if the user + wants to suppress foreign key checks, + (in table imports, for example) we + set this FALSE */ + /** whether an insert into an empty table is active */ + bool bulk_insert; + /*------------------------------*/ + /* MySQL has a transaction coordinator to coordinate two phase + commit between multiple storage engines and the binary log. When + an engine participates in a transaction, it's responsible for + registering itself using the trans_register_ha() API. */ + bool is_registered; /* This flag is set to true after the + transaction has been registered with + the coordinator using the XA API, and + is set to false after commit or + rollback. */ + /** whether this is holding the prepare mutex */ + bool active_commit_ordered; + /*------------------------------*/ + bool check_unique_secondary; + /*!< normally TRUE, but if the user + wants to speed up inserts by + suppressing unique key checks + for secondary indexes when we decide + if we can use the insert buffer for + them, we set this FALSE */ + bool flush_log_later;/* In 2PC, we hold the + prepare_commit mutex across + both phases. In that case, we + defer flush of the logs to disk + until after we release the + mutex. */ + ulint duplicates; /*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */ + /** whether this modifies InnoDB dictionary tables */ + bool dict_operation; +#ifdef UNIV_DEBUG + /** copy of dict_operation during commit() */ + bool was_dict_operation; +#endif + /** whether dict_sys.latch is held exclusively; protected by + dict_sys.latch */ + bool dict_operation_lock_mode; + + /** wall-clock time of the latest transition to TRX_STATE_ACTIVE; + used for diagnostic purposes only */ + time_t start_time; + /** microsecond_interval_timer() of transaction start */ + ulonglong start_time_micro; + lsn_t commit_lsn; /*!< lsn at the time of the commit */ + /*------------------------------*/ + THD* mysql_thd; /*!< MySQL thread handle corresponding + to this trx, or NULL */ + + const char* mysql_log_file_name; + /*!< if MySQL binlog is used, this field + contains a pointer to the latest file + name; this is NULL if binlog is not + used */ + ulonglong mysql_log_offset; + /*!< if MySQL binlog is used, this + field contains the end offset of the + binlog entry */ + /*------------------------------*/ + ib_uint32_t n_mysql_tables_in_use; /*!< number of Innobase tables + used in the processing of the current + SQL statement in MySQL */ + ib_uint32_t mysql_n_tables_locked; + /*!< how many tables the current SQL + statement uses, except those + in consistent read */ + + /** DB_SUCCESS or error code; usually only the thread that is running + the transaction is allowed to modify this field. The only exception is + when a thread invokes lock_sys_t::cancel() in order to abort a + lock_wait(). That is protected by lock_sys.wait_mutex and lock.wait_lock. */ + dberr_t error_state; + + const dict_index_t*error_info; /*!< if the error number indicates a + duplicate key error, a pointer to + the problematic index is stored here */ + ulint error_key_num; /*!< if the index creation fails to a + duplicate key error, a mysql key + number of that index is stored here */ + que_t* graph; /*!< query currently run in the session, + or NULL if none; NOTE that the query + belongs to the session, and it can + survive over a transaction commit, if + it is a stored procedure with a COMMIT + WORK statement, for instance */ + /*------------------------------*/ + UT_LIST_BASE_NODE_T(trx_named_savept_t) + trx_savepoints; /*!< savepoints set with SAVEPOINT ..., + oldest first */ + /*------------------------------*/ + undo_no_t undo_no; /*!< next undo log record number to + assign; since the undo log is + private for a transaction, this + is a simple ascending sequence + with no gaps; thus it represents + the number of modified/inserted + rows in a transaction */ + trx_savept_t last_sql_stat_start; + /*!< undo_no when the last sql statement + was started: in case of an error, trx + is rolled back down to this number */ + trx_rsegs_t rsegs; /* rollback segments for undo logging */ + undo_no_t roll_limit; /*!< least undo number to undo during + a partial rollback; 0 otherwise */ + bool in_rollback; /*!< true when the transaction is + executing a partial or full rollback */ + ulint pages_undone; /*!< number of undo log pages undone + since the last undo log truncation */ + /*------------------------------*/ + ulint n_autoinc_rows; /*!< no. of AUTO-INC rows required for + an SQL statement. This is useful for + multi-row INSERTs */ + ib_vector_t* autoinc_locks; /* AUTOINC locks held by this + transaction. Note that these are + also in the lock list trx_locks. This + vector needs to be freed explicitly + when the trx instance is destroyed. + Protected by lock_sys.latch. */ + /*------------------------------*/ + bool read_only; /*!< true if transaction is flagged + as a READ-ONLY transaction. + if auto_commit && !will_lock + then it will be handled as a + AC-NL-RO-SELECT (Auto Commit Non-Locking + Read Only Select). A read only + transaction will not be assigned an + UNDO log. */ + bool auto_commit; /*!< true if it is an autocommit */ + bool will_lock; /*!< set to inform trx_start_low() that + the transaction may acquire locks */ + /* True if transaction has to read the undo log and + log the DML changes for online DDL table */ + bool apply_online_log = false; + + /*------------------------------*/ + fts_trx_t* fts_trx; /*!< FTS information, or NULL if + transaction hasn't modified tables + with FTS indexes (yet). */ + doc_id_t fts_next_doc_id;/* The document id used for updates */ + /*------------------------------*/ + ib_uint32_t flush_tables; /*!< if "covering" the FLUSH TABLES", + count of tables being flushed. */ + + /*------------------------------*/ +#ifdef UNIV_DEBUG + unsigned start_line; /*!< Track where it was started from */ + const char* start_file; /*!< Filename where it was started */ +#endif /* UNIV_DEBUG */ + + XID xid; /*!< X/Open XA transaction + identification to identify a + transaction branch */ + trx_mod_tables_t mod_tables; /*!< List of tables that were modified + by this transaction */ + /*------------------------------*/ + char* detailed_error; /*!< detailed error message for last + error, or empty. */ + rw_trx_hash_element_t *rw_trx_hash_element; + LF_PINS *rw_trx_hash_pins; + ulint magic_n; + + /** @return whether any persistent undo log has been generated */ + bool has_logged_persistent() const + { + return(rsegs.m_redo.undo); + } + + /** @return whether any undo log has been generated */ + bool has_logged() const + { + return(has_logged_persistent() || rsegs.m_noredo.undo); + } + + /** @return rollback segment for modifying temporary tables */ + trx_rseg_t* get_temp_rseg() + { + if (trx_rseg_t* rseg = rsegs.m_noredo.rseg) { + ut_ad(id != 0); + return(rseg); + } + + return(assign_temp_rseg()); + } + + /** Transition to committed state, to release implicit locks. */ + inline void commit_state(); + + /** Release any explicit locks of a committing transaction. */ + inline void release_locks(); + + /** Evict a table definition due to the rollback of ALTER TABLE. + @param table_id table identifier + @param reset_only whether to only reset dict_table_t::def_trx_id */ + void evict_table(table_id_t table_id, bool reset_only= false); + + /** Initiate rollback. + @param savept savepoint to which to roll back + @return error code or DB_SUCCESS */ + dberr_t rollback(trx_savept_t *savept= nullptr); + /** Roll back an active transaction. + @param savept savepoint to which to roll back */ + inline void rollback_low(trx_savept_t *savept= nullptr); + /** Finish rollback. + @return whether the rollback was completed normally + @retval false if the rollback was aborted by shutdown */ + inline bool rollback_finish(); +private: + /** Apply any changes to tables for which online DDL is in progress. */ + ATTRIBUTE_COLD void apply_log(); + /** Process tables that were modified by the committing transaction. */ + inline void commit_tables(); + /** Mark a transaction committed in the main memory data structures. + @param mtr mini-transaction (if there are any persistent modifications) */ + inline void commit_in_memory(const mtr_t *mtr); + /** Write log for committing the transaction. */ + void commit_persist(); + /** Clean up the transaction after commit_in_memory() */ + void commit_cleanup(); + /** Commit the transaction in a mini-transaction. + @param mtr mini-transaction (if there are any persistent modifications) */ + void commit_low(mtr_t *mtr= nullptr); + /** Commit an empty transaction. + @param mtr mini-transaction */ + void commit_empty(mtr_t *mtr); + /** Commit an empty transaction. + @param mtr mini-transaction */ + /** Assign the transaction its history serialisation number and write the + UNDO log to the assigned rollback segment. + @param mtr mini-transaction */ + inline void write_serialisation_history(mtr_t *mtr); +public: + /** Commit the transaction. */ + void commit(); + + /** Try to drop a persistent table. + @param table persistent table + @param fk whether to drop FOREIGN KEY metadata + @return error code */ + dberr_t drop_table(const dict_table_t &table); + /** Try to drop the foreign key constraints for a persistent table. + @param name name of persistent table + @return error code */ + dberr_t drop_table_foreign(const table_name_t &name); + /** Try to drop the statistics for a persistent table. + @param name name of persistent table + @return error code */ + dberr_t drop_table_statistics(const table_name_t &name); + /** Commit the transaction, possibly after drop_table(). + @param deleted handles of data files that were deleted */ + void commit(std::vector &deleted); + + + /** Discard all savepoints */ + void savepoints_discard() + { savepoints_discard(UT_LIST_GET_FIRST(trx_savepoints)); } + + + /** Discard all savepoints starting from a particular savepoint. + @param savept first savepoint to discard */ + void savepoints_discard(trx_named_savept_t *savept); + + + bool is_referenced() const + { + return (skip_lock_inheritance_and_n_ref & ~(1U << 31)) > 0; + } + + + void reference() + { + ut_d(auto old_n_ref =) + skip_lock_inheritance_and_n_ref.fetch_add(1); + ut_ad(int32_t(old_n_ref << 1) >= 0); + } + + void release_reference() + { + ut_d(auto old_n_ref =) + skip_lock_inheritance_and_n_ref.fetch_sub(1); + ut_ad(int32_t(old_n_ref << 1) > 0); + } + + bool is_not_inheriting_locks() const + { + return skip_lock_inheritance_and_n_ref >> 31; + } + + void set_skip_lock_inheritance() + { + ut_d(auto old_n_ref=) skip_lock_inheritance_and_n_ref.fetch_add(1U << 31); + ut_ad(!(old_n_ref >> 31)); + } + + void reset_skip_lock_inheritance() + { +#if defined __GNUC__ && (defined __i386__ || defined __x86_64__) + __asm__("lock btrl $31, %0" : : "m"(skip_lock_inheritance_and_n_ref)); +#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64) + _interlockedbittestandreset( + reinterpret_cast(&skip_lock_inheritance_and_n_ref), + 31); +#else + skip_lock_inheritance_and_n_ref.fetch_and(~1U << 31); +#endif + } + + /** @return whether the table has lock on + mysql.innodb_table_stats or mysql.innodb_index_stats */ + bool has_stats_table_lock() const; + + /** Free the memory to trx_pools */ + void free(); + + + void assert_freed() const + { + ut_ad(state == TRX_STATE_NOT_STARTED); + ut_ad(!id); + ut_ad(!mutex_is_owner()); + ut_ad(!has_logged()); + ut_ad(!is_referenced()); + ut_ad(!is_wsrep()); + ut_ad(!lock.was_chosen_as_deadlock_victim); + ut_ad(mod_tables.empty()); + ut_ad(!read_view.is_open()); + ut_ad(!lock.wait_thr); + ut_ad(!lock.wait_lock); + ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0); + ut_ad(lock.table_locks.empty()); + ut_ad(!autoinc_locks || ib_vector_is_empty(autoinc_locks)); + ut_ad(UT_LIST_GET_LEN(lock.evicted_tables) == 0); + ut_ad(!dict_operation); + ut_ad(!apply_online_log); + ut_ad(!is_not_inheriting_locks()); + ut_ad(check_foreigns); + ut_ad(check_unique_secondary); + } + + /** This has to be invoked on SAVEPOINT or at the end of a statement. + Even if a TRX_UNDO_EMPTY record was written for this table to cover an + insert into an empty table, subsequent operations will have to be covered + by row-level undo log records, so that ROLLBACK TO SAVEPOINT or a + rollback to the start of a statement will work. + @param table table on which any preceding bulk insert ended */ + void end_bulk_insert(const dict_table_t &table) + { + auto it= mod_tables.find(const_cast(&table)); + if (it != mod_tables.end()) + it->second.end_bulk_insert(); + } + + /** @return whether this is a non-locking autocommit transaction */ + bool is_autocommit_non_locking() const { return auto_commit && !will_lock; } + + /** This has to be invoked on SAVEPOINT or at the start of a statement. + Even if TRX_UNDO_EMPTY records were written for any table to cover an + insert into an empty table, subsequent operations will have to be covered + by row-level undo log records, so that ROLLBACK TO SAVEPOINT or a + rollback to the start of a statement will work. */ + void end_bulk_insert() + { + for (auto& t : mod_tables) + t.second.end_bulk_insert(); + } + + /** @return whether a bulk insert into empty table is in progress */ + bool is_bulk_insert() const + { + if (!bulk_insert || check_unique_secondary || check_foreigns) + return false; + for (const auto& t : mod_tables) + if (t.second.is_bulk_insert()) + return true; + return false; + } + + /** @return logical modification time of a table only + if the table has bulk buffer exist in the transaction */ + trx_mod_table_time_t *check_bulk_buffer(dict_table_t *table) + { + if (UNIV_LIKELY(!bulk_insert)) + return nullptr; + ut_ad(!check_unique_secondary); + ut_ad(!check_foreigns); + auto it= mod_tables.find(table); + if (it == mod_tables.end() || !it->second.bulk_buffer_exist()) + return nullptr; + return &it->second; + } + + /** Do the bulk insert for the buffered insert operation + for the transaction. + @return DB_SUCCESS or error code */ + dberr_t bulk_insert_apply() + { + return UNIV_UNLIKELY(bulk_insert) ? bulk_insert_apply_low(): DB_SUCCESS; + } + +private: + /** Apply the buffered bulk inserts. */ + dberr_t bulk_insert_apply_low(); + + /** Assign a rollback segment for modifying temporary tables. + @return the assigned rollback segment */ + trx_rseg_t *assign_temp_rseg(); +}; + +/** +Check if transaction is started. +@param[in] trx Transaction whose state we need to check +@reutrn true if transaction is in state started */ +inline bool trx_is_started(const trx_t* trx) +{ + return trx->state != TRX_STATE_NOT_STARTED; +} + +/* Transaction isolation levels (trx->isolation_level) */ +#define TRX_ISO_READ_UNCOMMITTED 0 /* dirty read: non-locking + SELECTs are performed so that + we do not look at a possible + earlier version of a record; + thus they are not 'consistent' + reads under this isolation + level; otherwise like level + 2 */ + +#define TRX_ISO_READ_COMMITTED 1 /* somewhat Oracle-like + isolation, except that in + range UPDATE and DELETE we + must block phantom rows + with next-key locks; + SELECT ... FOR UPDATE and ... + LOCK IN SHARE MODE only lock + the index records, NOT the + gaps before them, and thus + allow free inserting; + each consistent read reads its + own snapshot */ + +#define TRX_ISO_REPEATABLE_READ 2 /* this is the default; + all consistent reads in the + same trx read the same + snapshot; + full next-key locking used + in locking reads to block + insertions into gaps */ + +#define TRX_ISO_SERIALIZABLE 3 /* all plain SELECTs are + converted to LOCK IN SHARE + MODE reads */ + +/* Treatment of duplicate values (trx->duplicates; for example, in inserts). +Multiple flags can be combined with bitwise OR. */ +#define TRX_DUP_IGNORE 1U /* duplicate rows are to be updated */ +#define TRX_DUP_REPLACE 2U /* duplicate rows are to be replaced */ + + +/** Commit node states */ +enum commit_node_state { + COMMIT_NODE_SEND = 1, /*!< about to send a commit signal to + the transaction */ + COMMIT_NODE_WAIT /*!< commit signal sent to the transaction, + waiting for completion */ +}; + +/** Commit command node in a query graph */ +struct commit_node_t{ + que_common_t common; /*!< node type: QUE_NODE_COMMIT */ + enum commit_node_state + state; /*!< node execution state */ +}; + + +#include "trx0trx.inl" + +#endif diff --git a/storage/innobase/include/trx0trx.inl b/storage/innobase/include/trx0trx.inl new file mode 100644 index 00000000..b063c920 --- /dev/null +++ b/storage/innobase/include/trx0trx.inl @@ -0,0 +1,86 @@ +/***************************************************************************** + +Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2016, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0trx.ic +The transaction + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +/**********************************************************************//** +Determines if a transaction is in the given state. +The caller must hold trx->mutex, or it must be the thread +that is serving a running transaction. +A running RW transaction must be in trx_sys.rw_trx_hash. +@return TRUE if trx->state == state */ +UNIV_INLINE +bool +trx_state_eq( +/*=========*/ + const trx_t* trx, /*!< in: transaction */ + trx_state_t state, /*!< in: state; + if state != TRX_STATE_NOT_STARTED + asserts that + trx->state != TRX_STATE_NOT_STARTED */ + bool relaxed) + /*!< in: whether to allow + trx->state == TRX_STATE_NOT_STARTED + after an error has been reported */ +{ +#ifdef UNIV_DEBUG + switch (trx->state) { + case TRX_STATE_PREPARED: + case TRX_STATE_PREPARED_RECOVERED: + case TRX_STATE_COMMITTED_IN_MEMORY: + ut_ad(!trx->is_autocommit_non_locking()); + return(trx->state == state); + + case TRX_STATE_ACTIVE: + if (trx->is_autocommit_non_locking()) { + ut_ad(!trx->is_recovered); + ut_ad(trx->read_only); + ut_ad(trx->mysql_thd); + } + return(state == trx->state); + + case TRX_STATE_NOT_STARTED: + /* These states are not allowed for running transactions. */ + ut_a(state == TRX_STATE_NOT_STARTED + || (relaxed + && thd_get_error_number(trx->mysql_thd))); + + return(true); + } + ut_error; +#endif /* UNIV_DEBUG */ + return(trx->state == state); +} + +/****************************************************************//** +Retrieves the error_info field from a trx. +@return the error info */ +UNIV_INLINE +const dict_index_t* +trx_get_error_info( +/*===============*/ + const trx_t* trx) /*!< in: trx object */ +{ + return(trx->error_info); +} diff --git a/storage/innobase/include/trx0types.h b/storage/innobase/include/trx0types.h new file mode 100644 index 00000000..bfa2adc0 --- /dev/null +++ b/storage/innobase/include/trx0types.h @@ -0,0 +1,131 @@ +/***************************************************************************** + +Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0types.h +Transaction system global type definitions + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#pragma once +#include "univ.i" +#include "ut0new.h" + +#include + +/** printf(3) format used for printing DB_TRX_ID and other system fields */ +#define TRX_ID_FMT IB_ID_FMT + +/** maximum length that a formatted trx_t::id could take, not including +the terminating NUL character. */ +static const ulint TRX_ID_MAX_LEN = 17; + +/** Space id of the transaction system page (the system tablespace) */ +static constexpr uint32_t TRX_SYS_SPACE= 0; + +/** Page number of the transaction system page */ +#define TRX_SYS_PAGE_NO FSP_TRX_SYS_PAGE_NO + +/** Random value to check for corruption of trx_t */ +static const ulint TRX_MAGIC_N = 91118598; + +constexpr uint innodb_purge_threads_MAX= 32; +constexpr uint innodb_purge_batch_size_MAX= 5000; + +/** Transaction states (trx_t::state) */ +enum trx_state_t { + TRX_STATE_NOT_STARTED, + + TRX_STATE_ACTIVE, + /** XA PREPARE has been executed; only XA COMMIT or XA ROLLBACK + are possible */ + TRX_STATE_PREPARED, + /** XA PREPARE transaction that was returned to ha_recover() */ + TRX_STATE_PREPARED_RECOVERED, + TRX_STATE_COMMITTED_IN_MEMORY +}; + +/** Memory objects */ +/* @{ */ +/** Transaction */ +struct trx_t; +/** The locks and state of an active transaction */ +struct trx_lock_t; +/** Rollback segment */ +struct trx_rseg_t; +/** Transaction undo log */ +struct trx_undo_t; +/** Rollback command node in a query graph */ +struct roll_node_t; +/** Commit command node in a query graph */ +struct commit_node_t; +/** SAVEPOINT command node in a query graph */ +struct trx_named_savept_t; +/* @} */ + +/** Row identifier (DB_ROW_ID, DATA_ROW_ID) */ +typedef ib_id_t row_id_t; +/** Transaction identifier (DB_TRX_ID, DATA_TRX_ID) */ +typedef ib_id_t trx_id_t; +/** Rollback pointer (DB_ROLL_PTR, DATA_ROLL_PTR) */ +typedef ib_id_t roll_ptr_t; +/** Undo number */ +typedef ib_id_t undo_no_t; + +/** Transaction savepoint */ +struct trx_savept_t{ + undo_no_t least_undo_no; /*!< least undo number to undo */ +}; + +/** File objects */ +/* @{ */ +/** Undo segment header */ +typedef byte trx_usegf_t; +/** Undo log header */ +typedef byte trx_ulogf_t; +/** Undo log page header */ +typedef byte trx_upagef_t; + +/** Undo log record */ +typedef byte trx_undo_rec_t; + +/* @} */ + +/** Info required to purge a record */ +struct trx_purge_rec_t +{ + /** Undo log record, or nullptr (roll_ptr!=0 if the log can be skipped) */ + const trx_undo_rec_t *undo_rec; + /** File pointer to undo_rec */ + roll_ptr_t roll_ptr; +}; + +typedef std::vector > trx_ids_t; + +/** Number of std::unordered_map hash buckets expected to be needed +for table IDs in a purge batch. GNU libstdc++ would default to 1 and +enlarge and rehash on demand. */ +static constexpr size_t TRX_PURGE_TABLE_BUCKETS= 128; + +/** The number of rollback segments; rollback segment id must fit in +the 7 bits reserved for it in DB_ROLL_PTR. */ +static constexpr unsigned TRX_SYS_N_RSEGS= 128; +/** Maximum number of undo tablespaces (not counting the system tablespace) */ +static constexpr unsigned TRX_SYS_MAX_UNDO_SPACES= TRX_SYS_N_RSEGS - 1; diff --git a/storage/innobase/include/trx0undo.h b/storage/innobase/include/trx0undo.h new file mode 100644 index 00000000..3d22a33e --- /dev/null +++ b/storage/innobase/include/trx0undo.h @@ -0,0 +1,514 @@ +/***************************************************************************** + +Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0undo.h +Transaction undo log + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#ifndef trx0undo_h +#define trx0undo_h + +#ifndef UNIV_INNOCHECKSUM +#include "trx0sys.h" + +/** The LSB of the "is insert" flag in DB_ROLL_PTR */ +#define ROLL_PTR_INSERT_FLAG_POS 55 +/** The LSB of the 7-bit trx_rseg_t::id in DB_ROLL_PTR */ +#define ROLL_PTR_RSEG_ID_POS 48 +/** The LSB of the 32-bit undo log page number in DB_ROLL_PTR */ +#define ROLL_PTR_PAGE_POS 16 +/** The LSB of the 16-bit byte offset within an undo log page in DB_ROLL_PTR */ +#define ROLL_PTR_BYTE_POS 0 + +/***********************************************************************//** +Builds a roll pointer. +@return roll pointer */ +UNIV_INLINE +roll_ptr_t +trx_undo_build_roll_ptr( +/*====================*/ + bool is_insert, /*!< in: TRUE if insert undo log */ + ulint rseg_id, /*!< in: rollback segment id */ + uint32_t page_no, /*!< in: page number */ + uint16_t offset); /*!< in: offset of the undo entry within page */ +/***********************************************************************//** +Decodes a roll pointer. */ +UNIV_INLINE +void +trx_undo_decode_roll_ptr( +/*=====================*/ + roll_ptr_t roll_ptr, /*!< in: roll pointer */ + bool* is_insert, /*!< out: TRUE if insert undo log */ + ulint* rseg_id, /*!< out: rollback segment id */ + uint32_t* page_no, /*!< out: page number */ + uint16_t* offset); /*!< out: offset of the undo + entry within page */ +/***********************************************************************//** +Determine if DB_ROLL_PTR is of the insert type. +@return true if insert */ +UNIV_INLINE +bool +trx_undo_roll_ptr_is_insert( +/*========================*/ + roll_ptr_t roll_ptr); /*!< in: roll pointer */ +/***********************************************************************//** +Returns true if the record is of the insert type. +@return true if the record was freshly inserted (not updated). */ +UNIV_INLINE +bool +trx_undo_trx_id_is_insert( +/*======================*/ + const byte* trx_id) /*!< in: DB_TRX_ID, followed by DB_ROLL_PTR */ + MY_ATTRIBUTE((warn_unused_result)); +/** Write DB_ROLL_PTR. +@param[out] ptr buffer +@param[in] roll_ptr DB_ROLL_PTR value */ +inline void trx_write_roll_ptr(byte* ptr, roll_ptr_t roll_ptr) +{ + compile_time_assert(DATA_ROLL_PTR_LEN == 7); + mach_write_to_7(ptr, roll_ptr); +} +/** Read DB_ROLL_PTR. +@param[in] ptr buffer +@return roll ptr */ +inline roll_ptr_t trx_read_roll_ptr(const byte* ptr) +{ + compile_time_assert(DATA_ROLL_PTR_LEN == 7); + return mach_read_from_7(ptr); +} + +/** Get the next record in an undo log. +@param[in] undo_page undo log page +@param[in] rec undo record offset in the page +@param[in] page_no undo log header page number +@param[in] offset undo log header offset on page +@return undo log record, the page latched, NULL if none */ +inline trx_undo_rec_t* +trx_undo_page_get_next_rec(const buf_block_t *undo_page, uint16_t rec, + uint32_t page_no, uint16_t offset); +/** Get the previous record in an undo log. +@param[in,out] block undo log page +@param[in] rec undo record offset in the page +@param[in] page_no undo log header page number +@param[in] offset undo log header offset on page +@param[in] shared latching mode: true=RW_S_LATCH, false=RW_X_LATCH +@param[in,out] mtr mini-transaction +@return undo log record, the page latched, NULL if none */ +trx_undo_rec_t* +trx_undo_get_prev_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no, + uint16_t offset, bool shared, mtr_t *mtr); + +/** Get the first undo log record on a page. +@param[in] block undo log page +@param[in] page_no undo log header page number +@param[in] offset undo log header page offset +@return pointer to first record +@retval nullptr if none exists */ +trx_undo_rec_t* +trx_undo_page_get_first_rec(const buf_block_t *block, uint32_t page_no, + uint16_t offset); + +/** Initialize an undo log page. +NOTE: This corresponds to a redo log record and must not be changed! +@see mtr_t::undo_create() +@param[in,out] block undo log page */ +void trx_undo_page_init(const buf_block_t &block); + +/** Allocate an undo log page. +@param[in,out] undo undo log +@param[in,out] mtr mini-transaction that does not hold any page latch +@param[out] err error code +@return X-latched block if success +@retval nullptr on failure */ +buf_block_t *trx_undo_add_page(trx_undo_t *undo, mtr_t *mtr, dberr_t *err) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Free the last undo log page. The caller must hold the rseg mutex. +@param[in,out] undo undo log +@param[in,out] mtr mini-transaction that does not hold any undo log page + or that has allocated the undo log page +@return error code */ +dberr_t trx_undo_free_last_page(trx_undo_t *undo, mtr_t *mtr) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Try to truncate the undo logs. +@param trx transaction +@return error code */ +dberr_t trx_undo_try_truncate(const trx_t &trx); + +/** Truncate the head of an undo log. +NOTE that only whole pages are freed; the header page is not +freed, but emptied, if all the records there are below the limit. +@param[in,out] rseg rollback segment +@param[in] hdr_page_no header page number +@param[in] hdr_offset header offset on the page +@param[in] limit first undo number to preserve +(everything below the limit will be truncated) +@return error code */ +dberr_t +trx_undo_truncate_start( + trx_rseg_t* rseg, + uint32_t hdr_page_no, + uint16_t hdr_offset, + undo_no_t limit) + MY_ATTRIBUTE((nonnull, warn_unused_result)); +/** Mark that an undo log header belongs to a data dictionary transaction. +@param[in] trx dictionary transaction +@param[in,out] undo undo log +@param[in,out] mtr mini-transaction */ +void trx_undo_mark_as_dict(const trx_t* trx, trx_undo_t* undo, mtr_t* mtr); +/** Assign an undo log for a persistent transaction. +A new undo log is created or a cached undo log reused. +@param[in,out] trx transaction +@param[out] err error code +@param[in,out] mtr mini-transaction +@return the undo log block +@retval NULL on error */ +buf_block_t* +trx_undo_assign(trx_t* trx, dberr_t* err, mtr_t* mtr) + MY_ATTRIBUTE((nonnull)); +/** Assign an undo log for a transaction. +A new undo log is created or a cached undo log reused. +@tparam is_temp whether this is temporary undo log +@param[in,out] trx transaction +@param[in] rseg rollback segment +@param[out] undo the undo log +@param[in,out] mtr mini-transaction +@param[out] err error code +@return the undo log block +@retval nullptr on error */ +template +buf_block_t* +trx_undo_assign_low(trx_t *trx, trx_rseg_t *rseg, trx_undo_t **undo, + mtr_t *mtr, dberr_t *err) + MY_ATTRIBUTE((nonnull, warn_unused_result)); + +/** Set the state of the undo log segment at a XA PREPARE or XA ROLLBACK. +@param[in,out] trx transaction +@param[in,out] undo undo log +@param[in] rollback false=XA PREPARE, true=XA ROLLBACK +@param[in,out] mtr mini-transaction */ +void trx_undo_set_state_at_prepare(trx_t *trx, trx_undo_t *undo, bool rollback, + mtr_t *mtr) + MY_ATTRIBUTE((nonnull)); + +/** At shutdown, frees the undo logs of a transaction. */ +void +trx_undo_free_at_shutdown(trx_t *trx); + +/** Read an undo log when starting up the database. +@param[in,out] rseg rollback segment +@param[in] id rollback segment slot +@param[in] page_no undo log segment page number +@return the undo log +@retval nullptr on error */ +trx_undo_t * +trx_undo_mem_create_at_db_start(trx_rseg_t *rseg, ulint id, uint32_t page_no); + +#endif /* !UNIV_INNOCHECKSUM */ + +/** the only rollback segment type since MariaDB 10.3.1 */ +constexpr uint16_t TRX_UNDO_UPDATE= 2; +/* TRX_UNDO_STATE values of an undo log segment */ +/** contains an undo log of an active transaction */ +constexpr uint16_t TRX_UNDO_ACTIVE = 1; +/** cached for quick reuse */ +constexpr uint16_t TRX_UNDO_CACHED = 2; +/** can be freed in purge when all undo data in it is removed */ +constexpr uint16_t TRX_UNDO_TO_PURGE = 4; +/** contains an undo log of a prepared transaction */ +constexpr uint16_t TRX_UNDO_PREPARED = 5; + +#ifndef UNIV_INNOCHECKSUM + +/** Transaction undo log memory object; modified by the thread associated +with the transaction. */ + +struct trx_undo_t { + /*-----------------------------*/ + ulint id; /*!< undo log slot number within the + rollback segment */ + ulint state; /*!< state of the corresponding undo log + segment */ + trx_id_t trx_id; /*!< id of the trx assigned to the undo + log */ + XID xid; /*!< X/Open XA transaction + identification */ + bool dict_operation; /*!< TRUE if a dict operation trx */ + trx_rseg_t* rseg; /*!< rseg where the undo log belongs */ + /*-----------------------------*/ + uint32_t hdr_page_no; /*!< page number of the header page in + the undo log */ + uint32_t last_page_no; /*!< page number of the last page in the + undo log; this may differ from + top_page_no during a rollback */ + uint16_t hdr_offset; /*!< header offset of the undo log on + the page */ + uint32_t size; /*!< current size in pages */ + /*-----------------------------*/ + uint32_t top_page_no; /*!< page number where the latest undo + log record was catenated; during + rollback the page from which the latest + undo record was chosen */ + uint16_t top_offset; /*!< offset of the latest undo record, + i.e., the topmost element in the undo + log if we think of it as a stack */ + undo_no_t top_undo_no; /*!< undo number of the latest record + (IB_ID_MAX if the undo log is empty) */ + buf_block_t* guess_block; /*!< guess for the buffer block where + the top page might reside */ + + /** @return whether the undo log is empty */ + bool empty() const { return top_undo_no == IB_ID_MAX; } + + /*-----------------------------*/ + UT_LIST_NODE_T(trx_undo_t) undo_list; + /*!< undo log objects in the rollback + segment are chained into lists */ +}; + +/** Cache a pointer to an undo record in a latched buffer pool page, +parse the undo log record and store the record type, update vector +and compiler information */ +class UndorecApplier +{ + /** Undo log block page id */ + page_id_t page_id; + /** Pointer to within undo log record */ + const trx_undo_rec_t *undo_rec; + /** Undo log record type */ + byte type; + /** compiler information */ + byte cmpl_info; + /** page_offset(undo_rec) of the start of undo_rec */ + uint16_t offset; + /** Transaction id of the undo log */ + const trx_id_t trx_id; + /** Update vector */ + upd_t *update; + /** memory heap which can be used to build previous version of + the index record and its offsets */ + mem_heap_t *heap; + /** mini-transaction for accessing B-tree pages */ + mtr_t mtr; + +public: + UndorecApplier(page_id_t page_id, trx_id_t trx_id) : + page_id(page_id), trx_id(trx_id), heap(mem_heap_create(100)) + { + } + + /** Assign the next page id */ + void assign_next(const page_id_t next_page_id) + { + page_id= next_page_id; + } + + page_id_t get_page_id() const { return page_id; } + + /** Handle the DML undo log and apply it on online indexes */ + inline void apply_undo_rec(const trx_undo_rec_t *rec); + + ~UndorecApplier() + { + mem_heap_free(heap); + } + +private: + /** Handle the insert undo log and apply it on online indexes + @param tuple row reference from undo log record + @param clust_index clustered index */ + void log_insert(const dtuple_t &tuple, dict_index_t *clust_index); + + /** Handle the update, delete undo log and apply it on online + indexes. + @param tuple row reference from undo log record + @param clust_index clustered index */ + void log_update(const dtuple_t &tuple, dict_index_t *clust_index); + + /** Check whether the given roll pointer is generated by + the current undo log record information stored. + @return true if roll pointer matches with current undo log info */ + inline bool is_same(roll_ptr_t roll_ptr) const; + + /** Clear the undo log record information */ + void clear_undo_rec() + { + undo_rec= nullptr; + cmpl_info= 0; + type= 0; + update= nullptr; + mem_heap_empty(heap); + } + + /** Get the correct version of the clustered index record that + was modified by the current undo log record. Because there could + be the multiple successive updates of the same record within the + same transaction. + @param tuple tuple contains primary key value + @param index clustered index + @param[out] clust_rec current clustered index record + @param offsets offsets points to the record + @return clustered index record which was changed by + the undo log record or nullptr when there is no clustered + index record changed by undo log record */ + const rec_t* get_old_rec(const dtuple_t &tuple, dict_index_t *index, + const rec_t **clust_rec, rec_offs **offsets); +}; + +#endif /* !UNIV_INNOCHECKSUM */ + +/** The offset of the undo log page header on pages of the undo log */ +#define TRX_UNDO_PAGE_HDR FSEG_PAGE_DATA +/*-------------------------------------------------------------*/ +/** Transaction undo log page header offsets */ +/* @{ */ +#define TRX_UNDO_PAGE_TYPE 0 /*!< unused; 0 (before MariaDB 10.3.1: + 1=TRX_UNDO_INSERT or + 2=TRX_UNDO_UPDATE) */ +#define TRX_UNDO_PAGE_START 2 /*!< Byte offset where the undo log + records for the LATEST transaction + start on this page (remember that + in an update undo log, the first page + can contain several undo logs) */ +#define TRX_UNDO_PAGE_FREE 4 /*!< On each page of the undo log this + field contains the byte offset of the + first free byte on the page */ +#define TRX_UNDO_PAGE_NODE 6 /*!< The file list node in the chain + of undo log pages */ +/*-------------------------------------------------------------*/ +#define TRX_UNDO_PAGE_HDR_SIZE (6 + FLST_NODE_SIZE) + /*!< Size of the transaction undo + log page header, in bytes */ +/* @} */ + +/** An update undo segment with just one page can be reused if it has +at most this many bytes used; we must leave space at least for one new undo +log header on the page */ + +#define TRX_UNDO_PAGE_REUSE_LIMIT (3 << (srv_page_size_shift - 2)) + +/* An update undo log segment may contain several undo logs on its first page +if the undo logs took so little space that the segment could be cached and +reused. All the undo log headers are then on the first page, and the last one +owns the undo log records on subsequent pages if the segment is bigger than +one page. If an undo log is stored in a segment, then on the first page it is +allowed to have zero undo records, but if the segment extends to several +pages, then all the rest of the pages must contain at least one undo log +record. */ + +/** The offset of the undo log segment header on the first page of the undo +log segment */ + +#define TRX_UNDO_SEG_HDR (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE) +/** Undo log segment header */ +/* @{ */ +/*-------------------------------------------------------------*/ +#define TRX_UNDO_STATE 0 /*!< TRX_UNDO_ACTIVE, ... */ + +#ifndef UNIV_INNOCHECKSUM + +#define TRX_UNDO_LAST_LOG 2 /*!< Offset of the last undo log header + on the segment header page, 0 if + none */ +#define TRX_UNDO_FSEG_HEADER 4 /*!< Header for the file segment which + the undo log segment occupies */ +#define TRX_UNDO_PAGE_LIST (4 + FSEG_HEADER_SIZE) + /*!< Base node for the list of pages in + the undo log segment; defined only on + the undo log segment's first page */ +/*-------------------------------------------------------------*/ +/** Size of the undo log segment header */ +#define TRX_UNDO_SEG_HDR_SIZE (4 + FSEG_HEADER_SIZE + FLST_BASE_NODE_SIZE) +/* @} */ + +/** The undo log header. There can be several undo log headers on the first +page of an update undo log segment. */ +/* @{ */ +/*-------------------------------------------------------------*/ +/** Transaction start identifier, or 0 if the undo log segment has been +completely purged and trx_purge_free_segment() has started freeing it */ +#define TRX_UNDO_TRX_ID 0 +/** Transaction end identifier (if the log is in a history list), +or 0 if the transaction has not been committed */ +#define TRX_UNDO_TRX_NO 8 +/** Before MariaDB 10.3.1, when purge did not reset DB_TRX_ID of +surviving user records, this used to be called TRX_UNDO_DEL_MARKS. + +This field is redundant; it is only being read by some debug assertions. + +The value 1 indicates that purge needs to process the undo log segment. +The value 0 indicates that all of it has been processed, and +trx_purge_free_segment() has been invoked, so the log is not safe to access. + +Before MariaDB 10.3.1, a log segment may carry the value 0 even before +trx_purge_free_segment() was called, for those undo log records for +which purge would not result in removing delete-marked records. */ +#define TRX_UNDO_NEEDS_PURGE 16 +#define TRX_UNDO_LOG_START 18 /*!< Offset of the first undo log record + of this log on the header page; purge + may remove undo log record from the + log start, and therefore this is not + necessarily the same as this log + header end offset */ +#define TRX_UNDO_XID_EXISTS 20 /*!< TRUE if undo log header includes + X/Open XA transaction identification + XID */ +#define TRX_UNDO_DICT_TRANS 21 /*!< TRUE if the transaction is a table + create, index create, or drop + transaction: in recovery + the transaction cannot be rolled back + in the usual way: a 'rollback' rather + means dropping the created or dropped + table, if it still exists */ +#define TRX_UNDO_TABLE_ID 22 /*!< Id of the table if the preceding + field is TRUE */ +#define TRX_UNDO_NEXT_LOG 30 /*!< Offset of the next undo log header + on this page, 0 if none */ +#define TRX_UNDO_PREV_LOG 32 /*!< Offset of the previous undo log + header on this page, 0 if none */ +#define TRX_UNDO_HISTORY_NODE 34 /*!< If the log is put to the history + list, the file list node is here */ +/*-------------------------------------------------------------*/ +/** Size of the undo log header without XID information */ +#define TRX_UNDO_LOG_OLD_HDR_SIZE (34 + FLST_NODE_SIZE) + +/** X/Open XA Transaction Identification (XID) */ +/* @{ */ +/** xid_t::formatID */ +#define TRX_UNDO_XA_FORMAT (TRX_UNDO_LOG_OLD_HDR_SIZE) +/** xid_t::gtrid_length */ +#define TRX_UNDO_XA_TRID_LEN (TRX_UNDO_XA_FORMAT + 4) +/** xid_t::bqual_length */ +#define TRX_UNDO_XA_BQUAL_LEN (TRX_UNDO_XA_TRID_LEN + 4) +/** Distributed transaction identifier data */ +#define TRX_UNDO_XA_XID (TRX_UNDO_XA_BQUAL_LEN + 4) +/*--------------------------------------------------------------*/ +#define TRX_UNDO_LOG_XA_HDR_SIZE (TRX_UNDO_XA_XID + XIDDATASIZE) + /*!< Total size of the undo log header + with the XA XID */ +/* @} */ + +#include "trx0undo.inl" +#endif /* !UNIV_INNOCHECKSUM */ + +#endif diff --git a/storage/innobase/include/trx0undo.inl b/storage/innobase/include/trx0undo.inl new file mode 100644 index 00000000..9f05989f --- /dev/null +++ b/storage/innobase/include/trx0undo.inl @@ -0,0 +1,129 @@ +/***************************************************************************** + +Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/trx0undo.ic +Transaction undo log + +Created 3/26/1996 Heikki Tuuri +*******************************************************/ + +#include "data0type.h" +#include "page0page.h" + +/***********************************************************************//** +Builds a roll pointer. +@return roll pointer */ +UNIV_INLINE +roll_ptr_t +trx_undo_build_roll_ptr( +/*====================*/ + bool is_insert, /*!< in: TRUE if insert undo log */ + ulint rseg_id, /*!< in: rollback segment id */ + uint32_t page_no, /*!< in: page number */ + uint16_t offset) /*!< in: offset of the undo entry within page */ +{ + compile_time_assert(DATA_ROLL_PTR_LEN == 7); + ut_ad(rseg_id < TRX_SYS_N_RSEGS); + + return roll_ptr_t{is_insert} << ROLL_PTR_INSERT_FLAG_POS | + roll_ptr_t{rseg_id} << ROLL_PTR_RSEG_ID_POS | + roll_ptr_t{page_no} << ROLL_PTR_PAGE_POS | offset; +} + +/***********************************************************************//** +Decodes a roll pointer. */ +UNIV_INLINE +void +trx_undo_decode_roll_ptr( +/*=====================*/ + roll_ptr_t roll_ptr, /*!< in: roll pointer */ + bool* is_insert, /*!< out: TRUE if insert undo log */ + ulint* rseg_id, /*!< out: rollback segment id */ + uint32_t* page_no, /*!< out: page number */ + uint16_t* offset) /*!< out: offset of the undo + entry within page */ +{ + compile_time_assert(DATA_ROLL_PTR_LEN == 7); + ut_ad(roll_ptr < (1ULL << 56)); + *offset= static_cast(roll_ptr); + *page_no= static_cast(roll_ptr >> 16); + *rseg_id= static_cast(roll_ptr >> 48 & 0x7F); + *is_insert= static_cast(roll_ptr >> 55); +} + +/***********************************************************************//** +Determine if DB_ROLL_PTR is of the insert type. +@return true if insert */ +UNIV_INLINE +bool +trx_undo_roll_ptr_is_insert( +/*========================*/ + roll_ptr_t roll_ptr) /*!< in: roll pointer */ +{ + compile_time_assert(DATA_ROLL_PTR_LEN == 7); + ut_ad(roll_ptr < (1ULL << (ROLL_PTR_INSERT_FLAG_POS + 1))); + return static_cast(roll_ptr >> ROLL_PTR_INSERT_FLAG_POS); +} + +/***********************************************************************//** +Returns true if the record is of the insert type. +@return true if the record was freshly inserted (not updated). */ +UNIV_INLINE +bool +trx_undo_trx_id_is_insert( +/*======================*/ + const byte* trx_id) /*!< in: DB_TRX_ID, followed by DB_ROLL_PTR */ +{ + compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR); + return bool(trx_id[DATA_TRX_ID_LEN] >> 7); +} + +/** Determine the end offset of undo log records of an undo log page. +@param[in] undo_page undo log page +@param[in] page_no undo log header page number +@param[in] offset undo log header offset +@return end offset */ +inline +uint16_t trx_undo_page_get_end(const buf_block_t *undo_page, uint32_t page_no, + uint16_t offset) +{ + if (page_no == undo_page->page.id().page_no()) + if (uint16_t end = mach_read_from_2(TRX_UNDO_NEXT_LOG + offset + + undo_page->page.frame)) + return end; + + return mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + + undo_page->page.frame); +} + +/** Get the next record in an undo log. +@param[in] undo_page undo log page +@param[in] rec undo record offset in the page +@param[in] page_no undo log header page number +@param[in] offset undo log header offset on page +@return undo log record, the page latched, NULL if none */ +inline trx_undo_rec_t* +trx_undo_page_get_next_rec(const buf_block_t *undo_page, uint16_t rec, + uint32_t page_no, uint16_t offset) +{ + uint16_t end= trx_undo_page_get_end(undo_page, page_no, offset); + uint16_t next= mach_read_from_2(undo_page->page.frame + rec); + return next == end ? nullptr : undo_page->page.frame + next; +} diff --git a/storage/innobase/include/trx0xa.h b/storage/innobase/include/trx0xa.h new file mode 100644 index 00000000..cb5d67cf --- /dev/null +++ b/storage/innobase/include/trx0xa.h @@ -0,0 +1,61 @@ +/***************************************************************************** + +Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/* + * Start of xa.h header + * + * Define a symbol to prevent multiple inclusions of this header file + */ +#ifndef XA_H +#define XA_H + +#include "handler.h" + +/* + * Transaction branch identification: XID and NULLXID: + */ +#ifndef XIDDATASIZE + +/** Sizes of transaction identifier */ +#define XIDDATASIZE 128 /*!< maximum size of a transaction + identifier, in bytes */ +#define MAXGTRIDSIZE 64 /*!< maximum size in bytes of gtrid */ +#define MAXBQUALSIZE 64 /*!< maximum size in bytes of bqual */ + +#endif +/** X/Open XA distributed transaction status codes */ +/* @{ */ +#define XA_OK 0 /*!< normal execution */ +#define XAER_ASYNC -2 /*!< asynchronous operation already + outstanding */ +#define XAER_RMERR -3 /*!< a resource manager error + occurred in the transaction + branch */ +#define XAER_NOTA -4 /*!< the XID is not valid */ +#define XAER_INVAL -5 /*!< invalid arguments were given */ +#define XAER_PROTO -6 /*!< routine invoked in an improper + context */ +#define XAER_RMFAIL -7 /*!< resource manager unavailable */ +#define XAER_DUPID -8 /*!< the XID already exists */ +#define XAER_OUTSIDE -9 /*!< resource manager doing + work outside transaction */ +/* @} */ +#endif /* ifndef XA_H */ +/* + * End of xa.h header + */ diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i new file mode 100644 index 00000000..1b4f70b6 --- /dev/null +++ b/storage/innobase/include/univ.i @@ -0,0 +1,503 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/***********************************************************************//** +@file include/univ.i +Version control for database, common definitions, and include files + +Created 1/20/1994 Heikki Tuuri +****************************************************************************/ + +#pragma once + +/** How far ahead should we tell the service manager the timeout +(time in seconds) */ +#define INNODB_EXTEND_TIMEOUT_INTERVAL 30 + +#if defined(_WIN32) +# include +#endif /* _WIN32 */ + +/* Include a minimum number of SQL header files so that few changes +made in SQL code cause a complete InnoDB rebuild. These headers are +used throughout InnoDB but do not include too much themselves. They +support cross-platform development and expose comonly used SQL names. */ + +#include +#include "my_counter.h" +#include "aligned.h" +#include +#include + +/* Include to get S_I... macros defined for os0file.cc */ +#include + +#ifndef _WIN32 +# include +# include "my_config.h" +#endif + +#include +#include +#ifdef HAVE_UNISTD_H +#include +#endif + +#include "my_pthread.h" + +/* Following defines are to enable performance schema +instrumentation in each of five InnoDB modules if +HAVE_PSI_INTERFACE is defined. */ +#ifdef HAVE_PSI_INTERFACE +# define UNIV_PFS_MUTEX +# define UNIV_PFS_RWLOCK +# define UNIV_PFS_IO +# define UNIV_PFS_THREAD + +# include "mysql/psi/psi.h" /* HAVE_PSI_MEMORY_INTERFACE */ +# ifdef HAVE_PSI_MEMORY_INTERFACE +# define UNIV_PFS_MEMORY +# endif /* HAVE_PSI_MEMORY_INTERFACE */ + +#ifdef HAVE_PFS_THREAD_PROVIDER_H +/* For PSI_MUTEX_CALL() and similar. */ +#include "pfs_thread_provider.h" +#endif + +#include "mysql/psi/mysql_thread.h" +/* For PSI_FILE_CALL(). */ +#ifdef HAVE_PFS_FILE_PROVIDER_H +#include "pfs_file_provider.h" +#endif + +#include "mysql/psi/mysql_file.h" + +#endif /* HAVE_PSI_INTERFACE */ + +#ifdef _WIN32 +# define YY_NO_UNISTD_H 1 +/* VC++ tries to optimise for size by default, from V8+. The size of +the pointer to member depends on whether the type is defined before the +compiler sees the type in the translation unit. This default behaviour +can cause the pointer to be a different size in different translation +units, depending on the above rule. We force optimise for size behaviour +for all cases. This is used by ut0lst.h related code. */ +# pragma pointers_to_members(full_generality, multiple_inheritance) +#endif /* _WIN32 */ + +/* DEBUG VERSION CONTROL + ===================== */ + +/* When this macro is defined then additional test functions will be +compiled. These functions live at the end of each relevant source file +and have "test_" prefix. These functions can be called from the end of +innodb_init() or they can be called from gdb after srv_start() has executed +using the call command. */ +/* +#define UNIV_COMPILE_TEST_FUNCS +#define UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR +#define UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH +#define UNIV_ENABLE_UNIT_TEST_DICT_STATS +#define UNIV_ENABLE_UNIT_TEST_ROW_RAW_FORMAT_INT +*/ + +#ifdef DBUG_OFF +# undef UNIV_DEBUG +#elif !defined UNIV_DEBUG +# define UNIV_DEBUG +#endif + +#if 0 +#define UNIV_DEBUG_PRINT /* Enable the compilation of + some debug print functions */ +#define UNIV_AHI_DEBUG /* Enable adaptive hash index + debugging without UNIV_DEBUG */ +#define UNIV_BLOB_LIGHT_DEBUG /* Enable off-page column + debugging without UNIV_DEBUG */ +#define UNIV_DEBUG_LOCK_VALIDATE /* Enable + ut_ad(lock_rec_validate_page()) + assertions. */ +#define UNIV_LRU_DEBUG /* debug the buffer pool LRU */ +#define UNIV_HASH_DEBUG /* debug HASH_ macros */ +#define UNIV_IBUF_DEBUG /* debug the insert buffer */ +#define UNIV_PERF_DEBUG /* debug flag that enables + light weight performance + related stuff. */ +#define UNIV_SEARCH_PERF_STAT /* statistics for the + adaptive hash index */ +#define UNIV_BTR_PRINT /* enable functions for + printing B-trees */ +#define UNIV_ZIP_DEBUG /* extensive consistency checks + for compressed pages */ +#define UNIV_ZIP_COPY /* call page_zip_copy_recs() + more often */ +#define UNIV_AIO_DEBUG /* prints info about + submitted and reaped AIO + requests to the log. */ +#define UNIV_STATS_DEBUG /* prints various stats + related debug info from + dict0stats.c */ +#define FTS_INTERNAL_DIAG_PRINT /* FTS internal debugging + info output */ +#endif + +// #define UNIV_SQL_DEBUG + +#ifndef MY_ATTRIBUTE +#if defined(__GNUC__) +# define MY_ATTRIBUTE(A) __attribute__(A) +#else +# define MY_ATTRIBUTE(A) +#endif +#endif + +#define UNIV_INLINE static inline + +#define UNIV_WORD_SIZE SIZEOF_SIZE_T + +/** The following alignment is used in memory allocations in memory heap +management to ensure correct alignment for doubles etc. */ +#define UNIV_MEM_ALIGNMENT 8U + +/* + DATABASE VERSION CONTROL + ======================== +*/ + +#if defined (HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE) || defined(_WIN32) +#define IF_PUNCH_HOLE(A,B) A +#else +#define IF_PUNCH_HOLE(A,B) B +#endif + +/** log2 of smallest compressed page size (1<<10 == 1024 bytes) +Note: This must never change! */ +#define UNIV_ZIP_SIZE_SHIFT_MIN 10U + +/** log2 of largest compressed page size (1<<14 == 16384 bytes). +A compressed page directory entry reserves 14 bits for the start offset +and 2 bits for flags. This limits the uncompressed page size to 16k. +*/ +#define UNIV_ZIP_SIZE_SHIFT_MAX 14U + +/* Define the Min, Max, Default page sizes. */ +/** Minimum Page Size Shift (power of 2) */ +#define UNIV_PAGE_SIZE_SHIFT_MIN 12U +/** log2 of largest page size (1<<16 == 64436 bytes). */ +/** Maximum Page Size Shift (power of 2) */ +#define UNIV_PAGE_SIZE_SHIFT_MAX 16U +/** log2 of default page size (1<<14 == 16384 bytes). */ +/** Default Page Size Shift (power of 2) */ +#define UNIV_PAGE_SIZE_SHIFT_DEF 14U +/** Original 16k InnoDB Page Size Shift, in case the default changes */ +#define UNIV_PAGE_SIZE_SHIFT_ORIG 14U +/** Original 16k InnoDB Page Size as an ssize (log2 - 9) */ +#define UNIV_PAGE_SSIZE_ORIG (UNIV_PAGE_SIZE_SHIFT_ORIG - 9U) + +/** Minimum page size InnoDB currently supports. */ +#define UNIV_PAGE_SIZE_MIN (1U << UNIV_PAGE_SIZE_SHIFT_MIN) +/** Maximum page size InnoDB currently supports. */ +#define UNIV_PAGE_SIZE_MAX (1U << UNIV_PAGE_SIZE_SHIFT_MAX) +/** Default page size for InnoDB tablespaces. */ +#define UNIV_PAGE_SIZE_DEF (1U << UNIV_PAGE_SIZE_SHIFT_DEF) +/** Original 16k page size for InnoDB tablespaces. */ +#define UNIV_PAGE_SIZE_ORIG (1U << UNIV_PAGE_SIZE_SHIFT_ORIG) + +/** Smallest compressed page size */ +#define UNIV_ZIP_SIZE_MIN (1U << UNIV_ZIP_SIZE_SHIFT_MIN) + +/** Largest compressed page size */ +#define UNIV_ZIP_SIZE_MAX (1U << UNIV_ZIP_SIZE_SHIFT_MAX) + +/** Largest possible ssize for an uncompressed page. +(The convention 'ssize' is used for 'log2 minus 9' or the number of +shifts starting with 512.) +This max number varies depending on srv_page_size. */ +#define UNIV_PAGE_SSIZE_MAX \ + ulint(srv_page_size_shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1U) + +/** Smallest possible ssize for an uncompressed page. */ +#define UNIV_PAGE_SSIZE_MIN \ + ulint(UNIV_PAGE_SIZE_SHIFT_MIN - UNIV_ZIP_SIZE_SHIFT_MIN + 1U) + +/** Maximum number of parallel threads in a parallelized operation */ +#define UNIV_MAX_PARALLELISM 32 + +/** This is the "mbmaxlen" for my_charset_filename (defined in +strings/ctype-utf8.c), which is used to encode File and Database names. */ +#define FILENAME_CHARSET_MAXNAMLEN 5 + +/** The maximum length of an encode table name in bytes. The max +table and database names are NAME_CHAR_LEN (64) characters. After the +encoding, the max length would be NAME_CHAR_LEN (64) * +FILENAME_CHARSET_MAXNAMLEN (5) = 320 bytes. The number does not include a +terminating '\0'. InnoDB can handle longer names internally */ +#define MAX_TABLE_NAME_LEN 320 + +/** The maximum length of a database name. Like MAX_TABLE_NAME_LEN this is +the MySQL's NAME_LEN, see check_and_convert_db_name(). */ +#define MAX_DATABASE_NAME_LEN MAX_TABLE_NAME_LEN + +/** MAX_FULL_NAME_LEN defines the full name path including the +database name and table name. In addition, 14 bytes is added for: + 2 for surrounding quotes around table name + 1 for the separating dot (.) + 9 for the #mysql50# prefix */ +#define MAX_FULL_NAME_LEN \ + (MAX_TABLE_NAME_LEN + MAX_DATABASE_NAME_LEN + 14) + +/** Maximum length of the compression alogrithm string. Currently we support +only (NONE | ZLIB | LZ4). */ +#define MAX_COMPRESSION_LEN 4 + +/** The maximum length in bytes that a database name can occupy when stored in +UTF8, including the terminating '\0', see dict_fs2utf8(). You must include +mysql_com.h if you are to use this macro. */ +#define MAX_DB_UTF8_LEN (NAME_LEN + 1) + +/** The maximum length in bytes that a table name can occupy when stored in +UTF8, including the terminating '\0', see dict_fs2utf8(). You must include +mysql_com.h if you are to use this macro. */ +#define MAX_TABLE_UTF8_LEN (NAME_LEN + sizeof(srv_mysql50_table_name_prefix)) + +/* + UNIVERSAL TYPE DEFINITIONS + ========================== +*/ + +/** Unsigned octet of bits */ +typedef unsigned char byte; +/** Machine-word-width unsigned integer */ +typedef size_t ulint; +/** Machine-word-width signed integer */ +typedef ssize_t lint; + +/** ulint format for the printf() family of functions */ +#define ULINTPF "%zu" +/** ulint hexadecimal format for the printf() family of functions */ +#define ULINTPFx "%zx" + +#ifdef _WIN32 +/* Use the integer types and formatting strings defined in Visual Studio. */ +# define UINT32PF "%u" +# define UINT64scan "llu" +# define UINT64PFx "%016llx" +#elif defined __APPLE__ +/* Apple prefers to call the 64-bit types 'long long' +in both 32-bit and 64-bit environments. */ +# define UINT32PF "%" PRIu32 +# define UINT64scan "llu" +# define UINT64PFx "%016llx" +#elif defined _AIX +/* Workaround for macros expension trouble */ +# define UINT32PF "%u" +# define UINT64scan "lu" +# define UINT64PFx "%016lx" +#else +/* Use the integer types and formatting strings defined in the C99 standard. */ +# define UINT32PF "%" PRIu32 +# define INT64PF "%" PRId64 +# define UINT64scan PRIu64 +# define UINT64PFx "%016" PRIx64 +#endif + +typedef int64_t ib_int64_t; +typedef uint64_t ib_uint64_t; +typedef uint32_t ib_uint32_t; + +#define UINT64PF "%" UINT64scan +#define IB_ID_FMT UINT64PF + +/** Log sequence number (also used for redo log byte arithmetics) */ +typedef ib_uint64_t lsn_t; + +/** The 'undefined' value for a ulint */ +#define ULINT_UNDEFINED ((ulint)(-1)) + +/** The 'undefined' value for a ib_uint64_t */ +#define UINT64_UNDEFINED ((ib_uint64_t)(-1)) + +/** The bitmask of 32-bit unsigned integer */ +#define ULINT32_MASK 0xFFFFFFFFU +/** The undefined 32-bit unsigned integer */ +#define ULINT32_UNDEFINED ULINT32_MASK + +/** Maximum value for a ulint */ +#define ULINT_MAX ((ulint)(-2)) + +/** Maximum value for ib_uint64_t */ +#define IB_UINT64_MAX ((ib_uint64_t) (~0ULL)) + +/** The generic InnoDB system object identifier data type */ +typedef ib_uint64_t ib_id_t; +#define IB_ID_MAX (~(ib_id_t) 0) +#define IB_ID_FMT UINT64PF + +#ifndef UINTMAX_MAX +#define UINTMAX_MAX IB_UINT64_MAX +#endif +/** This 'ibool' type is used within Innobase. Remember that different included +headers may define 'bool' differently. Do not assume that 'bool' is a ulint! */ +#define ibool ulint + +#ifndef TRUE + +#define TRUE 1 +#define FALSE 0 + +#endif + +#define UNIV_NOTHROW + +/** The following number as the length of a logical field means that the field +has the SQL NULL as its value. NOTE that because we assume that the length +of a field is a 32-bit integer when we store it, for example, to an undo log +on disk, we must have also this number fit in 32 bits, also in 64-bit +computers! */ + +#define UNIV_SQL_NULL ULINT32_UNDEFINED + +/** Lengths which are not UNIV_SQL_NULL, but bigger than the following +number indicate that a field contains a reference to an externally +stored part of the field in the tablespace. The length field then +contains the sum of the following flag and the locally stored len. */ + +#define UNIV_EXTERN_STORAGE_FIELD (UNIV_SQL_NULL - UNIV_PAGE_SIZE_DEF) + +#if defined(__GNUC__) +/* Tell the compiler that variable/function is unused. */ +# define UNIV_UNUSED MY_ATTRIBUTE ((unused)) +#else +# define UNIV_UNUSED +#endif /* CHECK FOR GCC VER_GT_2 */ + +/* Some macros to improve branch prediction and reduce cache misses */ +#ifdef __GNUC__ +/* Tell the compiler that 'expr' probably evaluates to 'constant'. */ +# define UNIV_EXPECT(expr,constant) __builtin_expect(expr, constant) +/* Tell the compiler that a pointer is likely to be NULL */ +# define UNIV_LIKELY_NULL(ptr) __builtin_expect((ptr) != 0, 0) +/* Minimize cache-miss latency by moving data at addr into a cache before +it is read. */ +# define UNIV_PREFETCH_R(addr) __builtin_prefetch(addr, 0, 3) +/* Minimize cache-miss latency by moving data at addr into a cache before +it is read or written. */ +# define UNIV_PREFETCH_RW(addr) __builtin_prefetch(addr, 1, 3) + +/* Sun Studio includes sun_prefetch.h as of version 5.9 */ +#elif (defined(__SUNPRO_C) || defined(__SUNPRO_CC)) + +# include + +# define UNIV_EXPECT(expr,value) (expr) +# define UNIV_LIKELY_NULL(expr) (expr) + +//# define UNIV_PREFETCH_R(addr) sun_prefetch_read_many((void*) addr) +# define UNIV_PREFETCH_R(addr) ((void) 0) +# define UNIV_PREFETCH_RW(addr) sun_prefetch_write_many(addr) + +# elif defined _MSC_VER +# define UNIV_EXPECT(expr,value) (expr) +# define UNIV_LIKELY_NULL(expr) (expr) +# if defined _M_IX86 || defined _M_X64 + // __MM_HINT_T0 - (temporal data) + // prefetch data into all levels of the cache hierarchy. +# define UNIV_PREFETCH_R(addr) _mm_prefetch((char *) addr, _MM_HINT_T0) +# define UNIV_PREFETCH_RW(addr) _mm_prefetch((char *) addr, _MM_HINT_T0) +# elif defined _M_ARM64 +# define UNIV_PREFETCH_R(addr) __prefetch(addr) +# define UNIV_PREFETCH_RW(addr) __prefetch(addr) +# else +# define UNIV_PREFETCH_R ((void) 0) +# define UNIV_PREFETCH_RW(addr) ((void) 0) +# endif +#else +/* Dummy versions of the macros */ +# define UNIV_EXPECT(expr,value) (expr) +# define UNIV_LIKELY_NULL(expr) (expr) +# define UNIV_PREFETCH_R(addr) ((void) 0) +# define UNIV_PREFETCH_RW(addr) ((void) 0) +#endif + +/* Tell the compiler that cond is likely to hold */ +#define UNIV_LIKELY(cond) UNIV_EXPECT(cond, TRUE) +/* Tell the compiler that cond is unlikely to hold */ +#define UNIV_UNLIKELY(cond) UNIV_EXPECT(cond, FALSE) + +/* Compile-time constant of the given array's size. */ +#define UT_ARR_SIZE(a) (sizeof(a) / sizeof((a)[0])) + +#include +#include "db0err.h" +#include "ut0dbg.h" +#include "ut0lst.h" +#include "ut0ut.h" + +extern uint32_t srv_page_size_shift; +extern ulong srv_page_size; + +/* Dimension of spatial object we support so far. It has its root in +myisam/sp_defs.h. We only support 2 dimension data */ +#define SPDIMS 2 + +#ifdef HAVE_PSI_INTERFACE +typedef unsigned int mysql_pfs_key_t; + +# ifdef UNIV_PFS_MUTEX +extern mysql_pfs_key_t buf_pool_mutex_key; +extern mysql_pfs_key_t dict_foreign_err_mutex_key; +extern mysql_pfs_key_t fil_system_mutex_key; +extern mysql_pfs_key_t flush_list_mutex_key; +extern mysql_pfs_key_t fts_cache_mutex_key; +extern mysql_pfs_key_t fts_cache_init_mutex_key; +extern mysql_pfs_key_t fts_delete_mutex_key; +extern mysql_pfs_key_t fts_doc_id_mutex_key; +extern mysql_pfs_key_t ibuf_bitmap_mutex_key; +extern mysql_pfs_key_t ibuf_mutex_key; +extern mysql_pfs_key_t ibuf_pessimistic_insert_mutex_key; +extern mysql_pfs_key_t recalc_pool_mutex_key; +extern mysql_pfs_key_t purge_sys_pq_mutex_key; +extern mysql_pfs_key_t recv_sys_mutex_key; +extern mysql_pfs_key_t rtr_active_mutex_key; +extern mysql_pfs_key_t rtr_match_mutex_key; +extern mysql_pfs_key_t rtr_path_mutex_key; +extern mysql_pfs_key_t page_zip_stat_per_index_mutex_key; +extern mysql_pfs_key_t srv_innodb_monitor_mutex_key; +extern mysql_pfs_key_t srv_misc_tmpfile_mutex_key; +extern mysql_pfs_key_t srv_monitor_file_mutex_key; +extern mysql_pfs_key_t buf_dblwr_mutex_key; +extern mysql_pfs_key_t trx_pool_mutex_key; +extern mysql_pfs_key_t trx_pool_manager_mutex_key; +extern mysql_pfs_key_t lock_wait_mutex_key; +extern mysql_pfs_key_t srv_threads_mutex_key; +# endif /* UNIV_PFS_MUTEX */ + +# ifdef UNIV_PFS_RWLOCK +extern mysql_pfs_key_t dict_operation_lock_key; +extern mysql_pfs_key_t fil_space_latch_key; +extern mysql_pfs_key_t trx_i_s_cache_lock_key; +extern mysql_pfs_key_t trx_purge_latch_key; +extern mysql_pfs_key_t index_tree_rw_lock_key; +extern mysql_pfs_key_t index_online_log_key; +extern mysql_pfs_key_t trx_sys_rw_lock_key; +extern mysql_pfs_key_t lock_latch_key; +extern mysql_pfs_key_t log_latch_key; +extern mysql_pfs_key_t trx_rseg_latch_key; +# endif /* UNIV_PFS_RWLOCK */ +#endif /* HAVE_PSI_INTERFACE */ diff --git a/storage/innobase/include/ut0byte.h b/storage/innobase/include/ut0byte.h new file mode 100644 index 00000000..2b70fac3 --- /dev/null +++ b/storage/innobase/include/ut0byte.h @@ -0,0 +1,107 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0byte.h +Utilities for byte operations + +Created 1/20/1994 Heikki Tuuri +***********************************************************************/ + +#ifndef ut0byte_h +#define ut0byte_h + +#include "univ.i" + +/*******************************************************//** +Creates a 64-bit integer out of two 32-bit integers. +@return created integer */ +UNIV_INLINE +ib_uint64_t +ut_ull_create( +/*==========*/ + ulint high, /*!< in: high-order 32 bits */ + ulint low) /*!< in: low-order 32 bits */ + MY_ATTRIBUTE((const)); + +/********************************************************//** +Rounds a 64-bit integer downward to a multiple of a power of 2. +@return rounded value */ +UNIV_INLINE +ib_uint64_t +ut_uint64_align_down( +/*=================*/ + ib_uint64_t n, /*!< in: number to be rounded */ + ulint align_no); /*!< in: align by this number + which must be a power of 2 */ +/********************************************************//** +Rounds ib_uint64_t upward to a multiple of a power of 2. +@return rounded value */ +UNIV_INLINE +ib_uint64_t +ut_uint64_align_up( +/*===============*/ + ib_uint64_t n, /*!< in: number to be rounded */ + ulint align_no); /*!< in: align by this number + which must be a power of 2 */ +/** Round down a pointer to the nearest aligned address. +@param ptr pointer +@param alignment a power of 2 +@return aligned pointer */ +static inline void *ut_align_down(void *ptr, size_t alignment) +{ + ut_ad(alignment > 0); + ut_ad(ut_is_2pow(alignment)); + ut_ad(ptr); + static_assert(sizeof ptr == sizeof(size_t), "compatibility"); + + return reinterpret_cast(reinterpret_cast(ptr) & + ~(alignment - 1)); +} + +static inline const void *ut_align_down(const void *ptr, size_t alignment) +{ + return ut_align_down(const_cast(ptr), alignment); +} + +/** Compute the offset of a pointer from the nearest aligned address. +@param ptr pointer +@param alignment a power of 2 +@return distance from aligned pointer */ +inline size_t ut_align_offset(const void *ptr, size_t alignment) +{ + ut_ad(alignment > 0); + ut_ad(ut_is_2pow(alignment)); + ut_ad(ptr); + return reinterpret_cast(ptr) & (alignment - 1); +} + +/*****************************************************************//** +Gets the nth bit of a ulint. +@return TRUE if nth bit is 1; 0th bit is defined to be the least significant */ +UNIV_INLINE +ibool +ut_bit_get_nth( +/*===========*/ + ulint a, /*!< in: ulint */ + ulint n); /*!< in: nth bit requested */ + +#include "ut0byte.inl" + +#endif diff --git a/storage/innobase/include/ut0byte.inl b/storage/innobase/include/ut0byte.inl new file mode 100644 index 00000000..dfa069c2 --- /dev/null +++ b/storage/innobase/include/ut0byte.inl @@ -0,0 +1,90 @@ +/***************************************************************************** + +Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************************//** +@file include/ut0byte.ic +Utilities for byte operations + +Created 5/30/1994 Heikki Tuuri +*******************************************************************/ + +/*******************************************************//** +Creates a 64-bit integer out of two 32-bit integers. +@return created integer */ +UNIV_INLINE +ib_uint64_t +ut_ull_create( +/*==========*/ + ulint high, /*!< in: high-order 32 bits */ + ulint low) /*!< in: low-order 32 bits */ +{ + ut_ad(high <= ULINT32_MASK); + ut_ad(low <= ULINT32_MASK); + return(((ib_uint64_t) high) << 32 | low); +} + +/********************************************************//** +Rounds a 64-bit integer downward to a multiple of a power of 2. +@return rounded value */ +UNIV_INLINE +ib_uint64_t +ut_uint64_align_down( +/*=================*/ + ib_uint64_t n, /*!< in: number to be rounded */ + ulint align_no) /*!< in: align by this number + which must be a power of 2 */ +{ + ut_ad(align_no > 0); + ut_ad(ut_is_2pow(align_no)); + + return(n & ~((ib_uint64_t) align_no - 1)); +} + +/********************************************************//** +Rounds ib_uint64_t upward to a multiple of a power of 2. +@return rounded value */ +UNIV_INLINE +ib_uint64_t +ut_uint64_align_up( +/*===============*/ + ib_uint64_t n, /*!< in: number to be rounded */ + ulint align_no) /*!< in: align by this number + which must be a power of 2 */ +{ + ib_uint64_t align_1 = (ib_uint64_t) align_no - 1; + + ut_ad(align_no > 0); + ut_ad(ut_is_2pow(align_no)); + + return((n + align_1) & ~align_1); +} + +/*****************************************************************//** +Gets the nth bit of a ulint. +@return TRUE if nth bit is 1; 0th bit is defined to be the least significant */ +UNIV_INLINE +ibool +ut_bit_get_nth( +/*===========*/ + ulint a, /*!< in: ulint */ + ulint n) /*!< in: nth bit requested */ +{ + ut_ad(n < 8 * sizeof(ulint)); + return(1 & (a >> n)); +} diff --git a/storage/innobase/include/ut0counter.h b/storage/innobase/include/ut0counter.h new file mode 100644 index 00000000..d6589cc4 --- /dev/null +++ b/storage/innobase/include/ut0counter.h @@ -0,0 +1,123 @@ +/***************************************************************************** + +Copyright (c) 2012, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file include/ut0counter.h + +Counter utility class + +Created 2012/04/12 by Sunny Bains +*******************************************************/ + +#ifndef ut0counter_h +#define ut0counter_h + +#include "univ.i" +#include "my_rdtsc.h" + +/** Use the result of my_timer_cycles(), which mainly uses RDTSC for cycles +as a random value. See the comments for my_timer_cycles() */ +/** @return result from RDTSC or similar functions. */ +static inline size_t +get_rnd_value() +{ + size_t c = static_cast(my_timer_cycles()); + + if (c != 0) { + return c; + } + + /* We may go here if my_timer_cycles() returns 0, + so we have to have the plan B for the counter. */ +#if !defined(_WIN32) + return (size_t)pthread_self(); +#else + LARGE_INTEGER cnt; + QueryPerformanceCounter(&cnt); + + return static_cast(cnt.QuadPart); +#endif /* !_WIN32 */ +} + +/** Atomic which occupies whole CPU cache line. +Note: We rely on the default constructor of std::atomic and +do not explicitly initialize the contents. This works for us, +because ib_counter_t is only intended for usage with global +memory that is allocated from the .bss and thus guaranteed to +be zero-initialized by the run-time environment. +@see srv_stats */ +template +struct ib_atomic_counter_element_t { + alignas(CPU_LEVEL1_DCACHE_LINESIZE) Atomic_relaxed value; +}; + +template +struct ib_counter_element_t { + alignas(CPU_LEVEL1_DCACHE_LINESIZE) Type value; +}; + + +/** Class for using fuzzy counters. The counter is multi-instance relaxed atomic +so the results are not guaranteed to be 100% accurate but close +enough. */ +template class Element = ib_atomic_counter_element_t, + int N = 128 > +struct ib_counter_t { + /** Increment the counter by 1. */ + void inc() { add(1); } + ib_counter_t& operator++() { inc(); return *this; } + + /** Increment the counter by 1. + @param[in] index a reasonably thread-unique identifier */ + void inc(size_t index) { add(index, 1); } + + /** Add to the counter. + @param[in] n amount to be added */ + void add(Type n) { add(get_rnd_value(), n); } + + /** Add to the counter. + @param[in] index a reasonably thread-unique identifier + @param[in] n amount to be added */ + TPOOL_SUPPRESS_TSAN void add(size_t index, Type n) { + index = index % N; + + ut_ad(index < UT_ARR_SIZE(m_counter)); + + m_counter[index].value += n; + } + + /* @return total value - not 100% accurate, since it is relaxed atomic*/ + operator Type() const { + Type total = 0; + + for (const auto &counter : m_counter) { + total += counter.value; + } + + return(total); + } + +private: + static_assert(sizeof(Element) == CPU_LEVEL1_DCACHE_LINESIZE, ""); + /** Array of counter elements */ + alignas(CPU_LEVEL1_DCACHE_LINESIZE) Element m_counter[N]; +}; + +#endif /* ut0counter_h */ diff --git a/storage/innobase/include/ut0dbg.h b/storage/innobase/include/ut0dbg.h new file mode 100644 index 00000000..85856660 --- /dev/null +++ b/storage/innobase/include/ut0dbg.h @@ -0,0 +1,179 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*****************************************************************//** +@file include/ut0dbg.h +Debug utilities for Innobase + +Created 1/30/1994 Heikki Tuuri +**********************************************************************/ + +#ifndef ut0dbg_h +#define ut0dbg_h + +#ifdef UNIV_INNOCHECKSUM +#define ut_a assert +#define ut_ad assert +#define ut_error assert(0) +#else /* !UNIV_INNOCHECKSUM */ + +/* Do not include univ.i because univ.i includes this. */ + +/*************************************************************//** +Report a failed assertion. */ +ATTRIBUTE_NORETURN ATTRIBUTE_COLD __attribute__((nonnull(2))) +void +ut_dbg_assertion_failed( +/*====================*/ + const char* expr, /*!< in: the failed assertion */ + const char* file, /*!< in: source file containing the assertion */ + unsigned line); /*!< in: line number of the assertion */ + +/** Abort execution if EXPR does not evaluate to nonzero. +@param EXPR assertion expression that should hold */ +#define ut_a(EXPR) do { \ + if (UNIV_UNLIKELY(!(ulint) (EXPR))) { \ + ut_dbg_assertion_failed(#EXPR, \ + __FILE__, __LINE__); \ + } \ +} while (0) + +/** Abort execution. */ +#define ut_error \ + ut_dbg_assertion_failed(0, __FILE__, __LINE__) + +/** Debug assertion */ +#define ut_ad DBUG_SLOW_ASSERT +#if defined(UNIV_DEBUG) || !defined(DBUG_OFF) +/** Debug statement. Does nothing unless UNIV_DEBUG is defined. */ +#define ut_d(EXPR) EXPR +#else +/** Debug statement. Does nothing unless UNIV_DEBUG is defined. */ +#define ut_d(EXPR) +#endif + +#if defined(HAVE_SYS_TIME_H) && defined(HAVE_SYS_RESOURCE_H) + +#define HAVE_UT_CHRONO_T + +#include +#include +#include + +/** A "chronometer" used to clock snippets of code. +Example usage: + ut_chrono_t ch("this loop"); + for (;;) { ... } + ch.show(); +would print the timings of the for() loop, prefixed with "this loop:" */ +class ut_chrono_t { +public: + /** Constructor. + @param[in] name chrono's name, used when showing the values */ + ut_chrono_t( + const char* name) + : + m_name(name), + m_show_from_destructor(true) + { + reset(); + } + + /** Resets the chrono (records the current time in it). */ + void + reset() + { + gettimeofday(&m_tv, NULL); + + getrusage(RUSAGE_SELF, &m_ru); + } + + /** Shows the time elapsed and usage statistics since the last reset. */ + void + show() + { + struct rusage ru_now; + struct timeval tv_now; + struct timeval tv_diff; + + getrusage(RUSAGE_SELF, &ru_now); + + gettimeofday(&tv_now, NULL); + +#ifndef timersub +#define timersub(a, b, r) \ + do { \ + (r)->tv_sec = (a)->tv_sec - (b)->tv_sec; \ + (r)->tv_usec = (a)->tv_usec - (b)->tv_usec; \ + if ((r)->tv_usec < 0) { \ + (r)->tv_sec--; \ + (r)->tv_usec += 1000000; \ + } \ + } while (0) +#endif /* timersub */ + +#define CHRONO_PRINT(type, tvp) \ + fprintf(stderr, "%s: %s% 5ld.%06ld sec\n", \ + m_name, type, \ + static_cast((tvp)->tv_sec), \ + static_cast((tvp)->tv_usec)) + + timersub(&tv_now, &m_tv, &tv_diff); + CHRONO_PRINT("real", &tv_diff); + + timersub(&ru_now.ru_utime, &m_ru.ru_utime, &tv_diff); + CHRONO_PRINT("user", &tv_diff); + + timersub(&ru_now.ru_stime, &m_ru.ru_stime, &tv_diff); + CHRONO_PRINT("sys ", &tv_diff); + } + + /** Cause the timings not to be printed from the destructor. */ + void end() + { + m_show_from_destructor = false; + } + + /** Destructor. */ + ~ut_chrono_t() + { + if (m_show_from_destructor) { + show(); + } + } + +private: + /** Name of this chronometer. */ + const char* m_name; + + /** True if the current timings should be printed by the destructor. */ + bool m_show_from_destructor; + + /** getrusage() result as of the last reset(). */ + struct rusage m_ru; + + /** gettimeofday() result as of the last reset(). */ + struct timeval m_tv; +}; + +#endif /* HAVE_SYS_TIME_H && HAVE_SYS_RESOURCE_H */ + +#endif /* !UNIV_INNOCHECKSUM */ + +#endif diff --git a/storage/innobase/include/ut0list.h b/storage/innobase/include/ut0list.h new file mode 100644 index 00000000..765f6a2a --- /dev/null +++ b/storage/innobase/include/ut0list.h @@ -0,0 +1,146 @@ +/***************************************************************************** + +Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0list.h +A double-linked list + +Created 4/26/2006 Osku Salerma +************************************************************************/ + +/*******************************************************************//** +A double-linked list. This differs from the one in ut0lst.h in that in this +one, each list node contains a pointer to the data, whereas the one in +ut0lst.h uses a strategy where the list pointers are embedded in the data +items themselves. + +Use this one when you need to store arbitrary data in the list where you +can't embed the list pointers in the data, if a data item needs to be +stored in multiple lists, etc. + +Note about the memory management: ib_list_t is a fixed-size struct whose +allocation/deallocation is done through ib_list_create/ib_list_free, but the +memory for the list nodes is allocated through a user-given memory heap, +which can either be the same for all nodes or vary per node. Most users will +probably want to create a memory heap to store the item-specific data, and +pass in this same heap to the list node creation functions, thus +automatically freeing the list node when the item's heap is freed. + +************************************************************************/ + +#ifndef IB_LIST_H +#define IB_LIST_H + +#include "mem0mem.h" + +struct ib_list_t; +struct ib_list_node_t; + +/****************************************************************//** +Create a new list using mem_alloc. Lists created with this function must be +freed with ib_list_free. +@return list */ +ib_list_t* +ib_list_create(void); +/*=================*/ + +/****************************************************************//** +Free a list. */ +void +ib_list_free( +/*=========*/ + ib_list_t* list); /*!< in: list */ + +/****************************************************************//** +Add the data to the end of the list. +@return new list node */ +ib_list_node_t* +ib_list_add_last( +/*=============*/ + ib_list_t* list, /*!< in: list */ + void* data, /*!< in: data */ + mem_heap_t* heap); /*!< in: memory heap to use */ + +/****************************************************************//** +Remove the node from the list. */ +void +ib_list_remove( +/*===========*/ + ib_list_t* list, /*!< in: list */ + ib_list_node_t* node); /*!< in: node to remove */ + +/****************************************************************//** +Get the first node in the list. +@return first node, or NULL */ +UNIV_INLINE +ib_list_node_t* +ib_list_get_first( +/*==============*/ + ib_list_t* list); /*!< in: list */ + +/****************************************************************//** +Get the last node in the list. +@return last node, or NULL */ +UNIV_INLINE +ib_list_node_t* +ib_list_get_last( +/*=============*/ + ib_list_t* list); /*!< in: list */ + +/******************************************************************** +Check if list is empty. */ +UNIV_INLINE +ibool +ib_list_is_empty( +/*=============*/ + /* out: TRUE if empty else */ + const ib_list_t* list); /* in: list */ + +/******************************************************************** +Get number of items on list. +@return number of items on list */ +UNIV_INLINE +ulint +ib_list_len( +/*========*/ + const ib_list_t* list); /*first); +} + +/****************************************************************//** +Get the last node in the list. +@return last node, or NULL */ +UNIV_INLINE +ib_list_node_t* +ib_list_get_last( +/*=============*/ + ib_list_t* list) /*!< in: list */ +{ + return(list->last); +} + +/******************************************************************** +Check if list is empty. */ +UNIV_INLINE +ibool +ib_list_is_empty( +/*=============*/ + /* out: TRUE if empty else FALSE */ + const ib_list_t* list) /* in: list */ +{ + return(!(list->first || list->last)); +} + +/******************************************************************** +Get number of items on list. +@return number of items on list */ +UNIV_INLINE +ulint +ib_list_len( +/*========*/ + const ib_list_t* list) /*first; + + while(node) { + len++; + node = node->next; + } + + return (len); +} diff --git a/storage/innobase/include/ut0lst.h b/storage/innobase/include/ut0lst.h new file mode 100644 index 00000000..7b7ed7b8 --- /dev/null +++ b/storage/innobase/include/ut0lst.h @@ -0,0 +1,563 @@ +/***************************************************************************** + +Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0lst.h +List utilities + +Created 9/10/1995 Heikki Tuuri +Rewritten by Sunny Bains Dec 2011. +***********************************************************************/ + +#pragma once + +/* Do not include univ.i because univ.i includes this. */ + +#include "ut0dbg.h" + +/* This module implements the two-way linear list. Note that a single +list node may belong to two or more lists, but is only on one list +at a time. */ + +/*******************************************************************//** +The two way list node. +@param TYPE the list node type name */ +template +struct ut_list_node { + Type* prev; /*!< pointer to the previous + node, NULL if start of list */ + Type* next; /*!< pointer to next node, + NULL if end of list */ + + void reverse() + { + Type* tmp = prev; + prev = next; + next = tmp; + } +}; + +/** Macro used for legacy reasons */ +#define UT_LIST_NODE_T(t) ut_list_node + +/*******************************************************************//** +The two-way list base node. The base node contains pointers to both ends +of the list and a count of nodes in the list (excluding the base node +from the count). We also store a pointer to the member field so that it +doesn't have to be specified when doing list operations. +@param Type the type of the list element +@param NodePtr field member pointer that points to the list node */ +template +struct ut_list_base { + typedef Type elem_type; + typedef NodePtr node_ptr; + typedef ut_list_node node_type; + + ulint count; /*!< count of nodes in list */ + elem_type* start; /*!< pointer to list start, + NULL if empty */ + elem_type* end; /*!< pointer to list end, + NULL if empty */ + node_ptr node; /*!< Pointer to member field + that is used as a link node */ +#ifdef UNIV_DEBUG + ulint init; /*!< UT_LIST_INITIALISED if + the list was initialised with + UT_LIST_INIT() */ +#endif /* UNIV_DEBUG */ + + void reverse() + { + Type* tmp = start; + start = end; + end = tmp; + } +}; + +#define UT_LIST_BASE_NODE_T(t) ut_list_base t::*> + +#ifdef UNIV_DEBUG +# define UT_LIST_INITIALISED 0xCAFE +# define UT_LIST_INITIALISE(b) (b).init = UT_LIST_INITIALISED +# define UT_LIST_IS_INITIALISED(b) ut_a(((b).init == UT_LIST_INITIALISED)) +#else +# define UT_LIST_INITIALISE(b) +# define UT_LIST_IS_INITIALISED(b) +#endif /* UNIV_DEBUG */ + +/*******************************************************************//** +Note: This is really the list constructor. We should be able to use +placement new here. +Initializes the base node of a two-way list. +@param b the list base node +@param pmf point to member field that will be used as the link node */ +#define UT_LIST_INIT(b, pmf) \ +{ \ + (b).count = 0; \ + (b).start = 0; \ + (b).end = 0; \ + (b).node = pmf; \ + UT_LIST_INITIALISE(b); \ +} + +/** Functor for accessing the embedded node within a list element. This is +required because some lists can have the node emebedded inside a nested +struct/union. See lock0priv.h (table locks) for an example. It provides a +specialised functor to grant access to the list node. */ +template +struct GenericGetNode { + + typedef ut_list_node node_type; + + GenericGetNode(node_type Type::* node) : m_node(node) {} + + node_type& operator() (Type& elem) + { + return(elem.*m_node); + } + + node_type Type::*m_node; +}; + +/*******************************************************************//** +Adds the node as the first element in a two-way linked list. +@param list the base node (not a pointer to it) +@param elem the element to add */ +template +void +ut_list_prepend( + List& list, + typename List::elem_type* elem) +{ + typename List::node_type& elem_node = elem->*list.node; + + UT_LIST_IS_INITIALISED(list); + + elem_node.prev = 0; + elem_node.next = list.start; + + if (list.start != 0) { + typename List::node_type& base_node = + list.start->*list.node; + + ut_ad(list.start != elem); + + base_node.prev = elem; + } + + list.start = elem; + + if (list.end == 0) { + list.end = elem; + } + + ++list.count; +} + +/*******************************************************************//** +Adds the node as the first element in a two-way linked list. +@param LIST the base node (not a pointer to it) +@param ELEM the element to add */ +#define UT_LIST_ADD_FIRST(LIST, ELEM) ut_list_prepend(LIST, ELEM) + +/*******************************************************************//** +Adds the node as the last element in a two-way linked list. +@param list list +@param elem the element to add +@param get_node to get the list node for that element */ +template +void +ut_list_append( + List& list, + typename List::elem_type* elem, + Functor get_node) +{ + typename List::node_type& node = get_node(*elem); + + UT_LIST_IS_INITIALISED(list); + + node.next = 0; + node.prev = list.end; + + if (list.end != 0) { + typename List::node_type& base_node = get_node(*list.end); + + ut_ad(list.end != elem); + + base_node.next = elem; + } + + list.end = elem; + + if (list.start == 0) { + list.start = elem; + } + + ++list.count; +} + +/*******************************************************************//** +Adds the node as the last element in a two-way linked list. +@param list list +@param elem the element to add */ +template +void +ut_list_append( + List& list, + typename List::elem_type* elem) +{ + ut_list_append( + list, elem, + GenericGetNode(list.node)); +} + +/*******************************************************************//** +Adds the node as the last element in a two-way linked list. +@param LIST list base node (not a pointer to it) +@param ELEM the element to add */ +#define UT_LIST_ADD_LAST(LIST, ELEM) ut_list_append(LIST, ELEM) + +/*******************************************************************//** +Inserts a ELEM2 after ELEM1 in a list. +@param list the base node +@param elem1 node after which ELEM2 is inserted +@param elem2 node being inserted after ELEM1 */ +template +void +ut_list_insert( + List& list, + typename List::elem_type* elem1, + typename List::elem_type* elem2) +{ + ut_ad(elem1 != elem2); + UT_LIST_IS_INITIALISED(list); + + typename List::node_type& elem1_node = elem1->*list.node; + typename List::node_type& elem2_node = elem2->*list.node; + + elem2_node.prev = elem1; + elem2_node.next = elem1_node.next; + + if (elem1_node.next != NULL) { + typename List::node_type& next_node = + elem1_node.next->*list.node; + + next_node.prev = elem2; + } + + elem1_node.next = elem2; + + if (list.end == elem1) { + list.end = elem2; + } + + ++list.count; +} + +/*******************************************************************//** +Inserts a ELEM2 after ELEM1 in a list. +@param LIST list base node (not a pointer to it) +@param ELEM1 node after which ELEM2 is inserted +@param ELEM2 node being inserted after ELEM1 */ +#define UT_LIST_INSERT_AFTER(LIST, ELEM1, ELEM2) \ + ut_list_insert(LIST, ELEM1, ELEM2) + +/*******************************************************************//** +Inserts a ELEM2 after ELEM1 in a list. +@param list the base node +@param elem1 node after which ELEM2 is inserted +@param elem2 node being inserted after ELEM1 +@param get_node to get the list node for that element */ + +template +void +ut_list_insert( + List& list, + typename List::elem_type* elem1, + typename List::elem_type* elem2, + Functor get_node) +{ + ut_ad(elem1 != elem2); + UT_LIST_IS_INITIALISED(list); + + typename List::node_type& elem1_node = get_node(*elem1); + typename List::node_type& elem2_node = get_node(*elem2); + + elem2_node.prev = elem1; + elem2_node.next = elem1_node.next; + + if (elem1_node.next != NULL) { + typename List::node_type& next_node = + get_node(*elem1_node.next); + + next_node.prev = elem2; + } + + elem1_node.next = elem2; + + if (list.end == elem1) { + list.end = elem2; + } + + ++list.count; + +} +/*******************************************************************//** +Removes a node from a two-way linked list. +@param list the base node (not a pointer to it) +@param node member node within list element that is to be removed +@param get_node functor to get the list node from elem */ +template +void +ut_list_remove( + List& list, + typename List::node_type& node, + Functor get_node) +{ + ut_a(list.count > 0); + UT_LIST_IS_INITIALISED(list); + + if (node.next != NULL) { + typename List::node_type& next_node = + get_node(*node.next); + + next_node.prev = node.prev; + } else { + list.end = node.prev; + } + + if (node.prev != NULL) { + typename List::node_type& prev_node = + get_node(*node.prev); + + prev_node.next = node.next; + } else { + list.start = node.next; + } + + node.next = 0; + node.prev = 0; + + --list.count; +} + +/*******************************************************************//** +Removes a node from a two-way linked list. +@param list the base node (not a pointer to it) +@param elem element to be removed from the list +@param get_node functor to get the list node from elem */ +template +void +ut_list_remove( + List& list, + typename List::elem_type* elem, + Functor get_node) +{ + ut_list_remove(list, get_node(*elem), get_node); +} + +/*******************************************************************//** +Removes a node from a two-way linked list. +@param list the base node (not a pointer to it) +@param elem element to be removed from the list */ +template +void +ut_list_remove( + List& list, + typename List::elem_type* elem) +{ + ut_list_remove( + list, elem->*list.node, + GenericGetNode(list.node)); +} + +/*******************************************************************//** +Removes a node from a two-way linked list. +@param LIST the base node (not a pointer to it) +@param ELEM node to be removed from the list */ +#define UT_LIST_REMOVE(LIST, ELEM) ut_list_remove(LIST, ELEM) + +/********************************************************************//** +Gets the next node in a two-way list. +@param NAME list name +@param N pointer to a node +@return the successor of N in NAME, or NULL */ +#define UT_LIST_GET_NEXT(NAME, N) (((N)->NAME).next) + +/********************************************************************//** +Gets the previous node in a two-way list. +@param NAME list name +@param N pointer to a node +@return the predecessor of N in NAME, or NULL */ +#define UT_LIST_GET_PREV(NAME, N) (((N)->NAME).prev) + +/********************************************************************//** +Alternative macro to get the number of nodes in a two-way list, i.e., +its length. +@param BASE the base node (not a pointer to it). +@return the number of nodes in the list */ +#define UT_LIST_GET_LEN(BASE) (BASE).count + +/********************************************************************//** +Gets the first node in a two-way list. +@param BASE the base node (not a pointer to it) +@return first node, or NULL if the list is empty */ +#define UT_LIST_GET_FIRST(BASE) (BASE).start + +/********************************************************************//** +Gets the last node in a two-way list. +@param BASE the base node (not a pointer to it) +@return last node, or NULL if the list is empty */ +#define UT_LIST_GET_LAST(BASE) (BASE).end + +struct NullValidate { void operator()(const void*) const {} }; + +/** Iterate over all the elements and call the functor for each element. +@param[in] list base node (not a pointer to it) +@param[in,out] functor Functor that is called for each element in the list */ +template +inline void ut_list_map(const List& list, Functor& functor) +{ + ulint count = 0; + + UT_LIST_IS_INITIALISED(list); + + for (typename List::elem_type* elem = list.start; elem; + elem = (elem->*list.node).next, ++count) { + + functor(elem); + } + + ut_a(count == list.count); +} + +/** Iterate over all the elements and call the functor for each element. +@param[in] list base node (not a pointer to it) +@param[in] functor Functor that is called for each element in the list */ +template +inline void ut_list_map(const List& list, const Functor& functor) +{ + ulint count = 0; + + UT_LIST_IS_INITIALISED(list); + + for (typename List::elem_type* elem = list.start; elem; + elem = (elem->*list.node).next, ++count) { + + functor(elem); + } + + ut_a(count == list.count); +} + +/** Check the consistency of a doubly linked list. +@param[in] list base node (not a pointer to it) +@param[in,out] functor Functor that is called for each element in the list */ +template +void ut_list_validate(const List& list, Functor& functor) +{ + ut_list_map(list, functor); +#ifdef UNIV_DEBUG + /* Validate the list backwards. */ + ulint count = list.count; + + for (typename List::elem_type* elem = list.end; + elem != 0; + elem = (elem->*list.node).prev) { + --count; + } + ut_ad(!count); +#endif +} + +/** Check the consistency of a doubly linked list. +@param[in] list base node (not a pointer to it) +@param[in] functor Functor that is called for each element in the list */ +template +inline void ut_list_validate(const List& list, const Functor& functor) +{ + ut_list_map(list, functor); +#ifdef UNIV_DEBUG + /* Validate the list backwards. */ + ulint count = list.count; + + for (typename List::elem_type* elem = list.end; + elem != 0; + elem = (elem->*list.node).prev) { + --count; + } + + ut_ad(!count); +#endif +} + +template +inline void ut_list_validate(const List& list) +{ + ut_d(ut_list_validate(list, NullValidate())); +} + +#ifdef UNIV_DEBUG +template +inline void ut_list_reverse(List& list) +{ + UT_LIST_IS_INITIALISED(list); + + for (typename List::elem_type* elem = list.start; + elem != 0; + elem = (elem->*list.node).prev) { + (elem->*list.node).reverse(); + } + + list.reverse(); +} + +/** Check if the given element exists in the list. +@param[in,out] list the list object +@param[in] elem the element of the list which will be checked */ +template +inline bool ut_list_exists(const List& list, typename List::elem_type* elem) +{ + for (typename List::elem_type* e1 = UT_LIST_GET_FIRST(list); e1; + e1 = (e1->*list.node).next) { + if (elem == e1) { + return true; + } + } + return false; +} +#endif + +/** Move the given element to the beginning of the list. +@param[in,out] list the list object +@param[in] elem the element of the list which will be moved + to the beginning of the list. */ +template +void +ut_list_move_to_front( + List& list, + typename List::elem_type* elem) +{ + ut_ad(ut_list_exists(list, elem)); + + if (UT_LIST_GET_FIRST(list) != elem) { + ut_list_remove(list, elem); + ut_list_prepend(list, elem); + } +} diff --git a/storage/innobase/include/ut0mem.h b/storage/innobase/include/ut0mem.h new file mode 100644 index 00000000..a5ed72f9 --- /dev/null +++ b/storage/innobase/include/ut0mem.h @@ -0,0 +1,76 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0mem.h +Memory primitives + +Created 5/30/1994 Heikki Tuuri +************************************************************************/ + +#ifndef ut0mem_h +#define ut0mem_h + +#include "univ.i" + +/******************************************************************** +Concatenate 3 strings.*/ +char* +ut_str3cat( +/*=======*/ + /* out, own: concatenated string, must be + freed with ut_free() */ + const char* s1, /* in: string 1 */ + const char* s2, /* in: string 2 */ + const char* s3); /* in: string 3 */ + +/**********************************************************************//** +Converts a raw binary data to a NUL-terminated hex string. The output is +truncated if there is not enough space in "hex", make sure "hex_size" is at +least (2 * raw_size + 1) if you do not want this to happen. Returns the +actual number of characters written to "hex" (including the NUL). +@return number of chars written */ +UNIV_INLINE +ulint +ut_raw_to_hex( +/*==========*/ + const void* raw, /*!< in: raw data */ + ulint raw_size, /*!< in: "raw" length in bytes */ + char* hex, /*!< out: hex string */ + ulint hex_size); /*!< in: "hex" size in bytes */ + +/*******************************************************************//** +Adds single quotes to the start and end of string and escapes any quotes +by doubling them. Returns the number of bytes that were written to "buf" +(including the terminating NUL). If buf_size is too small then the +trailing bytes from "str" are discarded. +@return number of bytes that were written */ +UNIV_INLINE +ulint +ut_str_sql_format( +/*==============*/ + const char* str, /*!< in: string */ + ulint str_len, /*!< in: string length in bytes */ + char* buf, /*!< out: output buffer */ + ulint buf_size); /*!< in: output buffer size + in bytes */ + +#include "ut0mem.inl" + +#endif diff --git a/storage/innobase/include/ut0mem.inl b/storage/innobase/include/ut0mem.inl new file mode 100644 index 00000000..cc95a036 --- /dev/null +++ b/storage/innobase/include/ut0mem.inl @@ -0,0 +1,246 @@ +/***************************************************************************** + +Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2019, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0mem.ic +Memory primitives + +Created 5/30/1994 Heikki Tuuri +************************************************************************/ + +#include "ut0byte.h" +#include "mach0data.h" + +/**********************************************************************//** +Converts a raw binary data to a NUL-terminated hex string. The output is +truncated if there is not enough space in "hex", make sure "hex_size" is at +least (2 * raw_size + 1) if you do not want this to happen. Returns the +actual number of characters written to "hex" (including the NUL). +@return number of chars written */ +UNIV_INLINE +ulint +ut_raw_to_hex( +/*==========*/ + const void* raw, /*!< in: raw data */ + ulint raw_size, /*!< in: "raw" length in bytes */ + char* hex, /*!< out: hex string */ + ulint hex_size) /*!< in: "hex" size in bytes */ +{ + +#ifdef WORDS_BIGENDIAN + +#define MK_UINT16(a, b) (((uint16) (a)) << 8 | (uint16) (b)) + +#define UINT16_GET_A(u) ((char) ((u) >> 8)) +#define UINT16_GET_B(u) ((char) ((u) & 0xFF)) + +#else /* WORDS_BIGENDIAN */ + +#define MK_UINT16(a, b) (((uint16) (b)) << 8 | (uint16) (a)) + +#define UINT16_GET_A(u) ((char) ((u) & 0xFF)) +#define UINT16_GET_B(u) ((char) ((u) >> 8)) + +#endif /* WORDS_BIGENDIAN */ + +#define MK_ALL_UINT16_WITH_A(a) \ + MK_UINT16(a, '0'), \ + MK_UINT16(a, '1'), \ + MK_UINT16(a, '2'), \ + MK_UINT16(a, '3'), \ + MK_UINT16(a, '4'), \ + MK_UINT16(a, '5'), \ + MK_UINT16(a, '6'), \ + MK_UINT16(a, '7'), \ + MK_UINT16(a, '8'), \ + MK_UINT16(a, '9'), \ + MK_UINT16(a, 'A'), \ + MK_UINT16(a, 'B'), \ + MK_UINT16(a, 'C'), \ + MK_UINT16(a, 'D'), \ + MK_UINT16(a, 'E'), \ + MK_UINT16(a, 'F') + + static const uint16 hex_map[256] = { + MK_ALL_UINT16_WITH_A('0'), + MK_ALL_UINT16_WITH_A('1'), + MK_ALL_UINT16_WITH_A('2'), + MK_ALL_UINT16_WITH_A('3'), + MK_ALL_UINT16_WITH_A('4'), + MK_ALL_UINT16_WITH_A('5'), + MK_ALL_UINT16_WITH_A('6'), + MK_ALL_UINT16_WITH_A('7'), + MK_ALL_UINT16_WITH_A('8'), + MK_ALL_UINT16_WITH_A('9'), + MK_ALL_UINT16_WITH_A('A'), + MK_ALL_UINT16_WITH_A('B'), + MK_ALL_UINT16_WITH_A('C'), + MK_ALL_UINT16_WITH_A('D'), + MK_ALL_UINT16_WITH_A('E'), + MK_ALL_UINT16_WITH_A('F') + }; + const unsigned char* rawc; + ulint read_bytes; + ulint write_bytes; + ulint i; + + rawc = (const unsigned char*) raw; + + if (hex_size == 0) { + + return(0); + } + + if (hex_size <= 2 * raw_size) { + + read_bytes = hex_size / 2; + write_bytes = hex_size; + } else { + + read_bytes = raw_size; + write_bytes = 2 * raw_size + 1; + } + +#define LOOP_READ_BYTES(ASSIGN) \ + for (i = 0; i < read_bytes; i++) { \ + ASSIGN; \ + hex += 2; \ + rawc++; \ + } + + if (ut_align_offset(hex, 2) == 0) { + + LOOP_READ_BYTES( + *(uint16*) hex = hex_map[*rawc] + ); + } else { + + LOOP_READ_BYTES( + *hex = UINT16_GET_A(hex_map[*rawc]); + *(hex + 1) = UINT16_GET_B(hex_map[*rawc]) + ); + } + + if (hex_size <= 2 * raw_size && hex_size % 2 == 0) { + + hex--; + } + + *hex = '\0'; + + return(write_bytes); +} + +/*******************************************************************//** +Adds single quotes to the start and end of string and escapes any quotes +by doubling them. Returns the number of bytes that were written to "buf" +(including the terminating NUL). If buf_size is too small then the +trailing bytes from "str" are discarded. +@return number of bytes that were written */ +UNIV_INLINE +ulint +ut_str_sql_format( +/*==============*/ + const char* str, /*!< in: string */ + ulint str_len, /*!< in: string length in bytes */ + char* buf, /*!< out: output buffer */ + ulint buf_size) /*!< in: output buffer size + in bytes */ +{ + ulint str_i; + ulint buf_i; + + buf_i = 0; + + switch (buf_size) { + case 3: + + if (str_len == 0) { + + buf[buf_i] = '\''; + buf_i++; + buf[buf_i] = '\''; + buf_i++; + } + /* FALLTHROUGH */ + case 2: + case 1: + + buf[buf_i] = '\0'; + buf_i++; + /* FALLTHROUGH */ + case 0: + + return(buf_i); + } + + /* buf_size >= 4 */ + + buf[0] = '\''; + buf_i = 1; + + for (str_i = 0; str_i < str_len; str_i++) { + + char ch; + + if (buf_size - buf_i == 2) { + + break; + } + + ch = str[str_i]; + + switch (ch) { + case '\0': + + if (buf_size - buf_i < 4) { + + goto func_exit; + } + buf[buf_i] = '\\'; + buf_i++; + buf[buf_i] = '0'; + buf_i++; + break; + case '\'': + case '\\': + + if (buf_size - buf_i < 4) { + + goto func_exit; + } + buf[buf_i] = ch; + buf_i++; + /* FALLTHROUGH */ + default: + + buf[buf_i] = ch; + buf_i++; + } + } + +func_exit: + + buf[buf_i] = '\''; + buf_i++; + buf[buf_i] = '\0'; + buf_i++; + + return(buf_i); +} diff --git a/storage/innobase/include/ut0new.h b/storage/innobase/include/ut0new.h new file mode 100644 index 00000000..f4183e4c --- /dev/null +++ b/storage/innobase/include/ut0new.h @@ -0,0 +1,1099 @@ +/***************************************************************************** + +Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file ut/ut0new.h +Instrumented memory allocator. + +Created May 26, 2014 Vasil Dimov +*******************************************************/ + +/** Dynamic memory allocation within InnoDB guidelines. +All dynamic (heap) memory allocations (malloc(3), strdup(3), etc, "new", +various std:: containers that allocate memory internally), that are done +within InnoDB are instrumented. This means that InnoDB uses a custom set +of functions for allocating memory, rather than calling e.g. "new" directly. + +Here follows a cheat sheet on what InnoDB functions to use whenever a +standard one would have been used. + +Creating new objects with "new": +-------------------------------- +Standard: + new expression + or + new(std::nothrow) expression +InnoDB, default instrumentation: + UT_NEW_NOKEY(expression) +InnoDB, custom instrumentation, preferred: + UT_NEW(expression, key) + +Destroying objects, created with "new": +--------------------------------------- +Standard: + delete ptr +InnoDB: + UT_DELETE(ptr) + +Creating new arrays with "new[]": +--------------------------------- +Standard: + new type[num] + or + new(std::nothrow) type[num] +InnoDB, default instrumentation: + UT_NEW_ARRAY_NOKEY(type, num) +InnoDB, custom instrumentation, preferred: + UT_NEW_ARRAY(type, num, key) + +Destroying arrays, created with "new[]": +---------------------------------------- +Standard: + delete[] ptr +InnoDB: + UT_DELETE_ARRAY(ptr) + +Declaring a type with a std:: container, e.g. std::vector: +---------------------------------------------------------- +Standard: + std::vector +InnoDB: + std::vector > + +Declaring objects of some std:: type: +------------------------------------- +Standard: + std::vector v +InnoDB, default instrumentation: + std::vector > v +InnoDB, custom instrumentation, preferred: + std::vector > v(ut_allocator(key)) + +Raw block allocation (as usual in C++, consider whether using "new" would +not be more appropriate): +------------------------------------------------------------------------- +Standard: + malloc(num) +InnoDB, default instrumentation: + ut_malloc_nokey(num) +InnoDB, custom instrumentation, preferred: + ut_malloc(num, key) + +Raw block resize: +----------------- +Standard: + realloc(ptr, new_size) +InnoDB: + ut_realloc(ptr, new_size) + +Raw block deallocation: +----------------------- +Standard: + free(ptr) +InnoDB: + ut_free(ptr) + +Note: the expression passed to UT_NEW() or UT_NEW_NOKEY() must always end +with (), thus: +Standard: + new int +InnoDB: + UT_NEW_NOKEY(int()) +*/ + +#ifndef ut0new_h +#define ut0new_h + +#include /* std::numeric_limits */ +#include + +#include +#include /* malloc() */ +#include /* strlen(), strrchr(), strncmp() */ + +#include /* my_large_free/malloc() */ + +#include "my_global.h" /* needed for headers from mysql/psi/ */ + +#include "mysql/psi/mysql_memory.h" /* PSI_MEMORY_CALL() */ + +#include "mysql/psi/psi_memory.h" /* PSI_memory_key, PSI_memory_info */ + +#include "ut0ut.h" /* ut_strcmp_functor */ + +#define OUT_OF_MEMORY_MSG \ + "Check if you should increase the swap file or ulimits of your" \ + " operating system. Note that on most 32-bit computers the process" \ + " memory space is limited to 2 GB or 4 GB." + +/** The total amount of memory currently allocated from the operating +system with allocate_large() */ +extern Atomic_counter os_total_large_mem_allocated; + +/** Maximum number of retries to allocate memory. */ +extern const size_t alloc_max_retries; + +constexpr uint32_t INVALID_AUTOEVENT_IDX = 0xFFFFFFFFU; + +/** Keys for registering allocations with performance schema. +Pointers to these variables are supplied to PFS code via the pfs_info[] +array and the PFS code initializes them via PSI_MEMORY_CALL(register_memory)(). +mem_key_other and mem_key_std are special in the following way (see also +ut_allocator::get_mem_key()): +* If the caller has not provided a key and the file name of the caller is + unknown, then mem_key_std will be used. This happens only when called from + within std::* containers. +* If the caller has not provided a key and the file name of the caller is + known, but is not amongst the predefined names (see ut_new_boot()) then + mem_key_other will be used. Generally this should not happen and if it + happens then that means that the list of predefined names must be extended. +Keep this list alphabetically sorted. */ +extern PSI_memory_key mem_key_ahi; +extern PSI_memory_key mem_key_buf_buf_pool; +extern PSI_memory_key mem_key_dict_stats_bg_recalc_pool_t; +extern PSI_memory_key mem_key_dict_stats_index_map_t; +extern PSI_memory_key mem_key_dict_stats_n_diff_on_level; +extern PSI_memory_key mem_key_other; +extern PSI_memory_key mem_key_row_log_buf; +extern PSI_memory_key mem_key_row_merge_sort; +extern PSI_memory_key mem_key_std; + +/** Setup the internal objects needed for UT_NEW() to operate. +This must be called before the first call to UT_NEW(). */ +void +ut_new_boot(); + +#ifdef UNIV_PFS_MEMORY + +/** +Retrieve a memory key (registered with PFS), +given AUTOEVENT_IDX of the caller + +@param[in] autoevent_idx - AUTOEVENT_IDX value of the caller +@return registered memory key or PSI_NOT_INSTRUMENTED */ +PSI_memory_key ut_new_get_key_by_file(uint32_t autoevent_idx); + +#endif /* UNIV_PFS_MEMORY */ + +/** A structure that holds the necessary data for performance schema +accounting. An object of this type is put in front of each allocated block +of memory when allocation is done by ut_allocator::allocate(). This is +because the data is needed even when freeing the memory. Users of +ut_allocator::allocate_large() are responsible for maintaining this +themselves. */ +struct ut_new_pfx_t { + +#ifdef UNIV_PFS_MEMORY + + /** Performance schema key. Assigned to a name at startup via + PSI_MEMORY_CALL(register_memory)() and later used for accounting + allocations and deallocations with + PSI_MEMORY_CALL(memory_alloc)(key, size, owner) and + PSI_MEMORY_CALL(memory_free)(key, size, owner). */ + PSI_memory_key m_key; + + /** + Thread owner. + Instrumented thread that owns the allocated memory. + This state is used by the performance schema to maintain + per thread statistics, + when memory is given from thread A to thread B. + */ + struct PSI_thread *m_owner; + +#endif /* UNIV_PFS_MEMORY */ + + /** Size of the allocated block in bytes, including this prepended + aux structure (for ut_allocator::allocate()). For example if InnoDB + code requests to allocate 100 bytes, and sizeof(ut_new_pfx_t) is 16, + then 116 bytes are allocated in total and m_size will be 116. + ut_allocator::allocate_large() does not prepend this struct to the + allocated block and its users are responsible for maintaining it + and passing it later to ut_allocator::deallocate_large(). */ + size_t m_size; +#if SIZEOF_VOIDP == 4 + /** Pad the header size to a multiple of 64 bits on 32-bit systems, + so that the payload will be aligned to 64 bits. */ + size_t pad; +#endif +}; + +#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP) +static inline void ut_dontdump(void *ptr, size_t m_size, bool dontdump) +{ + ut_a(ptr != NULL); + + if (dontdump && madvise(ptr, m_size, MADV_DONTDUMP)) { + ib::warn() << "Failed to set memory to " DONTDUMP_STR ": " + << strerror(errno) + << " ptr " << ptr + << " size " << m_size; + } +} + +static inline void ut_dodump(void* ptr, size_t m_size) +{ + if (ptr && madvise(ptr, m_size, MADV_DODUMP)) { + ib::warn() << "Failed to set memory to " DODUMP_STR ": " + << strerror(errno) + << " ptr " << ptr + << " size " << m_size; + } +} +#else +static inline void ut_dontdump(void *, size_t, bool) {} +static inline void ut_dodump(void*, size_t) {} +#endif + +/** Allocator class for allocating memory from inside std::* containers. +@tparam T type of allocated object +@tparam oom_fatal whether to commit suicide when running out of memory */ +template +class ut_allocator { +public: + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef T value_type; + typedef size_t size_type; + typedef ptrdiff_t difference_type; + +#ifdef UNIV_PFS_MEMORY + /** Default constructor. */ + explicit + ut_allocator(PSI_memory_key key = PSI_NOT_INSTRUMENTED) + : m_key(key) + { + } +#else + ut_allocator() = default; + ut_allocator(PSI_memory_key) {} +#endif /* UNIV_PFS_MEMORY */ + + /** Constructor from allocator of another type. */ + template + ut_allocator(const ut_allocator& +#ifdef UNIV_PFS_MEMORY + other +#endif + ) + { +#ifdef UNIV_PFS_MEMORY + const PSI_memory_key other_key = other.get_mem_key(); + + m_key = (other_key != mem_key_std) + ? other_key + : PSI_NOT_INSTRUMENTED; +#endif /* UNIV_PFS_MEMORY */ + } + + /** Return the maximum number of objects that can be allocated by + this allocator. */ + size_type + max_size() const + { + const size_type s_max = std::numeric_limits::max(); + +#ifdef UNIV_PFS_MEMORY + return((s_max - sizeof(ut_new_pfx_t)) / sizeof(T)); +#else + return(s_max / sizeof(T)); +#endif /* UNIV_PFS_MEMORY */ + } + + pointer allocate(size_type n) { return allocate(n, NULL, INVALID_AUTOEVENT_IDX); } + + /** Allocate a chunk of memory that can hold 'n_elements' objects of + type 'T' and trace the allocation. + If the allocation fails this method may throw an exception. This + is mandated by the standard and if it returns NULL instead, then + STL containers that use it (e.g. std::vector) may get confused. + After successfull allocation the returned pointer must be passed + to ut_allocator::deallocate() when no longer needed. + @param[in] n_elements number of elements + @param[in] set_to_zero if true, then the returned memory is + initialized with 0x0 bytes. + @param[in] throw_on_error if true, raize exception if too big + @return pointer to the allocated memory */ + pointer + allocate( + size_type n_elements, + const_pointer, + uint32_t +#ifdef UNIV_PFS_MEMORY + autoevent_idx /* AUTOEVENT_IDX of the caller */ +#endif + , + bool set_to_zero = false, + bool throw_on_error = true) + { + if (n_elements == 0) { + return(NULL); + } + + if (n_elements > max_size()) { + if (throw_on_error) { + throw(std::bad_alloc()); + } else { + return(NULL); + } + } + + void* ptr; + size_t total_bytes = n_elements * sizeof(T); + +#ifdef UNIV_PFS_MEMORY + /* The header size must not ruin the 64-bit alignment + on 32-bit systems. Some allocated structures use + 64-bit fields. */ + ut_ad((sizeof(ut_new_pfx_t) & 7) == 0); + total_bytes += sizeof(ut_new_pfx_t); +#endif /* UNIV_PFS_MEMORY */ + + for (size_t retries = 1; ; retries++) { + + if (set_to_zero) { + ptr = calloc(1, total_bytes); + } else { + ptr = malloc(total_bytes); + } + + if (ptr != NULL || retries >= alloc_max_retries) { + break; + } + + std::this_thread::sleep_for(std::chrono::seconds(1)); + } + + if (ptr == NULL) { + ib::fatal_or_error(oom_fatal) + << "Cannot allocate " << total_bytes + << " bytes of memory after " + << alloc_max_retries << " retries over " + << alloc_max_retries << " seconds. OS error: " + << strerror(errno) << " (" << errno << "). " + << OUT_OF_MEMORY_MSG; + if (throw_on_error) { + throw(std::bad_alloc()); + } else { + return(NULL); + } + } + +#ifdef UNIV_PFS_MEMORY + ut_new_pfx_t* pfx = static_cast(ptr); + + allocate_trace(total_bytes, autoevent_idx, pfx); + + return(reinterpret_cast(pfx + 1)); +#else + return(reinterpret_cast(ptr)); +#endif /* UNIV_PFS_MEMORY */ + } + + /** Free a memory allocated by allocate() and trace the deallocation. + @param[in,out] ptr pointer to memory to free */ + void deallocate(pointer ptr, size_type n_elements = 0) + { +#ifdef UNIV_PFS_MEMORY + if (ptr == NULL) { + return; + } + + ut_new_pfx_t* pfx = reinterpret_cast(ptr) - 1; + + deallocate_trace(pfx); + + free(pfx); +#else + free(ptr); +#endif /* UNIV_PFS_MEMORY */ + } + + /** Create an object of type 'T' using the value 'val' over the + memory pointed by 'p'. */ + void + construct( + pointer p, + const T& val) + { + new(p) T(val); + } + + /** Destroy an object pointed by 'p'. */ + void + destroy( + pointer p) + { + p->~T(); + } + + /** Return the address of an object. */ + pointer + address( + reference x) const + { + return(&x); + } + + /** Return the address of a const object. */ + const_pointer + address( + const_reference x) const + { + return(&x); + } + + template + struct rebind { + typedef ut_allocator other; + }; + + /* The following are custom methods, not required by the standard. */ + +#ifdef UNIV_PFS_MEMORY + + /** realloc(3)-like method. + The passed in ptr must have been returned by allocate() and the + pointer returned by this method must be passed to deallocate() when + no longer needed. + @param[in,out] ptr old pointer to reallocate + @param[in] n_elements new number of elements to allocate + @param[in] file file name of the caller + @return newly allocated memory */ + pointer + reallocate( + void* ptr, + size_type n_elements, + uint32_t autoevent_idx) + { + if (n_elements == 0) { + deallocate(static_cast(ptr)); + return(NULL); + } + + if (ptr == NULL) { + return(allocate(n_elements, NULL, autoevent_idx, false, false)); + } + + if (n_elements > max_size()) { + return(NULL); + } + + ut_new_pfx_t* pfx_old; + ut_new_pfx_t* pfx_new; + size_t total_bytes; + + pfx_old = reinterpret_cast(ptr) - 1; + + total_bytes = n_elements * sizeof(T) + sizeof(ut_new_pfx_t); + + for (size_t retries = 1; ; retries++) { + + pfx_new = static_cast( + realloc(pfx_old, total_bytes)); + + if (pfx_new != NULL || retries >= alloc_max_retries) { + break; + } + + std::this_thread::sleep_for(std::chrono::seconds(1)); + } + + if (pfx_new == NULL) { + ib::fatal_or_error(oom_fatal) + << "Cannot reallocate " << total_bytes + << " bytes of memory after " + << alloc_max_retries << " retries over " + << alloc_max_retries << " seconds. OS error: " + << strerror(errno) << " (" << errno << "). " + << OUT_OF_MEMORY_MSG; + return(NULL); + } + + /* pfx_new still contains the description of the old block + that was presumably freed by realloc(). */ + deallocate_trace(pfx_new); + + /* pfx_new is set here to describe the new block. */ + allocate_trace(total_bytes, autoevent_idx, pfx_new); + + return(reinterpret_cast(pfx_new + 1)); + } + + /** Allocate, trace the allocation and construct 'n_elements' objects + of type 'T'. If the allocation fails or if some of the constructors + throws an exception, then this method will return NULL. It does not + throw exceptions. After successfull completion the returned pointer + must be passed to delete_array() when no longer needed. + @param[in] n_elements number of elements to allocate + @param[in] file file name of the caller + @return pointer to the first allocated object or NULL */ + pointer + new_array( + size_type n_elements, + uint32_t autoevent_idx + ) + { + T* p = allocate(n_elements, NULL, autoevent_idx, false, false); + + if (p == NULL) { + return(NULL); + } + + T* first = p; + size_type i; + + try { + for (i = 0; i < n_elements; i++) { + new(p) T; + ++p; + } + } catch (...) { + for (size_type j = 0; j < i; j++) { + --p; + p->~T(); + } + + deallocate(first); + + throw; + } + + return(first); + } + + /** Destroy, deallocate and trace the deallocation of an array created + by new_array(). + @param[in,out] ptr pointer to the first object in the array */ + void + delete_array( + T* ptr) + { + if (ptr == NULL) { + return; + } + + const size_type n_elements = n_elements_allocated(ptr); + + T* p = ptr + n_elements - 1; + + for (size_type i = 0; i < n_elements; i++) { + p->~T(); + --p; + } + + deallocate(ptr); + } + +#endif /* UNIV_PFS_MEMORY */ + + /** Allocate a large chunk of memory that can hold 'n_elements' + objects of type 'T' and trace the allocation. + @param[in] n_elements number of elements + @param[in] dontdump if true, advise the OS is not to core + dump this memory. + @param[out] pfx storage for the description of the + allocated memory. The caller must provide space for this one and keep + it until the memory is no longer needed and then pass it to + deallocate_large(). + @return pointer to the allocated memory or NULL */ + pointer + allocate_large( + size_type n_elements, + ut_new_pfx_t* pfx, + bool dontdump = false) + { + if (n_elements == 0 || n_elements > max_size()) { + return(NULL); + } + + ulint n_bytes = n_elements * sizeof(T); + + pointer ptr = reinterpret_cast( + my_large_malloc(&n_bytes, MYF(0))); + + if (ptr == NULL) { + return NULL; + } + + ut_dontdump(ptr, n_bytes, dontdump); + + if (pfx != NULL) { +#ifdef UNIV_PFS_MEMORY + allocate_trace(n_bytes, 0, pfx); +#endif /* UNIV_PFS_MEMORY */ + pfx->m_size = n_bytes; + } + + os_total_large_mem_allocated += n_bytes; + + return(ptr); + } + + pointer + allocate_large_dontdump( + size_type n_elements, + ut_new_pfx_t* pfx) + { + return allocate_large(n_elements, pfx, true); + } + /** Free a memory allocated by allocate_large() and trace the + deallocation. + @param[in,out] ptr pointer to memory to free + @param[in] pfx descriptor of the memory, as returned by + allocate_large(). */ + void + deallocate_large( + pointer ptr, + const ut_new_pfx_t* pfx) + { + size_t size = pfx->m_size; +#ifdef UNIV_PFS_MEMORY + if (pfx) { + deallocate_trace(pfx); + } +#endif /* UNIV_PFS_MEMORY */ + os_total_large_mem_allocated -= size; + + my_large_free(ptr, size); + } + + void + deallocate_large_dodump( + pointer ptr, + const ut_new_pfx_t* pfx) + { + ut_dodump(ptr, pfx->m_size); + deallocate_large(ptr, pfx); + } + +#ifdef UNIV_PFS_MEMORY + /** Get the performance schema key to use for tracing allocations. + @param[in] file file name of the caller or NULL if unknown + @return performance schema key */ + PSI_memory_key + get_mem_key( + uint32_t autoevent_idx = INVALID_AUTOEVENT_IDX) const + { + if (m_key != PSI_NOT_INSTRUMENTED) { + return(m_key); + } + + if (autoevent_idx == INVALID_AUTOEVENT_IDX) { + return(mem_key_std); + } + const PSI_memory_key key = ut_new_get_key_by_file(autoevent_idx); + + if (key != PSI_NOT_INSTRUMENTED) { + return(key); + } + + return(mem_key_other); + } + +private: + + /** Retrieve the size of a memory block allocated by new_array(). + @param[in] ptr pointer returned by new_array(). + @return size of memory block */ + size_type + n_elements_allocated( + const_pointer ptr) + { + const ut_new_pfx_t* pfx + = reinterpret_cast(ptr) - 1; + + const size_type user_bytes + = pfx->m_size - sizeof(ut_new_pfx_t); + + ut_ad(user_bytes % sizeof(T) == 0); + + return(user_bytes / sizeof(T)); + } + + /** Trace a memory allocation. + After the accounting, the data needed for tracing the deallocation + later is written into 'pfx'. + The PFS event name is picked on the following criteria: + 1. If key (!= PSI_NOT_INSTRUMENTED) has been specified when constructing + this ut_allocator object, then the name associated with that key will + be used (this is the recommended approach for new code) + 2. Otherwise, if "file" is NULL, then the name associated with + mem_key_std will be used + 3. Otherwise, if an entry is found by ut_new_get_key_by_file(), that + corresponds to "file", that will be used (see ut_new_boot()) + 4. Otherwise, the name associated with mem_key_other will be used. + @param[in] size number of bytes that were allocated + @param[in] autoevent_idx autoevent_idx of the caller + @param[out] pfx placeholder to store the info which will be + needed when freeing the memory */ + void + allocate_trace( + size_t size, + const uint32_t autoevent_idx, + ut_new_pfx_t* pfx) + { + const PSI_memory_key key = get_mem_key(autoevent_idx); + + pfx->m_key = PSI_MEMORY_CALL(memory_alloc)(key, size, & pfx->m_owner); + pfx->m_size = size; + } + + /** Trace a memory deallocation. + @param[in] pfx info for the deallocation */ + void + deallocate_trace( + const ut_new_pfx_t* pfx) + { + PSI_MEMORY_CALL(memory_free)(pfx->m_key, pfx->m_size, pfx->m_owner); + } + + /** Performance schema key. */ + PSI_memory_key m_key; + +#endif /* UNIV_PFS_MEMORY */ + +private: + + /** Assignment operator, not used, thus disabled (private). */ + template + void + operator=( + const ut_allocator&); +}; + +/** Compare two allocators of the same type. +As long as the type of A1 and A2 is the same, a memory allocated by A1 +could be freed by A2 even if the pfs mem key is different. */ +template +inline +bool +operator==(const ut_allocator&, const ut_allocator&) { return(true); } + +/** Compare two allocators of the same type. */ +template +inline +bool +operator!=( + const ut_allocator& lhs, + const ut_allocator& rhs) +{ + return(!(lhs == rhs)); +} + +#ifdef UNIV_PFS_MEMORY + +/* + constexpr trickery ahead. + + Compute AUTOEVENT_IDX at compile time. + (index in the auto_event_names array, corresponding to basename of __FILE__) + + The tricks are necessary to reduce the cost of lookup the + PSI_memory_key for auto event. +*/ + +static constexpr const char* cexpr_basename_helper(const char* s, const char* last_slash) +{ + return + *s == '\0' ? last_slash : + *s == '/' || *s == '\\' ? cexpr_basename_helper(s + 1, s + 1) : + cexpr_basename_helper(s + 1, last_slash); +} + +static constexpr const char* cexpr_basename(const char* filename) +{ + return cexpr_basename_helper(filename, filename); +} + +static constexpr bool cexpr_strequal_ignore_dot(const char* a, const char* b) +{ + return *a == 0 || *a == '.' ? (*b == 0 || *b == '.') + : *a == *b ? cexpr_strequal_ignore_dot(a + 1, b + 1) : false; +} + +constexpr const char* const auto_event_names[] = +{ + "btr0btr", + "btr0buf", + "btr0bulk", + "btr0cur", + "btr0pcur", + "btr0sea", + "buf0buf", + "buf0dblwr", + "buf0dump", + "buf0lru", + "buf0rea", + "dict0dict", + "dict0mem", + "dict0stats", + "eval0eval", + "fil0crypt", + "fil0fil", + "fsp0file", + "fts0ast", + "fts0blex", + "fts0config", + "fts0file", + "fts0fts", + "fts0opt", + "fts0pars", + "fts0que", + "fts0sql", + "fts0tlex", + "gis0sea", + "ha_innodb", + "handler0alter", + "hash0hash", + "i_s", + "lexyy", + "lock0lock", + "mem0mem", + "os0file", + "pars0lex", + "rem0rec", + "row0ftsort", + "row0import", + "row0log", + "row0merge", + "row0mysql", + "row0sel", + "srv0start", + "trx0i_s", + "trx0i_s", + "trx0roll", + "trx0rseg", + "trx0seg", + "trx0trx", + "trx0undo", + "ut0list", + "ut0mem", + "ut0new", + "ut0pool", + "ut0rbt", + "ut0wqueue", + "xtrabackup", + nullptr +}; + +constexpr uint32_t cexpr_lookup_auto_event_name(const char* name, uint32_t idx = 0) +{ + return !auto_event_names[idx] ? INVALID_AUTOEVENT_IDX : + cexpr_strequal_ignore_dot(name, auto_event_names[idx]) ? idx : + cexpr_lookup_auto_event_name(name, idx + 1); +} + +/* + The AUTOEVENT_IDX macro. + + Note, that there is a static_assert that checks whether + basename of the __FILE is not registered in the auto_event_names array. + If you run into this assert, add the basename to the array. + + Weird looking lambda is used to force the evaluation at the compile time. +*/ +#define AUTOEVENT_IDX []()\ +{\ + constexpr auto idx = cexpr_lookup_auto_event_name(cexpr_basename(__FILE__)); \ + static_assert(idx != INVALID_AUTOEVENT_IDX, "auto_event_names contains no entry for " __FILE__);\ + return idx; \ +}() + + +/** Allocate, trace the allocation and construct an object. +Use this macro instead of 'new' within InnoDB. +For example: instead of + Foo* f = new Foo(args); +use: + Foo* f = UT_NEW(Foo(args), mem_key_some); +Upon failure to allocate the memory, this macro may return NULL. It +will not throw exceptions. After successfull allocation the returned +pointer must be passed to UT_DELETE() when no longer needed. +@param[in] expr any expression that could follow "new" +@param[in] key performance schema memory tracing key +@return pointer to the created object or NULL */ +#define UT_NEW(expr, key) \ + /* Placement new will return NULL and not attempt to construct an + object if the passed in pointer is NULL, e.g. if allocate() has + failed to allocate memory and has returned NULL. */ \ + ::new(ut_allocator(key).allocate( \ + sizeof expr, NULL, AUTOEVENT_IDX, false, false)) expr + +/** Allocate, trace the allocation and construct an object. +Use this macro instead of 'new' within InnoDB and instead of UT_NEW() +when creating a dedicated memory key is not feasible. +For example: instead of + Foo* f = new Foo(args); +use: + Foo* f = UT_NEW_NOKEY(Foo(args)); +Upon failure to allocate the memory, this macro may return NULL. It +will not throw exceptions. After successfull allocation the returned +pointer must be passed to UT_DELETE() when no longer needed. +@param[in] expr any expression that could follow "new" +@return pointer to the created object or NULL */ +#define UT_NEW_NOKEY(expr) UT_NEW(expr, PSI_NOT_INSTRUMENTED) + +/** Destroy, deallocate and trace the deallocation of an object created by +UT_NEW() or UT_NEW_NOKEY(). +We can't instantiate ut_allocator without having the type of the object, thus +we redirect this to a templated function. */ +#define UT_DELETE(ptr) ut_delete(ptr) + + +/** Destroy and account object created by UT_NEW() or UT_NEW_NOKEY(). +@param[in,out] ptr pointer to the object */ +template +inline +void +ut_delete( + T* ptr) +{ + if (ptr == NULL) { + return; + } + + ut_allocator allocator; + + allocator.destroy(ptr); + allocator.deallocate(ptr); +} + +/** Allocate and account 'n_elements' objects of type 'type'. +Use this macro to allocate memory within InnoDB instead of 'new[]'. +The returned pointer must be passed to UT_DELETE_ARRAY(). +@param[in] type type of objects being created +@param[in] n_elements number of objects to create +@param[in] key performance schema memory tracing key +@return pointer to the first allocated object or NULL */ +#define UT_NEW_ARRAY(type, n_elements, key) \ + ut_allocator(key).new_array(n_elements, AUTOEVENT_IDX) + +/** Allocate and account 'n_elements' objects of type 'type'. +Use this macro to allocate memory within InnoDB instead of 'new[]' and +instead of UT_NEW_ARRAY() when it is not feasible to create a dedicated key. +@param[in] type type of objects being created +@param[in] n_elements number of objects to create +@return pointer to the first allocated object or NULL */ +#define UT_NEW_ARRAY_NOKEY(type, n_elements) \ + UT_NEW_ARRAY(type, n_elements, PSI_NOT_INSTRUMENTED) + +/** Destroy, deallocate and trace the deallocation of an array created by +UT_NEW_ARRAY() or UT_NEW_ARRAY_NOKEY(). +We can't instantiate ut_allocator without having the type of the object, thus +we redirect this to a templated function. */ +#define UT_DELETE_ARRAY(ptr) ut_delete_array(ptr) + +/** Destroy and account objects created by UT_NEW_ARRAY() or +UT_NEW_ARRAY_NOKEY(). +@param[in,out] ptr pointer to the first object in the array */ +template +inline +void +ut_delete_array( + T* ptr) +{ + ut_allocator().delete_array(ptr); +} + +#define ut_malloc(n_bytes, key) static_cast( \ + ut_allocator(key).allocate( \ + n_bytes, NULL, AUTOEVENT_IDX, false, false)) + +#define ut_malloc_dontdump(n_bytes, key) static_cast( \ + ut_allocator(key).allocate_large( \ + n_bytes, NULL, true)) + +#define ut_zalloc(n_bytes, key) static_cast( \ + ut_allocator(key).allocate( \ + n_bytes, NULL, AUTOEVENT_IDX, true, false)) + +#define ut_malloc_nokey(n_bytes) static_cast( \ + ut_allocator(PSI_NOT_INSTRUMENTED).allocate( \ + n_bytes, NULL, AUTOEVENT_IDX, false, false)) + +#define ut_zalloc_nokey(n_bytes) static_cast( \ + ut_allocator(PSI_NOT_INSTRUMENTED).allocate( \ + n_bytes, NULL, AUTOEVENT_IDX, true, false)) + +#define ut_zalloc_nokey_nofatal(n_bytes) static_cast( \ + ut_allocator(PSI_NOT_INSTRUMENTED).allocate( \ + n_bytes, NULL, AUTOEVENT_IDX, true, false)) + +#define ut_realloc(ptr, n_bytes) static_cast( \ + ut_allocator(PSI_NOT_INSTRUMENTED).reallocate( \ + ptr, n_bytes, AUTOEVENT_IDX)) + +#define ut_free(ptr) ut_allocator(PSI_NOT_INSTRUMENTED).deallocate( \ + reinterpret_cast(ptr)) + +#else /* UNIV_PFS_MEMORY */ + +/* Fallbacks when memory tracing is disabled at compile time. */ + +#define UT_NEW(expr, key) ::new(std::nothrow) expr +#define UT_NEW_NOKEY(expr) ::new(std::nothrow) expr +#define UT_DELETE(ptr) ::delete ptr + +#define UT_NEW_ARRAY(type, n_elements, key) \ + ::new(std::nothrow) type[n_elements] + +#define UT_NEW_ARRAY_NOKEY(type, n_elements) \ + ::new(std::nothrow) type[n_elements] + +#define UT_DELETE_ARRAY(ptr) ::delete[] ptr + +#define ut_malloc(n_bytes, key) ::malloc(n_bytes) + +#define ut_zalloc(n_bytes, key) ::calloc(1, n_bytes) + +#define ut_malloc_nokey(n_bytes) ::malloc(n_bytes) + +static inline void *ut_malloc_dontdump(size_t n_bytes, ...) +{ + void *ptr = my_large_malloc(&n_bytes, MYF(0)); + + ut_dontdump(ptr, n_bytes, true); + + if (ptr) { + os_total_large_mem_allocated += n_bytes; + } + return ptr; +} + +#define ut_zalloc_nokey(n_bytes) ::calloc(1, n_bytes) + +#define ut_zalloc_nokey_nofatal(n_bytes) ::calloc(1, n_bytes) + +#define ut_realloc(ptr, n_bytes) ::realloc(ptr, n_bytes) + +#define ut_free(ptr) ::free(ptr) + +#endif /* UNIV_PFS_MEMORY */ + +static inline void ut_free_dodump(void *ptr, size_t size) +{ + ut_dodump(ptr, size); + os_total_large_mem_allocated -= size; + my_large_free(ptr, size); +} + +#endif /* ut0new_h */ diff --git a/storage/innobase/include/ut0pool.h b/storage/innobase/include/ut0pool.h new file mode 100644 index 00000000..aa0cfb9e --- /dev/null +++ b/storage/innobase/include/ut0pool.h @@ -0,0 +1,365 @@ +/***************************************************************************** + +Copyright (c) 2013, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0pool.h +Object pool. + +Created 2012-Feb-26 Sunny Bains +***********************************************************************/ + +#ifndef ut0pool_h +#define ut0pool_h + +#include +#include +#include + +#include + +/** Allocate the memory for the object in blocks. We keep the objects sorted +on pointer so that they are closer together in case they have to be iterated +over in a list. */ +template +struct Pool { + + typedef Type value_type; + + struct Element { + Pool* m_pool; + value_type m_type; + }; + + /** Constructor + @param size size of the memory block */ + Pool(size_t size) + : + m_end(), + m_start(), + m_size(size), + m_last() + { + ut_ad(ut_is_2pow(size)); + ut_a(size >= sizeof(Element)); + static_assert(!(sizeof(Element) % CPU_LEVEL1_DCACHE_LINESIZE), + "alignment"); + + m_lock_strategy.create(); + + ut_a(m_start == 0); + + m_start = static_cast( + aligned_malloc(m_size, CPU_LEVEL1_DCACHE_LINESIZE)); + memset_aligned( + m_start, 0, m_size); + + m_last = m_start; + + m_end = &m_start[m_size / sizeof *m_start]; + + /* Note: Initialise only a small subset, even though we have + allocated all the memory. This is required only because PFS + (MTR) results change if we instantiate too many mutexes up + front. */ + + init(ut_min(size_t(16), size_t(m_end - m_start))); + + ut_ad(m_pqueue.size() <= size_t(m_last - m_start)); + } + + /** Destructor */ + ~Pool() + { + m_lock_strategy.destroy(); + + for (Element* elem = m_start; elem != m_last; ++elem) { + + ut_ad(elem->m_pool == this); + Factory::destroy(&elem->m_type); + } + + IF_WIN(_aligned_free,free)(m_start); + m_end = m_last = m_start = 0; + m_size = 0; + } + + /** Get an object from the pool. + @retrun a free instance or NULL if exhausted. */ + Type* get() + { + Element* elem; + + m_lock_strategy.enter(); + + if (!m_pqueue.empty()) { + + elem = m_pqueue.top(); + m_pqueue.pop(); + + } else if (m_last < m_end) { + + /* Initialise the remaining elements. */ + init(size_t(m_end - m_last)); + + ut_ad(!m_pqueue.empty()); + + elem = m_pqueue.top(); + m_pqueue.pop(); + } else { + elem = NULL; + } + + m_lock_strategy.exit(); + return elem ? &elem->m_type : NULL; + } + + /** Add the object to the pool. + @param ptr object to free */ + static void mem_free(value_type* ptr) + { + Element* elem; + byte* p = reinterpret_cast(ptr + 1); + + elem = reinterpret_cast(p - sizeof(*elem)); + + elem->m_pool->m_lock_strategy.enter(); + + elem->m_pool->putl(elem); + + elem->m_pool->m_lock_strategy.exit(); + } + +protected: + // Disable copying + Pool(const Pool&); + Pool& operator=(const Pool&); + +private: + + /* We only need to compare on pointer address. */ + typedef std::priority_queue< + Element*, + std::vector >, + std::greater > pqueue_t; + + /** Release the object to the free pool + @param elem element to free */ + void putl(Element* elem) + { + ut_ad(elem >= m_start && elem < m_last); + m_pqueue.push(elem); + } + + /** Initialise the elements. + @param n_elems Number of elements to initialise */ + void init(size_t n_elems) + { + ut_ad(size_t(m_end - m_last) >= n_elems); + + for (size_t i = 0; i < n_elems; ++i, ++m_last) { + + m_last->m_pool = this; + Factory::init(&m_last->m_type); + m_pqueue.push(m_last); + } + + ut_ad(m_last <= m_end); + } + +private: + /** Pointer to the last element */ + Element* m_end; + + /** Pointer to the first element */ + Element* m_start; + + /** Size of the block in bytes */ + size_t m_size; + + /** Upper limit of used space */ + Element* m_last; + + /** Priority queue ordered on the pointer addresse. */ + pqueue_t m_pqueue; + + /** Lock strategy to use */ + LockStrategy m_lock_strategy; +}; + +template +struct PoolManager { + + typedef Pool PoolType; + typedef typename PoolType::value_type value_type; + + PoolManager(size_t size) + : + m_size(size) + { + create(); + } + + ~PoolManager() + { + destroy(); + + ut_a(m_pools.empty()); + } + + /** Get an element from one of the pools. + @return instance or NULL if pool is empty. */ + value_type* get() + { + size_t index = 0; + size_t delay = 1; + value_type* ptr = NULL; + + do { + m_lock_strategy.enter(); + + ut_ad(!m_pools.empty()); + + size_t n_pools = m_pools.size(); + + PoolType* pool = m_pools[index % n_pools]; + + m_lock_strategy.exit(); + + ptr = pool->get(); + + if (ptr == 0 && (index / n_pools) > 2) { + + if (!add_pool(n_pools)) { + + ib::error() << "Failed to allocate" + " memory for a pool of size " + << m_size << " bytes. Will" + " wait for " << delay + << " seconds for a thread to" + " free a resource"; + + /* There is nothing much we can do + except crash and burn, however lets + be a little optimistic and wait for + a resource to be freed. */ + std::this_thread::sleep_for( + std::chrono::seconds(delay)); + + if (delay < 32) { + delay <<= 1; + } + + } else { + delay = 1; + } + } + + ++index; + + } while (ptr == NULL); + + return(ptr); + } + + static void mem_free(value_type* ptr) + { + PoolType::mem_free(ptr); + } + +private: + /** Add a new pool + @param n_pools Number of pools that existed when the add pool was + called. + @return true on success */ + bool add_pool(size_t n_pools) + { + bool added = false; + + m_lock_strategy.enter(); + + if (n_pools < m_pools.size()) { + /* Some other thread already added a pool. */ + added = true; + } else { + PoolType* pool; + + ut_ad(n_pools == m_pools.size()); + + pool = UT_NEW_NOKEY(PoolType(m_size)); + + if (pool != NULL) { + m_pools.push_back(pool); + + ib::info() << "Number of transaction pools: " + << m_pools.size(); + + added = true; + } + } + + ut_ad(n_pools < m_pools.size() || !added); + + m_lock_strategy.exit(); + + return(added); + } + + /** Create the pool manager. */ + void create() + { + ut_a(m_size > sizeof(value_type)); + m_lock_strategy.create(); + + add_pool(0); + } + + /** Release the resources. */ + void destroy() + { + typename Pools::iterator it; + typename Pools::iterator end = m_pools.end(); + + for (it = m_pools.begin(); it != end; ++it) { + PoolType* pool = *it; + + UT_DELETE(pool); + } + + m_pools.clear(); + + m_lock_strategy.destroy(); + } +private: + // Disable copying + PoolManager(const PoolManager&); + PoolManager& operator=(const PoolManager&); + + typedef std::vector > Pools; + + /** Size of each block */ + size_t m_size; + + /** Pools managed this manager */ + Pools m_pools; + + /** Lock strategy to use */ + LockStrategy m_lock_strategy; +}; + +#endif /* ut0pool_h */ diff --git a/storage/innobase/include/ut0rbt.h b/storage/innobase/include/ut0rbt.h new file mode 100644 index 00000000..38071165 --- /dev/null +++ b/storage/innobase/include/ut0rbt.h @@ -0,0 +1,254 @@ +/***************************************************************************** + +Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ +/******************************************************************//** +@file include/ut0rbt.h +Various utilities + +Created 2007-03-20 Sunny Bains +*******************************************************/ + +#ifndef INNOBASE_UT0RBT_H +#define INNOBASE_UT0RBT_H + +#if !defined(IB_RBT_TESTING) +#include "ut0mem.h" +#else +#include +#include +#include +#include + +#define ut_malloc malloc +#define ut_free free +#define ulint unsigned long +#define ut_a(c) assert(c) +#define ut_error assert(0) +#define ibool unsigned int +#define TRUE 1 +#define FALSE 0 +#endif + +struct ib_rbt_node_t; +typedef void (*ib_rbt_print_node)(const ib_rbt_node_t* node); +typedef int (*ib_rbt_compare)(const void* p1, const void* p2); +typedef int (*ib_rbt_arg_compare)(const void*, const void* p1, const void* p2); + +/** Red black tree color types */ +enum ib_rbt_color_t { + IB_RBT_RED, + IB_RBT_BLACK +}; + +/** Red black tree node */ +struct ib_rbt_node_t { + ib_rbt_color_t color; /* color of this node */ + + ib_rbt_node_t* left; /* points left child */ + ib_rbt_node_t* right; /* points right child */ + ib_rbt_node_t* parent; /* points parent node */ + + char value[1]; /* Data value */ +}; + +/** Red black tree instance.*/ +struct ib_rbt_t { + ib_rbt_node_t* nil; /* Black colored node that is + used as a sentinel. This is + pre-allocated too.*/ + + ib_rbt_node_t* root; /* Root of the tree, this is + pre-allocated and the first + data node is the left child.*/ + + ulint n_nodes; /* Total number of data nodes */ + + ib_rbt_compare compare; /* Fn. to use for comparison */ + ib_rbt_arg_compare + compare_with_arg; /* Fn. to use for comparison + with argument */ + ulint sizeof_value; /* Sizeof the item in bytes */ + void* cmp_arg; /* Compare func argument */ +}; + +/** The result of searching for a key in the tree, this is useful for +a speedy lookup and insert if key doesn't exist.*/ +struct ib_rbt_bound_t { + const ib_rbt_node_t* + last; /* Last node visited */ + + int result; /* Result of comparing with + the last non-nil node that + was visited */ +}; + +/* Size in elements (t is an rb tree instance) */ +#define rbt_size(t) (t->n_nodes) + +/* Check whether the rb tree is empty (t is an rb tree instance) */ +#define rbt_empty(t) (rbt_size(t) == 0) + +/* Get data value (t is the data type, n is an rb tree node instance) */ +#define rbt_value(t, n) ((t*) &n->value[0]) + +/* Compare a key with the node value (t is tree, k is key, n is node)*/ +#define rbt_compare(t, k, n) (t->compare(k, n->value)) + +/**********************************************************************//** +Free an instance of a red black tree */ +void +rbt_free( +/*=====*/ + ib_rbt_t* tree); /*!< in: rb tree to free */ +/**********************************************************************//** +Create an instance of a red black tree +@return rb tree instance */ +ib_rbt_t* +rbt_create( +/*=======*/ + size_t sizeof_value, /*!< in: size in bytes */ + ib_rbt_compare compare); /*!< in: comparator */ +/**********************************************************************//** +Create an instance of a red black tree, whose comparison function takes +an argument +@return rb tree instance */ +ib_rbt_t* +rbt_create_arg_cmp( +/*===============*/ + size_t sizeof_value, /*!< in: size in bytes */ + ib_rbt_arg_compare + compare, /*!< in: comparator */ + void* cmp_arg); /*!< in: compare fn arg */ +/**********************************************************************//** +Delete a node from the red black tree, identified by key */ +ibool +rbt_delete( +/*=======*/ + /* in: TRUE on success */ + ib_rbt_t* tree, /* in: rb tree */ + const void* key); /* in: key to delete */ +/**********************************************************************//** +Remove a node from the red black tree, NOTE: This function will not delete +the node instance, THAT IS THE CALLERS RESPONSIBILITY. +@return the deleted node with the const. */ +ib_rbt_node_t* +rbt_remove_node( +/*============*/ + ib_rbt_t* tree, /*!< in: rb tree */ + const ib_rbt_node_t* + node); /*!< in: node to delete, this + is a fudge and declared const + because the caller has access + only to const nodes.*/ +/**********************************************************************//** +Add data to the red black tree, identified by key (no dups yet!) +@return inserted node */ +const ib_rbt_node_t* +rbt_insert( +/*=======*/ + ib_rbt_t* tree, /*!< in: rb tree */ + const void* key, /*!< in: key for ordering */ + const void* value); /*!< in: data that will be + copied to the node.*/ +/**********************************************************************//** +Add a new node to the tree, useful for data that is pre-sorted. +@return appended node */ +const ib_rbt_node_t* +rbt_add_node( +/*=========*/ + ib_rbt_t* tree, /*!< in: rb tree */ + ib_rbt_bound_t* parent, /*!< in: parent */ + const void* value); /*!< in: this value is copied + to the node */ +/**********************************************************************//** +Return the left most data node in the tree +@return left most node */ +const ib_rbt_node_t* +rbt_first( +/*======*/ + const ib_rbt_t* tree); /*!< in: rb tree */ +/**********************************************************************//** +Return the right most data node in the tree +@return right most node */ +const ib_rbt_node_t* +rbt_last( +/*=====*/ + const ib_rbt_t* tree); /*!< in: rb tree */ +/**********************************************************************//** +Return the next node from current. +@return successor node to current that is passed in. */ +const ib_rbt_node_t* +rbt_next( +/*=====*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + const ib_rbt_node_t* /* in: current node */ + current); +/**********************************************************************//** +Return the prev node from current. +@return precedessor node to current that is passed in */ +const ib_rbt_node_t* +rbt_prev( +/*=====*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + const ib_rbt_node_t* /* in: current node */ + current); +/**********************************************************************//** +Search for the key, a node will be retuned in parent.last, whether it +was found or not. If not found then parent.last will contain the +parent node for the possibly new key otherwise the matching node. +@return result of last comparison */ +int +rbt_search( +/*=======*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + ib_rbt_bound_t* parent, /*!< in: search bounds */ + const void* key); /*!< in: key to search */ +/**********************************************************************//** +Search for the key, a node will be retuned in parent.last, whether it +was found or not. If not found then parent.last will contain the +parent node for the possibly new key otherwise the matching node. +@return result of last comparison */ +int +rbt_search_cmp( +/*===========*/ + const ib_rbt_t* tree, /*!< in: rb tree */ + ib_rbt_bound_t* parent, /*!< in: search bounds */ + const void* key, /*!< in: key to search */ + ib_rbt_compare compare, /*!< in: comparator */ + ib_rbt_arg_compare + arg_compare); /*!< in: fn to compare items + with argument */ +/**********************************************************************//** +Merge the node from dst into src. Return the number of nodes merged. +@return no. of recs merged */ +ulint +rbt_merge_uniq( +/*===========*/ + ib_rbt_t* dst, /*!< in: dst rb tree */ + const ib_rbt_t* src); /*!< in: src rb tree */ +#if defined UNIV_DEBUG || defined IB_RBT_TESTING +/**********************************************************************//** +Verify the integrity of the RB tree. For debugging. 0 failure else height +of tree (in count of black nodes). +@return TRUE if OK FALSE if tree invalid. */ +ibool +rbt_validate( +/*=========*/ + const ib_rbt_t* tree); /*!< in: tree to validate */ +#endif /* UNIV_DEBUG || IB_RBT_TESTING */ + +#endif /* INNOBASE_UT0RBT_H */ diff --git a/storage/innobase/include/ut0rnd.h b/storage/innobase/include/ut0rnd.h new file mode 100644 index 00000000..511eb21f --- /dev/null +++ b/storage/innobase/include/ut0rnd.h @@ -0,0 +1,128 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0rnd.h +Random numbers and hashing + +Created 1/20/1994 Heikki Tuuri +***********************************************************************/ + +#ifndef ut0rnd_h +#define ut0rnd_h + +#include "ut0byte.h" +#include + +#ifndef UNIV_INNOCHECKSUM +/** Seed value of ut_rnd_gen() */ +extern std::atomic ut_rnd_current; + +/** @return a pseudo-random 32-bit number */ +inline uint32_t ut_rnd_gen() +{ + /* This is a Galois linear-feedback shift register. + https://en.wikipedia.org/wiki/Linear-feedback_shift_register#Galois_LFSRs + The generating primitive Galois Field polynomial is the Castagnoli + polynomial that was made popular by CRC-32C: + x^32+x^28+x^27+x^26+x^25+x^23+x^22+x^20+ + x^19+x^18+x^14+x^13+x^11+x^10+x^9+x^8+x^6+1 */ + const uint32_t crc32c= 0x1edc6f41; + + uint32_t rnd= ut_rnd_current.load(std::memory_order_relaxed); + + if (UNIV_UNLIKELY(rnd == 0)) + { + rnd= static_cast(my_interval_timer()); + if (!rnd) rnd= 1; + } + else + { + bool lsb= rnd & 1; + rnd>>= 1; + if (lsb) + rnd^= crc32c; + } + + ut_rnd_current.store(rnd, std::memory_order_relaxed); + return rnd; +} + +/** @return a random number between 0 and n-1, inclusive */ +inline ulint ut_rnd_interval(ulint n) +{ + return n > 1 ? static_cast(ut_rnd_gen() % n) : 0; +} + +/*******************************************************//** +The following function generates a hash value for a ulint integer +to a hash table of size table_size, which should be a prime or some +random number to work reliably. +@return hash value */ +UNIV_INLINE +ulint +ut_hash_ulint( +/*==========*/ + ulint key, /*!< in: value to be hashed */ + ulint table_size); /*!< in: hash table size */ +/*************************************************************//** +Folds a 64-bit integer. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_ull( +/*========*/ + ib_uint64_t d) /*!< in: 64-bit integer */ + MY_ATTRIBUTE((const)); +/***********************************************************//** +Looks for a prime number slightly greater than the given argument. +The prime is chosen so that it is not near any power of 2. +@return prime */ +ulint +ut_find_prime( +/*==========*/ + ulint n) /*!< in: positive number > 100 */ + MY_ATTRIBUTE((const)); + +#endif /* !UNIV_INNOCHECKSUM */ + +/*************************************************************//** +Folds a pair of ulints. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_ulint_pair( +/*===============*/ + ulint n1, /*!< in: ulint */ + ulint n2) /*!< in: ulint */ + MY_ATTRIBUTE((const)); +/*************************************************************//** +Folds a binary string. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_binary( +/*===========*/ + const byte* str, /*!< in: string of bytes */ + ulint len) /*!< in: length */ + MY_ATTRIBUTE((pure)); + +#include "ut0rnd.inl" + +#endif diff --git a/storage/innobase/include/ut0rnd.inl b/storage/innobase/include/ut0rnd.inl new file mode 100644 index 00000000..37da323f --- /dev/null +++ b/storage/innobase/include/ut0rnd.inl @@ -0,0 +1,128 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************************//** +@file include/ut0rnd.ic +Random numbers and hashing + +Created 5/30/1994 Heikki Tuuri +*******************************************************************/ + +#define UT_HASH_RANDOM_MASK 1463735687 +#define UT_HASH_RANDOM_MASK2 1653893711 + +#ifndef UNIV_INNOCHECKSUM + +/*******************************************************//** +The following function generates a hash value for a ulint integer +to a hash table of size table_size, which should be a prime +or some random number for the hash table to work reliably. +@return hash value */ +UNIV_INLINE +ulint +ut_hash_ulint( +/*==========*/ + ulint key, /*!< in: value to be hashed */ + ulint table_size) /*!< in: hash table size */ +{ + ut_ad(table_size); + key = key ^ UT_HASH_RANDOM_MASK2; + + return(key % table_size); +} + +/*************************************************************//** +Folds a 64-bit integer. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_ull( +/*========*/ + ib_uint64_t d) /*!< in: 64-bit integer */ +{ + return(ut_fold_ulint_pair((ulint) d & ULINT32_MASK, + (ulint) (d >> 32))); +} +#endif /* !UNIV_INNOCHECKSUM */ + +/*************************************************************//** +Folds a pair of ulints. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_ulint_pair( +/*===============*/ + ulint n1, /*!< in: ulint */ + ulint n2) /*!< in: ulint */ +{ + return(((((n1 ^ n2 ^ UT_HASH_RANDOM_MASK2) << 8) + n1) + ^ UT_HASH_RANDOM_MASK) + n2); +} + +/*************************************************************//** +Folds a binary string. +@return folded value */ +UNIV_INLINE +ulint +ut_fold_binary( +/*===========*/ + const byte* str, /*!< in: string of bytes */ + ulint len) /*!< in: length */ +{ + ulint fold = 0; + const byte* str_end = str + (len & 0xFFFFFFF8); + + ut_ad(str || !len); + + while (str < str_end) { + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + } + + switch (len & 0x7) { + case 7: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + /* fall through */ + case 6: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + /* fall through */ + case 5: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + /* fall through */ + case 4: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + /* fall through */ + case 3: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + /* fall through */ + case 2: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + /* fall through */ + case 1: + fold = ut_fold_ulint_pair(fold, (ulint)(*str++)); + } + + return(fold); +} diff --git a/storage/innobase/include/ut0sort.h b/storage/innobase/include/ut0sort.h new file mode 100644 index 00000000..4f1d4c04 --- /dev/null +++ b/storage/innobase/include/ut0sort.h @@ -0,0 +1,104 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0sort.h +Sort utility + +Created 11/9/1995 Heikki Tuuri +***********************************************************************/ + +#ifndef ut0sort_h +#define ut0sort_h + +/* This module gives a macro definition of the body of +a standard sort function for an array of elements of any +type. The comparison function is given as a parameter to +the macro. The sort algorithm is mergesort which has logarithmic +worst case. +*/ + +/*******************************************************************//** +This macro expands to the body of a standard sort function. +The sort function uses mergesort and must be defined separately +for each type of array. +Also the comparison function has to be defined individually +for each array cell type. SORT_FUN is the sort function name. +The function takes the array to be sorted (ARR), +the array of auxiliary space (AUX_ARR) of same size, +and the low (LOW), inclusive, and high (HIGH), noninclusive, +limits for the sort interval as arguments. +CMP_FUN is the comparison function name. It takes as arguments +two elements from the array and returns 1, if the first is bigger, +0 if equal, and -1 if the second bigger. */ + +#define UT_SORT_FUNCTION_BODY(SORT_FUN, ARR, AUX_ARR, LOW, HIGH, CMP_FUN)\ +{\ + ulint ut_sort_mid77;\ + ulint ut_sort_i77;\ + ulint ut_sort_low77;\ + ulint ut_sort_high77;\ +\ + ut_ad((LOW) < (HIGH));\ + ut_ad(ARR);\ + ut_ad(AUX_ARR);\ +\ + if ((LOW) == (HIGH) - 1) {\ + return;\ + } else if ((LOW) == (HIGH) - 2) {\ + if (CMP_FUN((ARR)[LOW], (ARR)[(HIGH) - 1]) > 0) {\ + (AUX_ARR)[LOW] = (ARR)[LOW];\ + (ARR)[LOW] = (ARR)[(HIGH) - 1];\ + (ARR)[(HIGH) - 1] = (AUX_ARR)[LOW];\ + }\ + return;\ + }\ +\ + ut_sort_mid77 = ((LOW) + (HIGH)) / 2;\ +\ + SORT_FUN((ARR), (AUX_ARR), (LOW), ut_sort_mid77);\ + SORT_FUN((ARR), (AUX_ARR), ut_sort_mid77, (HIGH));\ +\ + ut_sort_low77 = (LOW);\ + ut_sort_high77 = ut_sort_mid77;\ +\ + for (ut_sort_i77 = (LOW); ut_sort_i77 < (HIGH); ut_sort_i77++) {\ +\ + if (ut_sort_low77 >= ut_sort_mid77) {\ + (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\ + ut_sort_high77++;\ + } else if (ut_sort_high77 >= (HIGH)) {\ + (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\ + ut_sort_low77++;\ + } else if (CMP_FUN((ARR)[ut_sort_low77],\ + (ARR)[ut_sort_high77]) > 0) {\ + (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\ + ut_sort_high77++;\ + } else {\ + (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\ + ut_sort_low77++;\ + }\ + }\ +\ + memcpy((void*) ((ARR) + (LOW)), (AUX_ARR) + (LOW),\ + ((HIGH) - (LOW)) * sizeof *(ARR));\ +}\ + + +#endif + diff --git a/storage/innobase/include/ut0stage.h b/storage/innobase/include/ut0stage.h new file mode 100644 index 00000000..17fbd91b --- /dev/null +++ b/storage/innobase/include/ut0stage.h @@ -0,0 +1,499 @@ +/***************************************************************************** + +Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2020, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file ut/ut0stage.h +Supplementary code to performance schema stage instrumentation. + +Created Nov 12, 2014 Vasil Dimov +*******************************************************/ + +#ifndef ut0stage_h +#define ut0stage_h + +#include +#include + +#include "my_global.h" /* needed for headers from mysql/psi/ */ + +#include "mysql/psi/mysql_stage.h" /* mysql_stage_inc_work_completed */ +#include "mysql/psi/psi.h" /* HAVE_PSI_STAGE_INTERFACE, PSI_stage_progress */ + +#include "dict0mem.h" /* dict_index_t */ +#include "row0log.h" /* row_log_estimate_work() */ +#include "srv0srv.h" /* ut_stage_alter_t */ + +#ifdef HAVE_PSI_STAGE_INTERFACE + +/** Class used to report ALTER TABLE progress via performance_schema. +The only user of this class is the ALTER TABLE code and it calls the methods +in the following order +constructor +begin_phase_read_pk() + multiple times: + n_pk_recs_inc() // once per record read + inc() // once per page read +end_phase_read_pk() +if any new indexes are being added, for each one: + begin_phase_sort() + multiple times: + inc() // once per record sorted + begin_phase_insert() + multiple times: + inc() // once per record inserted + being_phase_log_index() + multiple times: + inc() // once per log-block applied +begin_phase_log_table() + multiple times: + inc() // once per log-block applied +begin_phase_end() +destructor + +This class knows the specifics of each phase and tries to increment the +progress in an even manner across the entire ALTER TABLE lifetime. */ +class ut_stage_alter_t { +public: + /** Constructor. + @param[in] pk primary key of the old table */ + explicit + ut_stage_alter_t( + const dict_index_t* pk) + : + m_progress(NULL), + m_pk(pk), + m_n_pk_recs(0), + m_n_pk_pages(0), + m_n_recs_processed(0), + m_cur_phase(NOT_STARTED) + { + } + + /** Destructor. */ + ~ut_stage_alter_t(); + + /** Flag an ALTER TABLE start (read primary key phase). + @param[in] n_sort_indexes number of indexes that will be sorted + during ALTER TABLE, used for estimating the total work to be done */ + void + begin_phase_read_pk( + ulint n_sort_indexes); + + /** Increment the number of records in PK (table) with 1. + This is used to get more accurate estimate about the number of + records per page which is needed because some phases work on + per-page basis while some work on per-record basis and we want + to get the progress as even as possible. */ + void + n_pk_recs_inc(); + + /** Flag either one record or one page processed, depending on the + current phase. + @param[in] inc_val flag this many units processed at once */ + void + inc( + ulint inc_val = 1); + + /** Flag the end of reading of the primary key. + Here we know the exact number of pages and records and calculate + the number of records per page and refresh the estimate. */ + void + end_phase_read_pk(); + + /** Flag the beginning of the sort phase. + @param[in] sort_multi_factor since merge sort processes + one page more than once we only update the estimate once per this + many pages processed. */ + void + begin_phase_sort( + double sort_multi_factor); + + /** Flag the beginning of the insert phase. */ + void + begin_phase_insert(); + + /** Flag the beginning of the log index phase. */ + void + begin_phase_log_index(); + + /** Flag the beginning of the log table phase. */ + void + begin_phase_log_table(); + + /** Flag the beginning of the end phase. */ + void + begin_phase_end(); + +private: + + /** Update the estimate of total work to be done. */ + void + reestimate(); + + /** Change the current phase. + @param[in] new_stage pointer to the new stage to change to */ + void + change_phase( + const PSI_stage_info* new_stage); + + /** Performance schema accounting object. */ + PSI_stage_progress* m_progress; + + /** Old table PK. Used for calculating the estimate. */ + const dict_index_t* m_pk; + + /** Number of records in the primary key (table), including delete + marked records. */ + ulint m_n_pk_recs; + + /** Number of leaf pages in the primary key. */ + ulint m_n_pk_pages; + + /** Estimated number of records per page in the primary key. */ + double m_n_recs_per_page; + + /** Number of indexes that are being added. */ + ulint m_n_sort_indexes; + + /** During the sort phase, increment the counter once per this + many pages processed. This is because sort processes one page more + than once. */ + ulint m_sort_multi_factor; + + /** Number of records processed during sort & insert phases. We + need to increment the counter only once page, or once per + recs-per-page records. */ + ulint m_n_recs_processed; + + /** Current phase. */ + enum { + NOT_STARTED = 0, + READ_PK = 1, + SORT = 2, + INSERT = 3, + /* JAN: TODO: MySQL 5.7 vrs. MariaDB sql/log.h + LOG_INDEX = 5, + LOG_TABLE = 6, */ + LOG_INNODB_INDEX = 5, + LOG_INNODB_TABLE = 6, + END = 7, + } m_cur_phase; +}; + +/** Destructor. */ +inline +ut_stage_alter_t::~ut_stage_alter_t() +{ + if (m_progress == NULL) { + return; + } + + /* Set completed = estimated before we quit. */ + mysql_stage_set_work_completed( + m_progress, + mysql_stage_get_work_estimated(m_progress)); + + mysql_end_stage(); +} + +/** Flag an ALTER TABLE start (read primary key phase). +@param[in] n_sort_indexes number of indexes that will be sorted +during ALTER TABLE, used for estimating the total work to be done */ +inline +void +ut_stage_alter_t::begin_phase_read_pk( + ulint n_sort_indexes) +{ + m_n_sort_indexes = n_sort_indexes; + + m_cur_phase = READ_PK; + + m_progress = mysql_set_stage( + srv_stage_alter_table_read_pk_internal_sort.m_key); + + mysql_stage_set_work_completed(m_progress, 0); + reestimate(); +} + +/** Increment the number of records in PK (table) with 1. +This is used to get more accurate estimate about the number of +records per page which is needed because some phases work on +per-page basis while some work on per-record basis and we want +to get the progress as even as possible. */ +inline +void +ut_stage_alter_t::n_pk_recs_inc() +{ + m_n_pk_recs++; +} + +/** Flag either one record or one page processed, depending on the +current phase. */ +inline +void +ut_stage_alter_t::inc(ulint inc_val) +{ + if (m_progress == NULL) { + return; + } + + ulint multi_factor = 1; + bool should_proceed = true; + + switch (m_cur_phase) { + case NOT_STARTED: + ut_error; + case READ_PK: + m_n_pk_pages++; + ut_ad(inc_val == 1); + /* Overall the read pk phase will read all the pages from the + PK and will do work, proportional to the number of added + indexes, thus when this is called once per read page we + increment with 1 + m_n_sort_indexes */ + inc_val = 1 + m_n_sort_indexes; + break; + case SORT: + multi_factor = m_sort_multi_factor; + /* fall through */ + case INSERT: { + /* Increment the progress every nth record. During + sort and insert phases, this method is called once per + record processed. We need fractional point numbers here + because "records per page" is such a number naturally and + to avoid rounding skew we want, for example: if there are + (double) N records per page, then the work_completed + should be incremented on the inc() calls round(k*N), + for k=1,2,3... */ + const double every_nth = m_n_recs_per_page * + static_cast(multi_factor); + + const ulint k = static_cast( + round(static_cast(m_n_recs_processed) / + every_nth)); + + const ulint nth = static_cast( + round(static_cast(k) * every_nth)); + + should_proceed = m_n_recs_processed == nth; + + m_n_recs_processed++; + + break; + } + /* JAN: TODO: MySQL 5.7 + case LOG_INDEX: + break; + case LOG_TABLE: + break; */ + case LOG_INNODB_INDEX: + case LOG_INNODB_TABLE: + break; + case END: + break; + } + + if (should_proceed) { + mysql_stage_inc_work_completed(m_progress, inc_val); + reestimate(); + } +} + +/** Flag the end of reading of the primary key. +Here we know the exact number of pages and records and calculate +the number of records per page and refresh the estimate. */ +inline +void +ut_stage_alter_t::end_phase_read_pk() +{ + reestimate(); + + if (m_n_pk_pages == 0) { + /* The number of pages in the PK could be 0 if the tree is + empty. In this case we set m_n_recs_per_page to 1 to avoid + division by zero later. */ + m_n_recs_per_page = 1.0; + } else { + m_n_recs_per_page = std::max( + static_cast(m_n_pk_recs) + / static_cast(m_n_pk_pages), + 1.0); + } +} + +/** Flag the beginning of the sort phase. +@param[in] sort_multi_factor since merge sort processes +one page more than once we only update the estimate once per this +many pages processed. */ +inline +void +ut_stage_alter_t::begin_phase_sort( + double sort_multi_factor) +{ + if (sort_multi_factor <= 1.0) { + m_sort_multi_factor = 1; + } else { + m_sort_multi_factor = static_cast( + round(sort_multi_factor)); + } + + change_phase(&srv_stage_alter_table_merge_sort); +} + +/** Flag the beginning of the insert phase. */ +inline +void +ut_stage_alter_t::begin_phase_insert() +{ + change_phase(&srv_stage_alter_table_insert); +} + +/** Flag the beginning of the log index phase. */ +inline +void +ut_stage_alter_t::begin_phase_log_index() +{ + change_phase(&srv_stage_alter_table_log_index); +} + +/** Flag the beginning of the log table phase. */ +inline +void +ut_stage_alter_t::begin_phase_log_table() +{ + change_phase(&srv_stage_alter_table_log_table); +} + +/** Flag the beginning of the end phase. */ +inline +void +ut_stage_alter_t::begin_phase_end() +{ + change_phase(&srv_stage_alter_table_end); +} + +/** Update the estimate of total work to be done. */ +inline +void +ut_stage_alter_t::reestimate() +{ + if (m_progress == NULL) { + return; + } + + /* During the log table phase we calculate the estimate as + work done so far + log size remaining. */ + if (m_cur_phase == LOG_INNODB_TABLE) { + mysql_stage_set_work_estimated( + m_progress, + mysql_stage_get_work_completed(m_progress) + + row_log_estimate_work(m_pk)); + return; + } + + /* During the other phases we use a formula, regardless of + how much work has been done so far. */ + + /* For number of pages in the PK - if the PK has not been + read yet, use stat_n_leaf_pages (approximate), otherwise + use the exact number we gathered. */ + const ulint n_pk_pages + = m_cur_phase != READ_PK + ? m_n_pk_pages + : m_pk->stat_n_leaf_pages; + + ulonglong estimate __attribute__((unused)) + = n_pk_pages + * (1 /* read PK */ + + m_n_sort_indexes /* row_merge_buf_sort() inside the + read PK per created index */ + + m_n_sort_indexes * 2 /* sort & insert per created index */) + + row_log_estimate_work(m_pk); + + /* Prevent estimate < completed */ + estimate = std::max(estimate, + mysql_stage_get_work_completed(m_progress)); + + mysql_stage_set_work_estimated(m_progress, estimate); +} + +/** Change the current phase. +@param[in] new_stage pointer to the new stage to change to */ +inline +void +ut_stage_alter_t::change_phase( + const PSI_stage_info* new_stage) +{ + if (m_progress == NULL) { + return; + } + + if (new_stage == &srv_stage_alter_table_read_pk_internal_sort) { + m_cur_phase = READ_PK; + } else if (new_stage == &srv_stage_alter_table_merge_sort) { + m_cur_phase = SORT; + } else if (new_stage == &srv_stage_alter_table_insert) { + m_cur_phase = INSERT; + /* JAN: TODO: MySQL 5.7 used LOG_INDEX and LOG_TABLE */ + } else if (new_stage == &srv_stage_alter_table_log_index) { + m_cur_phase = LOG_INNODB_INDEX; + } else if (new_stage == &srv_stage_alter_table_log_table) { + m_cur_phase = LOG_INNODB_TABLE; + } else if (new_stage == &srv_stage_alter_table_end) { + m_cur_phase = END; + } else { + ut_error; + } + + const ulonglong c = mysql_stage_get_work_completed(m_progress); + const ulonglong e = mysql_stage_get_work_estimated(m_progress); + + m_progress = mysql_set_stage(new_stage->m_key); + + mysql_stage_set_work_completed(m_progress, c); + mysql_stage_set_work_estimated(m_progress, e); +} +#else /* HAVE_PSI_STAGE_INTERFACE */ + +class ut_stage_alter_t { +public: + explicit ut_stage_alter_t(const dict_index_t*) {} + + void begin_phase_read_pk(ulint) {} + + void n_pk_recs_inc() {} + + void inc() {} + void inc(ulint) {} + + void end_phase_read_pk() {} + + void begin_phase_sort(double) {} + + void begin_phase_insert() {} + + void begin_phase_log_index() {} + + void begin_phase_log_table() {} + + void begin_phase_end() {} +}; + +#endif /* HAVE_PSI_STAGE_INTERFACE */ + +#endif /* ut0stage_h */ diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h new file mode 100644 index 00000000..fe16ce14 --- /dev/null +++ b/storage/innobase/include/ut0ut.h @@ -0,0 +1,444 @@ +/***************************************************************************** + +Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2019, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/ut0ut.h +Various utilities + +Created 1/20/1994 Heikki Tuuri +***********************************************************************/ + +#ifndef ut0ut_h +#define ut0ut_h + +/* Do not include univ.i because univ.i includes this. */ + +#include +#include +#include + +#ifndef UNIV_INNOCHECKSUM + +#include "db0err.h" + +#include + +#ifndef MYSQL_SERVER +#include +#endif /* MYSQL_SERVER */ + +#include + +#include + +/** Index name prefix in fast index creation, as a string constant */ +#define TEMP_INDEX_PREFIX_STR "\377" + +#define ut_max std::max +#define ut_min std::min + +/** Calculate the minimum of two pairs. +@param[out] min_hi MSB of the minimum pair +@param[out] min_lo LSB of the minimum pair +@param[in] a_hi MSB of the first pair +@param[in] a_lo LSB of the first pair +@param[in] b_hi MSB of the second pair +@param[in] b_lo LSB of the second pair */ +UNIV_INLINE +void +ut_pair_min( + ulint* min_hi, + ulint* min_lo, + ulint a_hi, + ulint a_lo, + ulint b_hi, + ulint b_lo); +/******************************************************//** +Compares two ulints. +@return 1 if a > b, 0 if a == b, -1 if a < b */ +UNIV_INLINE +int +ut_ulint_cmp( +/*=========*/ + ulint a, /*!< in: ulint */ + ulint b); /*!< in: ulint */ +/** Compare two pairs of integers. +@param[in] a_h more significant part of first pair +@param[in] a_l less significant part of first pair +@param[in] b_h more significant part of second pair +@param[in] b_l less significant part of second pair +@return comparison result of (a_h,a_l) and (b_h,b_l) +@retval -1 if (a_h,a_l) is less than (b_h,b_l) +@retval 0 if (a_h,a_l) is equal to (b_h,b_l) +@retval 1 if (a_h,a_l) is greater than (b_h,b_l) */ +UNIV_INLINE +int +ut_pair_cmp( + ulint a_h, + ulint a_l, + ulint b_h, + ulint b_l) + MY_ATTRIBUTE((warn_unused_result)); + +/*************************************************************//** +Calculates fast the remainder of n/m when m is a power of two. +@param n in: numerator +@param m in: denominator, must be a power of two +@return the remainder of n/m */ +template inline T ut_2pow_remainder(T n, T m){return n & (m - 1);} +/*************************************************************//** +Calculates the biggest multiple of m that is not bigger than n +when m is a power of two. In other words, rounds n down to m * k. +@param n in: number to round down +@param m in: alignment, must be a power of two +@return n rounded down to the biggest possible integer multiple of m */ +template inline T ut_2pow_round(T n, T m) { return n & ~(m - 1); } +/********************************************************//** +Calculates the smallest multiple of m that is not smaller than n +when m is a power of two. In other words, rounds n up to m * k. +@param n in: number to round up +@param m in: alignment, must be a power of two +@return n rounded up to the smallest possible integer multiple of m */ +#define UT_CALC_ALIGN(n, m) ((n + m - 1) & ~(m - 1)) +template inline T ut_calc_align(T n, T m) +{ return static_cast(UT_CALC_ALIGN(n, m)); } + +/*************************************************************//** +Calculates fast the 2-logarithm of a number, rounded upward to an +integer. +@return logarithm in the base 2, rounded upward */ +UNIV_INLINE +ulint +ut_2_log( +/*=====*/ + ulint n); /*!< in: number */ +/*************************************************************//** +Calculates 2 to power n. +@return 2 to power n */ +UNIV_INLINE +ulint +ut_2_exp( +/*=====*/ + ulint n); /*!< in: number */ + +/**********************************************************//** +Returns the number of milliseconds since some epoch. The +value may wrap around. It should only be used for heuristic +purposes. +@return ms since epoch */ +ulint +ut_time_ms(void); +/*============*/ +#endif /* !UNIV_INNOCHECKSUM */ + +/** Determine how many bytes (groups of 8 bits) are needed to +store the given number of bits. +@param b in: bits +@return number of bytes (octets) needed to represent b */ +#define UT_BITS_IN_BYTES(b) (((b) + 7) >> 3) + +/** Determines if a number is zero or a power of two. +@param[in] n number +@return nonzero if n is zero or a power of two; zero otherwise */ +#define ut_is_2pow(n) (!((n) & ((n) - 1))) + +/** Functor that compares two C strings. Can be used as a comparator for +e.g. std::map that uses char* as keys. */ +struct ut_strcmp_functor +{ + bool operator()( + const char* a, + const char* b) const + { + return(strcmp(a, b) < 0); + } +}; + +/**********************************************************//** +Prints a timestamp to a file. */ +void +ut_print_timestamp( +/*===============*/ + FILE* file) /*!< in: file where to print */ + ATTRIBUTE_COLD __attribute__((nonnull)); + +#ifndef UNIV_INNOCHECKSUM + +/**********************************************************//** +Sprintfs a timestamp to a buffer, 13..14 chars plus terminating NUL. */ +void +ut_sprintf_timestamp( +/*=================*/ + char* buf); /*!< in: buffer where to sprintf */ + +/*************************************************************//** +Prints the contents of a memory buffer in hex and ascii. */ +void +ut_print_buf( +/*=========*/ + FILE* file, /*!< in: file where to print */ + const void* buf, /*!< in: memory buffer */ + ulint len); /*!< in: length of the buffer */ + +/*************************************************************//** +Prints the contents of a memory buffer in hex. */ +void +ut_print_buf_hex( +/*=============*/ + std::ostream& o, /*!< in/out: output stream */ + const void* buf, /*!< in: memory buffer */ + ulint len) /*!< in: length of the buffer */ + MY_ATTRIBUTE((nonnull)); +/*************************************************************//** +Prints the contents of a memory buffer in hex and ascii. */ +void +ut_print_buf( +/*=========*/ + std::ostream& o, /*!< in/out: output stream */ + const void* buf, /*!< in: memory buffer */ + ulint len) /*!< in: length of the buffer */ + MY_ATTRIBUTE((nonnull)); + +/* Forward declaration of transaction handle */ +struct trx_t; + +/** Get a fixed-length string, quoted as an SQL identifier. +If the string contains a slash '/', the string will be +output as two identifiers separated by a period (.), +as in SQL database_name.identifier. + @param [in] trx transaction (NULL=no quotes). + @param [in] name table name. + @retval String quoted as an SQL identifier. +*/ +std::string +ut_get_name( + const trx_t* trx, + const char* name); + +/**********************************************************************//** +Outputs a fixed-length string, quoted as an SQL identifier. +If the string contains a slash '/', the string will be +output as two identifiers separated by a period (.), +as in SQL database_name.identifier. */ +void +ut_print_name( +/*==========*/ + FILE* ef, /*!< in: stream */ + const trx_t* trx, /*!< in: transaction */ + const char* name); /*!< in: table name to print */ +/** Format a table name, quoted as an SQL identifier. +If the name contains a slash '/', the result will contain two +identifiers separated by a period (.), as in SQL +database_name.table_name. +@see table_name_t +@param[in] name table or index name +@param[out] formatted formatted result, will be NUL-terminated +@param[in] formatted_size size of the buffer in bytes +@return pointer to 'formatted' */ +char* +ut_format_name( + const char* name, + char* formatted, + ulint formatted_size); + +/**********************************************************************//** +Catenate files. */ +void +ut_copy_file( +/*=========*/ + FILE* dest, /*!< in: output file */ + FILE* src); /*!< in: input file to be appended to output */ + +/*************************************************************//** +Convert an error number to a human readable text message. The +returned string is static and should not be freed or modified. +@return string, describing the error */ +const char* +ut_strerr( +/*======*/ + dberr_t num); /*!< in: error number */ + +#endif /* !UNIV_INNOCHECKSUM */ + +namespace ib { + +/** This is a wrapper class, used to print any unsigned integer type +in hexadecimal format. The main purpose of this data type is to +overload the global operator<<, so that we can print the given +wrapper value in hex. */ +struct hex { + explicit hex(uintmax_t t): m_val(t) {} + const uintmax_t m_val; +}; + +/** This is an overload of the global operator<< for the user defined type +ib::hex. The unsigned value held in the ib::hex wrapper class will be printed +into the given output stream in hexadecimal format. +@param[in,out] lhs the output stream into which rhs is written. +@param[in] rhs the object to be written into lhs. +@retval reference to the output stream. */ +inline +std::ostream& +operator<<( + std::ostream& lhs, + const hex& rhs) +{ + std::ios_base::fmtflags ff = lhs.flags(); + lhs << std::showbase << std::hex << rhs.m_val; + lhs.setf(ff); + return(lhs); +} + +/** This is a wrapper class, used to print any number in IEC style */ +struct bytes_iec { + explicit bytes_iec(unsigned long long t): m_val(t) {} + double get_double() const { return static_cast(m_val); } + const unsigned long long m_val; +}; + +/** Like hex operator above, except for bytes_iec */ +std::ostream &operator<<(std::ostream &lhs, const bytes_iec &rhs); + +/** The class logger is the base class of all the error log related classes. +It contains a std::ostringstream object. The main purpose of this class is +to forward operator<< to the underlying std::ostringstream object. Do not +use this class directly, instead use one of the derived classes. */ +class logger +{ +protected: + /* This class must not be used directly */ + ATTRIBUTE_COLD ATTRIBUTE_NOINLINE logger() = default; +public: + template ATTRIBUTE_COLD ATTRIBUTE_NOINLINE + logger& operator<<(const T& rhs) + { + m_oss << rhs; + return *this; + } + + /** Handle a fixed character string in the same way as a pointer to + an unknown-length character string, to reduce object code bloat. */ + template logger& operator<<(const char (&rhs)[N]) + { return *this << static_cast(rhs); } + + /** Output an error code name */ + ATTRIBUTE_COLD logger& operator<<(dberr_t err); + + /** Append a string. + @param buf string buffer + @param size buffer size + @return the output stream */ + ATTRIBUTE_COLD __attribute__((noinline)) + std::ostream &write(const char *buf, std::streamsize size) + { + return m_oss.write(buf, size); + } + + std::ostream &write(const byte *buf, std::streamsize size) + { return write(reinterpret_cast(buf), size); } + + std::ostringstream m_oss; +}; + +/** The class info is used to emit informational log messages. It is to be +used similar to std::cout. But the log messages will be emitted only when +the dtor is called. The preferred usage of this class is to make use of +unnamed temporaries as follows: + +info() << "The server started successfully."; + +In the above usage, the temporary object will be destroyed at the end of the +statement and hence the log message will be emitted at the end of the +statement. If a named object is created, then the log message will be emitted +only when it goes out of scope or destroyed. */ +class info : public logger { +public: + ATTRIBUTE_COLD + ~info(); +}; + +/** The class warn is used to emit warnings. Refer to the documentation of +class info for further details. */ +class warn : public logger { +public: + ATTRIBUTE_COLD + ~warn(); +}; + +/** The class error is used to emit error messages. Refer to the +documentation of class info for further details. */ +class error : public logger { +public: + ATTRIBUTE_COLD + ~error(); + /** Indicates that error::~error() was invoked. Can be used to + determine if error messages were logged during innodb code execution. + @return true if there were error messages, false otherwise. */ + static bool was_logged() { return logged; } + +private: + /** true if error::~error() was invoked, false otherwise */ + static bool logged; +}; + +/** The class fatal is used to emit an error message and stop the server +by crashing it. Use this class when MySQL server needs to be stopped +immediately. Refer to the documentation of class info for usage details. */ +class fatal : public logger { +public: + ATTRIBUTE_NORETURN + ~fatal(); +}; + +/** Emit an error message if the given predicate is true, otherwise emit a +warning message */ +class error_or_warn : public logger { +public: + ATTRIBUTE_COLD + error_or_warn(bool pred) + : m_error(pred) + {} + + ATTRIBUTE_COLD + ~error_or_warn(); +private: + const bool m_error; +}; + +/** Emit a fatal message if the given predicate is true, otherwise emit a +error message. */ +class fatal_or_error : public logger { +public: + ATTRIBUTE_COLD + fatal_or_error(bool pred) + : m_fatal(pred) + {} + + ATTRIBUTE_COLD + ~fatal_or_error(); +private: + const bool m_fatal; +}; + +} // namespace ib + +#include "ut0ut.inl" + +#endif + diff --git a/storage/innobase/include/ut0ut.inl b/storage/innobase/include/ut0ut.inl new file mode 100644 index 00000000..73feaf82 --- /dev/null +++ b/storage/innobase/include/ut0ut.inl @@ -0,0 +1,143 @@ +/***************************************************************************** + +Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************************//** +@file include/ut0ut.ic +Various utilities + +Created 5/30/1994 Heikki Tuuri +*******************************************************************/ + +#include + +/** Calculate the minimum of two pairs. +@param[out] min_hi MSB of the minimum pair +@param[out] min_lo LSB of the minimum pair +@param[in] a_hi MSB of the first pair +@param[in] a_lo LSB of the first pair +@param[in] b_hi MSB of the second pair +@param[in] b_lo LSB of the second pair */ +UNIV_INLINE +void +ut_pair_min( + ulint* min_hi, + ulint* min_lo, + ulint a_hi, + ulint a_lo, + ulint b_hi, + ulint b_lo) +{ + if (a_hi == b_hi) { + *min_hi = a_hi; + *min_lo = std::min(a_lo, b_lo); + } else if (a_hi < b_hi) { + *min_hi = a_hi; + *min_lo = a_lo; + } else { + *min_hi = b_hi; + *min_lo = b_lo; + } +} + +/******************************************************//** +Compares two ulints. +@return 1 if a > b, 0 if a == b, -1 if a < b */ +UNIV_INLINE +int +ut_ulint_cmp( +/*=========*/ + ulint a, /*!< in: ulint */ + ulint b) /*!< in: ulint */ +{ + if (a < b) { + return(-1); + } else if (a == b) { + return(0); + } else { + return(1); + } +} + +/** Compare two pairs of integers. +@param[in] a_h more significant part of first pair +@param[in] a_l less significant part of first pair +@param[in] b_h more significant part of second pair +@param[in] b_l less significant part of second pair +@return comparison result of (a_h,a_l) and (b_h,b_l) +@retval -1 if (a_h,a_l) is less than (b_h,b_l) +@retval 0 if (a_h,a_l) is equal to (b_h,b_l) +@retval 1 if (a_h,a_l) is greater than (b_h,b_l) */ +UNIV_INLINE +int +ut_pair_cmp( + ulint a_h, + ulint a_l, + ulint b_h, + ulint b_l) +{ + if (a_h < b_h) { + return(-1); + } + if (a_h > b_h) { + return(1); + } + return(ut_ulint_cmp(a_l, b_l)); +} + +/*************************************************************//** +Calculates fast the 2-logarithm of a number, rounded upward to an +integer. +@return logarithm in the base 2, rounded upward */ +UNIV_INLINE +ulint +ut_2_log( +/*=====*/ + ulint n) /*!< in: number != 0 */ +{ + ulint res; + + res = 0; + + ut_ad(n > 0); + + n = n - 1; + + for (;;) { + n = n / 2; + + if (n == 0) { + break; + } + + res++; + } + + return(res + 1); +} + +/*************************************************************//** +Calculates 2 to power n. +@return 2 to power n */ +UNIV_INLINE +ulint +ut_2_exp( +/*=====*/ + ulint n) /*!< in: number */ +{ + return((ulint) 1 << n); +} diff --git a/storage/innobase/include/ut0vec.h b/storage/innobase/include/ut0vec.h new file mode 100644 index 00000000..f4660f96 --- /dev/null +++ b/storage/innobase/include/ut0vec.h @@ -0,0 +1,285 @@ +/***************************************************************************** + +Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0vec.h +A vector of pointers to data items + +Created 4/6/2006 Osku Salerma +************************************************************************/ + +#ifndef IB_VECTOR_H +#define IB_VECTOR_H + +#include "mem0mem.h" + +struct ib_alloc_t; +struct ib_vector_t; + +typedef void* (*ib_mem_alloc_t)( + /* out: Pointer to allocated memory */ + ib_alloc_t* allocator, /* in: Pointer to allocator instance */ + ulint size); /* in: Number of bytes to allocate */ + +typedef void (*ib_mem_free_t)( + ib_alloc_t* allocator, /* in: Pointer to allocator instance */ + void* ptr); /* in: Memory to free */ + +typedef void* (*ib_mem_resize_t)( + /* out: Pointer to resized memory */ + ib_alloc_t* allocator, /* in: Pointer to allocator */ + void* ptr, /* in: Memory to resize */ + ulint old_size, /* in: Old memory size in bytes */ + ulint new_size); /* in: New size in bytes */ + +typedef int (*ib_compare_t)(const void*, const void*); + +/* An automatically resizing vector datatype with the following properties: + + -All memory allocation is done through an allocator, which is responsible for +freeing it when done with the vector. +*/ + +/* This is useful shorthand for elements of type void* */ +#define ib_vector_getp(v, n) (*(void**) ib_vector_get(v, n)) +#define ib_vector_getp_const(v, n) (*(void**) ib_vector_get_const(v, n)) + +#define ib_vector_allocator(v) (v->allocator) + +/******************************************************************** +Create a new vector with the given initial size. */ +ib_vector_t* +ib_vector_create( +/*=============*/ + /* out: vector */ + ib_alloc_t* alloc, /* in: Allocator */ + /* in: size of the data item */ + ulint sizeof_value, + ulint size); /* in: initial size */ + +/******************************************************************** +Destroy the vector. Make sure the vector owns the allocator, e.g., +the heap in the the heap allocator. */ +UNIV_INLINE +void +ib_vector_free( +/*===========*/ + ib_vector_t* vec); /* in/out: vector */ + +/******************************************************************** +Push a new element to the vector, increasing its size if necessary, +if elem is not NULL then elem is copied to the vector.*/ +UNIV_INLINE +void* +ib_vector_push( +/*===========*/ + /* out: pointer the "new" element */ + ib_vector_t* vec, /* in/out: vector */ + const void* elem); /* in: data element */ + +/******************************************************************** +Pop the last element from the vector.*/ +UNIV_INLINE +void* +ib_vector_pop( +/*==========*/ + /* out: pointer to the "new" element */ + ib_vector_t* vec); /* in/out: vector */ + +/*******************************************************************//** +Remove an element to the vector +@return pointer to the "removed" element */ +UNIV_INLINE +void* +ib_vector_remove( +/*=============*/ + ib_vector_t* vec, /*!< in: vector */ + const void* elem); /*!< in: value to remove */ + +/******************************************************************** +Get the number of elements in the vector. */ +UNIV_INLINE +ulint +ib_vector_size( +/*===========*/ + /* out: number of elements in vector */ + const ib_vector_t* vec); /* in: vector */ + +/******************************************************************** +Increase the size of the vector. */ +void +ib_vector_resize( +/*=============*/ + /* out: number of elements in vector */ + ib_vector_t* vec); /* in/out: vector */ + +/******************************************************************** +Test whether a vector is empty or not. +@return TRUE if empty */ +UNIV_INLINE +ibool +ib_vector_is_empty( +/*===============*/ + const ib_vector_t* vec); /*!< in: vector */ + +/****************************************************************//** +Get the n'th element. +@return n'th element */ +UNIV_INLINE +void* +ib_vector_get( +/*==========*/ + ib_vector_t* vec, /*!< in: vector */ + ulint n); /*!< in: element index to get */ + +/******************************************************************** +Const version of the get n'th element. +@return n'th element */ +UNIV_INLINE +const void* +ib_vector_get_const( +/*================*/ + const ib_vector_t* vec, /* in: vector */ + ulint n); /* in: element index to get */ +/****************************************************************//** +Get last element. The vector must not be empty. +@return last element */ +UNIV_INLINE +void* +ib_vector_get_last( +/*===============*/ + ib_vector_t* vec); /*!< in: vector */ +/****************************************************************//** +Set the n'th element. */ +UNIV_INLINE +void +ib_vector_set( +/*==========*/ + ib_vector_t* vec, /*!< in/out: vector */ + ulint n, /*!< in: element index to set */ + void* elem); /*!< in: data element */ + +/******************************************************************** +Reset the vector size to 0 elements. */ +UNIV_INLINE +void +ib_vector_reset( +/*============*/ + ib_vector_t* vec); /* in/out: vector */ + +/******************************************************************** +Get the last element of the vector. */ +UNIV_INLINE +void* +ib_vector_last( +/*===========*/ + /* out: pointer to last element */ + ib_vector_t* vec); /* in/out: vector */ + +/******************************************************************** +Get the last element of the vector. */ +UNIV_INLINE +const void* +ib_vector_last_const( +/*=================*/ + /* out: pointer to last element */ + const ib_vector_t* vec); /* in: vector */ + +/******************************************************************** +Sort the vector elements. */ +UNIV_INLINE +void +ib_vector_sort( +/*===========*/ + ib_vector_t* vec, /* in/out: vector */ + ib_compare_t compare); /* in: the comparator to use for sort */ + +/******************************************************************** +The default ib_vector_t heap free. Does nothing. */ +UNIV_INLINE +void +ib_heap_free( +/*=========*/ + ib_alloc_t* allocator, /* in: allocator */ + void* ptr); /* in: size in bytes */ + +/******************************************************************** +The default ib_vector_t heap malloc. Uses mem_heap_alloc(). */ +UNIV_INLINE +void* +ib_heap_malloc( +/*===========*/ + /* out: pointer to allocated memory */ + ib_alloc_t* allocator, /* in: allocator */ + ulint size); /* in: size in bytes */ + +/******************************************************************** +The default ib_vector_t heap resize. Since we can't resize the heap +we have to copy the elements from the old ptr to the new ptr. +Uses mem_heap_alloc(). */ +UNIV_INLINE +void* +ib_heap_resize( +/*===========*/ + /* out: pointer to reallocated + memory */ + ib_alloc_t* allocator, /* in: allocator */ + void* old_ptr, /* in: pointer to memory */ + ulint old_size, /* in: old size in bytes */ + ulint new_size); /* in: new size in bytes */ + +/******************************************************************** +Create a heap allocator that uses the passed in heap. */ +UNIV_INLINE +ib_alloc_t* +ib_heap_allocator_create( +/*=====================*/ + /* out: heap allocator instance */ + mem_heap_t* heap); /* in: heap to use */ + +/******************************************************************** +Free a heap allocator. */ +UNIV_INLINE +void +ib_heap_allocator_free( +/*===================*/ + ib_alloc_t* ib_ut_alloc); /* in: alloc instace to free */ + +/* Allocator used by ib_vector_t. */ +struct ib_alloc_t { + ib_mem_alloc_t mem_malloc; /* For allocating memory */ + ib_mem_free_t mem_release; /* For freeing memory */ + ib_mem_resize_t mem_resize; /* For resizing memory */ + void* arg; /* Currently if not NULL then it + points to the heap instance */ +}; + +/* See comment at beginning of file. */ +struct ib_vector_t { + ib_alloc_t* allocator; /* Allocator, because one size + doesn't fit all */ + void* data; /* data elements */ + ulint used; /* number of elements currently used */ + ulint total; /* number of elements allocated */ + /* Size of a data item */ + ulint sizeof_value; +}; + +#include "ut0vec.inl" + +#endif /* IB_VECTOR_H */ diff --git a/storage/innobase/include/ut0vec.inl b/storage/innobase/include/ut0vec.inl new file mode 100644 index 00000000..531f0f22 --- /dev/null +++ b/storage/innobase/include/ut0vec.inl @@ -0,0 +1,348 @@ +/***************************************************************************** + +Copyright (c) 2006, 2014, Oracle and/or its affiliates. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0vec.ic +A vector of pointers to data items + +Created 4/6/2006 Osku Salerma +************************************************************************/ + +#define IB_VEC_OFFSET(v, i) (vec->sizeof_value * i) + +/******************************************************************** +The default ib_vector_t heap malloc. Uses mem_heap_alloc(). */ +UNIV_INLINE +void* +ib_heap_malloc( +/*===========*/ + ib_alloc_t* allocator, /* in: allocator */ + ulint size) /* in: size in bytes */ +{ + mem_heap_t* heap = (mem_heap_t*) allocator->arg; + + return(mem_heap_alloc(heap, size)); +} + +/******************************************************************** +The default ib_vector_t heap free. Does nothing. */ +UNIV_INLINE +void +ib_heap_free( +/*=========*/ + ib_alloc_t* allocator UNIV_UNUSED, /* in: allocator */ + void* ptr UNIV_UNUSED) /* in: size in bytes */ +{ + /* We can't free individual elements. */ +} + +/******************************************************************** +The default ib_vector_t heap resize. Since we can't resize the heap +we have to copy the elements from the old ptr to the new ptr. +We always assume new_size >= old_size, so the buffer won't overflow. +Uses mem_heap_alloc(). */ +UNIV_INLINE +void* +ib_heap_resize( +/*===========*/ + ib_alloc_t* allocator, /* in: allocator */ + void* old_ptr, /* in: pointer to memory */ + ulint old_size, /* in: old size in bytes */ + ulint new_size) /* in: new size in bytes */ +{ + void* new_ptr; + mem_heap_t* heap = (mem_heap_t*) allocator->arg; + + ut_a(new_size >= old_size); + new_ptr = mem_heap_alloc(heap, new_size); + memcpy(new_ptr, old_ptr, old_size); + + return(new_ptr); +} + +/******************************************************************** +Create a heap allocator that uses the passed in heap. */ +UNIV_INLINE +ib_alloc_t* +ib_heap_allocator_create( +/*=====================*/ + mem_heap_t* heap) /* in: heap to use */ +{ + ib_alloc_t* heap_alloc; + + heap_alloc = (ib_alloc_t*) mem_heap_alloc(heap, sizeof(*heap_alloc)); + + heap_alloc->arg = heap; + heap_alloc->mem_release = ib_heap_free; + heap_alloc->mem_malloc = ib_heap_malloc; + heap_alloc->mem_resize = ib_heap_resize; + + return(heap_alloc); +} + +/******************************************************************** +Free a heap allocator. */ +UNIV_INLINE +void +ib_heap_allocator_free( +/*===================*/ + ib_alloc_t* ib_ut_alloc) /* in: alloc instace to free */ +{ + mem_heap_free((mem_heap_t*) ib_ut_alloc->arg); +} + +/******************************************************************** +Get number of elements in vector. */ +UNIV_INLINE +ulint +ib_vector_size( +/*===========*/ + /* out: number of elements in vector*/ + const ib_vector_t* vec) /* in: vector */ +{ + return(vec->used); +} + +/****************************************************************//** +Get n'th element. */ +UNIV_INLINE +void* +ib_vector_get( +/*==========*/ + ib_vector_t* vec, /*!< in: vector */ + ulint n) /*!< in: element index to get */ +{ + ut_a(n < vec->used); + + return((byte*) vec->data + IB_VEC_OFFSET(vec, n)); +} + +/******************************************************************** +Const version of the get n'th element. +@return n'th element */ +UNIV_INLINE +const void* +ib_vector_get_const( +/*================*/ + const ib_vector_t* vec, /* in: vector */ + ulint n) /* in: element index to get */ +{ + ut_a(n < vec->used); + + return((byte*) vec->data + IB_VEC_OFFSET(vec, n)); +} +/****************************************************************//** +Get last element. The vector must not be empty. +@return last element */ +UNIV_INLINE +void* +ib_vector_get_last( +/*===============*/ + ib_vector_t* vec) /*!< in: vector */ +{ + ut_a(vec->used > 0); + + return((byte*) ib_vector_get(vec, vec->used - 1)); +} + +/****************************************************************//** +Set the n'th element. */ +UNIV_INLINE +void +ib_vector_set( +/*==========*/ + ib_vector_t* vec, /*!< in/out: vector */ + ulint n, /*!< in: element index to set */ + void* elem) /*!< in: data element */ +{ + void* slot; + + ut_a(n < vec->used); + + slot = ((byte*) vec->data + IB_VEC_OFFSET(vec, n)); + memcpy(slot, elem, vec->sizeof_value); +} + +/******************************************************************** +Reset the vector size to 0 elements. */ +UNIV_INLINE +void +ib_vector_reset( +/*============*/ + /* out: void */ + ib_vector_t* vec) /* in: vector */ +{ + vec->used = 0; +} + +/******************************************************************** +Get the last element of the vector. */ +UNIV_INLINE +void* +ib_vector_last( +/*===========*/ + /* out: void */ + ib_vector_t* vec) /* in: vector */ +{ + ut_a(ib_vector_size(vec) > 0); + + return(ib_vector_get(vec, ib_vector_size(vec) - 1)); +} + +/******************************************************************** +Get the last element of the vector. */ +UNIV_INLINE +const void* +ib_vector_last_const( +/*=================*/ + /* out: void */ + const ib_vector_t* vec) /* in: vector */ +{ + ut_a(ib_vector_size(vec) > 0); + + return(ib_vector_get_const(vec, ib_vector_size(vec) - 1)); +} + +/****************************************************************//** +Remove the last element from the vector. +@return last vector element */ +UNIV_INLINE +void* +ib_vector_pop( +/*==========*/ + /* out: pointer to element */ + ib_vector_t* vec) /* in: vector */ +{ + void* elem; + + ut_a(vec->used > 0); + + elem = ib_vector_last(vec); + --vec->used; + + return(elem); +} + +/******************************************************************** +Append an element to the vector, if elem != NULL then copy the data +from elem.*/ +UNIV_INLINE +void* +ib_vector_push( +/*===========*/ + /* out: pointer to the "new" element */ + ib_vector_t* vec, /* in: vector */ + const void* elem) /* in: element to add (can be NULL) */ +{ + void* last; + + if (vec->used >= vec->total) { + ib_vector_resize(vec); + } + + last = (byte*) vec->data + IB_VEC_OFFSET(vec, vec->used); + +#ifdef UNIV_DEBUG + memset(last, 0, vec->sizeof_value); +#endif + + if (elem) { + memcpy(last, elem, vec->sizeof_value); + } + + ++vec->used; + + return(last); +} + +/*******************************************************************//** +Remove an element to the vector +@return pointer to the "removed" element */ +UNIV_INLINE +void* +ib_vector_remove( +/*=============*/ + ib_vector_t* vec, /*!< in: vector */ + const void* elem) /*!< in: value to remove */ +{ + void* current = NULL; + void* next; + ulint i; + ulint old_used_count = vec->used; + + for (i = 0; i < vec->used; i++) { + current = ib_vector_get(vec, i); + + if (*(void**) current == elem) { + if (i == vec->used - 1) { + return(ib_vector_pop(vec)); + } + + next = ib_vector_get(vec, i + 1); + memmove(current, next, vec->sizeof_value + * (vec->used - i - 1)); + --vec->used; + break; + } + } + + return((old_used_count != vec->used) ? current : NULL); +} + +/******************************************************************** +Sort the vector elements. */ +UNIV_INLINE +void +ib_vector_sort( +/*===========*/ + /* out: void */ + ib_vector_t* vec, /* in: vector */ + ib_compare_t compare)/* in: the comparator to use for sort */ +{ + qsort(vec->data, vec->used, vec->sizeof_value, compare); +} + +/******************************************************************** +Destroy the vector. Make sure the vector owns the allocator, e.g., +the heap in the the heap allocator. */ +UNIV_INLINE +void +ib_vector_free( +/*===========*/ + ib_vector_t* vec) /* in, own: vector */ +{ + /* Currently we only support one type of allocator - heap, + when the heap is freed all the elements are freed too. */ + + /* Only the heap allocator uses the arg field. */ + ut_ad(vec->allocator->arg != NULL); + + mem_heap_free((mem_heap_t*) vec->allocator->arg); +} + +/******************************************************************** +Test whether a vector is empty or not. +@return TRUE if empty */ +UNIV_INLINE +ibool +ib_vector_is_empty( +/*===============*/ + const ib_vector_t* vec) /*!< in: vector */ +{ + return(ib_vector_size(vec) == 0); +} diff --git a/storage/innobase/include/ut0wqueue.h b/storage/innobase/include/ut0wqueue.h new file mode 100644 index 00000000..95c7a248 --- /dev/null +++ b/storage/innobase/include/ut0wqueue.h @@ -0,0 +1,86 @@ +/***************************************************************************** + +Copyright (c) 2006, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/*******************************************************************//** +@file include/ut0wqueue.h +A work queue + +Created 4/26/2006 Osku Salerma +************************************************************************/ + +/*******************************************************************//** +A Work queue. Threads can add work items to the queue and other threads can +wait for work items to be available and take them off the queue for +processing. +************************************************************************/ + +#pragma once + +#include "ut0list.h" +#include "mem0mem.h" + +// Forward declaration +struct ib_list_t; + +/** Work queue */ +struct ib_wqueue_t +{ + /** Mutex protecting everything */ + mysql_mutex_t mutex; + /** Work item list */ + ib_list_t *items; + /** ib_list_len(*items) */ + size_t length; +}; + +/****************************************************************//** +Create a new work queue. +@return work queue */ +ib_wqueue_t* +ib_wqueue_create(); +/*===============*/ + +/****************************************************************//** +Free a work queue. */ +void +ib_wqueue_free( +/*===========*/ + ib_wqueue_t* wq); /*!< in: work queue */ + +/** Add a work item to the queue. +@param[in,out] wq work queue +@param[in] item work item +@param[in,out] heap memory heap to use for allocating list node +@param[in] wq_locked work queue mutex locked */ +void +ib_wqueue_add(ib_wqueue_t* wq, void* item, mem_heap_t* heap, + bool wq_locked = false); + +/** Check if queue is empty. +@param wq wait queue +@return whether the queue is empty */ +bool ib_wqueue_is_empty(ib_wqueue_t* wq); + +/******************************************************************** +Return first item on work queue or NULL if queue is empty +@return work item or NULL */ +void* +ib_wqueue_nowait( +/*=============*/ + ib_wqueue_t* wq); /*current_lock = lock; + + if (bit_no != ULINT_UNDEFINED); + else if (lock->is_table()) + bit_no= ULINT_UNDEFINED; + else + { + bit_no= lock_rec_find_set_bit(lock); + ut_ad(bit_no != ULINT_UNDEFINED); + } + + iter->bit_no= bit_no; +} + +/*******************************************************************//** +Gets the previous lock in the lock queue, returns NULL if there are no +more locks (i.e. the current lock is the first one). The iterator is +receded (if not-NULL is returned). +@return previous lock or NULL */ +const lock_t* +lock_queue_iterator_get_prev( +/*=========================*/ + lock_queue_iterator_t* iter) /*!< in/out: iterator */ +{ + lock_sys.assert_locked(*iter->current_lock); + + const lock_t *prev_lock= !iter->current_lock->is_table() + ? lock_rec_get_prev(iter->current_lock, iter->bit_no) + : UT_LIST_GET_PREV(un_member.tab_lock.locks, iter->current_lock); + + if (prev_lock) + iter->current_lock= prev_lock; + + return prev_lock; +} diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc new file mode 100644 index 00000000..df51ceb1 --- /dev/null +++ b/storage/innobase/lock/lock0lock.cc @@ -0,0 +1,6812 @@ +/***************************************************************************** + +Copyright (c) 1996, 2022, Oracle and/or its affiliates. +Copyright (c) 2014, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file lock/lock0lock.cc +The transaction lock system + +Created 5/7/1996 Heikki Tuuri +*******************************************************/ + +#define LOCK_MODULE_IMPLEMENTATION + +#include "univ.i" + +#include +#include +#include + +#include "lock0lock.h" +#include "lock0priv.h" +#include "dict0mem.h" +#include "trx0purge.h" +#include "trx0sys.h" +#include "ut0vec.h" +#include "btr0cur.h" +#include "row0sel.h" +#include "row0mysql.h" +#include "row0vers.h" +#include "pars0pars.h" +#include "srv0mon.h" +#include "que0que.h" +#include "scope.h" +#include + +#include + +#ifdef WITH_WSREP +#include +#endif /* WITH_WSREP */ + +/** The value of innodb_deadlock_detect */ +my_bool innodb_deadlock_detect; +/** The value of innodb_deadlock_report */ +ulong innodb_deadlock_report; + +#ifdef HAVE_REPLICATION +extern "C" void thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD other_thd); +extern "C" int thd_need_wait_reports(const MYSQL_THD thd); +extern "C" int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd); +extern "C" int thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2); +#endif + +/** Functor for accessing the embedded node within a table lock. */ +struct TableLockGetNode +{ + ut_list_node &operator()(lock_t &elem) + { return(elem.un_member.tab_lock.locks); } +}; + +/** Create the hash table. +@param n the lower bound of n_cells */ +void lock_sys_t::hash_table::create(ulint n) +{ + n_cells= ut_find_prime(n); + const size_t size= MY_ALIGN(pad(n_cells) * sizeof *array, + CPU_LEVEL1_DCACHE_LINESIZE); + void *v= aligned_malloc(size, CPU_LEVEL1_DCACHE_LINESIZE); + memset_aligned(v, 0, size); + array= static_cast(v); +} + +/** Resize the hash table. +@param n the lower bound of n_cells */ +void lock_sys_t::hash_table::resize(ulint n) +{ + ut_ad(lock_sys.is_writer()); + ulint new_n_cells= ut_find_prime(n); + const size_t size= MY_ALIGN(pad(new_n_cells) * sizeof *array, + CPU_LEVEL1_DCACHE_LINESIZE); + void *v= aligned_malloc(size, CPU_LEVEL1_DCACHE_LINESIZE); + memset_aligned(v, 0, size); + hash_cell_t *new_array= static_cast(v); + + for (auto i= pad(n_cells); i--; ) + { + if (lock_t *lock= static_cast(array[i].node)) + { + /* all hash_latch must vacated */ + ut_ad(i % (ELEMENTS_PER_LATCH + LATCH) >= LATCH); + do + { + ut_ad(!lock->is_table()); + hash_cell_t *c= calc_hash(lock->un_member.rec_lock.page_id.fold(), + new_n_cells) + new_array; + lock_t *next= lock->hash; + lock->hash= nullptr; + if (!c->node) + c->node= lock; + else if (!lock->is_waiting()) + { + lock->hash= static_cast(c->node); + c->node= lock; + } + else + { + lock_t *next= static_cast(c->node); + while (next->hash) + next= next->hash; + next->hash= lock; + } + lock= next; + } + while (lock); + } + } + + aligned_free(array); + array= new_array; + n_cells= new_n_cells; +} + +#ifdef SUX_LOCK_GENERIC +void lock_sys_t::hash_latch::wait() +{ + pthread_mutex_lock(&lock_sys.hash_mutex); + while (!write_trylock()) + pthread_cond_wait(&lock_sys.hash_cond, &lock_sys.hash_mutex); + pthread_mutex_unlock(&lock_sys.hash_mutex); +} + +void lock_sys_t::hash_latch::release() +{ + pthread_mutex_lock(&lock_sys.hash_mutex); + write_unlock(); + pthread_cond_signal(&lock_sys.hash_cond); + pthread_mutex_unlock(&lock_sys.hash_mutex); +} +#endif + +#ifdef UNIV_DEBUG +/** Assert that a lock shard is exclusively latched by this thread */ +void lock_sys_t::assert_locked(const lock_t &lock) const +{ + ut_ad(this == &lock_sys); + if (is_writer()) + return; + if (lock.is_table()) + assert_locked(*lock.un_member.tab_lock.table); + else + lock_sys.hash_get(lock.type_mode). + assert_locked(lock.un_member.rec_lock.page_id); +} + +/** Assert that a table lock shard is exclusively latched by this thread */ +void lock_sys_t::assert_locked(const dict_table_t &table) const +{ + ut_ad(!table.is_temporary()); + if (is_writer()) + return; + ut_ad(readers); + ut_ad(table.lock_mutex_is_owner()); +} + +/** Assert that hash cell for page is exclusively latched by this thread */ +void lock_sys_t::hash_table::assert_locked(const page_id_t id) const +{ + if (lock_sys.is_writer()) + return; + ut_ad(lock_sys.readers); + ut_ad(latch(cell_get(id.fold()))->is_locked()); +} + +/** Assert that a hash table cell is exclusively latched (by some thread) */ +void lock_sys_t::assert_locked(const hash_cell_t &cell) const +{ + if (is_writer()) + return; + ut_ad(lock_sys.readers); + ut_ad(hash_table::latch(const_cast(&cell))->is_locked()); +} +#endif + +LockGuard::LockGuard(lock_sys_t::hash_table &hash, page_id_t id) +{ + const auto id_fold= id.fold(); + lock_sys.rd_lock(SRW_LOCK_CALL); + cell_= hash.cell_get(id_fold); + hash.latch(cell_)->acquire(); +} + +LockMultiGuard::LockMultiGuard(lock_sys_t::hash_table &hash, + const page_id_t id1, const page_id_t id2) +{ + ut_ad(id1.space() == id2.space()); + const auto id1_fold= id1.fold(), id2_fold= id2.fold(); + lock_sys.rd_lock(SRW_LOCK_CALL); + cell1_= hash.cell_get(id1_fold); + cell2_= hash.cell_get(id2_fold); + + auto latch1= hash.latch(cell1_), latch2= hash.latch(cell2_); + if (latch1 > latch2) + std::swap(latch1, latch2); + latch1->acquire(); + if (latch1 != latch2) + latch2->acquire(); +} + +LockMultiGuard::~LockMultiGuard() +{ + auto latch1= lock_sys_t::hash_table::latch(cell1_), + latch2= lock_sys_t::hash_table::latch(cell2_); + latch1->release(); + if (latch1 != latch2) + latch2->release(); + /* Must be last, to avoid a race with lock_sys_t::hash_table::resize() */ + lock_sys.rd_unlock(); +} + +TRANSACTIONAL_TARGET +TMLockGuard::TMLockGuard(lock_sys_t::hash_table &hash, page_id_t id) +{ + const auto id_fold= id.fold(); +#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC + if (xbegin()) + { + if (lock_sys.latch.is_write_locked()) + xabort(); + cell_= hash.cell_get(id_fold); + if (hash.latch(cell_)->is_locked()) + xabort(); + elided= true; + return; + } + elided= false; +#endif + lock_sys.rd_lock(SRW_LOCK_CALL); + cell_= hash.cell_get(id_fold); + hash.latch(cell_)->acquire(); +} + +/** Pretty-print a table lock. +@param[in,out] file output stream +@param[in] lock table lock */ +static void lock_table_print(FILE* file, const lock_t* lock); + +/** Pretty-print a record lock. +@param[in,out] file output stream +@param[in] lock record lock +@param[in,out] mtr mini-transaction for accessing the record */ +static void lock_rec_print(FILE* file, const lock_t* lock, mtr_t& mtr); + +namespace Deadlock +{ + /** Whether to_check may be nonempty */ + static Atomic_relaxed to_be_checked; + /** Transactions to check for deadlock. Protected by lock_sys.wait_mutex. */ + static std::set to_check; + + MY_ATTRIBUTE((nonnull, warn_unused_result)) + /** Check if a lock request results in a deadlock. + Resolve a deadlock by choosing a transaction that will be rolled back. + @param trx transaction requesting a lock + @param wait_lock the lock being requested + @return the lock that trx is or was waiting for + @retval nullptr if the lock wait was resolved + @retval -1 if trx must report DB_DEADLOCK */ + static lock_t *check_and_resolve(trx_t *trx, lock_t *wait_lock); + + /** Quickly detect a deadlock using Brent's cycle detection algorithm. + @param trx transaction that is waiting for another transaction + @return a transaction that is part of a cycle + @retval nullptr if no cycle was found */ + inline trx_t *find_cycle(trx_t *trx) + { + mysql_mutex_assert_owner(&lock_sys.wait_mutex); + trx_t *tortoise= trx, *hare= trx; + for (unsigned power= 1, l= 1; (hare= hare->lock.wait_trx) != nullptr; l++) + { + if (tortoise == hare) + { + ut_ad(l > 1); + lock_sys.deadlocks++; + /* Note: Normally, trx should be part of any deadlock cycle + that is found. However, if innodb_deadlock_detect=OFF had been + in effect in the past, it is possible that trx will be waiting + for a transaction that participates in a pre-existing deadlock + cycle. In that case, our victim will not be trx. */ + return hare; + } + if (l == power) + { + /* The maximum concurrent number of TRX_STATE_ACTIVE transactions + is TRX_RSEG_N_SLOTS * 128, or innodb_page_size / 16 * 128 + (default: 131,072, maximum: 524,288). + Our maximum possible number of iterations should be twice that. */ + power<<= 1; + l= 0; + tortoise= hare; + } + } + return nullptr; + } +}; + +#ifdef UNIV_DEBUG +/** Validate the transactional locks. */ +static void lock_validate(); + +/** Validate the record lock queues on a page. +@param block buffer pool block +@param latched whether the tablespace latch may be held +@return true if ok */ +static bool lock_rec_validate_page(const buf_block_t *block, bool latched) + MY_ATTRIBUTE((nonnull, warn_unused_result)); +#endif /* UNIV_DEBUG */ + +/* The lock system */ +lock_sys_t lock_sys; + +/** Only created if !srv_read_only_mode. Protected by lock_sys.latch. */ +static FILE *lock_latest_err_file; + +/*********************************************************************//** +Reports that a transaction id is insensible, i.e., in the future. */ +ATTRIBUTE_COLD +void +lock_report_trx_id_insanity( +/*========================*/ + trx_id_t trx_id, /*!< in: trx id */ + const rec_t* rec, /*!< in: user record */ + dict_index_t* index, /*!< in: index */ + const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index) */ + trx_id_t max_trx_id) /*!< in: trx_sys.get_max_trx_id() */ +{ + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!rec_is_metadata(rec, *index)); + + ib::error() + << "Transaction id " << ib::hex(trx_id) + << " associated with record" << rec_offsets_print(rec, offsets) + << " in index " << index->name + << " of table " << index->table->name + << " is greater than the global counter " << max_trx_id + << "! The table is corrupted."; +} + +/*********************************************************************//** +Checks that a transaction id is sensible, i.e., not in the future. +@return true if ok */ +bool +lock_check_trx_id_sanity( +/*=====================*/ + trx_id_t trx_id, /*!< in: trx id */ + const rec_t* rec, /*!< in: user record */ + dict_index_t* index, /*!< in: index */ + const rec_offs* offsets) /*!< in: rec_get_offsets(rec, index) */ +{ + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!rec_is_metadata(rec, *index)); + + trx_id_t max_trx_id= trx_sys.get_max_trx_id(); + ut_ad(max_trx_id || srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN); + + if (UNIV_LIKELY(max_trx_id != 0) && UNIV_UNLIKELY(trx_id >= max_trx_id)) + { + lock_report_trx_id_insanity(trx_id, rec, index, offsets, max_trx_id); + return false; + } + return true; +} + + +/** + Creates the lock system at database start. + + @param[in] n_cells number of slots in lock hash table +*/ +void lock_sys_t::create(ulint n_cells) +{ + ut_ad(this == &lock_sys); + ut_ad(!is_initialised()); + + m_initialised= true; + + latch.SRW_LOCK_INIT(lock_latch_key); +#ifdef __aarch64__ + mysql_mutex_init(lock_wait_mutex_key, &wait_mutex, MY_MUTEX_INIT_FAST); +#else + mysql_mutex_init(lock_wait_mutex_key, &wait_mutex, nullptr); +#endif +#ifdef SUX_LOCK_GENERIC + pthread_mutex_init(&hash_mutex, nullptr); + pthread_cond_init(&hash_cond, nullptr); +#endif + + rec_hash.create(n_cells); + prdt_hash.create(n_cells); + prdt_page_hash.create(n_cells); + + if (!srv_read_only_mode) + { + lock_latest_err_file= os_file_create_tmpfile(); + ut_a(lock_latest_err_file); + } +} + +#ifdef UNIV_PFS_RWLOCK +/** Acquire exclusive lock_sys.latch */ +void lock_sys_t::wr_lock(const char *file, unsigned line) +{ + mysql_mutex_assert_not_owner(&wait_mutex); + latch.wr_lock(file, line); + ut_ad(!writer.exchange(pthread_self(), std::memory_order_relaxed)); +} +/** Release exclusive lock_sys.latch */ +void lock_sys_t::wr_unlock() +{ + ut_ad(writer.exchange(0, std::memory_order_relaxed) == + pthread_self()); + latch.wr_unlock(); +} + +/** Acquire shared lock_sys.latch */ +void lock_sys_t::rd_lock(const char *file, unsigned line) +{ + mysql_mutex_assert_not_owner(&wait_mutex); + latch.rd_lock(file, line); + ut_ad(!writer.load(std::memory_order_relaxed)); + ut_d(readers.fetch_add(1, std::memory_order_relaxed)); +} + +/** Release shared lock_sys.latch */ +void lock_sys_t::rd_unlock() +{ + ut_ad(!writer.load(std::memory_order_relaxed)); + ut_ad(readers.fetch_sub(1, std::memory_order_relaxed)); + latch.rd_unlock(); +} +#endif + +/** + Resize the lock hash table. + + @param[in] n_cells number of slots in lock hash table +*/ +void lock_sys_t::resize(ulint n_cells) +{ + ut_ad(this == &lock_sys); + /* Buffer pool resizing is rarely initiated by the user, and this + would exceed the maximum size of a memory transaction. */ + LockMutexGuard g{SRW_LOCK_CALL}; + rec_hash.resize(n_cells); + prdt_hash.resize(n_cells); + prdt_page_hash.resize(n_cells); +} + +/** Closes the lock system at database shutdown. */ +void lock_sys_t::close() +{ + ut_ad(this == &lock_sys); + + if (!m_initialised) + return; + + if (lock_latest_err_file) + { + my_fclose(lock_latest_err_file, MYF(MY_WME)); + lock_latest_err_file= nullptr; + } + + rec_hash.free(); + prdt_hash.free(); + prdt_page_hash.free(); +#ifdef SUX_LOCK_GENERIC + pthread_mutex_destroy(&hash_mutex); + pthread_cond_destroy(&hash_cond); +#endif + + latch.destroy(); + mysql_mutex_destroy(&wait_mutex); + + Deadlock::to_check.clear(); + Deadlock::to_be_checked= false; + + m_initialised= false; +} + +#ifdef WITH_WSREP +# ifdef UNIV_DEBUG +/** Check if both conflicting lock transaction and other transaction +requesting record lock are brute force (BF). If they are check is +this BF-BF wait correct and if not report BF wait and assert. + +@param[in] lock_rec other waiting record lock +@param[in] trx trx requesting conflicting record lock +*/ +static void wsrep_assert_no_bf_bf_wait(const lock_t *lock, const trx_t *trx) +{ + ut_ad(!lock->is_table()); + lock_sys.assert_locked(*lock); + trx_t* lock_trx= lock->trx; + + /* Note that we are holding lock_sys.latch, thus we should + not acquire THD::LOCK_thd_data mutex below to avoid latching + order violation. */ + + if (!trx->is_wsrep() || !lock_trx->is_wsrep()) + return; + if (UNIV_LIKELY(!wsrep_thd_is_BF(trx->mysql_thd, FALSE)) + || UNIV_LIKELY(!wsrep_thd_is_BF(lock_trx->mysql_thd, FALSE))) + return; + + ut_ad(trx->state == TRX_STATE_ACTIVE); + + switch (lock_trx->state) { + case TRX_STATE_COMMITTED_IN_MEMORY: + /* The state change is only protected by trx_t::mutex, + which we are not even holding here. */ + case TRX_STATE_PREPARED: + /* Wait for lock->trx to complete the commit + (or XA ROLLBACK) and to release the lock. */ + return; + case TRX_STATE_ACTIVE: + break; + default: + ut_ad("invalid state" == 0); + } + + /* If BF - BF order is honored, i.e. trx already holding + record lock should be ordered before this new lock request + we can keep trx waiting for the lock. If conflicting + transaction is already aborting or rolling back for replaying + we can also let new transaction waiting. */ + if (wsrep_thd_order_before(lock_trx->mysql_thd, trx->mysql_thd) + || wsrep_thd_is_aborting(lock_trx->mysql_thd)) { + return; + } + + mtr_t mtr; + + ib::error() << "Conflicting lock on table: " + << lock->index->table->name + << " index: " + << lock->index->name() + << " that has lock "; + lock_rec_print(stderr, lock, mtr); + + ib::error() << "WSREP state: "; + + wsrep_report_bf_lock_wait(trx->mysql_thd, + trx->id); + wsrep_report_bf_lock_wait(lock_trx->mysql_thd, + lock_trx->id); + /* BF-BF wait is a bug */ + ut_error; +} +# endif /* UNIV_DEBUG */ + +/** check if lock timeout was for priority thread, +as a side effect trigger lock monitor +@param trx transaction owning the lock +@return false for regular lock timeout */ +ATTRIBUTE_NOINLINE static bool wsrep_is_BF_lock_timeout(const trx_t &trx) +{ + ut_ad(trx.is_wsrep()); + + if (trx.error_state == DB_DEADLOCK || !srv_monitor_timer || + !wsrep_thd_is_BF(trx.mysql_thd, false)) + return false; + + ib::info() << "WSREP: BF lock wait long for trx:" << ib::hex(trx.id) + << " query: " << wsrep_thd_query(trx.mysql_thd); + return true; +} +#endif /* WITH_WSREP */ + +/*********************************************************************//** +Checks if a lock request for a new lock has to wait for request lock2. +@return TRUE if new lock has to wait for lock2 to be removed */ +UNIV_INLINE +bool +lock_rec_has_to_wait( +/*=================*/ + const trx_t* trx, /*!< in: trx of new lock */ + unsigned type_mode,/*!< in: precise mode of the new lock + to set: LOCK_S or LOCK_X, possibly + ORed to LOCK_GAP or LOCK_REC_NOT_GAP, + LOCK_INSERT_INTENTION */ + const lock_t* lock2, /*!< in: another record lock; NOTE that + it is assumed that this has a lock bit + set on the same record as in the new + lock we are setting */ + bool lock_is_on_supremum) + /*!< in: TRUE if we are setting the + lock on the 'supremum' record of an + index page: we know then that the lock + request is really for a 'gap' type lock */ +{ + ut_ad(trx); + ut_ad(!lock2->is_table()); + ut_d(lock_sys.hash_get(type_mode).assert_locked( + lock2->un_member.rec_lock.page_id)); + + if (trx == lock2->trx + || lock_mode_compatible( + static_cast(LOCK_MODE_MASK & type_mode), + lock2->mode())) { + return false; + } + + /* We have somewhat complex rules when gap type record locks + cause waits */ + + if ((lock_is_on_supremum || (type_mode & LOCK_GAP)) + && !(type_mode & LOCK_INSERT_INTENTION)) { + + /* Gap type locks without LOCK_INSERT_INTENTION flag + do not need to wait for anything. This is because + different users can have conflicting lock types + on gaps. */ + + return false; + } + + if (!(type_mode & LOCK_INSERT_INTENTION) && lock2->is_gap()) { + + /* Record lock (LOCK_ORDINARY or LOCK_REC_NOT_GAP + does not need to wait for a gap type lock */ + + return false; + } + + if ((type_mode & LOCK_GAP) && lock2->is_record_not_gap()) { + + /* Lock on gap does not need to wait for + a LOCK_REC_NOT_GAP type lock */ + + return false; + } + + if (lock2->is_insert_intention()) { + /* No lock request needs to wait for an insert + intention lock to be removed. This is ok since our + rules allow conflicting locks on gaps. This eliminates + a spurious deadlock caused by a next-key lock waiting + for an insert intention lock; when the insert + intention lock was granted, the insert deadlocked on + the waiting next-key lock. + + Also, insert intention locks do not disturb each + other. */ + + return false; + } + +#ifdef HAVE_REPLICATION + if ((type_mode & LOCK_GAP || lock2->is_gap()) + && !thd_need_ordering_with(trx->mysql_thd, lock2->trx->mysql_thd)) { + /* If the upper server layer has already decided on the + commit order between the transaction requesting the + lock and the transaction owning the lock, we do not + need to wait for gap locks. Such ordeering by the upper + server layer happens in parallel replication, where the + commit order is fixed to match the original order on the + master. + + Such gap locks are mainly needed to get serialisability + between transactions so that they will be binlogged in + the correct order so that statement-based replication + will give the correct results. Since the right order + was already determined on the master, we do not need + to enforce it again here. + + Skipping the locks is not essential for correctness, + since in case of deadlock we will just kill the later + transaction and retry it. But it can save some + unnecessary rollbacks and retries. */ + + return false; + } +#endif /* HAVE_REPLICATION */ + +#ifdef WITH_WSREP + /* New lock request from a transaction is using unique key + scan and this transaction is a wsrep high priority transaction + (brute force). If conflicting transaction is also wsrep high + priority transaction we should avoid lock conflict because + ordering of these transactions is already decided and + conflicting transaction will be later replayed. */ + if (trx->is_wsrep_UK_scan() + && wsrep_thd_is_BF(lock2->trx->mysql_thd, false)) { + return false; + } + + /* if BF-BF conflict, we have to look at write set order */ + if (trx->is_wsrep() && + (type_mode & LOCK_MODE_MASK) == LOCK_X && + (lock2->type_mode & LOCK_MODE_MASK) == LOCK_X && + wsrep_thd_order_before(trx->mysql_thd, + lock2->trx->mysql_thd)) { + return false; + } + + /* We very well can let bf to wait normally as other + BF will be replayed in case of conflict. For debug + builds we will do additional sanity checks to catch + unsupported bf wait if any. */ + ut_d(wsrep_assert_no_bf_bf_wait(lock2, trx)); +#endif /* WITH_WSREP */ + + return true; +} + +/*********************************************************************//** +Checks if a lock request lock1 has to wait for request lock2. +@return TRUE if lock1 has to wait for lock2 to be removed */ +bool +lock_has_to_wait( +/*=============*/ + const lock_t* lock1, /*!< in: waiting lock */ + const lock_t* lock2) /*!< in: another lock; NOTE that it is + assumed that this has a lock bit set + on the same record as in lock1 if the + locks are record locks */ +{ + ut_ad(lock1 && lock2); + + if (lock1->trx == lock2->trx + || lock_mode_compatible(lock1->mode(), lock2->mode())) { + return false; + } + + if (lock1->is_table()) { + return true; + } + + ut_ad(!lock2->is_table()); + + if (lock1->type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE)) { + return lock_prdt_has_to_wait(lock1->trx, lock1->type_mode, + lock_get_prdt_from_lock(lock1), + lock2); + } + + return lock_rec_has_to_wait( + lock1->trx, lock1->type_mode, lock2, + lock_rec_get_nth_bit(lock1, PAGE_HEAP_NO_SUPREMUM)); +} + +/*============== RECORD LOCK BASIC FUNCTIONS ============================*/ + +/**********************************************************************//** +Looks for a set bit in a record lock bitmap. Returns ULINT_UNDEFINED, +if none found. +@return bit index == heap number of the record, or ULINT_UNDEFINED if +none found */ +ulint +lock_rec_find_set_bit( +/*==================*/ + const lock_t* lock) /*!< in: record lock with at least one bit set */ +{ + for (ulint i = 0; i < lock_rec_get_n_bits(lock); ++i) { + + if (lock_rec_get_nth_bit(lock, i)) { + + return(i); + } + } + + return(ULINT_UNDEFINED); +} + +/*********************************************************************//** +Resets the record lock bitmap to zero. NOTE: does not touch the wait_lock +pointer in the transaction! This function is used in lock object creation +and resetting. */ +static +void +lock_rec_bitmap_reset( +/*==================*/ + lock_t* lock) /*!< in: record lock */ +{ + ulint n_bytes; + + ut_ad(!lock->is_table()); + + /* Reset to zero the bitmap which resides immediately after the lock + struct */ + + n_bytes = lock_rec_get_n_bits(lock) / 8; + + ut_ad((lock_rec_get_n_bits(lock) % 8) == 0); + + memset(reinterpret_cast(&lock[1]), 0, n_bytes); +} + +/*********************************************************************//** +Copies a record lock to heap. +@return copy of lock */ +static +lock_t* +lock_rec_copy( +/*==========*/ + const lock_t* lock, /*!< in: record lock */ + mem_heap_t* heap) /*!< in: memory heap */ +{ + ulint size; + + ut_ad(!lock->is_table()); + + size = sizeof(lock_t) + lock_rec_get_n_bits(lock) / 8; + + return(static_cast(mem_heap_dup(heap, lock, size))); +} + +/*********************************************************************//** +Gets the previous record lock set on a record. +@return previous lock on the same record, NULL if none exists */ +const lock_t* +lock_rec_get_prev( +/*==============*/ + const lock_t* in_lock,/*!< in: record lock */ + ulint heap_no)/*!< in: heap number of the record */ +{ + ut_ad(!in_lock->is_table()); + const page_id_t id{in_lock->un_member.rec_lock.page_id}; + hash_cell_t *cell= lock_sys.hash_get(in_lock->type_mode).cell_get(id.fold()); + + for (lock_t *lock= lock_sys_t::get_first(*cell, id); lock != in_lock; + lock= lock_rec_get_next_on_page(lock)) + if (lock_rec_get_nth_bit(lock, heap_no)) + return lock; + + return nullptr; +} + +/*============= FUNCTIONS FOR ANALYZING RECORD LOCK QUEUE ================*/ + +/*********************************************************************//** +Checks if a transaction has a GRANTED explicit lock on rec stronger or equal +to precise_mode. +@return lock or NULL */ +UNIV_INLINE +lock_t* +lock_rec_has_expl( +/*==============*/ + ulint precise_mode,/*!< in: LOCK_S or LOCK_X + possibly ORed to LOCK_GAP or + LOCK_REC_NOT_GAP, for a + supremum record we regard this + always a gap type request */ + const hash_cell_t& cell, /*!< in: lock hash table cell */ + const page_id_t id, /*!< in: page identifier */ + ulint heap_no,/*!< in: heap number of the record */ + const trx_t* trx) /*!< in: transaction */ +{ + ut_ad((precise_mode & LOCK_MODE_MASK) == LOCK_S + || (precise_mode & LOCK_MODE_MASK) == LOCK_X); + ut_ad(!(precise_mode & LOCK_INSERT_INTENTION)); + + for (lock_t *lock= lock_sys_t::get_first(cell, id, heap_no); lock; + lock= lock_rec_get_next(heap_no, lock)) + if (lock->trx == trx && + !(lock->type_mode & (LOCK_WAIT | LOCK_INSERT_INTENTION)) && + (!((LOCK_REC_NOT_GAP | LOCK_GAP) & lock->type_mode) || + heap_no == PAGE_HEAP_NO_SUPREMUM || + ((LOCK_REC_NOT_GAP | LOCK_GAP) & precise_mode & lock->type_mode)) && + lock_mode_stronger_or_eq(lock->mode(), static_cast + (precise_mode & LOCK_MODE_MASK))) + return lock; + + return nullptr; +} + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Checks if some other transaction has a lock request in the queue. +@return lock or NULL */ +static +lock_t* +lock_rec_other_has_expl_req( +/*========================*/ + lock_mode mode, /*!< in: LOCK_S or LOCK_X */ + const hash_cell_t& cell, /*!< in: lock hash table cell */ + const page_id_t id, /*!< in: page identifier */ + bool wait, /*!< in: whether also waiting locks + are taken into account */ + ulint heap_no,/*!< in: heap number of the record */ + const trx_t* trx) /*!< in: transaction, or NULL if + requests by all transactions + are taken into account */ +{ + ut_ad(mode == LOCK_X || mode == LOCK_S); + + /* Only GAP lock can be on SUPREMUM, and we are not looking for + GAP lock */ + if (heap_no == PAGE_HEAP_NO_SUPREMUM) { + return(NULL); + } + + for (lock_t* lock = lock_sys_t::get_first(cell, id, heap_no); + lock; lock = lock_rec_get_next(heap_no, lock)) { + if (lock->trx != trx + && !lock->is_gap() + && (!lock->is_waiting() || wait) + && lock_mode_stronger_or_eq(lock->mode(), mode)) { + + return(lock); + } + } + + return(NULL); +} +#endif /* UNIV_DEBUG */ + +#ifdef WITH_WSREP +void lock_wait_wsrep_kill(trx_t *bf_trx, ulong thd_id, trx_id_t trx_id); + +#ifdef UNIV_DEBUG +void wsrep_report_error(const lock_t* victim_lock, const trx_t *bf_trx) +{ + // We have conflicting BF-BF case, these threads + // should not execute concurrently + mtr_t mtr; + WSREP_ERROR("BF request is not compatible with victim"); + WSREP_ERROR("BF requesting lock: "); + lock_rec_print(stderr, bf_trx->lock.wait_lock, mtr); + WSREP_ERROR("victim holding lock: "); + lock_rec_print(stderr, victim_lock, mtr); + wsrep_assert_no_bf_bf_wait(victim_lock, bf_trx); +} +#endif /* WITH_DEBUG */ + +/** Kill the holders of conflicting locks. +@param trx brute-force applier transaction running in the current thread */ +ATTRIBUTE_COLD ATTRIBUTE_NOINLINE +static void lock_wait_wsrep(trx_t *trx) +{ + DBUG_ASSERT(wsrep_on(trx->mysql_thd)); + if (!wsrep_thd_is_BF(trx->mysql_thd, false)) + return; + + std::set victims; + + lock_sys.wr_lock(SRW_LOCK_CALL); + mysql_mutex_lock(&lock_sys.wait_mutex); + + const lock_t *wait_lock= trx->lock.wait_lock; + if (!wait_lock) + { +func_exit: + lock_sys.wr_unlock(); + mysql_mutex_unlock(&lock_sys.wait_mutex); + return; + } + + if (wait_lock->is_table()) + { + dict_table_t *table= wait_lock->un_member.tab_lock.table; + for (lock_t *lock= UT_LIST_GET_FIRST(table->locks); lock; + lock= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) + { + /* if victim has also BF status, but has earlier seqno, we have to wait */ + if (lock->trx != trx && + !(wsrep_thd_is_BF(lock->trx->mysql_thd, false) && + wsrep_thd_order_before(lock->trx->mysql_thd, trx->mysql_thd))) + { + if (wsrep_thd_is_BF(lock->trx->mysql_thd, false)) + { + // There is no need to kill victim with compatible lock + if (!lock_has_to_wait(trx->lock.wait_lock, lock)) + continue; + +#ifdef UNIV_DEBUG + wsrep_report_error(lock, trx); +#endif + } + + victims.emplace(lock->trx); + } + } + } + else + { + const page_id_t id{wait_lock->un_member.rec_lock.page_id}; + hash_cell_t &cell= *(wait_lock->type_mode & LOCK_PREDICATE + ? lock_sys.prdt_hash : lock_sys.rec_hash).cell_get + (id.fold()); + if (lock_t *lock= lock_sys_t::get_first(cell, id)) + { + const ulint heap_no= lock_rec_find_set_bit(wait_lock); + if (!lock_rec_get_nth_bit(lock, heap_no)) + lock= lock_rec_get_next(heap_no, lock); + do + { + /* if victim has also BF status, but has earlier seqno, we have to wait */ + if (lock->trx != trx && + !(wsrep_thd_is_BF(lock->trx->mysql_thd, false) && + wsrep_thd_order_before(lock->trx->mysql_thd, trx->mysql_thd))) + { + if (wsrep_thd_is_BF(lock->trx->mysql_thd, false)) + { + // There is no need to kill victim with compatible lock + if (!lock_has_to_wait(trx->lock.wait_lock, lock)) + continue; + +#ifdef UNIV_DEBUG + wsrep_report_error(lock, trx); +#endif + } + + victims.emplace(lock->trx); + } + } while ((lock= lock_rec_get_next(heap_no, lock))); + } + } + + if (victims.empty()) + goto func_exit; + + std::vector> victim_id; + for (trx_t *v : victims) + victim_id.emplace_back(std::pair + {thd_get_thread_id(v->mysql_thd), v->id}); + + DBUG_EXECUTE_IF("sync.before_wsrep_thd_abort", + { + const char act[]= + "now SIGNAL sync.before_wsrep_thd_abort_reached " + "WAIT_FOR signal.before_wsrep_thd_abort"; + DBUG_ASSERT(!debug_sync_set_action(trx->mysql_thd, + STRING_WITH_LEN(act))); + };); + + lock_sys.wr_unlock(); + mysql_mutex_unlock(&lock_sys.wait_mutex); + + for (const auto &v : victim_id) + lock_wait_wsrep_kill(trx, v.first, v.second); +} +#endif /* WITH_WSREP */ + +/*********************************************************************//** +Checks if some other transaction has a conflicting explicit lock request +in the queue, so that we have to wait. +@param[in] mode LOCK_S or LOCK_X, possibly ORed to LOCK_GAP or LOC_REC_NOT_GAP, +LOCK_INSERT_INTENTION +@param[in] cell lock hash table cell +@param[in] id page identifier +@param[in] heap_no heap number of the record +@param[in] trx our transaction +@return conflicting lock and the flag which indicated if conflicting locks +which wait for the current transaction were ignored */ +static lock_t *lock_rec_other_has_conflicting(unsigned mode, + const hash_cell_t &cell, + const page_id_t id, + ulint heap_no, const trx_t *trx) +{ + bool is_supremum = (heap_no == PAGE_HEAP_NO_SUPREMUM); + + for (lock_t* lock = lock_sys_t::get_first(cell, id, heap_no); + lock; lock = lock_rec_get_next(heap_no, lock)) { + if (lock_rec_has_to_wait(trx, mode, lock, is_supremum)) { + return(lock); + } + } + + return(NULL); +} + +/*********************************************************************//** +Checks if some transaction has an implicit x-lock on a record in a secondary +index. +@return transaction id of the transaction which has the x-lock, or 0; +NOTE that this function can return false positives but never false +negatives. The caller must confirm all positive results by calling +trx_is_active(). */ +static +trx_t* +lock_sec_rec_some_has_impl( +/*=======================*/ + trx_t* caller_trx,/*!id == 0 in a locking read + if caller_trx has not modified any persistent tables. */ + if (!trx_sys.find_same_or_older(caller_trx, max_trx_id) || + !lock_check_trx_id_sanity(max_trx_id, rec, index, offsets)) + return nullptr; + + /* We checked above that some active (or XA PREPARE) transaction exists + that is older than PAGE_MAX_TRX_ID. That is, some transaction may be + holding an implicit lock on the record. We have to look up the + clustered index record to find if it is (or was) the case. */ + return row_vers_impl_x_locked(caller_trx, rec, index, offsets); +} + +/*********************************************************************//** +Return the number of table locks for a transaction. +The caller must be holding lock_sys.latch. */ +ulint +lock_number_of_tables_locked( +/*=========================*/ + const trx_lock_t* trx_lock) /*!< in: transaction locks */ +{ + const lock_t* lock; + ulint n_tables = 0; + + lock_sys.assert_locked(); + + for (lock = UT_LIST_GET_FIRST(trx_lock->trx_locks); + lock != NULL; + lock = UT_LIST_GET_NEXT(trx_locks, lock)) { + + if (lock->is_table()) { + n_tables++; + } + } + + return(n_tables); +} + +/*============== RECORD LOCK CREATION AND QUEUE MANAGEMENT =============*/ + +/** Reset the wait status of a lock. +@param[in,out] lock lock that was possibly being waited for */ +static void lock_reset_lock_and_trx_wait(lock_t *lock) +{ + lock_sys.assert_locked(*lock); + mysql_mutex_assert_owner(&lock_sys.wait_mutex); + trx_t *trx= lock->trx; + ut_ad(lock->is_waiting()); + ut_ad(!trx->lock.wait_lock || trx->lock.wait_lock == lock); + if (trx_t *wait_trx= trx->lock.wait_trx) + Deadlock::to_check.erase(wait_trx); + trx->lock.wait_lock= nullptr; + trx->lock.wait_trx= nullptr; + lock->type_mode&= ~LOCK_WAIT; +} + +#ifdef UNIV_DEBUG +/** Check transaction state */ +static void check_trx_state(const trx_t *trx) +{ + ut_ad(!trx->auto_commit || trx->will_lock); + const auto state= trx->state; + ut_ad(state == TRX_STATE_ACTIVE || + state == TRX_STATE_PREPARED_RECOVERED || + state == TRX_STATE_PREPARED || + state == TRX_STATE_COMMITTED_IN_MEMORY); +} +#endif + +/** Create a new record lock and inserts it to the lock queue, +without checking for deadlocks or conflicts. +@param[in] c_lock conflicting lock +@param[in] type_mode lock mode and wait flag +@param[in] page_id index page number +@param[in] page R-tree index page, or NULL +@param[in] heap_no record heap number in the index page +@param[in] index the index tree +@param[in,out] trx transaction +@param[in] holds_trx_mutex whether the caller holds trx->mutex +@return created lock */ +lock_t* +lock_rec_create_low( + lock_t* c_lock, + unsigned type_mode, + const page_id_t page_id, + const page_t* page, + ulint heap_no, + dict_index_t* index, + trx_t* trx, + bool holds_trx_mutex) +{ + lock_t* lock; + ulint n_bytes; + + ut_d(lock_sys.hash_get(type_mode).assert_locked(page_id)); + ut_ad(xtest() || holds_trx_mutex == trx->mutex_is_owner()); + ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index)); + ut_ad(!(type_mode & LOCK_TABLE)); + ut_ad(trx->state != TRX_STATE_NOT_STARTED); + ut_ad(!trx->is_autocommit_non_locking()); + + /* If rec is the supremum record, then we reset the gap and + LOCK_REC_NOT_GAP bits, as all locks on the supremum are + automatically of the gap type */ + + if (UNIV_UNLIKELY(heap_no == PAGE_HEAP_NO_SUPREMUM)) { + ut_ad(!(type_mode & LOCK_REC_NOT_GAP)); + type_mode = type_mode & ~(LOCK_GAP | LOCK_REC_NOT_GAP); + } + + if (UNIV_LIKELY(!(type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE)))) { + n_bytes = (page_dir_get_n_heap(page) + 7) / 8; + } else { + ut_ad(heap_no == PRDT_HEAPNO); + + /* The lock is always on PAGE_HEAP_NO_INFIMUM (0), so + we only need 1 bit (which round up to 1 byte) for + lock bit setting */ + n_bytes = 1; + + if (type_mode & LOCK_PREDICATE) { + ulint tmp = UNIV_WORD_SIZE - 1; + + /* We will attach predicate structure after lock. + Make sure the memory is aligned on 8 bytes, + the mem_heap_alloc will align it with + MEM_SPACE_NEEDED anyway. */ + n_bytes = (n_bytes + sizeof(lock_prdt_t) + tmp) & ~tmp; + ut_ad(n_bytes == sizeof(lock_prdt_t) + UNIV_WORD_SIZE); + } + } + + if (!holds_trx_mutex) { + trx->mutex_lock(); + } + ut_ad(trx->mutex_is_owner()); + ut_ad(trx->state != TRX_STATE_NOT_STARTED); + + if (trx->lock.rec_cached >= UT_ARR_SIZE(trx->lock.rec_pool) + || sizeof *lock + n_bytes > sizeof *trx->lock.rec_pool) { + lock = static_cast( + mem_heap_alloc(trx->lock.lock_heap, + sizeof *lock + n_bytes)); + } else { + lock = &trx->lock.rec_pool[trx->lock.rec_cached++].lock; + } + + lock->trx = trx; + lock->type_mode = type_mode; + lock->index = index; + lock->un_member.rec_lock.page_id = page_id; + + if (UNIV_LIKELY(!(type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE)))) { + lock->un_member.rec_lock.n_bits = uint32_t(n_bytes * 8); + } else { + /* Predicate lock always on INFIMUM (0) */ + lock->un_member.rec_lock.n_bits = 8; + } + lock_rec_bitmap_reset(lock); + lock_rec_set_nth_bit(lock, heap_no); + index->table->n_rec_locks++; + ut_ad(index->table->get_ref_count() || !index->table->can_be_evicted); + + const auto lock_hash = &lock_sys.hash_get(type_mode); + lock_hash->cell_get(page_id.fold())->append(*lock, &lock_t::hash); + + if (type_mode & LOCK_WAIT) { + if (trx->lock.wait_trx) { + ut_ad(!c_lock || trx->lock.wait_trx == c_lock->trx); + ut_ad(trx->lock.wait_lock); + ut_ad((*trx->lock.wait_lock).trx == trx); + } else { + ut_ad(c_lock); + trx->lock.wait_trx = c_lock->trx; + ut_ad(!trx->lock.wait_lock); + } + trx->lock.wait_lock = lock; + } + UT_LIST_ADD_LAST(trx->lock.trx_locks, lock); + if (!holds_trx_mutex) { + trx->mutex_unlock(); + } + MONITOR_INC(MONITOR_RECLOCK_CREATED); + MONITOR_INC(MONITOR_NUM_RECLOCK); + + return lock; +} + +/** Enqueue a waiting request for a lock which cannot be granted immediately. +Check for deadlocks. +@param[in] type_mode the requested lock mode (LOCK_S or LOCK_X) + possibly ORed with LOCK_GAP or + LOCK_REC_NOT_GAP, ORed with + LOCK_INSERT_INTENTION if this + waiting lock request is set + when performing an insert of + an index record +@param[in] id page identifier +@param[in] page leaf page in the index +@param[in] heap_no record heap number in the block +@param[in] index index tree +@param[in,out] thr query thread +@param[in] prdt minimum bounding box (spatial index) +@retval DB_LOCK_WAIT if the waiting lock was enqueued +@retval DB_DEADLOCK if this transaction was chosen as the victim */ +dberr_t +lock_rec_enqueue_waiting( + lock_t* c_lock, + unsigned type_mode, + const page_id_t id, + const page_t* page, + ulint heap_no, + dict_index_t* index, + que_thr_t* thr, + lock_prdt_t* prdt) +{ + ut_d(lock_sys.hash_get(type_mode).assert_locked(id)); + ut_ad(!srv_read_only_mode); + ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index)); + + trx_t* trx = thr_get_trx(thr); + ut_ad(xtest() || trx->mutex_is_owner()); + ut_ad(!trx->dict_operation_lock_mode); + /* Apart from Galera, only transactions that have waiting lock can be + chosen as deadlock victim. Only one lock can be waited for at a time, + and a transaction is associated with a single thread. That is why there + must not be waiting lock requests if the transaction is deadlock victim + and it is not WSREP. Galera transaction abort can be invoked from MDL + acquisition code when the transaction does not have waiting record + lock, that's why we check only deadlock victim bit here. */ + ut_ad(!(trx->lock.was_chosen_as_deadlock_victim & 1)); + + if (trx->mysql_thd && thd_lock_wait_timeout(trx->mysql_thd) == 0) { + trx->error_state = DB_LOCK_WAIT_TIMEOUT; + return DB_LOCK_WAIT_TIMEOUT; + } + + /* Enqueue the lock request that will wait to be granted, note that + we already own the trx mutex. */ + lock_t* lock = lock_rec_create_low( + c_lock, + type_mode | LOCK_WAIT, id, page, heap_no, index, trx, true); + + if (prdt && type_mode & LOCK_PREDICATE) { + lock_prdt_set_prdt(lock, prdt); + } + + trx->lock.wait_thr = thr; + + DBUG_LOG("ib_lock", "trx " << ib::hex(trx->id) + << " waits for lock in index " << index->name + << " of table " << index->table->name); + + MONITOR_INC(MONITOR_LOCKREC_WAIT); + + return DB_LOCK_WAIT; +} + +/*********************************************************************//** +Looks for a suitable type record lock struct by the same trx on the same page. +This can be used to save space when a new record lock should be set on a page: +no new struct is needed, if a suitable old is found. +@return lock or NULL */ +static inline +lock_t* +lock_rec_find_similar_on_page( + ulint type_mode, /*!< in: lock type_mode field */ + ulint heap_no, /*!< in: heap number of the record */ + lock_t* lock, /*!< in: lock_sys.get_first() */ + const trx_t* trx) /*!< in: transaction */ +{ + lock_sys.rec_hash.assert_locked(lock->un_member.rec_lock.page_id); + + for (/* No op */; + lock != NULL; + lock = lock_rec_get_next_on_page(lock)) { + + if (lock->trx == trx + && lock->type_mode == type_mode + && lock_rec_get_n_bits(lock) > heap_no) { + + return(lock); + } + } + + return(NULL); +} + +/*********************************************************************//** +Adds a record lock request in the record queue. The request is normally +added as the last in the queue, but if there are no waiting lock requests +on the record, and the request to be added is not a waiting request, we +can reuse a suitable record lock object already existing on the same page, +just setting the appropriate bit in its bitmap. This is a low-level function +which does NOT check for deadlocks or lock compatibility! +@param[in] type_mode lock mode, wait, gap etc. flags +@param[in,out] cell first hash table cell +@param[in] id page identifier +@param[in] page buffer block containing the record +@param[in] heap_no heap number of the record +@param[in] index index of record +@param[in,out] trx transaction +@param[in] caller_owns_trx_mutex TRUE if caller owns the transaction mutex */ +TRANSACTIONAL_TARGET +static void lock_rec_add_to_queue(unsigned type_mode, const hash_cell_t &cell, + const page_id_t id, const page_t *page, + ulint heap_no, dict_index_t *index, + trx_t *trx, bool caller_owns_trx_mutex) +{ + ut_d(lock_sys.hash_get(type_mode).assert_locked(id)); + ut_ad(xtest() || caller_owns_trx_mutex == trx->mutex_is_owner()); + ut_ad(index->is_primary() + || dict_index_get_online_status(index) != ONLINE_INDEX_CREATION); + ut_ad(!(type_mode & LOCK_TABLE)); +#ifdef UNIV_DEBUG + switch (type_mode & LOCK_MODE_MASK) { + case LOCK_X: + case LOCK_S: + break; + default: + ut_error; + } + + if (!(type_mode & (LOCK_WAIT | LOCK_GAP))) { + lock_mode mode = (type_mode & LOCK_MODE_MASK) == LOCK_S + ? LOCK_X + : LOCK_S; + const lock_t* other_lock + = lock_rec_other_has_expl_req( + mode, cell, id, false, heap_no, trx); +#ifdef WITH_WSREP + if (UNIV_LIKELY_NULL(other_lock) && trx->is_wsrep()) { + /* Only BF transaction may be granted lock + before other conflicting lock request. */ + if (!wsrep_thd_is_BF(trx->mysql_thd, FALSE) + && !wsrep_thd_is_BF(other_lock->trx->mysql_thd, FALSE)) { + /* If it is not BF, this case is a bug. */ + wsrep_report_bf_lock_wait(trx->mysql_thd, trx->id); + wsrep_report_bf_lock_wait(other_lock->trx->mysql_thd, other_lock->trx->id); + ut_error; + } + } else +#endif /* WITH_WSREP */ + ut_ad(!other_lock); + } +#endif /* UNIV_DEBUG */ + + /* If rec is the supremum record, then we can reset the gap bit, as + all locks on the supremum are automatically of the gap type, and we + try to avoid unnecessary memory consumption of a new record lock + struct for a gap type lock */ + + if (heap_no == PAGE_HEAP_NO_SUPREMUM) { + ut_ad(!(type_mode & LOCK_REC_NOT_GAP)); + + /* There should never be LOCK_REC_NOT_GAP on a supremum + record, but let us play safe */ + + type_mode &= ~(LOCK_GAP | LOCK_REC_NOT_GAP); + } + + if (type_mode & LOCK_WAIT) { + goto create; + } else if (lock_t *first_lock = lock_sys_t::get_first(cell, id)) { + for (lock_t* lock = first_lock;;) { + if (lock->is_waiting() + && lock_rec_get_nth_bit(lock, heap_no)) { + goto create; + } + if (!(lock = lock_rec_get_next_on_page(lock))) { + break; + } + } + + /* Look for a similar record lock on the same page: + if one is found and there are no waiting lock requests, + we can just set the bit */ + if (lock_t* lock = lock_rec_find_similar_on_page( + type_mode, heap_no, first_lock, trx)) { + trx_t* lock_trx = lock->trx; + if (caller_owns_trx_mutex) { + trx->mutex_unlock(); + } + { + TMTrxGuard tg{*lock_trx}; + lock_rec_set_nth_bit(lock, heap_no); + } + + if (caller_owns_trx_mutex) { + trx->mutex_lock(); + } + return; + } + } + +create: + /* Note: We will not pass any conflicting lock to lock_rec_create(), + because we should be moving an existing waiting lock request. */ + ut_ad(!(type_mode & LOCK_WAIT) || trx->lock.wait_trx); + + lock_rec_create_low(nullptr, + type_mode, id, page, heap_no, index, trx, + caller_owns_trx_mutex); +} + +/** A helper function for lock_rec_lock_slow(), which grants a Next Key Lock +(either LOCK_X or LOCK_S as specified by `mode`) on <`block`,`heap_no`> in the +`index` to the `trx`, assuming that it already has a granted `held_lock`, which +is at least as strong as mode|LOCK_REC_NOT_GAP. It does so by either reusing the +lock if it already covers the gap, or by ensuring a separate GAP Lock, which in +combination with Record Lock satisfies the request. +@param[in] held_lock a lock granted to `trx` which is at least as strong + as mode|LOCK_REC_NOT_GAP +@param[in] mode requested lock mode: LOCK_X or LOCK_S +@param[in] cell lock hash table cell +@param[in] id page identifier +@param[in] page buffer block containing the record +@param[in] heap_no heap number of the record to be locked +@param[in] index index of record to be locked +@param[in] trx the transaction requesting the Next Key Lock */ +static void lock_reuse_for_next_key_lock(const lock_t *held_lock, + unsigned mode, + const hash_cell_t &cell, + const page_id_t id, + const page_t *page, ulint heap_no, + dict_index_t *index, trx_t *trx) +{ + ut_ad(trx->mutex_is_owner()); + ut_ad(mode == LOCK_S || mode == LOCK_X); + ut_ad(lock_mode_is_next_key_lock(mode)); + + if (!held_lock->is_record_not_gap()) + { + ut_ad(held_lock->is_next_key_lock()); + return; + } + + /* We have a Record Lock granted, so we only need a GAP Lock. We assume + that GAP Locks do not conflict with anything. Therefore a GAP Lock + could be granted to us right now if we've requested: */ + mode|= LOCK_GAP; + ut_ad(nullptr == + lock_rec_other_has_conflicting(mode, cell, id, heap_no, trx)); + + /* It might be the case we already have one, so we first check that. */ + if (lock_rec_has_expl(mode, cell, id, heap_no, trx) == nullptr) + lock_rec_add_to_queue(mode, cell, id, page, heap_no, index, trx, true); +} + + +/*********************************************************************//** +Tries to lock the specified record in the mode requested. If not immediately +possible, enqueues a waiting lock request. This is a low-level function +which does NOT look at implicit locks! Checks lock compatibility within +explicit locks. This function sets a normal next-key lock, or in the case +of a page supremum record, a gap type lock. +@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, or DB_DEADLOCK */ +static +dberr_t +lock_rec_lock( +/*==========*/ + bool impl, /*!< in: if true, no lock is set + if no wait is necessary: we + assume that the caller will + set an implicit lock */ + unsigned mode, /*!< in: lock mode: LOCK_X or + LOCK_S possibly ORed to either + LOCK_GAP or LOCK_REC_NOT_GAP */ + const buf_block_t* block, /*!< in: buffer block containing + the record */ + ulint heap_no,/*!< in: heap number of record */ + dict_index_t* index, /*!< in: index of record */ + que_thr_t* thr) /*!< in: query thread */ +{ + trx_t *trx= thr_get_trx(thr); + /* There must not be lock requests for reads or updates if transaction was + chosen as deadlock victim. Apart from Galera, only transactions that have + waiting lock may be chosen as deadlock victims. Only one lock can be waited + for at a time, and a transaction is associated with a single thread. Galera + transaction abort can be invoked from MDL acquisition code when the + transaction does not have waiting lock, that's why we check only deadlock + victim bit here. */ + ut_ad(!(trx->lock.was_chosen_as_deadlock_victim & 1)); + ut_ad(!srv_read_only_mode); + ut_ad(((LOCK_MODE_MASK | LOCK_TABLE) & mode) == LOCK_S || + ((LOCK_MODE_MASK | LOCK_TABLE) & mode) == LOCK_X); + ut_ad(~mode & (LOCK_GAP | LOCK_REC_NOT_GAP)); + ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index)); + DBUG_EXECUTE_IF("innodb_report_deadlock", return DB_DEADLOCK;); +#ifdef ENABLED_DEBUG_SYNC + if (trx->mysql_thd) + DEBUG_SYNC_C("lock_rec"); +#endif + + ut_ad((LOCK_MODE_MASK & mode) != LOCK_S || + lock_table_has(trx, index->table, LOCK_IS)); + ut_ad((LOCK_MODE_MASK & mode) != LOCK_X || + lock_table_has(trx, index->table, LOCK_IX)); + + if (lock_table_has(trx, index->table, + static_cast(LOCK_MODE_MASK & mode))) + return DB_SUCCESS; + + /* During CREATE TABLE, we will write to newly created FTS_*_CONFIG + on which no lock has been created yet. */ + ut_ad(!trx->dict_operation_lock_mode || + (strstr(index->table->name.m_name, "/FTS_") && + strstr(index->table->name.m_name, "_CONFIG") + sizeof("_CONFIG") == + index->table->name.m_name + strlen(index->table->name.m_name) + 1)); + MONITOR_ATOMIC_INC(MONITOR_NUM_RECLOCK_REQ); + const page_id_t id{block->page.id()}; + LockGuard g{lock_sys.rec_hash, id}; + + if (lock_t *lock= lock_sys_t::get_first(g.cell(), id)) + { + dberr_t err= DB_SUCCESS; + trx->mutex_lock(); + if (lock_rec_get_next_on_page(lock) || + lock->trx != trx || + lock->type_mode != mode || + lock_rec_get_n_bits(lock) <= heap_no) + { + + unsigned checked_mode= (heap_no != PAGE_HEAP_NO_SUPREMUM && + lock_mode_is_next_key_lock(mode)) + ? mode | LOCK_REC_NOT_GAP + : mode; + + const lock_t *held_lock= + lock_rec_has_expl(checked_mode, g.cell(), id, heap_no, trx); + + /* Do nothing if the trx already has a strong enough lock on rec */ + if (!held_lock) + { + if (lock_t *c_lock= lock_rec_other_has_conflicting(mode, g.cell(), id, + heap_no, trx)) + /* + If another transaction has a non-gap conflicting + request in the queue, as this transaction does not + have a lock strong enough already granted on the + record, we have to wait. + */ + err= lock_rec_enqueue_waiting(c_lock, mode, id, block->page.frame, + heap_no, index, thr, nullptr); + else if (!impl) + { + /* Set the requested lock on the record. */ + lock_rec_add_to_queue(mode, g.cell(), id, block->page.frame, heap_no, + index, trx, true); + err= DB_SUCCESS_LOCKED_REC; + } + } + /* If checked_mode == mode, trx already has a strong enough lock on rec */ + else if (checked_mode != mode) + { + /* As check_mode != mode, the mode is Next Key Lock, which can not be + emulated by implicit lock (which are LOCK_REC_NOT_GAP only). */ + ut_ad(!impl); + + lock_reuse_for_next_key_lock(held_lock, mode, g.cell(), id, + block->page.frame, heap_no, index, trx); + } + } + else if (!impl) + { + /* + If the nth bit of the record lock is already set then we do not set + a new lock bit, otherwise we do set + */ + if (!lock_rec_get_nth_bit(lock, heap_no)) + { + lock_rec_set_nth_bit(lock, heap_no); + err= DB_SUCCESS_LOCKED_REC; + } + } + trx->mutex_unlock(); + return err; + } + + /* Simplified and faster path for the most common cases */ + if (!impl) + lock_rec_create_low(nullptr, mode, id, block->page.frame, heap_no, index, + trx, false); + + return DB_SUCCESS_LOCKED_REC; +} + +/*********************************************************************//** +Checks if a waiting record lock request still has to wait in a queue. +@return lock that is causing the wait */ +static +const lock_t* +lock_rec_has_to_wait_in_queue(const hash_cell_t &cell, const lock_t *wait_lock) +{ + const lock_t* lock; + ulint heap_no; + ulint bit_mask; + ulint bit_offset; + + ut_ad(wait_lock->is_waiting()); + ut_ad(!wait_lock->is_table()); + + heap_no = lock_rec_find_set_bit(wait_lock); + + bit_offset = heap_no / 8; + bit_mask = static_cast(1) << (heap_no % 8); + + for (lock = lock_sys_t::get_first( + cell, wait_lock->un_member.rec_lock.page_id); + lock != wait_lock; + lock = lock_rec_get_next_on_page_const(lock)) { + const byte* p = (const byte*) &lock[1]; + + if (heap_no < lock_rec_get_n_bits(lock) + && (p[bit_offset] & bit_mask) + && lock_has_to_wait(wait_lock, lock)) { +#ifdef WITH_WSREP + if (lock->trx->is_wsrep() && + wsrep_thd_order_before(wait_lock->trx->mysql_thd, + lock->trx->mysql_thd)) { + /* don't wait for another BF lock */ + continue; + } +#endif + return(lock); + } + } + + return(NULL); +} + +/** Note that a record lock wait started */ +inline void lock_sys_t::wait_start() +{ + mysql_mutex_assert_owner(&wait_mutex); + wait_count+= WAIT_COUNT_STEP + 1; + /* The maximum number of concurrently waiting transactions is one less + than the maximum number of concurrent transactions. */ + static_assert(WAIT_COUNT_STEP == UNIV_PAGE_SIZE_MAX / 16 * TRX_SYS_N_RSEGS, + "compatibility"); +} + +/** Note that a record lock wait resumed */ +inline +void lock_sys_t::wait_resume(THD *thd, my_hrtime_t start, my_hrtime_t now) +{ + mysql_mutex_assert_owner(&wait_mutex); + ut_ad(get_wait_pending()); + ut_ad(get_wait_cumulative()); + wait_count--; + if (now.val >= start.val) + { + const uint64_t diff_time= + static_cast((now.val - start.val) / 1000); + wait_time+= diff_time; + + if (diff_time > wait_time_max) + wait_time_max= diff_time; + + thd_storage_lock_wait(thd, diff_time); + } +} + +#ifdef HAVE_REPLICATION +ATTRIBUTE_NOINLINE MY_ATTRIBUTE((nonnull, warn_unused_result)) +/** Report lock waits to parallel replication. Sets +trx->error_state= DB_DEADLOCK if trx->lock.was_chosen_as_deadlock_victim was +set when lock_sys.wait_mutex was unlocked. +@param trx transaction that may be waiting for a lock +@param wait_lock lock that is being waited for +@return lock being waited for (may have been replaced by an equivalent one) +@retval nullptr if no lock is being waited for */ +static lock_t *lock_wait_rpl_report(trx_t *trx) +{ + mysql_mutex_assert_owner(&lock_sys.wait_mutex); + ut_ad(trx->state == TRX_STATE_ACTIVE); + THD *const thd= trx->mysql_thd; + ut_ad(thd); + lock_t *wait_lock= trx->lock.wait_lock; + if (!wait_lock) + return nullptr; + /* This would likely be too large to attempt to use a memory transaction, + even for wait_lock->is_table(). */ + const bool nowait= lock_sys.wr_lock_try(); + if (!nowait) + { + mysql_mutex_unlock(&lock_sys.wait_mutex); + lock_sys.wr_lock(SRW_LOCK_CALL); + mysql_mutex_lock(&lock_sys.wait_mutex); + wait_lock= trx->lock.wait_lock; + if (!wait_lock) + { +func_exit: + lock_sys.wr_unlock(); + /* trx->lock.was_chosen_as_deadlock_victim can be set when + lock_sys.wait_mutex was unlocked, let's check it. */ + if (!nowait && trx->lock.was_chosen_as_deadlock_victim) + trx->error_state= DB_DEADLOCK; + return wait_lock; + } + ut_ad(wait_lock->is_waiting()); + } + else if (!wait_lock->is_waiting()) + { + wait_lock= trx->lock.wait_lock; + if (!wait_lock) + goto func_exit; + if (!wait_lock->is_waiting()) + { + wait_lock= nullptr; + goto func_exit; + } + } + + if (wait_lock->is_table()) + { + dict_table_t *table= wait_lock->un_member.tab_lock.table; + for (lock_t *lock= UT_LIST_GET_FIRST(table->locks); lock; + lock= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) + if (lock->trx != trx) + thd_rpl_deadlock_check(thd, lock->trx->mysql_thd); + } + else + { + const page_id_t id{wait_lock->un_member.rec_lock.page_id}; + hash_cell_t &cell= *(wait_lock->type_mode & LOCK_PREDICATE + ? lock_sys.prdt_hash : lock_sys.rec_hash).cell_get + (id.fold()); + if (lock_t *lock= lock_sys_t::get_first(cell, id)) + { + const ulint heap_no= lock_rec_find_set_bit(wait_lock); + if (!lock_rec_get_nth_bit(lock, heap_no)) + lock= lock_rec_get_next(heap_no, lock); + do + if (lock->trx->mysql_thd != thd) + thd_rpl_deadlock_check(thd, lock->trx->mysql_thd); + while ((lock= lock_rec_get_next(heap_no, lock))); + } + } + + goto func_exit; +} +#endif /* HAVE_REPLICATION */ + +/** Wait for a lock to be released. +@retval DB_DEADLOCK if this transaction was chosen as the deadlock victim +@retval DB_INTERRUPTED if the execution was interrupted by the user +@retval DB_LOCK_WAIT_TIMEOUT if the lock wait timed out +@retval DB_SUCCESS if the lock was granted */ +dberr_t lock_wait(que_thr_t *thr) +{ + trx_t *trx= thr_get_trx(thr); + +#ifdef ENABLED_DEBUG_SYNC + if (trx->mysql_thd) + DEBUG_SYNC_C("lock_wait_start"); + + /* Create the sync point for any quit from the function. */ + SCOPE_EXIT([trx]() { + if (trx->mysql_thd) + DEBUG_SYNC_C("lock_wait_end"); + }); +#endif + + /* InnoDB system transactions may use the global value of + innodb_lock_wait_timeout, because trx->mysql_thd == NULL. */ + const ulong innodb_lock_wait_timeout= trx_lock_wait_timeout_get(trx); + const my_hrtime_t suspend_time= my_hrtime_coarse(); + ut_ad(!trx->dict_operation_lock_mode); + + /* The wait_lock can be cleared by another thread in lock_grant(), + lock_rec_cancel(), lock_cancel_waiting_and_release(), which could be + invoked from the high-level function lock_sys_t::cancel(). + But, a wait can only be initiated by the current thread which owns + the transaction. + + Even if trx->lock.wait_lock were changed, the object that it used to + point to it will remain valid memory (remain allocated from + trx->lock.lock_heap). If trx->lock.wait_lock was set to nullptr, the + original object could be transformed to a granted lock. On a page + split or merge, we would change trx->lock.wait_lock to point to + another waiting lock request object, and the old object would be + logically discarded. + + In any case, it is safe to read the memory that wait_lock points to, + even though we are not holding any mutex. We are only reading + wait_lock->type_mode & (LOCK_TABLE | LOCK_AUTO_INC), which will be + unaffected by any page split or merge operation. (Furthermore, + table lock objects will never be cloned or moved.) */ + lock_t *wait_lock= trx->lock.wait_lock; + + if (!wait_lock) + { + /* The lock has already been released or this transaction + was chosen as a deadlock victim: no need to wait */ + if (trx->lock.was_chosen_as_deadlock_victim) + trx->error_state= DB_DEADLOCK; + else if (trx->error_state == DB_LOCK_WAIT) + trx->error_state= DB_SUCCESS; + return trx->error_state; + } + + /* Because we are not holding exclusive lock_sys.latch, the + wait_lock may be changed by other threads during a page split or + merge in case it is a record lock. + + Because at this point we are not holding lock_sys.wait_mutex either, + another thread may set trx->lock.wait_lock == nullptr at any time. */ + + trx->lock.suspend_time= suspend_time; + + ut_ad(!trx->dict_operation_lock_mode); + + IF_WSREP(if (trx->is_wsrep()) lock_wait_wsrep(trx),); + + const auto type_mode= wait_lock->type_mode; +#ifdef HAVE_REPLICATION + /* Even though lock_wait_rpl_report() has nothing to do with + deadlock detection, it was always disabled by innodb_deadlock_detect=OFF. + We will keep it in that way, because unfortunately + thd_need_wait_reports() will hold even if parallel (or any) replication + is not being used. We want to be allow the user to skip + lock_wait_rpl_report(). */ + const bool rpl= trx->mysql_thd && innodb_deadlock_detect && + thd_need_wait_reports(trx->mysql_thd); +#endif + const bool row_lock_wait= thr->lock_state == QUE_THR_LOCK_ROW; + timespec abstime; + set_timespec_time_nsec(abstime, suspend_time.val * 1000); + abstime.MY_tv_sec+= innodb_lock_wait_timeout; + /* Dictionary transactions must wait be immune to lock wait timeouts + for locks on data dictionary tables. Here we check only for + SYS_TABLES, SYS_COLUMNS, SYS_INDEXES, SYS_FIELDS. Locks on further + tables SYS_FOREIGN, SYS_FOREIGN_COLS, SYS_VIRTUAL will only be + acquired while holding an exclusive lock on one of the 4 tables. */ + const bool no_timeout= innodb_lock_wait_timeout >= 100000000 || + ((type_mode & LOCK_TABLE) && + wait_lock->un_member.tab_lock.table->id <= DICT_FIELDS_ID); + thd_wait_begin(trx->mysql_thd, (type_mode & LOCK_TABLE) + ? THD_WAIT_TABLE_LOCK : THD_WAIT_ROW_LOCK); + + mysql_mutex_lock(&lock_sys.wait_mutex); + /* Now that we are holding lock_sys.wait_mutex, we must reload + trx->lock.wait_mutex. It cannot be cleared as long as we are holding + lock_sys.wait_mutex, but as long as we do not hold exclusive + lock_sys.latch, a waiting record lock can be replaced with an + equivalent waiting record lock during a page split or merge by + another thread. See lock_sys_t::cancel(). */ + wait_lock= trx->lock.wait_lock; + + if (wait_lock) + { + /* Dictionary transactions must ignore KILL, because they could + be executed as part of a multi-transaction DDL operation, + such as rollback_inplace_alter_table() or ha_innobase::delete_table(). */ + if (!trx->dict_operation && trx_is_interrupted(trx)) + { + /* innobase_kill_query() can only set trx->error_state=DB_INTERRUPTED + for any transaction that is attached to a connection. + + Furthermore, innobase_kill_query() could have been invoked before + this thread entered a lock wait. The thd_kill_level() or thd::killed + is only being checked every now and then. */ + trx->error_state= DB_INTERRUPTED; + goto abort_wait; + } + + wait_lock= Deadlock::check_and_resolve(trx, wait_lock); + + if (wait_lock == reinterpret_cast(-1)) + { + trx->error_state= DB_DEADLOCK; + goto end_wait; + } + } + else + { + /* trx->lock.was_chosen_as_deadlock_victim can be changed before + lock_sys.wait_mutex is acquired, so let's check it once more. */ + if (trx->lock.was_chosen_as_deadlock_victim) + trx->error_state= DB_DEADLOCK; + else if (trx->error_state == DB_LOCK_WAIT) + trx->error_state= DB_SUCCESS; + goto end_wait; + } + if (row_lock_wait) + lock_sys.wait_start(); + +#ifdef HAVE_REPLICATION + if (rpl) + wait_lock= lock_wait_rpl_report(trx); +#endif + + switch (trx->error_state) { + case DB_SUCCESS: + break; + case DB_LOCK_WAIT: + trx->error_state= DB_SUCCESS; + break; + default: +#ifdef UNIV_DEBUG + ut_ad("invalid state" == 0); + break; + case DB_DEADLOCK: + case DB_INTERRUPTED: +#endif + goto end_loop; + } + + while (wait_lock) + { + int err; + ut_ad(trx->lock.wait_lock); + + DEBUG_SYNC_C("lock_wait_before_suspend"); + + if (no_timeout) + { + my_cond_wait(&trx->lock.cond, &lock_sys.wait_mutex.m_mutex); + err= 0; + } + else + err= my_cond_timedwait(&trx->lock.cond, &lock_sys.wait_mutex.m_mutex, + &abstime); + + wait_lock= trx->lock.wait_lock; + + switch (trx->error_state) { + case DB_DEADLOCK: + case DB_INTERRUPTED: + break; +#ifdef UNIV_DEBUG + case DB_LOCK_WAIT_TIMEOUT: + case DB_LOCK_WAIT: + ut_ad("invalid state" == 0); + break; +#endif + default: + /* Dictionary transactions must ignore KILL, because they could + be executed as part of a multi-transaction DDL operation, + such as rollback_inplace_alter_table() or ha_innobase::delete_table(). */ + if (!trx->dict_operation && trx_is_interrupted(trx)) + /* innobase_kill_query() can only set trx->error_state=DB_INTERRUPTED + for any transaction that is attached to a connection. */ + trx->error_state= DB_INTERRUPTED; + else if (!err) + continue; +#ifdef WITH_WSREP + else if (trx->is_wsrep() && wsrep_is_BF_lock_timeout(*trx)); +#endif + else + { + trx->error_state= DB_LOCK_WAIT_TIMEOUT; + lock_sys.timeouts++; + } + } + break; + } + +end_loop: + if (row_lock_wait) + lock_sys.wait_resume(trx->mysql_thd, suspend_time, my_hrtime_coarse()); + + ut_ad(!wait_lock == !trx->lock.wait_lock); + + if (wait_lock) + { + abort_wait: + lock_sys_t::cancel(trx, wait_lock); + lock_sys.deadlock_check(); + } + +end_wait: + mysql_mutex_unlock(&lock_sys.wait_mutex); + DBUG_EXECUTE_IF("small_sleep_after_lock_wait", + { + if (!(type_mode & LOCK_TABLE) && + (type_mode & LOCK_MODE_MASK) == LOCK_X && + trx->error_state != DB_DEADLOCK && !trx_is_interrupted(trx)) { + my_sleep(20000); + } + }); + thd_wait_end(trx->mysql_thd); + +#ifdef UNIV_DEBUG + switch (trx->error_state) { + case DB_SUCCESS: + case DB_DEADLOCK: + case DB_INTERRUPTED: + case DB_LOCK_WAIT_TIMEOUT: + break; + default: + ut_ad("invalid state" == 0); + } +#endif + + return trx->error_state; +} + + +/** Resume a lock wait */ +template +void lock_wait_end(trx_t *trx) +{ + mysql_mutex_assert_owner(&lock_sys.wait_mutex); + ut_ad(trx->mutex_is_owner()); + ut_d(const auto state= trx->state); + ut_ad(state == TRX_STATE_COMMITTED_IN_MEMORY || state == TRX_STATE_ACTIVE || + state == TRX_STATE_PREPARED); + /* lock_wait() checks trx->lock.was_chosen_as_deadlock_victim flag before + requesting lock_sys.wait_mutex, and if the flag is set, it returns error, + what causes transaction rollback, which can reset trx->lock.wait_thr before + deadlock resolution starts cancelling victim's waiting lock. That's why we + don't check trx->lock.wait_thr here if the function was called from deadlock + resolution function. */ + ut_ad(from_deadlock || trx->lock.wait_thr); + + if (trx->lock.was_chosen_as_deadlock_victim) + { + ut_ad(from_deadlock || state == TRX_STATE_ACTIVE); + trx->error_state= DB_DEADLOCK; + } + + trx->lock.wait_thr= nullptr; + pthread_cond_signal(&trx->lock.cond); +} + +/** Grant a waiting lock request and release the waiting transaction. */ +static void lock_grant(lock_t *lock) +{ + lock_reset_lock_and_trx_wait(lock); + trx_t *trx= lock->trx; + trx->mutex_lock(); + if (lock->mode() == LOCK_AUTO_INC) + { + dict_table_t *table= lock->un_member.tab_lock.table; + ut_ad(!table->autoinc_trx); + table->autoinc_trx= trx; + ib_vector_push(trx->autoinc_locks, &lock); + } + + DBUG_PRINT("ib_lock", ("wait for trx " TRX_ID_FMT " ends", trx->id)); + + /* If we are resolving a deadlock by choosing another transaction as + a victim, then our original transaction may not be waiting anymore */ + + if (trx->lock.wait_thr) + lock_wait_end(trx); + + trx->mutex_unlock(); +} + +/*************************************************************//** +Cancels a waiting record lock request and releases the waiting transaction +that requested it. NOTE: does NOT check if waiting lock requests behind this +one can now be granted! */ +static void lock_rec_cancel(lock_t *lock) +{ + trx_t *trx= lock->trx; + mysql_mutex_lock(&lock_sys.wait_mutex); + trx->mutex_lock(); + + ut_d(lock_sys.hash_get(lock->type_mode). + assert_locked(lock->un_member.rec_lock.page_id)); + /* Reset the bit (there can be only one set bit) in the lock bitmap */ + lock_rec_reset_nth_bit(lock, lock_rec_find_set_bit(lock)); + + /* Reset the wait flag and the back pointer to lock in trx */ + lock_reset_lock_and_trx_wait(lock); + + /* The following releases the trx from lock wait */ + lock_wait_end(trx); + mysql_mutex_unlock(&lock_sys.wait_mutex); + trx->mutex_unlock(); +} + +/** Remove a record lock request, waiting or granted, from the queue and +grant locks to other transactions in the queue if they now are entitled +to a lock. NOTE: all record locks contained in in_lock are removed. +@param[in,out] in_lock record lock +@param[in] owns_wait_mutex whether lock_sys.wait_mutex is held */ +static void lock_rec_dequeue_from_page(lock_t *in_lock, bool owns_wait_mutex) +{ +#ifdef SAFE_MUTEX + ut_ad(owns_wait_mutex == mysql_mutex_is_owner(&lock_sys.wait_mutex)); +#endif /* SAFE_MUTEX */ + ut_ad(!in_lock->is_table()); + + const page_id_t page_id{in_lock->un_member.rec_lock.page_id}; + auto& lock_hash = lock_sys.hash_get(in_lock->type_mode); + ut_ad(lock_sys.is_writer() || in_lock->trx->mutex_is_owner()); + + ut_d(auto old_n_locks=) + in_lock->index->table->n_rec_locks--; + ut_ad(old_n_locks); + + const ulint rec_fold = page_id.fold(); + hash_cell_t &cell = *lock_hash.cell_get(rec_fold); + lock_sys.assert_locked(cell); + + HASH_DELETE(lock_t, hash, &lock_hash, rec_fold, in_lock); + ut_ad(lock_sys.is_writer() || in_lock->trx->mutex_is_owner()); + UT_LIST_REMOVE(in_lock->trx->lock.trx_locks, in_lock); + + MONITOR_INC(MONITOR_RECLOCK_REMOVED); + MONITOR_DEC(MONITOR_NUM_RECLOCK); + + bool acquired = false; + + /* Check if waiting locks in the queue can now be granted: + grant locks if there are no conflicting locks ahead. Stop at + the first X lock that is waiting or has been granted. */ + + for (lock_t* lock = lock_sys_t::get_first(cell, page_id); + lock != NULL; + lock = lock_rec_get_next_on_page(lock)) { + + if (!lock->is_waiting()) { + continue; + } + + if (!owns_wait_mutex) { + mysql_mutex_lock(&lock_sys.wait_mutex); + acquired = owns_wait_mutex = true; + } + + ut_ad(lock->trx->lock.wait_trx); + ut_ad(lock->trx->lock.wait_lock); + + if (const lock_t* c = lock_rec_has_to_wait_in_queue( + cell, lock)) { + trx_t* c_trx = c->trx; + lock->trx->lock.wait_trx = c_trx; + if (c_trx->lock.wait_trx + && innodb_deadlock_detect + && Deadlock::to_check.emplace(c_trx).second) { + Deadlock::to_be_checked = true; + } + } else { + /* Grant the lock */ + ut_ad(lock->trx != in_lock->trx); + lock_grant(lock); + } + } + + if (acquired) { + mysql_mutex_unlock(&lock_sys.wait_mutex); + } +} + +/** Remove a record lock request, waiting or granted, on a discarded page +@param hash hash table +@param in_lock lock object */ +TRANSACTIONAL_TARGET +void lock_rec_discard(lock_sys_t::hash_table &lock_hash, lock_t *in_lock) +{ + ut_ad(!in_lock->is_table()); + lock_hash.assert_locked(in_lock->un_member.rec_lock.page_id); + + HASH_DELETE(lock_t, hash, &lock_hash, + in_lock->un_member.rec_lock.page_id.fold(), in_lock); + ut_d(uint32_t old_locks); + { + trx_t *trx= in_lock->trx; + TMTrxGuard tg{*trx}; + ut_d(old_locks=) + in_lock->index->table->n_rec_locks--; + UT_LIST_REMOVE(trx->lock.trx_locks, in_lock); + } + ut_ad(old_locks); + MONITOR_INC(MONITOR_RECLOCK_REMOVED); + MONITOR_DEC(MONITOR_NUM_RECLOCK); +} + +/*************************************************************//** +Removes record lock objects set on an index page which is discarded. This +function does not move locks, or check for waiting locks, therefore the +lock bitmaps must already be reset when this function is called. */ +static void +lock_rec_free_all_from_discard_page(page_id_t id, const hash_cell_t &cell, + lock_sys_t::hash_table &lock_hash) +{ + for (lock_t *lock= lock_sys_t::get_first(cell, id); lock; ) + { + ut_ad(&lock_hash != &lock_sys.rec_hash || + lock_rec_find_set_bit(lock) == ULINT_UNDEFINED); + ut_ad(!lock->is_waiting()); + lock_t *next_lock= lock_rec_get_next_on_page(lock); + lock_rec_discard(lock_hash, lock); + lock= next_lock; + } +} + +/** Discard locks for an index when purging DELETE FROM SYS_INDEXES +after an aborted CREATE INDEX operation. +@param index a stale index on which ADD INDEX operation was aborted */ +ATTRIBUTE_COLD void lock_discard_for_index(const dict_index_t &index) +{ + ut_ad(!index.is_committed()); + /* This is very rarely executed code, and the size of the hash array + would exceed the maximum size of a memory transaction. */ + LockMutexGuard g{SRW_LOCK_CALL}; + const ulint n= lock_sys.rec_hash.pad(lock_sys.rec_hash.n_cells); + for (ulint i= 0; i < n; i++) + { + for (lock_t *lock= static_cast(lock_sys.rec_hash.array[i].node); + lock; ) + { + ut_ad(!lock->is_table()); + if (lock->index == &index) + { + ut_ad(!lock->is_waiting()); + lock_rec_discard(lock_sys.rec_hash, lock); + lock= static_cast(lock_sys.rec_hash.array[i].node); + } + else + lock= lock->hash; + } + } +} + +/*============= RECORD LOCK MOVING AND INHERITING ===================*/ + +/*************************************************************//** +Resets the lock bits for a single record. Releases transactions waiting for +lock requests here. */ +TRANSACTIONAL_TARGET +static +void +lock_rec_reset_and_release_wait(const hash_cell_t &cell, const page_id_t id, + ulint heap_no) +{ + for (lock_t *lock= lock_sys.get_first(cell, id, heap_no); lock; + lock= lock_rec_get_next(heap_no, lock)) + { + if (lock->is_waiting()) + lock_rec_cancel(lock); + else + { + TMTrxGuard tg{*lock->trx}; + lock_rec_reset_nth_bit(lock, heap_no); + } + } +} + +/** Makes a record to inherit the locks (except LOCK_INSERT_INTENTION type) +of another record as gap type locks, but does not reset the lock bits of +the other record. Also waiting lock requests on rec are inherited as +GRANTED gap locks. +@param heir_cell heir hash table cell +@param heir page containing the record which inherits +@param donor_cell donor hash table cell +@param donor page containing the record from which inherited; does NOT + reset the locks on this record +@param heir_page heir page frame +@param heir_heap_no heap_no of the inheriting record +@param heap_no heap_no of the donating record +@tparam from_split true if the function is invoked from + lock_update_split_(left|right)(), in this case not-gap + locks are not inherited to supremum if transaction + isolation level less or equal to READ COMMITTED */ +template +static void +lock_rec_inherit_to_gap(hash_cell_t &heir_cell, const page_id_t heir, + const hash_cell_t &donor_cell, const page_id_t donor, + const page_t *heir_page, ulint heir_heap_no, + ulint heap_no) +{ + ut_ad(!from_split || heir_heap_no == PAGE_HEAP_NO_SUPREMUM); + + /* At READ UNCOMMITTED or READ COMMITTED isolation level, + we do not want locks set + by an UPDATE or a DELETE to be inherited as gap type locks. But we + DO want S-locks/X-locks(taken for replace) set by a consistency + constraint to be inherited also then. */ + + for (lock_t *lock= lock_sys_t::get_first(donor_cell, donor, heap_no); lock; + lock= lock_rec_get_next(heap_no, lock)) + { + trx_t *lock_trx= lock->trx; + if (!lock->trx->is_not_inheriting_locks() && + !lock->is_insert_intention() && + (lock_trx->isolation_level > TRX_ISO_READ_COMMITTED || + /* When we are in a page split (not purge), then we don't set a lock + on supremum if the donor lock type is LOCK_REC_NOT_GAP. That is, do + not create bogus gap locks for non-gap locks for READ UNCOMMITTED and + READ COMMITTED isolation levels. LOCK_ORDINARY and + LOCK_GAP require a gap before the record to be locked, that is why + setting lock on supremmum is necessary. */ + ((!from_split || !lock->is_record_not_gap()) && + lock->mode() != (lock_trx->duplicates ? LOCK_S : LOCK_X)))) + { + lock_rec_add_to_queue(LOCK_GAP | lock->mode(), heir_cell, heir, + heir_page, heir_heap_no, lock->index, lock_trx, + false); + } + } +} + +/*************************************************************//** +Makes a record to inherit the gap locks (except LOCK_INSERT_INTENTION type) +of another record as gap type locks, but does not reset the lock bits of the +other record. Also waiting lock requests are inherited as GRANTED gap locks. */ +static +void +lock_rec_inherit_to_gap_if_gap_lock( +/*================================*/ + const buf_block_t* block, /*!< in: buffer block */ + ulint heir_heap_no, /*!< in: heap_no of + record which inherits */ + ulint heap_no) /*!< in: heap_no of record + from which inherited; + does NOT reset the locks + on this record */ +{ + const page_id_t id{block->page.id()}; + LockGuard g{lock_sys.rec_hash, id}; + + for (lock_t *lock= lock_sys_t::get_first(g.cell(), id, heap_no); lock; + lock= lock_rec_get_next(heap_no, lock)) + if (!lock->trx->is_not_inheriting_locks() && + !lock->is_insert_intention() && (heap_no == PAGE_HEAP_NO_SUPREMUM || + !lock->is_record_not_gap()) && + !lock_table_has(lock->trx, lock->index->table, LOCK_X)) + lock_rec_add_to_queue(LOCK_GAP | lock->mode(), + g.cell(), id, block->page.frame, + heir_heap_no, lock->index, lock->trx, false); +} + +/*************************************************************//** +Moves the locks of a record to another record and resets the lock bits of +the donating record. */ +TRANSACTIONAL_TARGET +static +void +lock_rec_move( + hash_cell_t& receiver_cell, /*!< in: hash table cell */ + const buf_block_t& receiver, /*!< in: buffer block containing + the receiving record */ + const page_id_t receiver_id, /*!< in: page identifier */ + const hash_cell_t& donator_cell, /*!< in: hash table cell */ + const page_id_t donator_id, /*!< in: page identifier of + the donating record */ + ulint receiver_heap_no,/*!< in: heap_no of the record + which gets the locks; there + must be no lock requests + on it! */ + ulint donator_heap_no)/*!< in: heap_no of the record + which gives the locks */ +{ + ut_ad(!lock_sys_t::get_first(receiver_cell, + receiver_id, receiver_heap_no)); + + for (lock_t *lock = lock_sys_t::get_first(donator_cell, donator_id, + donator_heap_no); + lock != NULL; + lock = lock_rec_get_next(donator_heap_no, lock)) { + const auto type_mode = lock->type_mode; + if (type_mode & LOCK_WAIT) { + ut_ad(lock->trx->lock.wait_lock == lock); + lock->type_mode &= ~LOCK_WAIT; + } + + trx_t* lock_trx = lock->trx; + lock_trx->mutex_lock(); + lock_rec_reset_nth_bit(lock, donator_heap_no); + + /* Note that we FIRST reset the bit, and then set the lock: + the function works also if donator_id == receiver_id */ + + lock_rec_add_to_queue(type_mode, receiver_cell, + receiver_id, receiver.page.frame, + receiver_heap_no, + lock->index, lock_trx, true); + lock_trx->mutex_unlock(); + } + + ut_ad(!lock_sys_t::get_first(donator_cell, donator_id, + donator_heap_no)); +} + +/** Move all the granted locks to the front of the given lock list. +All the waiting locks will be at the end of the list. +@param[in,out] lock_list the given lock list. */ +static +void +lock_move_granted_locks_to_front( + UT_LIST_BASE_NODE_T(lock_t)& lock_list) +{ + lock_t* lock; + + bool seen_waiting_lock = false; + + for (lock = UT_LIST_GET_FIRST(lock_list); lock; + lock = UT_LIST_GET_NEXT(trx_locks, lock)) { + + if (!seen_waiting_lock) { + if (lock->is_waiting()) { + seen_waiting_lock = true; + } + continue; + } + + ut_ad(seen_waiting_lock); + + if (!lock->is_waiting()) { + lock_t* prev = UT_LIST_GET_PREV(trx_locks, lock); + ut_a(prev); + ut_list_move_to_front(lock_list, lock); + lock = prev; + } + } +} + +/*************************************************************//** +Updates the lock table when we have reorganized a page. NOTE: we copy +also the locks set on the infimum of the page; the infimum may carry +locks if an update of a record is occurring on the page, and its locks +were temporarily stored on the infimum. */ +TRANSACTIONAL_TARGET +void +lock_move_reorganize_page( +/*======================*/ + const buf_block_t* block, /*!< in: old index page, now + reorganized */ + const buf_block_t* oblock) /*!< in: copy of the old, not + reorganized page */ +{ + mem_heap_t *heap; + + { + UT_LIST_BASE_NODE_T(lock_t) old_locks; + UT_LIST_INIT(old_locks, &lock_t::trx_locks); + + const page_id_t id{block->page.id()}; + const auto id_fold= id.fold(); + { + TMLockGuard g{lock_sys.rec_hash, id}; + if (!lock_sys_t::get_first(g.cell(), id)) + return; + } + + /* We will modify arbitrary trx->lock.trx_locks. + Do not bother with a memory transaction; we are going + to allocate memory and copy a lot of data. */ + LockMutexGuard g{SRW_LOCK_CALL}; + hash_cell_t &cell= *lock_sys.rec_hash.cell_get(id_fold); + + /* Note: Predicate locks for SPATIAL INDEX are not affected by + page reorganize, because they do not refer to individual record + heap numbers. */ + lock_t *lock= lock_sys_t::get_first(cell, id); + + if (!lock) + return; + + heap= mem_heap_create(256); + + /* Copy first all the locks on the page to heap and reset the + bitmaps in the original locks; chain the copies of the locks + using the trx_locks field in them. */ + + do + { + /* Make a copy of the lock */ + lock_t *old_lock= lock_rec_copy(lock, heap); + + UT_LIST_ADD_LAST(old_locks, old_lock); + + /* Reset bitmap of lock */ + lock_rec_bitmap_reset(lock); + + if (lock->is_waiting()) + { + ut_ad(lock->trx->lock.wait_lock == lock); + lock->type_mode&= ~LOCK_WAIT; + } + + lock= lock_rec_get_next_on_page(lock); + } + while (lock); + + const ulint comp= page_is_comp(block->page.frame); + ut_ad(comp == page_is_comp(oblock->page.frame)); + + lock_move_granted_locks_to_front(old_locks); + + DBUG_EXECUTE_IF("do_lock_reverse_page_reorganize", + ut_list_reverse(old_locks);); + + for (lock= UT_LIST_GET_FIRST(old_locks); lock; + lock= UT_LIST_GET_NEXT(trx_locks, lock)) + { + /* NOTE: we copy also the locks set on the infimum and + supremum of the page; the infimum may carry locks if an + update of a record is occurring on the page, and its locks + were temporarily stored on the infimum */ + const rec_t *rec1= page_get_infimum_rec(block->page.frame); + const rec_t *rec2= page_get_infimum_rec(oblock->page.frame); + + /* Set locks according to old locks */ + for (;;) + { + ulint old_heap_no; + ulint new_heap_no; + ut_d(const rec_t* const orec= rec1); + ut_ad(page_rec_is_metadata(rec1) == page_rec_is_metadata(rec2)); + + if (comp) + { + old_heap_no= rec_get_heap_no_new(rec2); + new_heap_no= rec_get_heap_no_new(rec1); + + rec1= page_rec_get_next_low(rec1, TRUE); + rec2= page_rec_get_next_low(rec2, TRUE); + } + else + { + old_heap_no= rec_get_heap_no_old(rec2); + new_heap_no= rec_get_heap_no_old(rec1); + ut_ad(!memcmp(rec1, rec2, rec_get_data_size_old(rec2))); + + rec1= page_rec_get_next_low(rec1, FALSE); + rec2= page_rec_get_next_low(rec2, FALSE); + } + + trx_t *lock_trx= lock->trx; + lock_trx->mutex_lock(); + + /* Clear the bit in old_lock. */ + if (old_heap_no < lock->un_member.rec_lock.n_bits && + lock_rec_reset_nth_bit(lock, old_heap_no)) + { + ut_ad(!page_rec_is_metadata(orec)); + + /* NOTE that the old lock bitmap could be too + small for the new heap number! */ + lock_rec_add_to_queue(lock->type_mode, cell, id, block->page.frame, + new_heap_no, lock->index, lock_trx, true); + } + + lock_trx->mutex_unlock(); + + if (!rec1 || !rec2) + { + ut_ad(!rec1 == !rec2); + ut_ad(new_heap_no == PAGE_HEAP_NO_SUPREMUM); + ut_ad(old_heap_no == PAGE_HEAP_NO_SUPREMUM); + break; + } + } + + ut_ad(lock_rec_find_set_bit(lock) == ULINT_UNDEFINED); + } + } + + mem_heap_free(heap); + +#ifdef UNIV_DEBUG_LOCK_VALIDATE + if (fil_space_t *space= fil_space_t::get(id.space())) + { + ut_ad(lock_rec_validate_page(block, space->is_latched())); + space->release(); + } +#endif +} + +/*************************************************************//** +Moves the explicit locks on user records to another page if a record +list end is moved to another page. */ +TRANSACTIONAL_TARGET +void +lock_move_rec_list_end( +/*===================*/ + const buf_block_t* new_block, /*!< in: index page to move to */ + const buf_block_t* block, /*!< in: index page */ + const rec_t* rec) /*!< in: record on page: this + is the first record moved */ +{ + const ulint comp= page_rec_is_comp(rec); + + ut_ad(block->page.frame == page_align(rec)); + ut_ad(comp == page_is_comp(new_block->page.frame)); + + const page_id_t id{block->page.id()}; + const page_id_t new_id{new_block->page.id()}; + { + /* This would likely be too large for a memory transaction. */ + LockMultiGuard g{lock_sys.rec_hash, id, new_id}; + + /* Note: when we move locks from record to record, waiting locks + and possible granted gap type locks behind them are enqueued in + the original order, because new elements are inserted to a hash + table to the end of the hash chain, and lock_rec_add_to_queue + does not reuse locks if there are waiters in the queue. */ + for (lock_t *lock= lock_sys_t::get_first(g.cell1(), id); lock; + lock= lock_rec_get_next_on_page(lock)) + { + const rec_t *rec1= rec; + const rec_t *rec2; + const auto type_mode= lock->type_mode; + + if (comp) + { + if (page_offset(rec1) == PAGE_NEW_INFIMUM) + rec1= page_rec_get_next_low(rec1, TRUE); + rec2= page_rec_get_next_low(new_block->page.frame + PAGE_NEW_INFIMUM, + TRUE); + } + else + { + if (page_offset(rec1) == PAGE_OLD_INFIMUM) + rec1= page_rec_get_next_low(rec1, FALSE); + rec2= page_rec_get_next_low(new_block->page.frame + PAGE_OLD_INFIMUM, + FALSE); + } + + if (UNIV_UNLIKELY(!rec1 || !rec2)) + { + ut_ad("corrupted page" == 0); + return; + } + + /* Copy lock requests on user records to new page and + reset the lock bits on the old */ + for (;;) + { + ut_ad(page_rec_is_metadata(rec1) == page_rec_is_metadata(rec2)); + ut_d(const rec_t* const orec= rec1); + + ulint rec1_heap_no; + ulint rec2_heap_no; + + if (comp) + { + rec1_heap_no= rec_get_heap_no_new(rec1); + if (!(rec1= page_rec_get_next_low(rec1, TRUE))) + { + ut_ad(rec1_heap_no == PAGE_HEAP_NO_SUPREMUM); + break; + } + rec2_heap_no= rec_get_heap_no_new(rec2); + rec2= page_rec_get_next_low(rec2, TRUE); + } + else + { + ut_d(const rec_t *old1= rec1); + rec1_heap_no= rec_get_heap_no_old(rec1); + if (!(rec1= page_rec_get_next_low(rec1, FALSE))) + { + ut_ad(rec1_heap_no == PAGE_HEAP_NO_SUPREMUM); + break; + } + + ut_ad(rec_get_data_size_old(old1) == rec_get_data_size_old(rec2)); + ut_ad(!memcmp(old1, rec2, rec_get_data_size_old(old1))); + + rec2_heap_no= rec_get_heap_no_old(rec2); + rec2= page_rec_get_next_low(rec2, FALSE); + } + + if (UNIV_UNLIKELY(!rec2)) + { + ut_ad("corrupted page" == 0); + return; + } + + trx_t *lock_trx= lock->trx; + lock_trx->mutex_lock(); + + if (rec1_heap_no < lock->un_member.rec_lock.n_bits && + lock_rec_reset_nth_bit(lock, rec1_heap_no)) + { + ut_ad(!page_rec_is_metadata(orec)); + + if (type_mode & LOCK_WAIT) + { + ut_ad(lock_trx->lock.wait_lock == lock); + lock->type_mode&= ~LOCK_WAIT; + } + + lock_rec_add_to_queue(type_mode, g.cell2(), new_id, + new_block->page.frame, + rec2_heap_no, lock->index, lock_trx, true); + } + + lock_trx->mutex_unlock(); + } + } + } + +#ifdef UNIV_DEBUG_LOCK_VALIDATE + if (fil_space_t *space= fil_space_t::get(id.space())) + { + const bool is_latched{space->is_latched()}; + ut_ad(lock_rec_validate_page(block, is_latched)); + ut_ad(lock_rec_validate_page(new_block, is_latched)); + space->release(); + } +#endif +} + +/*************************************************************//** +Moves the explicit locks on user records to another page if a record +list start is moved to another page. */ +TRANSACTIONAL_TARGET +void +lock_move_rec_list_start( +/*=====================*/ + const buf_block_t* new_block, /*!< in: index page to + move to */ + const buf_block_t* block, /*!< in: index page */ + const rec_t* rec, /*!< in: record on page: + this is the first + record NOT copied */ + const rec_t* old_end) /*!< in: old + previous-to-last + record on new_page + before the records + were copied */ +{ + const ulint comp= page_rec_is_comp(rec); + + ut_ad(block->page.frame == page_align(rec)); + ut_ad(comp == page_is_comp(new_block->page.frame)); + ut_ad(new_block->page.frame == page_align(old_end)); + ut_ad(!page_rec_is_metadata(rec)); + const page_id_t id{block->page.id()}; + const page_id_t new_id{new_block->page.id()}; + + { + /* This would likely be too large for a memory transaction. */ + LockMultiGuard g{lock_sys.rec_hash, id, new_id}; + + for (lock_t *lock= lock_sys_t::get_first(g.cell1(), id); lock; + lock= lock_rec_get_next_on_page(lock)) + { + const rec_t *rec1; + const rec_t *rec2; + const auto type_mode= lock->type_mode; + + if (comp) + { + rec1= page_rec_get_next_low(block->page.frame + PAGE_NEW_INFIMUM, + TRUE); + rec2= page_rec_get_next_low(old_end, TRUE); + } + else + { + rec1= page_rec_get_next_low(block->page.frame + PAGE_OLD_INFIMUM, + FALSE); + rec2= page_rec_get_next_low(old_end, FALSE); + } + + /* Copy lock requests on user records to new page and + reset the lock bits on the old */ + + while (rec1 != rec) + { + if (UNIV_UNLIKELY(!rec1 || !rec2)) + { + ut_ad("corrupted page" == 0); + return; + } + + ut_ad(page_rec_is_metadata(rec1) == page_rec_is_metadata(rec2)); + ut_d(const rec_t* const prev= rec1); + + ulint rec1_heap_no; + ulint rec2_heap_no; + + if (comp) + { + rec1_heap_no= rec_get_heap_no_new(rec1); + rec2_heap_no= rec_get_heap_no_new(rec2); + + rec1= page_rec_get_next_low(rec1, TRUE); + rec2= page_rec_get_next_low(rec2, TRUE); + } + else + { + rec1_heap_no= rec_get_heap_no_old(rec1); + rec2_heap_no= rec_get_heap_no_old(rec2); + + ut_ad(!memcmp(rec1, rec2, rec_get_data_size_old(rec2))); + + rec1= page_rec_get_next_low(rec1, FALSE); + rec2= page_rec_get_next_low(rec2, FALSE); + } + + trx_t *lock_trx= lock->trx; + lock_trx->mutex_lock(); + + if (rec1_heap_no < lock->un_member.rec_lock.n_bits && + lock_rec_reset_nth_bit(lock, rec1_heap_no)) + { + ut_ad(!page_rec_is_metadata(prev)); + + if (type_mode & LOCK_WAIT) + { + ut_ad(lock_trx->lock.wait_lock == lock); + lock->type_mode&= ~LOCK_WAIT; + } + + lock_rec_add_to_queue(type_mode, g.cell2(), new_id, + new_block->page.frame, + rec2_heap_no, lock->index, lock_trx, true); + } + + lock_trx->mutex_unlock(); + } + +#ifdef UNIV_DEBUG + if (page_rec_is_supremum(rec)) + for (auto i= lock_rec_get_n_bits(lock); --i > PAGE_HEAP_NO_USER_LOW; ) + ut_ad(!lock_rec_get_nth_bit(lock, i)); +#endif /* UNIV_DEBUG */ + } + } + +#ifdef UNIV_DEBUG_LOCK_VALIDATE + ut_ad(lock_rec_validate_page(block)); +#endif +} + +/*************************************************************//** +Moves the explicit locks on user records to another page if a record +list start is moved to another page. */ +TRANSACTIONAL_TARGET +void +lock_rtr_move_rec_list( +/*===================*/ + const buf_block_t* new_block, /*!< in: index page to + move to */ + const buf_block_t* block, /*!< in: index page */ + rtr_rec_move_t* rec_move, /*!< in: recording records + moved */ + ulint num_move) /*!< in: num of rec to move */ +{ + if (!num_move) + return; + + const ulint comp= page_rec_is_comp(rec_move[0].old_rec); + + ut_ad(block->page.frame == page_align(rec_move[0].old_rec)); + ut_ad(new_block->page.frame == page_align(rec_move[0].new_rec)); + ut_ad(comp == page_rec_is_comp(rec_move[0].new_rec)); + const page_id_t id{block->page.id()}; + const page_id_t new_id{new_block->page.id()}; + + { + /* This would likely be too large for a memory transaction. */ + LockMultiGuard g{lock_sys.rec_hash, id, new_id}; + + for (lock_t *lock= lock_sys_t::get_first(g.cell1(), id); lock; + lock= lock_rec_get_next_on_page(lock)) + { + const rec_t *rec1; + const rec_t *rec2; + const auto type_mode= lock->type_mode; + + /* Copy lock requests on user records to new page and + reset the lock bits on the old */ + + for (ulint moved= 0; moved < num_move; moved++) + { + ulint rec1_heap_no; + ulint rec2_heap_no; + + rec1= rec_move[moved].old_rec; + rec2= rec_move[moved].new_rec; + ut_ad(!page_rec_is_metadata(rec1)); + ut_ad(!page_rec_is_metadata(rec2)); + + if (comp) + { + rec1_heap_no= rec_get_heap_no_new(rec1); + rec2_heap_no= rec_get_heap_no_new(rec2); + } + else + { + rec1_heap_no= rec_get_heap_no_old(rec1); + rec2_heap_no= rec_get_heap_no_old(rec2); + + ut_ad(!memcmp(rec1, rec2, rec_get_data_size_old(rec2))); + } + + trx_t *lock_trx= lock->trx; + lock_trx->mutex_lock(); + + if (rec1_heap_no < lock->un_member.rec_lock.n_bits && + lock_rec_reset_nth_bit(lock, rec1_heap_no)) + { + if (type_mode & LOCK_WAIT) + { + ut_ad(lock_trx->lock.wait_lock == lock); + lock->type_mode&= ~LOCK_WAIT; + } + + lock_rec_add_to_queue(type_mode, g.cell2(), new_id, + new_block->page.frame, + rec2_heap_no, lock->index, lock_trx, true); + + rec_move[moved].moved= true; + } + + lock_trx->mutex_unlock(); + } + } + } + +#ifdef UNIV_DEBUG_LOCK_VALIDATE + ut_ad(lock_rec_validate_page(block)); +#endif +} +/*************************************************************//** +Updates the lock table when a page is split to the right. */ +void +lock_update_split_right( +/*====================*/ + const buf_block_t* right_block, /*!< in: right page */ + const buf_block_t* left_block) /*!< in: left page */ +{ + const ulint h= lock_get_min_heap_no(right_block); + const page_id_t l{left_block->page.id()}; + const page_id_t r{right_block->page.id()}; + + /* This would likely be too large for a memory transaction. */ + LockMultiGuard g{lock_sys.rec_hash, l, r}; + + /* Move the locks on the supremum of the left page to the supremum + of the right page */ + + lock_rec_move(g.cell2(), *right_block, r, g.cell1(), l, + PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM); + + /* Inherit the locks to the supremum of left page from the successor + of the infimum on right page */ + lock_rec_inherit_to_gap(g.cell1(), l, g.cell2(), r, + left_block->page.frame, PAGE_HEAP_NO_SUPREMUM, + h); +} + +void lock_update_node_pointer(const buf_block_t *left_block, + const buf_block_t *right_block) +{ + const ulint h= lock_get_min_heap_no(right_block); + const page_id_t l{left_block->page.id()}; + const page_id_t r{right_block->page.id()}; + LockMultiGuard g{lock_sys.rec_hash, l, r}; + + lock_rec_inherit_to_gap(g.cell2(), r, g.cell1(), l, right_block->page.frame, + h, PAGE_HEAP_NO_SUPREMUM); +} + +#ifdef UNIV_DEBUG +static void lock_assert_no_spatial(const page_id_t id) +{ + const auto id_fold= id.fold(); + auto cell= lock_sys.prdt_page_hash.cell_get(id_fold); + auto latch= lock_sys_t::hash_table::latch(cell); + latch->acquire(); + /* there should exist no page lock on the left page, + otherwise, it will be blocked from merge */ + ut_ad(!lock_sys_t::get_first(*cell, id)); + latch->release(); + cell= lock_sys.prdt_hash.cell_get(id_fold); + latch= lock_sys_t::hash_table::latch(cell); + latch->acquire(); + ut_ad(!lock_sys_t::get_first(*cell, id)); + latch->release(); +} +#endif + +/*************************************************************//** +Updates the lock table when a page is merged to the right. */ +void +lock_update_merge_right( +/*====================*/ + const buf_block_t* right_block, /*!< in: right page to + which merged */ + const rec_t* orig_succ, /*!< in: original + successor of infimum + on the right page + before merge */ + const buf_block_t* left_block) /*!< in: merged index + page which will be + discarded */ +{ + ut_ad(!page_rec_is_metadata(orig_succ)); + + const page_id_t l{left_block->page.id()}; + const page_id_t r{right_block->page.id()}; + /* This would likely be too large for a memory transaction. */ + LockMultiGuard g{lock_sys.rec_hash, l, r}; + + /* Inherit the locks from the supremum of the left page to the + original successor of infimum on the right page, to which the left + page was merged */ + lock_rec_inherit_to_gap(g.cell2(), r, g.cell1(), l, right_block->page.frame, + page_rec_get_heap_no(orig_succ), + PAGE_HEAP_NO_SUPREMUM); + + /* Reset the locks on the supremum of the left page, releasing + waiting transactions */ + lock_rec_reset_and_release_wait(g.cell1(), l, PAGE_HEAP_NO_SUPREMUM); + lock_rec_free_all_from_discard_page(l, g.cell1(), lock_sys.rec_hash); + + ut_d(lock_assert_no_spatial(l)); +} + +/** Update locks when the root page is copied to another in +btr_root_raise_and_insert(). Note that we leave lock structs on the +root page, even though they do not make sense on other than leaf +pages: the reason is that in a pessimistic update the infimum record +of the root page will act as a dummy carrier of the locks of the record +to be updated. */ +void lock_update_root_raise(const buf_block_t &block, const page_id_t root) +{ + const page_id_t id{block.page.id()}; + /* This would likely be too large for a memory transaction. */ + LockMultiGuard g{lock_sys.rec_hash, id, root}; + /* Move the locks on the supremum of the root to the supremum of block */ + lock_rec_move(g.cell1(), block, id, g.cell2(), root, + PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM); +} + +/** Update the lock table when a page is copied to another. +@param new_block the target page +@param old old page (not index root page) */ +void lock_update_copy_and_discard(const buf_block_t &new_block, page_id_t old) +{ + const page_id_t id{new_block.page.id()}; + /* This would likely be too large for a memory transaction. */ + LockMultiGuard g{lock_sys.rec_hash, id, old}; + /* Move the locks on the supremum of the old page to the supremum of new */ + lock_rec_move(g.cell1(), new_block, id, g.cell2(), old, + PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM); + lock_rec_free_all_from_discard_page(old, g.cell2(), lock_sys.rec_hash); +} + +/*************************************************************//** +Updates the lock table when a page is split to the left. */ +void +lock_update_split_left( +/*===================*/ + const buf_block_t* right_block, /*!< in: right page */ + const buf_block_t* left_block) /*!< in: left page */ +{ + ulint h= lock_get_min_heap_no(right_block); + const page_id_t l{left_block->page.id()}; + const page_id_t r{right_block->page.id()}; + LockMultiGuard g{lock_sys.rec_hash, l, r}; + /* Inherit the locks to the supremum of the left page from the + successor of the infimum on the right page */ + lock_rec_inherit_to_gap(g.cell1(), l, g.cell2(), r, + left_block->page.frame, PAGE_HEAP_NO_SUPREMUM, + h); +} + +/** Update the lock table when a page is merged to the left. +@param left left page +@param orig_pred original predecessor of supremum on the left page before merge +@param right merged, to-be-discarded right page */ +void lock_update_merge_left(const buf_block_t& left, const rec_t *orig_pred, + const page_id_t right) +{ + ut_ad(left.page.frame == page_align(orig_pred)); + + const page_id_t l{left.page.id()}; + const rec_t *left_next_rec= page_rec_get_next_const(orig_pred); + if (UNIV_UNLIKELY(!left_next_rec)) + { + ut_ad("corrupted page" == 0); + return; + } + + /* This would likely be too large for a memory transaction. */ + LockMultiGuard g{lock_sys.rec_hash, l, right}; + if (!page_rec_is_supremum(left_next_rec)) + { + /* Inherit the locks on the supremum of the left page to the + first record which was moved from the right page */ + lock_rec_inherit_to_gap(g.cell1(), l, g.cell1(), l, left.page.frame, + page_rec_get_heap_no(left_next_rec), + PAGE_HEAP_NO_SUPREMUM); + + /* Reset the locks on the supremum of the left page, + releasing waiting transactions */ + lock_rec_reset_and_release_wait(g.cell1(), l, PAGE_HEAP_NO_SUPREMUM); + } + + /* Move the locks from the supremum of right page to the supremum + of the left page */ + lock_rec_move(g.cell1(), left, l, g.cell2(), right, + PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM); + lock_rec_free_all_from_discard_page(right, g.cell2(), lock_sys.rec_hash); + + /* there should exist no page lock on the right page, + otherwise, it will be blocked from merge */ + ut_d(lock_assert_no_spatial(right)); +} + +/*************************************************************//** +Resets the original locks on heir and replaces them with gap type locks +inherited from rec. */ +void +lock_rec_reset_and_inherit_gap_locks( +/*=================================*/ + const buf_block_t& heir_block, /*!< in: block containing the + record which inherits */ + const page_id_t donor, /*!< in: page containing the + record from which inherited; + does NOT reset the locks on + this record */ + ulint heir_heap_no, /*!< in: heap_no of the + inheriting record */ + ulint heap_no) /*!< in: heap_no of the + donating record */ +{ + const page_id_t heir{heir_block.page.id()}; + /* This is a rare operation and likely too large for a memory transaction. */ + LockMultiGuard g{lock_sys.rec_hash, heir, donor}; + lock_rec_reset_and_release_wait(g.cell1(), heir, heir_heap_no); + lock_rec_inherit_to_gap(g.cell1(), heir, g.cell2(), donor, + heir_block.page.frame, heir_heap_no, heap_no); +} + +/*************************************************************//** +Updates the lock table when a page is discarded. */ +void +lock_update_discard( +/*================*/ + const buf_block_t* heir_block, /*!< in: index page + which will inherit the locks */ + ulint heir_heap_no, /*!< in: heap_no of the record + which will inherit the locks */ + const buf_block_t* block) /*!< in: index page + which will be discarded */ +{ + const page_t* page = block->page.frame; + const rec_t* rec; + ulint heap_no; + const page_id_t heir(heir_block->page.id()); + const page_id_t page_id(block->page.id()); + /* This would likely be too large for a memory transaction. */ + LockMultiGuard g{lock_sys.rec_hash, heir, page_id}; + + if (lock_sys_t::get_first(g.cell2(), page_id)) { + ut_d(lock_assert_no_spatial(page_id)); + /* Inherit all the locks on the page to the record and + reset all the locks on the page */ + + if (page_is_comp(page)) { + rec = page + PAGE_NEW_INFIMUM; + + do { + heap_no = rec_get_heap_no_new(rec); + + lock_rec_inherit_to_gap(g.cell1(), heir, + g.cell2(), page_id, + heir_block->page.frame, + heir_heap_no, heap_no); + + lock_rec_reset_and_release_wait( + g.cell2(), page_id, heap_no); + + rec = page + rec_get_next_offs(rec, TRUE); + } while (heap_no != PAGE_HEAP_NO_SUPREMUM); + } else { + rec = page + PAGE_OLD_INFIMUM; + + do { + heap_no = rec_get_heap_no_old(rec); + + lock_rec_inherit_to_gap(g.cell1(), heir, + g.cell2(), page_id, + heir_block->page.frame, + heir_heap_no, heap_no); + + lock_rec_reset_and_release_wait( + g.cell2(), page_id, heap_no); + + rec = page + rec_get_next_offs(rec, FALSE); + } while (heap_no != PAGE_HEAP_NO_SUPREMUM); + } + + lock_rec_free_all_from_discard_page(page_id, g.cell2(), + lock_sys.rec_hash); + } else { + const auto fold = page_id.fold(); + auto cell = lock_sys.prdt_hash.cell_get(fold); + auto latch = lock_sys_t::hash_table::latch(cell); + latch->acquire(); + lock_rec_free_all_from_discard_page(page_id, *cell, + lock_sys.prdt_hash); + latch->release(); + cell = lock_sys.prdt_page_hash.cell_get(fold); + latch = lock_sys_t::hash_table::latch(cell); + latch->acquire(); + lock_rec_free_all_from_discard_page(page_id, *cell, + lock_sys.prdt_page_hash); + latch->release(); + } +} + +/*************************************************************//** +Updates the lock table when a new user record is inserted. */ +void +lock_update_insert( +/*===============*/ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec) /*!< in: the inserted record */ +{ + ulint receiver_heap_no; + ulint donator_heap_no; + + ut_ad(block->page.frame == page_align(rec)); + ut_ad(!page_rec_is_metadata(rec)); + + /* Inherit the gap-locking locks for rec, in gap mode, from the next + record */ + + if (page_rec_is_comp(rec)) { + receiver_heap_no = rec_get_heap_no_new(rec); + rec = page_rec_get_next_low(rec, TRUE); + if (UNIV_UNLIKELY(!rec)) { + return; + } + donator_heap_no = rec_get_heap_no_new(rec); + } else { + receiver_heap_no = rec_get_heap_no_old(rec); + rec = page_rec_get_next_low(rec, FALSE); + if (UNIV_UNLIKELY(!rec)) { + return; + } + donator_heap_no = rec_get_heap_no_old(rec); + } + + lock_rec_inherit_to_gap_if_gap_lock( + block, receiver_heap_no, donator_heap_no); +} + +/*************************************************************//** +Updates the lock table when a record is removed. */ +void +lock_update_delete( +/*===============*/ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec) /*!< in: the record to be removed */ +{ + const page_t* page = block->page.frame; + ulint heap_no; + ulint next_heap_no; + + ut_ad(page == page_align(rec)); + ut_ad(!page_rec_is_metadata(rec)); + + if (page_is_comp(page)) { + heap_no = rec_get_heap_no_new(rec); + next_heap_no = rec_get_heap_no_new(page + + rec_get_next_offs(rec, + TRUE)); + } else { + heap_no = rec_get_heap_no_old(rec); + next_heap_no = rec_get_heap_no_old(page + + rec_get_next_offs(rec, + FALSE)); + } + + const page_id_t id{block->page.id()}; + LockGuard g{lock_sys.rec_hash, id}; + + /* Let the next record inherit the locks from rec, in gap mode */ + + lock_rec_inherit_to_gap(g.cell(), id, g.cell(), id, block->page.frame, + next_heap_no, heap_no); + + /* Reset the lock bits on rec and release waiting transactions */ + lock_rec_reset_and_release_wait(g.cell(), id, heap_no); +} + +/*********************************************************************//** +Stores on the page infimum record the explicit locks of another record. +This function is used to store the lock state of a record when it is +updated and the size of the record changes in the update. The record +is moved in such an update, perhaps to another page. The infimum record +acts as a dummy carrier record, taking care of lock releases while the +actual record is being moved. */ +void +lock_rec_store_on_page_infimum( +/*===========================*/ + const buf_block_t* block, /*!< in: buffer block containing rec */ + const rec_t* rec) /*!< in: record whose lock state + is stored on the infimum + record of the same page; lock + bits are reset on the + record */ +{ + const ulint heap_no= page_rec_get_heap_no(rec); + + ut_ad(block->page.frame == page_align(rec)); + const page_id_t id{block->page.id()}; +#ifdef ENABLED_DEBUG_SYNC + SCOPE_EXIT([]() { DEBUG_SYNC_C("lock_rec_store_on_page_infimum_end"); }); +#endif + + LockGuard g{lock_sys.rec_hash, id}; + lock_rec_move(g.cell(), *block, id, g.cell(), id, + PAGE_HEAP_NO_INFIMUM, heap_no); +} + +/** Restore the explicit lock requests on a single record, where the +state was stored on the infimum of a page. +@param block buffer block containing rec +@param rec record whose lock state is restored +@param donator page (rec is not necessarily on this page) +whose infimum stored the lock state; lock bits are reset on the infimum */ +void lock_rec_restore_from_page_infimum(const buf_block_t &block, + const rec_t *rec, page_id_t donator) +{ + const ulint heap_no= page_rec_get_heap_no(rec); + const page_id_t id{block.page.id()}; + LockMultiGuard g{lock_sys.rec_hash, id, donator}; + lock_rec_move(g.cell1(), block, id, g.cell2(), donator, heap_no, + PAGE_HEAP_NO_INFIMUM); +} + +/*========================= TABLE LOCKS ==============================*/ + +/** +Create a table lock, without checking for deadlocks or lock compatibility. +@param table table on which the lock is created +@param type_mode lock type and mode +@param trx transaction +@param c_lock conflicting lock +@return the created lock object */ +lock_t *lock_table_create(dict_table_t *table, unsigned type_mode, trx_t *trx, + lock_t *c_lock) +{ + lock_t* lock; + + lock_sys.assert_locked(*table); + ut_ad(trx->mutex_is_owner()); + ut_ad(!trx->is_wsrep() || lock_sys.is_writer()); + ut_ad(trx->state == TRX_STATE_ACTIVE || trx->is_recovered); + ut_ad(!trx->is_autocommit_non_locking()); + /* During CREATE TABLE, we will write to newly created FTS_*_CONFIG + on which no lock has been created yet. */ + ut_ad(!trx->dict_operation_lock_mode + || (strstr(table->name.m_name, "/FTS_") + && strstr(table->name.m_name, "_CONFIG") + sizeof("_CONFIG") + == table->name.m_name + strlen(table->name.m_name) + 1)); + + switch (LOCK_MODE_MASK & type_mode) { + case LOCK_AUTO_INC: + ++table->n_waiting_or_granted_auto_inc_locks; + /* For AUTOINC locking we reuse the lock instance only if + there is no wait involved else we allocate the waiting lock + from the transaction lock heap. */ + if (type_mode == LOCK_AUTO_INC) { + lock = table->autoinc_lock; + + ut_ad(!table->autoinc_trx); + table->autoinc_trx = trx; + + ib_vector_push(trx->autoinc_locks, &lock); + goto allocated; + } + + break; + case LOCK_X: + case LOCK_S: + ++table->n_lock_x_or_s; + break; + } + + lock = trx->lock.table_cached < array_elements(trx->lock.table_pool) + ? &trx->lock.table_pool[trx->lock.table_cached++] + : static_cast( + mem_heap_alloc(trx->lock.lock_heap, sizeof *lock)); + +allocated: + lock->type_mode = ib_uint32_t(type_mode | LOCK_TABLE); + lock->trx = trx; + + lock->un_member.tab_lock.table = table; + + ut_ad(table->get_ref_count() > 0 || !table->can_be_evicted); + + UT_LIST_ADD_LAST(trx->lock.trx_locks, lock); + + ut_list_append(table->locks, lock, TableLockGetNode()); + + if (type_mode & LOCK_WAIT) { + if (trx->lock.wait_trx) { + ut_ad(!c_lock || trx->lock.wait_trx == c_lock->trx); + ut_ad(trx->lock.wait_lock); + ut_ad((*trx->lock.wait_lock).trx == trx); + } else { + ut_ad(c_lock); + trx->lock.wait_trx = c_lock->trx; + ut_ad(!trx->lock.wait_lock); + } + trx->lock.wait_lock = lock; + } + + lock->trx->lock.table_locks.push_back(lock); + + MONITOR_INC(MONITOR_TABLELOCK_CREATED); + MONITOR_INC(MONITOR_NUM_TABLELOCK); + + return(lock); +} + +/*************************************************************//** +Pops autoinc lock requests from the transaction's autoinc_locks. We +handle the case where there are gaps in the array and they need to +be popped off the stack. */ +UNIV_INLINE +void +lock_table_pop_autoinc_locks( +/*=========================*/ + trx_t* trx) /*!< in/out: transaction that owns the AUTOINC locks */ +{ + ut_ad(!ib_vector_is_empty(trx->autoinc_locks)); + + /* Skip any gaps, gaps are NULL lock entries in the + trx->autoinc_locks vector. */ + + do { + ib_vector_pop(trx->autoinc_locks); + + if (ib_vector_is_empty(trx->autoinc_locks)) { + return; + } + + } while (*(lock_t**) ib_vector_get_last(trx->autoinc_locks) == NULL); +} + +/*************************************************************//** +Removes an autoinc lock request from the transaction's autoinc_locks. */ +UNIV_INLINE +void +lock_table_remove_autoinc_lock( +/*===========================*/ + lock_t* lock, /*!< in: table lock */ + trx_t* trx) /*!< in/out: transaction that owns the lock */ +{ + ut_ad(lock->type_mode == (LOCK_AUTO_INC | LOCK_TABLE)); + lock_sys.assert_locked(*lock->un_member.tab_lock.table); + ut_ad(trx->mutex_is_owner()); + + auto s = ib_vector_size(trx->autoinc_locks); + ut_ad(s); + + /* With stored functions and procedures the user may drop + a table within the same "statement". This special case has + to be handled by deleting only those AUTOINC locks that were + held by the table being dropped. */ + + lock_t* autoinc_lock = *static_cast( + ib_vector_get(trx->autoinc_locks, --s)); + + /* This is the default fast case. */ + + if (autoinc_lock == lock) { + lock_table_pop_autoinc_locks(trx); + } else { + /* The last element should never be NULL */ + ut_a(autoinc_lock != NULL); + + /* Handle freeing the locks from within the stack. */ + + while (s) { + autoinc_lock = *static_cast( + ib_vector_get(trx->autoinc_locks, --s)); + + if (autoinc_lock == lock) { + void* null_var = NULL; + ib_vector_set(trx->autoinc_locks, s, &null_var); + return; + } + } + + /* Must find the autoinc lock. */ + ut_error; + } +} + +/*************************************************************//** +Removes a table lock request from the queue and the trx list of locks; +this is a low-level function which does NOT check if waiting requests +can now be granted. */ +UNIV_INLINE +const dict_table_t* +lock_table_remove_low( +/*==================*/ + lock_t* lock) /*!< in/out: table lock */ +{ + ut_ad(lock->is_table()); + + trx_t* trx; + dict_table_t* table; + + ut_ad(lock->is_table()); + trx = lock->trx; + table = lock->un_member.tab_lock.table; + lock_sys.assert_locked(*table); + ut_ad(trx->mutex_is_owner()); + + /* Remove the table from the transaction's AUTOINC vector, if + the lock that is being released is an AUTOINC lock. */ + switch (lock->mode()) { + case LOCK_AUTO_INC: + ut_ad((table->autoinc_trx == trx) == !lock->is_waiting()); + + if (table->autoinc_trx == trx) { + table->autoinc_trx = NULL; + /* The locks must be freed in the reverse order from + the one in which they were acquired. This is to avoid + traversing the AUTOINC lock vector unnecessarily. + + We only store locks that were granted in the + trx->autoinc_locks vector (see lock_table_create() + and lock_grant()). */ + lock_table_remove_autoinc_lock(lock, trx); + } + + ut_ad(table->n_waiting_or_granted_auto_inc_locks); + --table->n_waiting_or_granted_auto_inc_locks; + break; + case LOCK_X: + case LOCK_S: + ut_ad(table->n_lock_x_or_s); + --table->n_lock_x_or_s; + break; + default: + break; + } + + UT_LIST_REMOVE(trx->lock.trx_locks, lock); + ut_list_remove(table->locks, lock, TableLockGetNode()); + + MONITOR_INC(MONITOR_TABLELOCK_REMOVED); + MONITOR_DEC(MONITOR_NUM_TABLELOCK); + return table; +} + +/*********************************************************************//** +Enqueues a waiting request for a table lock which cannot be granted +immediately. Checks for deadlocks. +@retval DB_LOCK_WAIT if the waiting lock was enqueued +@retval DB_DEADLOCK if this transaction was chosen as the victim */ +static +dberr_t +lock_table_enqueue_waiting( +/*=======================*/ + unsigned mode, /*!< in: lock mode this transaction is + requesting */ + dict_table_t* table, /*!< in/out: table */ + que_thr_t* thr, /*!< in: query thread */ + lock_t* c_lock) /*!< in: conflicting lock or NULL */ +{ + lock_sys.assert_locked(*table); + ut_ad(!srv_read_only_mode); + + trx_t* trx = thr_get_trx(thr); + ut_ad(trx->mutex_is_owner()); + ut_ad(!trx->dict_operation_lock_mode); + + /* Enqueue the lock request that will wait to be granted */ + lock_table_create(table, mode | LOCK_WAIT, trx, c_lock); + + trx->lock.wait_thr = thr; + /* Apart from Galera, only transactions that have waiting lock + may be chosen as deadlock victims. Only one lock can be waited for at a + time, and a transaction is associated with a single thread. That is why + there must not be waiting lock requests if the transaction is deadlock + victim and it is not WSREP. Galera transaction abort can be invoked + from MDL acquisition code when the transaction does not have waiting + lock, that's why we check only deadlock victim bit here. */ + ut_ad(!(trx->lock.was_chosen_as_deadlock_victim & 1)); + + MONITOR_INC(MONITOR_TABLELOCK_WAIT); + return(DB_LOCK_WAIT); +} + +/*********************************************************************//** +Checks if other transactions have an incompatible mode lock request in +the lock queue. +@return lock or NULL */ +UNIV_INLINE +lock_t* +lock_table_other_has_incompatible( +/*==============================*/ + const trx_t* trx, /*!< in: transaction, or NULL if all + transactions should be included */ + ulint wait, /*!< in: LOCK_WAIT if also + waiting locks are taken into + account, or 0 if not */ + const dict_table_t* table, /*!< in: table */ + lock_mode mode) /*!< in: lock mode */ +{ + lock_sys.assert_locked(*table); + + static_assert(LOCK_IS == 0, "compatibility"); + static_assert(LOCK_IX == 1, "compatibility"); + + if (UNIV_LIKELY(mode <= LOCK_IX && !table->n_lock_x_or_s)) { + return(NULL); + } + + for (lock_t* lock = UT_LIST_GET_LAST(table->locks); + lock; + lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock)) { + + trx_t* lock_trx = lock->trx; + + if (lock_trx != trx + && !lock_mode_compatible(lock->mode(), mode) + && (wait || !lock->is_waiting())) { + return(lock); + } + } + + return(NULL); +} + +/** Aqcuire or enqueue a table lock */ +static dberr_t lock_table_low(dict_table_t *table, lock_mode mode, + que_thr_t *thr, trx_t *trx) +{ + DBUG_EXECUTE_IF("innodb_table_deadlock", return DB_DEADLOCK;); + lock_t *wait_for= + lock_table_other_has_incompatible(trx, LOCK_WAIT, table, mode); + dberr_t err= DB_SUCCESS; + + trx->mutex_lock(); + + if (wait_for) + err= lock_table_enqueue_waiting(mode, table, thr, wait_for); + else + lock_table_create(table, mode, trx, nullptr); + + trx->mutex_unlock(); + + return err; +} + +#ifdef WITH_WSREP +/** Aqcuire or enqueue a table lock in Galera replication mode. */ +ATTRIBUTE_NOINLINE +static dberr_t lock_table_wsrep(dict_table_t *table, lock_mode mode, + que_thr_t *thr, trx_t *trx) +{ + LockMutexGuard g{SRW_LOCK_CALL}; + return lock_table_low(table, mode, thr, trx); +} +#endif + +/** Acquire a table lock. +@param table table to be locked +@param fktable pointer to table, in case of a FOREIGN key check +@param mode lock mode +@param thr SQL execution thread +@retval DB_SUCCESS if the lock was acquired +@retval DB_DEADLOCK if a deadlock occurred, or fktable && *fktable != table +@retval DB_LOCK_WAIT if lock_wait() must be invoked */ +dberr_t lock_table(dict_table_t *table, dict_table_t *const*fktable, + lock_mode mode, que_thr_t *thr) +{ + ut_ad(table); + + if (!fktable && table->is_temporary()) + return DB_SUCCESS; + + ut_ad(fktable || table->get_ref_count() || !table->can_be_evicted); + + trx_t *trx= thr_get_trx(thr); + + /* Look for equal or stronger locks the same trx already has on the + table. No need to acquire LockMutexGuard here because only the + thread that is executing a transaction can access trx_t::table_locks. */ + if (lock_table_has(trx, table, mode) || srv_read_only_mode) + return DB_SUCCESS; + + if ((mode == LOCK_IX || mode == LOCK_X) && + !trx->read_only && !trx->rsegs.m_redo.rseg) + trx_set_rw_mode(trx); + +#ifdef WITH_WSREP + if (trx->is_wsrep()) + return lock_table_wsrep(table, mode, thr, trx); +#endif + lock_sys.rd_lock(SRW_LOCK_CALL); + dberr_t err; + if (fktable != nullptr && *fktable != table) + err= DB_DEADLOCK; + else + { + table->lock_mutex_lock(); + err= lock_table_low(table, mode, thr, trx); + table->lock_mutex_unlock(); + } + lock_sys.rd_unlock(); + + return err; +} + +/** Create a table lock object for a resurrected transaction. +@param table table to be X-locked +@param trx transaction +@param mode LOCK_X or LOCK_IX */ +void lock_table_resurrect(dict_table_t *table, trx_t *trx, lock_mode mode) +{ + ut_ad(trx->is_recovered); + ut_ad(mode == LOCK_X || mode == LOCK_IX); + + if (lock_table_has(trx, table, mode)) + return; + + { + /* This is executed at server startup while no connections + are alowed. Do not bother with lock elision. */ + LockMutexGuard g{SRW_LOCK_CALL}; + ut_ad(!lock_table_other_has_incompatible(trx, LOCK_WAIT, table, mode)); + + trx->mutex_lock(); + lock_table_create(table, mode, trx); + } + trx->mutex_unlock(); +} + +/** Find a lock that a waiting table lock request still has to wait for. */ +static const lock_t *lock_table_has_to_wait_in_queue(const lock_t *wait_lock) +{ + ut_ad(wait_lock->is_waiting()); + ut_ad(wait_lock->is_table()); + + dict_table_t *table= wait_lock->un_member.tab_lock.table; + lock_sys.assert_locked(*table); + + static_assert(LOCK_IS == 0, "compatibility"); + static_assert(LOCK_IX == 1, "compatibility"); + + if (UNIV_LIKELY(wait_lock->mode() <= LOCK_IX && !table->n_lock_x_or_s)) + return nullptr; + + for (const lock_t *lock= UT_LIST_GET_FIRST(table->locks); lock != wait_lock; + lock= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) + if (lock_has_to_wait(wait_lock, lock)) + return lock; + + return nullptr; +} + +/*************************************************************//** +Removes a table lock request, waiting or granted, from the queue and grants +locks to other transactions in the queue, if they now are entitled to a +lock. +@param[in,out] in_lock table lock +@param[in] owns_wait_mutex whether lock_sys.wait_mutex is held */ +static void lock_table_dequeue(lock_t *in_lock, bool owns_wait_mutex) +{ +#ifdef SAFE_MUTEX + ut_ad(owns_wait_mutex == mysql_mutex_is_owner(&lock_sys.wait_mutex)); +#endif + ut_ad(in_lock->trx->mutex_is_owner()); + lock_t* lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, in_lock); + + const dict_table_t* table = lock_table_remove_low(in_lock); + + static_assert(LOCK_IS == 0, "compatibility"); + static_assert(LOCK_IX == 1, "compatibility"); + + if (UNIV_LIKELY(in_lock->mode() <= LOCK_IX && !table->n_lock_x_or_s)) { + return; + } + + bool acquired = false; + + /* Check if waiting locks in the queue can now be granted: grant + locks if there are no conflicting locks ahead. */ + + for (/* No op */; + lock != NULL; + lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) { + if (!lock->is_waiting()) { + continue; + } + + if (!owns_wait_mutex) { + mysql_mutex_lock(&lock_sys.wait_mutex); + acquired = owns_wait_mutex = true; + } + + ut_ad(lock->trx->lock.wait_trx); + ut_ad(lock->trx->lock.wait_lock); + + if (const lock_t* c = lock_table_has_to_wait_in_queue(lock)) { + trx_t* c_trx = c->trx; + lock->trx->lock.wait_trx = c_trx; + if (c_trx->lock.wait_trx + && innodb_deadlock_detect + && Deadlock::to_check.emplace(c_trx).second) { + Deadlock::to_be_checked = true; + } + } else { + /* Grant the lock */ + ut_ad(in_lock->trx != lock->trx); + in_lock->trx->mutex_unlock(); + lock_grant(lock); + in_lock->trx->mutex_lock(); + } + } + + if (acquired) { + mysql_mutex_unlock(&lock_sys.wait_mutex); + } +} + + +/** Sets a lock on a table based on the given mode. +@param table table to lock +@param trx transaction +@param mode LOCK_X or LOCK_S +@param no_wait whether to skip handling DB_LOCK_WAIT +@return error code */ +dberr_t lock_table_for_trx(dict_table_t *table, trx_t *trx, lock_mode mode, + bool no_wait) +{ + mem_heap_t *heap= mem_heap_create(512); + sel_node_t *node= sel_node_create(heap); + que_thr_t *thr= pars_complete_graph_for_exec(node, trx, heap, nullptr); + thr->graph->state= QUE_FORK_ACTIVE; + + thr= static_cast + (que_fork_get_first_thr(static_cast + (que_node_get_parent(thr)))); + +run_again: + thr->run_node= thr; + thr->prev_node= thr->common.parent; + dberr_t err= lock_table(table, nullptr, mode, thr); + + switch (err) { + case DB_SUCCESS: + break; + case DB_LOCK_WAIT: + if (no_wait) + { + lock_sys.cancel_lock_wait_for_trx(trx); + break; + } + /* fall through */ + default: + trx->error_state= err; + if (row_mysql_handle_errors(&err, trx, thr, nullptr)) + goto run_again; + } + + que_graph_free(thr->graph); + trx->op_info= ""; + + return err; +} + +/** Exclusively lock the data dictionary tables. +@param trx dictionary transaction +@return error code +@retval DB_SUCCESS on success */ +dberr_t lock_sys_tables(trx_t *trx) +{ + dberr_t err; + if (!(err= lock_table_for_trx(dict_sys.sys_tables, trx, LOCK_X)) && + !(err= lock_table_for_trx(dict_sys.sys_columns, trx, LOCK_X)) && + !(err= lock_table_for_trx(dict_sys.sys_indexes, trx, LOCK_X)) && + !(err= lock_table_for_trx(dict_sys.sys_fields, trx, LOCK_X))) + { + if (dict_sys.sys_foreign) + err= lock_table_for_trx(dict_sys.sys_foreign, trx, LOCK_X); + if (!err && dict_sys.sys_foreign_cols) + err= lock_table_for_trx(dict_sys.sys_foreign_cols, trx, LOCK_X); + if (!err && dict_sys.sys_virtual) + err= lock_table_for_trx(dict_sys.sys_virtual, trx, LOCK_X); + } + return err; +} + +/** Rebuild waiting queue after first_lock for heap_no. The queue is rebuilt +close to the way lock_rec_dequeue_from_page() does it. +@param trx transaction that has set a lock, which caused the queue + rebuild +@param cell rec hash cell of first_lock +@param first_lock the lock after which waiting queue will be rebuilt +@param heap_no heap no of the record for which waiting queue to rebuild */ +static void lock_rec_rebuild_waiting_queue( +#if defined(UNIV_DEBUG) || !defined(DBUG_OFF) + trx_t *trx, +#endif /* defined(UNIV_DEBUG) || !defined(DBUG_OFF) */ + hash_cell_t &cell, lock_t *first_lock, ulint heap_no) +{ + lock_sys.assert_locked(cell); + + for (lock_t *lock= first_lock; lock != NULL; + lock= lock_rec_get_next(heap_no, lock)) + { + if (!lock->is_waiting()) + continue; + mysql_mutex_lock(&lock_sys.wait_mutex); + ut_ad(lock->trx->lock.wait_trx); + ut_ad(lock->trx->lock.wait_lock); + + if (const lock_t *c= lock_rec_has_to_wait_in_queue(cell, lock)) + lock->trx->lock.wait_trx= c->trx; + else + { + /* Grant the lock */ + ut_ad(trx != lock->trx); + lock_grant(lock); + } + mysql_mutex_unlock(&lock_sys.wait_mutex); + } +} + +/*=========================== LOCK RELEASE ==============================*/ + +/*************************************************************//** +Removes a granted record lock of a transaction from the queue and grants +locks to other transactions waiting in the queue if they now are entitled +to a lock. */ +TRANSACTIONAL_TARGET +void +lock_rec_unlock( +/*============*/ + trx_t* trx, /*!< in/out: transaction that has + set a record lock */ + const page_id_t id, /*!< in: page containing rec */ + const rec_t* rec, /*!< in: record */ + lock_mode lock_mode)/*!< in: LOCK_S or LOCK_X */ +{ + lock_t* first_lock; + lock_t* lock; + ulint heap_no; + + ut_ad(trx); + ut_ad(rec); + ut_ad(!trx->lock.wait_lock); + ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)); + ut_ad(!page_rec_is_metadata(rec)); + + heap_no = page_rec_get_heap_no(rec); + + LockGuard g{lock_sys.rec_hash, id}; + + first_lock = lock_sys_t::get_first(g.cell(), id, heap_no); + + /* Find the last lock with the same lock_mode and transaction + on the record. */ + + for (lock = first_lock; lock != NULL; + lock = lock_rec_get_next(heap_no, lock)) { + if (lock->trx == trx && lock->mode() == lock_mode) { + goto released; + } + } + + { + ib::error err; + err << "Unlock row could not find a " << lock_mode + << " mode lock on the record. Current statement: "; + size_t stmt_len; + if (const char* stmt = innobase_get_stmt_unsafe( + trx->mysql_thd, &stmt_len)) { + err.write(stmt, stmt_len); + } + } + + return; + +released: + ut_a(!lock->is_waiting()); + { + TMTrxGuard tg{*trx}; + lock_rec_reset_nth_bit(lock, heap_no); + } + + /* Check if we can now grant waiting lock requests */ + lock_rec_rebuild_waiting_queue( +#if defined(UNIV_DEBUG) || !defined(DBUG_OFF) + trx, +#endif /* defined(UNIV_DEBUG) || !defined(DBUG_OFF) */ + g.cell(), first_lock, heap_no); +} + +/** Release the explicit locks of a committing transaction, +and release possible other transactions waiting because of these locks. +@return whether the operation succeeded */ +TRANSACTIONAL_TARGET static bool lock_release_try(trx_t *trx) +{ + /* At this point, trx->lock.trx_locks cannot be modified by other + threads, because our transaction has been committed. + See the checks and assertions in lock_rec_create_low() and + lock_rec_add_to_queue(). + + The function lock_table_create() should never be invoked on behalf + of a transaction running in another thread. Also there, we will + assert that the current transaction be active. */ + DBUG_ASSERT(trx->state == TRX_STATE_COMMITTED_IN_MEMORY); + DBUG_ASSERT(!trx->is_referenced()); + + bool all_released= true; +restart: + ulint count= 1000; + /* We will not attempt hardware lock elision (memory transaction) + here. Both lock_rec_dequeue_from_page() and lock_table_dequeue() + would likely lead to a memory transaction due to a system call, to + wake up a waiting transaction. */ + lock_sys.rd_lock(SRW_LOCK_CALL); + trx->mutex_lock(); + + /* Note: Anywhere else, trx->mutex is not held while acquiring + a lock table latch, but here we are following the opposite order. + To avoid deadlocks, we only try to acquire the lock table latches + but not keep waiting for them. */ + + for (lock_t *lock= UT_LIST_GET_LAST(trx->lock.trx_locks); lock; ) + { + ut_ad(lock->trx == trx); + lock_t *prev= UT_LIST_GET_PREV(trx_locks, lock); + if (!lock->is_table()) + { + ut_ad(!lock->index->table->is_temporary()); + ut_ad(lock->mode() != LOCK_X || + lock->index->table->id >= DICT_HDR_FIRST_ID || + trx->dict_operation || trx->was_dict_operation); + auto &lock_hash= lock_sys.hash_get(lock->type_mode); + auto cell= lock_hash.cell_get(lock->un_member.rec_lock.page_id.fold()); + auto latch= lock_sys_t::hash_table::latch(cell); + if (!latch->try_acquire()) + all_released= false; + else + { + lock_rec_dequeue_from_page(lock, false); + latch->release(); + } + } + else + { + dict_table_t *table= lock->un_member.tab_lock.table; + ut_ad(!table->is_temporary()); + ut_ad(table->id >= DICT_HDR_FIRST_ID || + (lock->mode() != LOCK_IX && lock->mode() != LOCK_X) || + trx->dict_operation || trx->was_dict_operation); + if (!table->lock_mutex_trylock()) + all_released= false; + else + { + lock_table_dequeue(lock, false); + table->lock_mutex_unlock(); + } + } + + lock= all_released ? UT_LIST_GET_LAST(trx->lock.trx_locks) : prev; + if (!--count) + break; + } + + lock_sys.rd_unlock(); + trx->mutex_unlock(); + if (all_released && !count) + goto restart; + return all_released; +} + +/** Release the explicit locks of a committing transaction, +and release possible other transactions waiting because of these locks. */ +void lock_release(trx_t *trx) +{ +#ifdef UNIV_DEBUG + std::set to_evict; + if (innodb_evict_tables_on_commit_debug && + !trx->is_recovered && !dict_sys.locked()) + for (const auto& p : trx->mod_tables) + if (!p.first->is_temporary()) + to_evict.emplace(p.first->id); +#endif + ulint count; + + for (count= 5; count--; ) + if (lock_release_try(trx)) + goto released; + + /* Fall back to acquiring lock_sys.latch in exclusive mode */ +restart: + count= 1000; + /* There is probably no point to try lock elision here; + in lock_release_try() it is different. */ + lock_sys.wr_lock(SRW_LOCK_CALL); + trx->mutex_lock(); + + while (lock_t *lock= UT_LIST_GET_LAST(trx->lock.trx_locks)) + { + ut_ad(lock->trx == trx); + if (!lock->is_table()) + { + ut_ad(!lock->index->table->is_temporary()); + ut_ad(lock->mode() != LOCK_X || + lock->index->table->id >= DICT_HDR_FIRST_ID || + trx->dict_operation || trx->was_dict_operation); + lock_rec_dequeue_from_page(lock, false); + } + else + { + ut_d(dict_table_t *table= lock->un_member.tab_lock.table); + ut_ad(!table->is_temporary()); + ut_ad(table->id >= DICT_HDR_FIRST_ID || + (lock->mode() != LOCK_IX && lock->mode() != LOCK_X) || + trx->dict_operation || trx->was_dict_operation); + lock_table_dequeue(lock, false); + } + + if (!--count) + break; + } + + lock_sys.wr_unlock(); + trx->mutex_unlock(); + if (!count) + goto restart; + +released: + if (UNIV_UNLIKELY(Deadlock::to_be_checked)) + { + mysql_mutex_lock(&lock_sys.wait_mutex); + lock_sys.deadlock_check(); + mysql_mutex_unlock(&lock_sys.wait_mutex); + } + + trx->lock.n_rec_locks= 0; + +#ifdef UNIV_DEBUG + if (to_evict.empty()) + return; + dict_sys.lock(SRW_LOCK_CALL); + LockMutexGuard g{SRW_LOCK_CALL}; + for (const table_id_t id : to_evict) + if (dict_table_t *table= dict_sys.find_table(id)) + if (!table->get_ref_count() && !UT_LIST_GET_LEN(table->locks)) + dict_sys.remove(table, true); + dict_sys.unlock(); +#endif +} + +/** Release the explicit locks of a committing transaction while +dict_sys.latch is exclusively locked, +and release possible other transactions waiting because of these locks. */ +void lock_release_on_drop(trx_t *trx) +{ + ut_ad(lock_sys.is_writer()); + ut_ad(trx->mutex_is_owner()); + ut_ad(trx->dict_operation); + + while (lock_t *lock= UT_LIST_GET_LAST(trx->lock.trx_locks)) + { + ut_ad(lock->trx == trx); + if (!lock->is_table()) + { + ut_ad(!lock->index->table->is_temporary()); + ut_ad(lock->mode() != LOCK_X || + lock->index->table->id >= DICT_HDR_FIRST_ID || + trx->dict_operation); + lock_rec_dequeue_from_page(lock, false); + } + else + { + ut_d(dict_table_t *table= lock->un_member.tab_lock.table); + ut_ad(!table->is_temporary()); + ut_ad(table->id >= DICT_HDR_FIRST_ID || + (lock->mode() != LOCK_IX && lock->mode() != LOCK_X) || + trx->dict_operation); + lock_table_dequeue(lock, false); + } + } +} + +/** Reset lock bit for supremum and rebuild waiting queue. +@param cell rec hash cell of in_lock +@param lock the lock with supemum bit set */ +static void lock_rec_unlock_supremum(hash_cell_t &cell, lock_t *lock) +{ + ut_ad(lock_rec_get_nth_bit(lock, PAGE_HEAP_NO_SUPREMUM)); +#ifdef SAFE_MUTEX + ut_ad(!mysql_mutex_is_owner(&lock_sys.wait_mutex)); +#endif /* SAFE_MUTEX */ + ut_ad(!lock->is_table()); + ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner()); + + lock_rec_reset_nth_bit(lock, PAGE_HEAP_NO_SUPREMUM); + + lock_t *first_lock= lock_sys_t::get_first( + cell, lock->un_member.rec_lock.page_id, PAGE_HEAP_NO_SUPREMUM); + + lock_rec_rebuild_waiting_queue( +#if defined(UNIV_DEBUG) || !defined(DBUG_OFF) + lock->trx, +#endif /* defined(UNIV_DEBUG) || !defined(DBUG_OFF) */ + cell, first_lock, PAGE_HEAP_NO_SUPREMUM); +} + +/** Release non-exclusive locks on XA PREPARE, +and wake up possible other transactions waiting because of these locks. +@param trx transaction in XA PREPARE state +@return whether all locks were released */ +static bool lock_release_on_prepare_try(trx_t *trx) +{ + /* At this point, trx->lock.trx_locks can still be modified by other + threads to convert implicit exclusive locks into explicit ones. + + The function lock_table_create() should never be invoked on behalf + of a transaction that is running in another thread. Also there, we + will assert that the current transaction be active. */ + DBUG_ASSERT(trx->state == TRX_STATE_PREPARED); + + bool all_released= true; + lock_sys.rd_lock(SRW_LOCK_CALL); + trx->mutex_lock(); + + /* Note: Normally, trx->mutex is not held while acquiring + a lock table latch, but here we are following the opposite order. + To avoid deadlocks, we only try to acquire the lock table latches + but not keep waiting for them. */ + + for (lock_t *prev, *lock= UT_LIST_GET_LAST(trx->lock.trx_locks); lock; + lock= prev) + { + ut_ad(lock->trx == trx); + prev= UT_LIST_GET_PREV(trx_locks, lock); + if (!lock->is_table()) + { + ut_ad(!lock->index->table->is_temporary()); + bool supremum_bit = lock_rec_get_nth_bit(lock, PAGE_HEAP_NO_SUPREMUM); + bool rec_granted_exclusive_not_gap = + lock->is_rec_granted_exclusive_not_gap(); + if (!supremum_bit && rec_granted_exclusive_not_gap) + continue; + auto &lock_hash= lock_sys.hash_get(lock->type_mode); + auto cell= lock_hash.cell_get(lock->un_member.rec_lock.page_id.fold()); + auto latch= lock_sys_t::hash_table::latch(cell); + if (latch->try_acquire()) + { + if (!rec_granted_exclusive_not_gap) + lock_rec_dequeue_from_page(lock, false); + else if (supremum_bit) + lock_rec_unlock_supremum(*cell, lock); + latch->release(); + } + else + all_released= false; + } + else + { + dict_table_t *table= lock->un_member.tab_lock.table; + ut_ad(!table->is_temporary()); + switch (lock->mode()) { + case LOCK_IS: + case LOCK_S: + if (table->lock_mutex_trylock()) + { + lock_table_dequeue(lock, false); + table->lock_mutex_unlock(); + } + else + all_released= false; + break; + case LOCK_IX: + case LOCK_X: + ut_ad(table->id >= DICT_HDR_FIRST_ID || trx->dict_operation); + /* fall through */ + default: + break; + } + } + } + + lock_sys.rd_unlock(); + trx->mutex_unlock(); + return all_released; +} + +/** Release non-exclusive locks on XA PREPARE, +and release possible other transactions waiting because of these locks. */ +void lock_release_on_prepare(trx_t *trx) +{ + trx->set_skip_lock_inheritance(); + + for (ulint count= 5; count--; ) + if (lock_release_on_prepare_try(trx)) + return; + + LockMutexGuard g{SRW_LOCK_CALL}; + trx->mutex_lock(); + + for (lock_t *prev, *lock= UT_LIST_GET_LAST(trx->lock.trx_locks); lock; + lock= prev) + { + ut_ad(lock->trx == trx); + prev= UT_LIST_GET_PREV(trx_locks, lock); + if (!lock->is_table()) + { + ut_ad(!lock->index->table->is_temporary()); + if (!lock->is_rec_granted_exclusive_not_gap()) + lock_rec_dequeue_from_page(lock, false); + else if (lock_rec_get_nth_bit(lock, PAGE_HEAP_NO_SUPREMUM)) + { + auto &lock_hash= lock_sys.hash_get(lock->type_mode); + auto cell= lock_hash.cell_get(lock->un_member.rec_lock.page_id.fold()); + lock_rec_unlock_supremum(*cell, lock); + } + else + ut_ad(lock->trx->isolation_level > TRX_ISO_READ_COMMITTED || + /* Insert-intention lock is valid for supremum for isolation + level > TRX_ISO_READ_COMMITTED */ + lock->mode() == LOCK_X || + !lock_rec_get_nth_bit(lock, PAGE_HEAP_NO_SUPREMUM)); + } + else + { + ut_d(dict_table_t *table= lock->un_member.tab_lock.table); + ut_ad(!table->is_temporary()); + switch (lock->mode()) { + case LOCK_IS: + case LOCK_S: + lock_table_dequeue(lock, false); + break; + case LOCK_IX: + case LOCK_X: + ut_ad(table->id >= DICT_HDR_FIRST_ID || trx->dict_operation); + /* fall through */ + default: + break; + } + } + } + + trx->mutex_unlock(); +} + +/** Release locks on a table whose creation is being rolled back */ +ATTRIBUTE_COLD +void lock_release_on_rollback(trx_t *trx, dict_table_t *table) +{ + trx->mod_tables.erase(table); + + /* This is very rarely executed code, in the rare case that an + CREATE TABLE operation is being rolled back. Theoretically, + we might try to remove the locks in multiple memory transactions. */ + lock_sys.wr_lock(SRW_LOCK_CALL); + trx->mutex_lock(); + + for (lock_t *next, *lock= UT_LIST_GET_FIRST(table->locks); lock; lock= next) + { + next= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock); + ut_ad(lock->trx == trx); + UT_LIST_REMOVE(trx->lock.trx_locks, lock); + ut_list_remove(table->locks, lock, TableLockGetNode()); + } + + for (lock_t *p, *lock= UT_LIST_GET_LAST(trx->lock.trx_locks); lock; lock= p) + { + p= UT_LIST_GET_PREV(trx_locks, lock); + ut_ad(lock->trx == trx); + if (lock->is_table()) + ut_ad(lock->un_member.tab_lock.table != table); + else if (lock->index->table == table) + lock_rec_dequeue_from_page(lock, false); + } + + lock_sys.wr_unlock(); + trx->mutex_unlock(); +} + +/*********************************************************************//** +Removes table locks of the transaction on a table to be dropped. */ +static +void +lock_trx_table_locks_remove( +/*========================*/ + const lock_t* lock_to_remove) /*!< in: lock to remove */ +{ + trx_t* trx = lock_to_remove->trx; + + ut_ad(lock_to_remove->is_table()); + lock_sys.assert_locked(*lock_to_remove->un_member.tab_lock.table); + ut_ad(trx->mutex_is_owner()); + + for (lock_list::iterator it = trx->lock.table_locks.begin(), + end = trx->lock.table_locks.end(); it != end; ++it) { + const lock_t* lock = *it; + + ut_ad(!lock || trx == lock->trx); + ut_ad(!lock || lock->is_table()); + ut_ad(!lock || lock->un_member.tab_lock.table); + + if (lock == lock_to_remove) { + *it = NULL; + return; + } + } + + /* Lock must exist in the vector. */ + ut_error; +} + +/*===================== VALIDATION AND DEBUGGING ====================*/ + +/** Print info of a table lock. +@param[in,out] file output stream +@param[in] lock table lock */ +static +void +lock_table_print(FILE* file, const lock_t* lock) +{ + lock_sys.assert_locked(); + ut_a(lock->is_table()); + + fputs("TABLE LOCK table ", file); + ut_print_name(file, lock->trx, + lock->un_member.tab_lock.table->name.m_name); + fprintf(file, " trx id " TRX_ID_FMT, lock->trx->id); + + switch (auto mode = lock->mode()) { + case LOCK_S: + fputs(" lock mode S", file); + break; + case LOCK_X: + ut_ad(lock->trx->id != 0); + fputs(" lock mode X", file); + break; + case LOCK_IS: + fputs(" lock mode IS", file); + break; + case LOCK_IX: + ut_ad(lock->trx->id != 0); + fputs(" lock mode IX", file); + break; + case LOCK_AUTO_INC: + fputs(" lock mode AUTO-INC", file); + break; + default: + fprintf(file, " unknown lock mode %u", mode); + } + + if (lock->is_waiting()) { + fputs(" waiting", file); + } + + putc('\n', file); +} + +/** Pretty-print a record lock. +@param[in,out] file output stream +@param[in] lock record lock +@param[in,out] mtr mini-transaction for accessing the record */ +static void lock_rec_print(FILE* file, const lock_t* lock, mtr_t& mtr) +{ + ut_ad(!lock->is_table()); + + const page_id_t page_id{lock->un_member.rec_lock.page_id}; + ut_d(lock_sys.hash_get(lock->type_mode).assert_locked(page_id)); + + fprintf(file, "RECORD LOCKS space id %u page no %u n bits " ULINTPF + " index %s of table ", + page_id.space(), page_id.page_no(), + lock_rec_get_n_bits(lock), + lock->index->name()); + ut_print_name(file, lock->trx, lock->index->table->name.m_name); + fprintf(file, " trx id " TRX_ID_FMT, lock->trx->id); + + switch (lock->mode()) { + case LOCK_S: + fputs(" lock mode S", file); + break; + case LOCK_X: + fputs(" lock_mode X", file); + break; + default: + ut_error; + } + + if (lock->is_gap()) { + fputs(" locks gap before rec", file); + } + + if (lock->is_record_not_gap()) { + fputs(" locks rec but not gap", file); + } + + if (lock->is_insert_intention()) { + fputs(" insert intention", file); + } + + if (lock->is_waiting()) { + fputs(" waiting", file); + } + + putc('\n', file); + + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + mtr.start(); + const buf_block_t* block = buf_page_try_get(page_id, &mtr); + + for (ulint i = 0; i < lock_rec_get_n_bits(lock); ++i) { + + if (!lock_rec_get_nth_bit(lock, i)) { + continue; + } + + fprintf(file, "Record lock, heap no %lu", (ulong) i); + + if (block) { + ut_ad(page_is_leaf(block->page.frame)); + const rec_t* rec; + + rec = page_find_rec_with_heap_no( + buf_block_get_frame(block), i); + ut_ad(!page_rec_is_metadata(rec)); + + offsets = rec_get_offsets( + rec, lock->index, offsets, + lock->index->n_core_fields, + ULINT_UNDEFINED, &heap); + + putc(' ', file); + rec_print_new(file, rec, offsets); + } + + putc('\n', file); + } + + mtr.commit(); + + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } +} + +#ifdef UNIV_DEBUG +/* Print the number of lock structs from lock_print_info_summary() only +in non-production builds for performance reasons, see +http://bugs.mysql.com/36942 */ +#define PRINT_NUM_OF_LOCK_STRUCTS +#endif /* UNIV_DEBUG */ + +#ifdef PRINT_NUM_OF_LOCK_STRUCTS +/*********************************************************************//** +Calculates the number of record lock structs in the record lock hash table. +@return number of record locks */ +TRANSACTIONAL_TARGET +static ulint lock_get_n_rec_locks() +{ + ulint n_locks = 0; + ulint i; + + lock_sys.assert_locked(); + + for (i = 0; i < lock_sys.rec_hash.n_cells; i++) { + const lock_t* lock; + + for (lock = static_cast( + HASH_GET_FIRST(&lock_sys.rec_hash, i)); + lock != 0; + lock = static_cast( + HASH_GET_NEXT(hash, lock))) { + + n_locks++; + } + } + + return(n_locks); +} +#endif /* PRINT_NUM_OF_LOCK_STRUCTS */ + +/*********************************************************************//** +Prints info of locks for all transactions. +@return FALSE if not able to acquire lock_sys.latch (and dislay info) */ +ibool +lock_print_info_summary( +/*====================*/ + FILE* file, /*!< in: file where to print */ + ibool nowait) /*!< in: whether to wait for lock_sys.latch */ +{ + /* Here, lock elision does not make sense, because + for the output we are going to invoke system calls, + which would interrupt a memory transaction. */ + if (!nowait) { + lock_sys.wr_lock(SRW_LOCK_CALL); + } else if (!lock_sys.wr_lock_try()) { + fputs("FAIL TO OBTAIN LOCK MUTEX," + " SKIP LOCK INFO PRINTING\n", file); + return(FALSE); + } + + if (lock_sys.deadlocks) { + fputs("------------------------\n" + "LATEST DETECTED DEADLOCK\n" + "------------------------\n", file); + + if (!srv_read_only_mode) { + ut_copy_file(file, lock_latest_err_file); + } + } + + fputs("------------\n" + "TRANSACTIONS\n" + "------------\n", file); + + fprintf(file, "Trx id counter " TRX_ID_FMT "\n", + trx_sys.get_max_trx_id()); + + fprintf(file, + "Purge done for trx's n:o < " TRX_ID_FMT + " undo n:o < " TRX_ID_FMT " state: %s\n" + "History list length %zu\n", + purge_sys.tail.trx_no, + purge_sys.tail.undo_no, + purge_sys.enabled() + ? (purge_sys.running() ? "running" + : purge_sys.paused() ? "stopped" : "running but idle") + : "disabled", + trx_sys.history_size_approx()); + +#ifdef PRINT_NUM_OF_LOCK_STRUCTS + fprintf(file, + "Total number of lock structs in row lock hash table %lu\n", + (ulong) lock_get_n_rec_locks()); +#endif /* PRINT_NUM_OF_LOCK_STRUCTS */ + return(TRUE); +} + +/** Prints transaction lock wait and MVCC state. +@param[in,out] file file where to print +@param[in] trx transaction +@param[in] now current my_hrtime_coarse() */ +void lock_trx_print_wait_and_mvcc_state(FILE *file, const trx_t *trx, + my_hrtime_t now) +{ + fprintf(file, "---"); + + trx_print_latched(file, trx, 600); + trx->read_view.print_limits(file); + + if (const lock_t* wait_lock = trx->lock.wait_lock) { + const my_hrtime_t suspend_time= trx->lock.suspend_time; + fprintf(file, + "------- TRX HAS BEEN WAITING %llu ns" + " FOR THIS LOCK TO BE GRANTED:\n", + now.val - suspend_time.val); + + if (!wait_lock->is_table()) { + mtr_t mtr; + lock_rec_print(file, wait_lock, mtr); + } else { + lock_table_print(file, wait_lock); + } + + fprintf(file, "------------------\n"); + } +} + +/*********************************************************************//** +Prints info of locks for a transaction. */ +static +void +lock_trx_print_locks( +/*=================*/ + FILE* file, /*!< in/out: File to write */ + const trx_t* trx) /*!< in: current transaction */ +{ + mtr_t mtr; + uint32_t i= 0; + /* Iterate over the transaction's locks. */ + lock_sys.assert_locked(); + for (lock_t *lock = UT_LIST_GET_FIRST(trx->lock.trx_locks); + lock != NULL; + lock = UT_LIST_GET_NEXT(trx_locks, lock)) { + if (!lock->is_table()) { + lock_rec_print(file, lock, mtr); + } else { + lock_table_print(file, lock); + } + + if (++i == 10) { + + fprintf(file, + "10 LOCKS PRINTED FOR THIS TRX:" + " SUPPRESSING FURTHER PRINTS\n"); + + break; + } + } +} + +/** Functor to display all transactions */ +struct lock_print_info +{ + lock_print_info(FILE* file, my_hrtime_t now) : + file(file), now(now), + purge_trx(purge_sys.query ? purge_sys.query->trx : nullptr) + {} + + void operator()(const trx_t &trx) const + { + if (UNIV_UNLIKELY(&trx == purge_trx)) + return; + lock_trx_print_wait_and_mvcc_state(file, &trx, now); + + if (trx.will_lock && srv_print_innodb_lock_monitor) + lock_trx_print_locks(file, &trx); + } + + FILE* const file; + const my_hrtime_t now; + const trx_t* const purge_trx; +}; + +/*********************************************************************//** +Prints info of locks for each transaction. This function will release +lock_sys.latch, which the caller must be holding in exclusive mode. */ +void +lock_print_info_all_transactions( +/*=============================*/ + FILE* file) /*!< in/out: file where to print */ +{ + fprintf(file, "LIST OF TRANSACTIONS FOR EACH SESSION:\n"); + + trx_sys.trx_list.for_each(lock_print_info(file, my_hrtime_coarse())); + lock_sys.wr_unlock(); + + ut_d(lock_validate()); +} + +#ifdef UNIV_DEBUG +/*********************************************************************//** +Find the the lock in the trx_t::trx_lock_t::table_locks vector. +@return true if found */ +static +bool +lock_trx_table_locks_find( +/*======================*/ + trx_t* trx, /*!< in: trx to validate */ + const lock_t* find_lock) /*!< in: lock to find */ +{ + bool found = false; + + ut_ad(trx->mutex_is_owner()); + + for (lock_list::const_iterator it = trx->lock.table_locks.begin(), + end = trx->lock.table_locks.end(); it != end; ++it) { + + const lock_t* lock = *it; + + if (lock == NULL) { + + continue; + + } else if (lock == find_lock) { + + /* Can't be duplicates. */ + ut_a(!found); + found = true; + } + + ut_a(trx == lock->trx); + ut_a(lock->is_table()); + ut_a(lock->un_member.tab_lock.table != NULL); + } + + return(found); +} + +/*********************************************************************//** +Validates the lock queue on a table. +@return TRUE if ok */ +static +ibool +lock_table_queue_validate( +/*======================*/ + const dict_table_t* table) /*!< in: table */ +{ + const lock_t* lock; + + lock_sys.assert_locked(*table); + + for (lock = UT_LIST_GET_FIRST(table->locks); + lock != NULL; + lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) { + + /* lock->trx->state cannot change from or to NOT_STARTED + while we are holding the lock_sys.latch. It may change + from ACTIVE or PREPARED to PREPARED or COMMITTED. */ + lock->trx->mutex_lock(); + check_trx_state(lock->trx); + + if (lock->trx->state == TRX_STATE_COMMITTED_IN_MEMORY) { + } else if (!lock->is_waiting()) { + ut_a(!lock_table_other_has_incompatible( + lock->trx, 0, table, + lock->mode())); + } else { + ut_a(lock_table_has_to_wait_in_queue(lock)); + } + + ut_a(lock_trx_table_locks_find(lock->trx, lock)); + lock->trx->mutex_unlock(); + } + + return(TRUE); +} + +/*********************************************************************//** +Validates the lock queue on a single record. +@return TRUE if ok */ +static +bool +lock_rec_queue_validate( +/*====================*/ + bool locked_lock_trx_sys, + /*!< in: if the caller holds + both the lock_sys.latch and + trx_sys_t->lock. */ + const page_id_t id, /*!< in: page identifier */ + const rec_t* rec, /*!< in: record to look at */ + const dict_index_t* index, /*!< in: index, or NULL if not known */ + const rec_offs* offsets)/*!< in: rec_get_offsets(rec, index) */ +{ + const lock_t* lock; + ulint heap_no; + + ut_a(rec); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets)); + ut_ad(page_rec_is_leaf(rec)); + ut_ad(!index || dict_index_is_clust(index) + || !dict_index_is_online_ddl(index)); + + heap_no = page_rec_get_heap_no(rec); + + if (!locked_lock_trx_sys) { + lock_sys.wr_lock(SRW_LOCK_CALL); + } + + hash_cell_t &cell= *lock_sys.rec_hash.cell_get(id.fold()); + lock_sys.assert_locked(cell); + + if (!page_rec_is_user_rec(rec)) { + + for (lock = lock_sys_t::get_first(cell, id, heap_no); + lock != NULL; + lock = lock_rec_get_next_const(heap_no, lock)) { + + ut_ad(!index || lock->index == index); + + lock->trx->mutex_lock(); + ut_ad(!lock->trx->read_only + || !lock->trx->is_autocommit_non_locking()); + ut_ad(trx_state_eq(lock->trx, + TRX_STATE_COMMITTED_IN_MEMORY) + || !lock->is_waiting() + || lock_rec_has_to_wait_in_queue(cell, lock)); + lock->trx->mutex_unlock(); + } + +func_exit: + if (!locked_lock_trx_sys) { + lock_sys.wr_unlock(); + } + + return true; + } + + ut_ad(page_rec_is_leaf(rec)); + + const trx_id_t impl_trx_id = index && index->is_primary() + ? lock_clust_rec_some_has_impl(rec, index, offsets) + : 0; + + if (trx_t *impl_trx = impl_trx_id + ? trx_sys.find(current_trx(), impl_trx_id, false) + : 0) { + /* impl_trx could have been committed before we + acquire its mutex, but not thereafter. */ + + impl_trx->mutex_lock(); + ut_ad(impl_trx->state != TRX_STATE_NOT_STARTED); + if (impl_trx->state == TRX_STATE_COMMITTED_IN_MEMORY) { + } else if (const lock_t* other_lock + = lock_rec_other_has_expl_req( + LOCK_S, cell, id, true, heap_no, + impl_trx)) { + /* The impl_trx is holding an implicit lock on the + given record 'rec'. So there cannot be another + explicit granted lock. Also, there can be another + explicit waiting lock only if the impl_trx has an + explicit granted lock. */ + +#ifdef WITH_WSREP + /** Galera record locking rules: + * If there is no other record lock to the same record, we may grant + the lock request. + * If there is other record lock but this requested record lock is + compatible, we may grant the lock request. + * If there is other record lock and it is not compatible with + requested lock, all normal transactions must wait. + * BF (brute force) additional exceptions : + ** If BF already holds record lock for requested record, we may + grant new record lock even if there is conflicting record lock(s) + waiting on a queue. + ** If conflicting transaction holds requested record lock, + we will cancel this record lock and select conflicting transaction + for BF abort or kill victim. + ** If conflicting transaction is waiting for requested record lock + we will cancel this wait and select conflicting transaction + for BF abort or kill victim. + ** There should not be two BF transactions waiting for same record lock + */ + if (other_lock->trx->is_wsrep() && !other_lock->is_waiting()) { + wsrep_report_bf_lock_wait(impl_trx->mysql_thd, impl_trx->id); + wsrep_report_bf_lock_wait(other_lock->trx->mysql_thd, other_lock->trx->id); + + if (!lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, + cell, id, heap_no, + impl_trx)) { + ib::info() << "WSREP impl BF lock conflict"; + } + } else +#endif /* WITH_WSREP */ + { + ut_ad(other_lock->is_waiting()); + ut_ad(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, + cell, id, heap_no, + impl_trx)); + } + } + + impl_trx->mutex_unlock(); + } + + for (lock = lock_sys_t::get_first(cell, id, heap_no); + lock != NULL; + lock = lock_rec_get_next_const(heap_no, lock)) { + ut_ad(!lock->trx->read_only + || !lock->trx->is_autocommit_non_locking()); + ut_ad(!page_rec_is_metadata(rec)); + + if (index) { + ut_a(lock->index == index); + } + + if (lock->is_waiting()) { + ut_a(lock->is_gap() + || lock_rec_has_to_wait_in_queue(cell, lock)); + } else if (!lock->is_gap()) { + const lock_mode mode = lock->mode() == LOCK_S + ? LOCK_X : LOCK_S; + + const lock_t* other_lock + = lock_rec_other_has_expl_req( + mode, cell, id, false, heap_no, + lock->trx); +#ifdef WITH_WSREP + if (UNIV_UNLIKELY(other_lock && lock->trx->is_wsrep())) { + /* Only BF transaction may be granted + lock before other conflicting lock + request. */ + if (!wsrep_thd_is_BF(lock->trx->mysql_thd, FALSE) + && !wsrep_thd_is_BF(other_lock->trx->mysql_thd, FALSE)) { + /* If no BF, this case is a bug. */ + wsrep_report_bf_lock_wait(lock->trx->mysql_thd, lock->trx->id); + wsrep_report_bf_lock_wait(other_lock->trx->mysql_thd, other_lock->trx->id); + ut_error; + } + } else +#endif /* WITH_WSREP */ + ut_ad(!other_lock); + } + } + + goto func_exit; +} + +/** Validate the record lock queues on a page. +@param block buffer pool block +@param latched whether the tablespace latch may be held +@return true if ok */ +static bool lock_rec_validate_page(const buf_block_t *block, bool latched) +{ + const lock_t* lock; + const rec_t* rec; + ulint nth_lock = 0; + ulint nth_bit = 0; + ulint i; + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + rec_offs_init(offsets_); + + const page_id_t id{block->page.id()}; + + LockGuard g{lock_sys.rec_hash, id}; +loop: + lock = lock_sys_t::get_first(g.cell(), id); + + if (!lock) { + goto function_exit; + } + + DBUG_ASSERT(!block->page.is_freed()); + + for (i = 0; i < nth_lock; i++) { + + lock = lock_rec_get_next_on_page_const(lock); + + if (!lock) { + goto function_exit; + } + } + + ut_ad(!lock->trx->read_only + || !lock->trx->is_autocommit_non_locking()); + + /* Only validate the record queues when this thread is not + holding a tablespace latch. */ + if (!latched) + for (i = nth_bit; i < lock_rec_get_n_bits(lock); i++) { + bool locked = lock_rec_get_nth_bit(lock, i); + if (locked || i == PAGE_HEAP_NO_SUPREMUM) { + + rec = page_find_rec_with_heap_no(block->page.frame, i); + ut_a(rec); + ut_ad(!locked || page_rec_is_leaf(rec)); + + /* If this thread is holding the file space + latch (fil_space_t::latch), the following + check WILL break the latching order and may + cause a deadlock of threads. */ + + if (locked) { + offsets = rec_get_offsets(rec, lock->index, + offsets, lock->index->n_core_fields, + ULINT_UNDEFINED, &heap); + lock_rec_queue_validate(true, id, rec, + lock->index, offsets); + } + + nth_bit = i + 1; + + goto loop; + } + } + + nth_bit = 0; + nth_lock++; + + goto loop; + +function_exit: + if (heap != NULL) { + mem_heap_free(heap); + } + return(TRUE); +} + +/*********************************************************************//** +Validate record locks up to a limit. +@return lock at limit or NULL if no more locks in the hash bucket */ +static MY_ATTRIBUTE((warn_unused_result)) +const lock_t* +lock_rec_validate( +/*==============*/ + ulint start, /*!< in: lock_sys.rec_hash + bucket */ + page_id_t* limit) /*!< in/out: upper limit of + (space, page_no) */ +{ + lock_sys.assert_locked(); + + for (const lock_t* lock = static_cast( + HASH_GET_FIRST(&lock_sys.rec_hash, start)); + lock != NULL; + lock = static_cast(HASH_GET_NEXT(hash, lock))) { + + ut_ad(!lock->trx->read_only + || !lock->trx->is_autocommit_non_locking()); + ut_ad(!lock->is_table()); + + page_id_t current(lock->un_member.rec_lock.page_id); + + if (current > *limit) { + *limit = current + 1; + return(lock); + } + } + + return(0); +} + +/*********************************************************************//** +Validate a record lock's block */ +static void lock_rec_block_validate(const page_id_t page_id) +{ + /* The lock and the block that it is referring to may be freed at + this point. */ + + buf_block_t* block; + mtr_t mtr; + + /* Transactional locks should never refer to dropped + tablespaces, because all DDL operations that would drop or + discard or rebuild a tablespace do hold an exclusive table + lock, which would conflict with any locks referring to the + tablespace from other transactions. */ + if (fil_space_t* space = fil_space_t::get(page_id.space())) { + dberr_t err = DB_SUCCESS; + mtr_start(&mtr); + + block = buf_page_get_gen( + page_id, + space->zip_size(), + RW_S_LATCH, NULL, + BUF_GET_POSSIBLY_FREED, + &mtr, &err); + + ut_ad(!block + || lock_rec_validate_page(block, space->is_latched())); + + mtr_commit(&mtr); + + space->release(); + } +} + +static my_bool lock_validate_table_locks(rw_trx_hash_element_t *element, void*) +{ + lock_sys.assert_locked(); + element->mutex.wr_lock(); + if (element->trx) + { + check_trx_state(element->trx); + for (const lock_t *lock= UT_LIST_GET_FIRST(element->trx->lock.trx_locks); + lock != NULL; + lock= UT_LIST_GET_NEXT(trx_locks, lock)) + if (lock->is_table()) + lock_table_queue_validate(lock->un_member.tab_lock.table); + } + element->mutex.wr_unlock(); + return 0; +} + + +/** Validate the transactional locks. */ +static void lock_validate() +{ + std::set pages; + { + LockMutexGuard g{SRW_LOCK_CALL}; + /* Validate table locks */ + trx_sys.rw_trx_hash.iterate(lock_validate_table_locks); + + for (ulint i= 0; i < lock_sys.rec_hash.n_cells; i++) + { + page_id_t limit{0, 0}; + while (const lock_t *lock= lock_rec_validate(i, &limit)) + { + if (lock_rec_find_set_bit(lock) == ULINT_UNDEFINED) + /* The lock bitmap is empty; ignore it. */ + continue; + pages.insert(lock->un_member.rec_lock.page_id); + } + } + } + + for (page_id_t page_id : pages) + lock_rec_block_validate(page_id); +} +#endif /* UNIV_DEBUG */ +/*============ RECORD LOCK CHECKS FOR ROW OPERATIONS ====================*/ + +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate insert of +a record. If they do, first tests if the query thread should anyway +be suspended for some reason; if not, then puts the transaction and +the query thread to the lock wait state and inserts a waiting request +for a gap x-lock to the lock queue. +@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */ +TRANSACTIONAL_TARGET +dberr_t +lock_rec_insert_check_and_lock( +/*===========================*/ + const rec_t* rec, /*!< in: record after which to insert */ + buf_block_t* block, /*!< in/out: buffer block of rec */ + dict_index_t* index, /*!< in: index */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + bool* inherit)/*!< out: set to true if the new + inserted record maybe should inherit + LOCK_GAP type locks from the successor + record */ +{ + ut_ad(block->page.frame == page_align(rec)); + ut_ad(mtr->is_named_space(index->table->space)); + ut_ad(page_is_leaf(block->page.frame)); + ut_ad(!index->table->is_temporary()); + + const rec_t *next_rec= page_rec_get_next_const(rec); + if (UNIV_UNLIKELY(!next_rec || rec_is_metadata(next_rec, *index))) + return DB_CORRUPTION; + + dberr_t err= DB_SUCCESS; + bool inherit_in= *inherit; + trx_t *trx= thr_get_trx(thr); + ulint heap_no= page_rec_get_heap_no(next_rec); + const page_id_t id{block->page.id()}; + + { + LockGuard g{lock_sys.rec_hash, id}; + /* Because this code is invoked for a running transaction by + the thread that is serving the transaction, it is not necessary + to hold trx->mutex here. */ + + /* When inserting a record into an index, the table must be at + least IX-locked. When we are building an index, we would pass + BTR_NO_LOCKING_FLAG and skip the locking altogether. */ + ut_ad(lock_table_has(trx, index->table, LOCK_IX)); + + *inherit= lock_sys_t::get_first(g.cell(), id, heap_no); + + if (*inherit) + { + /* Spatial index does not use GAP lock protection. It uses + "predicate lock" to protect the "range" */ + if (index->is_spatial()) + return DB_SUCCESS; + + /* If another transaction has an explicit lock request which locks + the gap, waiting or granted, on the successor, the insert has to wait. + + An exception is the case where the lock by the another transaction + is a gap type lock which it placed to wait for its turn to insert. We + do not consider that kind of a lock conflicting with our insert. This + eliminates an unnecessary deadlock which resulted when 2 transactions + had to wait for their insert. Both had waiting gap type lock requests + on the successor, which produced an unnecessary deadlock. */ + const unsigned type_mode= LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION; + + if (lock_t *c_lock= lock_rec_other_has_conflicting(type_mode, + g.cell(), id, + heap_no, trx)) + { + trx->mutex_lock(); + err= lock_rec_enqueue_waiting(c_lock, type_mode, id, block->page.frame, + heap_no, index, thr, nullptr); + trx->mutex_unlock(); + } + } + } + + switch (err) { + case DB_SUCCESS_LOCKED_REC: + err = DB_SUCCESS; + /* fall through */ + case DB_SUCCESS: + if (!inherit_in || index->is_clust()) + break; + /* Update the page max trx id field */ + page_update_max_trx_id(block, buf_block_get_page_zip(block), trx->id, mtr); + default: + /* We only care about the two return values. */ + break; + } + +#ifdef UNIV_DEBUG + { + mem_heap_t *heap= nullptr; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + const rec_offs *offsets; + rec_offs_init(offsets_); + + offsets= rec_get_offsets(next_rec, index, offsets_, index->n_core_fields, + ULINT_UNDEFINED, &heap); + + ut_ad(lock_rec_queue_validate(false, id, next_rec, index, offsets)); + + if (UNIV_LIKELY_NULL(heap)) + mem_heap_free(heap); + } +#endif /* UNIV_DEBUG */ + + return err; +} + +/*********************************************************************//** +Creates an explicit record lock for a running transaction that currently only +has an implicit lock on the record. The transaction instance must have a +reference count > 0 so that it can't be committed and freed before this +function has completed. */ +static +bool +lock_rec_convert_impl_to_expl_for_trx( +/*==================================*/ + trx_t* trx, /*!< in/out: active transaction */ + const page_id_t id, /*!< in: page identifier */ + const rec_t* rec, /*!< in: user record on page */ + dict_index_t* index) /*!< in: index of record */ +{ + if (!trx) + return false; + + ut_ad(trx->is_referenced()); + ut_ad(page_rec_is_leaf(rec)); + ut_ad(!rec_is_metadata(rec, *index)); + + DEBUG_SYNC_C("before_lock_rec_convert_impl_to_expl_for_trx"); + ulint heap_no= page_rec_get_heap_no(rec); + + { + LockGuard g{lock_sys.rec_hash, id}; + trx->mutex_lock(); + ut_ad(!trx_state_eq(trx, TRX_STATE_NOT_STARTED)); + + if (!trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY) && + !lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, g.cell(), id, heap_no, + trx)) + lock_rec_add_to_queue(LOCK_X | LOCK_REC_NOT_GAP, g.cell(), id, + page_align(rec), heap_no, index, trx, true); + } + + trx->mutex_unlock(); + trx->release_reference(); + + DEBUG_SYNC_C("after_lock_rec_convert_impl_to_expl_for_trx"); + return false; +} + + +#ifdef UNIV_DEBUG +struct lock_rec_other_trx_holds_expl_arg +{ + const ulint heap_no; + const hash_cell_t &cell; + const page_id_t id; + const trx_t &impl_trx; +}; + + +static my_bool lock_rec_other_trx_holds_expl_callback( + rw_trx_hash_element_t *element, + lock_rec_other_trx_holds_expl_arg *arg) +{ + element->mutex.wr_lock(); + if (element->trx) + { + element->trx->mutex_lock(); + ut_ad(element->trx->state != TRX_STATE_NOT_STARTED); + lock_t *expl_lock= element->trx->state == TRX_STATE_COMMITTED_IN_MEMORY + ? nullptr + : lock_rec_has_expl(LOCK_S | LOCK_REC_NOT_GAP, + arg->cell, arg->id, arg->heap_no, element->trx); + /* + An explicit lock is held by trx other than the trx holding the implicit + lock. + */ + ut_ad(!expl_lock || expl_lock->trx == &arg->impl_trx); + element->trx->mutex_unlock(); + } + element->mutex.wr_unlock(); + return 0; +} + + +/** + Checks if some transaction, other than given trx_id, has an explicit + lock on the given rec. + + FIXME: if the current transaction holds implicit lock from INSERT, a + subsequent locking read should not convert it to explicit. See also + MDEV-11215. + + @param caller_trx trx of current thread + @param[in] trx trx holding implicit lock on rec + @param[in] rec user record + @param[in] id page identifier +*/ +static void lock_rec_other_trx_holds_expl(trx_t *caller_trx, trx_t *trx, + const rec_t *rec, + const page_id_t id) +{ + if (trx) + { + ut_ad(!page_rec_is_metadata(rec)); + LockGuard g{lock_sys.rec_hash, id}; + ut_ad(trx->is_referenced()); + const trx_state_t state{trx->state}; + ut_ad(state != TRX_STATE_NOT_STARTED); + if (state == TRX_STATE_COMMITTED_IN_MEMORY) + /* The transaction was committed before we acquired LockGuard. */ + return; + lock_rec_other_trx_holds_expl_arg arg= + { page_rec_get_heap_no(rec), g.cell(), id, *trx }; + trx_sys.rw_trx_hash.iterate(caller_trx, + lock_rec_other_trx_holds_expl_callback, &arg); + } +} +#endif /* UNIV_DEBUG */ + +/** If an implicit x-lock exists on a record, convert it to an explicit one. + +Often, this is called by a transaction that is about to enter a lock wait +due to the lock conflict. Two explicit locks would be created: first the +exclusive lock on behalf of the lock-holder transaction in this function, +and then a wait request on behalf of caller_trx, in the calling function. + +This may also be called by the same transaction that is already holding +an implicit exclusive lock on the record. In this case, no explicit lock +should be created. + +@tparam is_primary whether the index is the primary key +@param[in,out] caller_trx current transaction +@param[in] id index tree leaf page identifier +@param[in] rec record on the leaf page +@param[in] index the index of the record +@param[in] offsets rec_get_offsets(rec,index) +@return whether caller_trx already holds an exclusive lock on rec */ +template +static +bool +lock_rec_convert_impl_to_expl( + trx_t* caller_trx, + page_id_t id, + const rec_t* rec, + dict_index_t* index, + const rec_offs* offsets) +{ + trx_t* trx; + + lock_sys.assert_unlocked(); + ut_ad(page_rec_is_user_rec(rec)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets)); + ut_ad(page_rec_is_leaf(rec)); + ut_ad(!rec_is_metadata(rec, *index)); + ut_ad(index->is_primary() == is_primary); + + if (is_primary) { + trx_id_t trx_id; + + trx_id = lock_clust_rec_some_has_impl(rec, index, offsets); + + if (trx_id == 0) { + return false; + } + if (UNIV_UNLIKELY(trx_id == caller_trx->id)) { + return true; + } + + trx = trx_sys.find(caller_trx, trx_id); + } else { + ut_ad(!dict_index_is_online_ddl(index)); + + trx = lock_sec_rec_some_has_impl(caller_trx, rec, index, + offsets); + if (trx == caller_trx) { + trx->release_reference(); + return true; + } + + ut_d(lock_rec_other_trx_holds_expl(caller_trx, trx, rec, id)); + } + + return lock_rec_convert_impl_to_expl_for_trx(trx, id, rec, index); +} + +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate modify (update, +delete mark, or delete unmark) of a clustered index record. If they do, +first tests if the query thread should anyway be suspended for some +reason; if not, then puts the transaction and the query thread to the +lock wait state and inserts a waiting request for a record x-lock to the +lock queue. +@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_clust_rec_modify_check_and_lock( +/*=================================*/ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: record which should be + modified */ + dict_index_t* index, /*!< in: clustered index */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err; + ulint heap_no; + + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(page_rec_is_leaf(rec)); + ut_ad(dict_index_is_clust(index)); + ut_ad(block->page.frame == page_align(rec)); + + ut_ad(!rec_is_metadata(rec, *index)); + ut_ad(!index->table->is_temporary()); + + heap_no = rec_offs_comp(offsets) + ? rec_get_heap_no_new(rec) + : rec_get_heap_no_old(rec); + + /* If a transaction has no explicit x-lock set on the record, set one + for it */ + + if (lock_rec_convert_impl_to_expl(thr_get_trx(thr), + block->page.id(), + rec, index, offsets)) { + /* We already hold an implicit exclusive lock. */ + return DB_SUCCESS; + } + + err = lock_rec_lock(true, LOCK_X | LOCK_REC_NOT_GAP, + block, heap_no, index, thr); + + ut_ad(lock_rec_queue_validate(false, block->page.id(), + rec, index, offsets)); + + if (err == DB_SUCCESS_LOCKED_REC) { + err = DB_SUCCESS; + } + + return(err); +} + +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate modify (delete +mark or delete unmark) of a secondary index record. +@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_sec_rec_modify_check_and_lock( +/*===============================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + buf_block_t* block, /*!< in/out: buffer block of rec */ + const rec_t* rec, /*!< in: record which should be + modified; NOTE: as this is a secondary + index, we always have to modify the + clustered index record first: see the + comment below */ + dict_index_t* index, /*!< in: secondary index */ + que_thr_t* thr, /*!< in: query thread + (can be NULL if BTR_NO_LOCKING_FLAG) */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + dberr_t err; + ulint heap_no; + + ut_ad(!dict_index_is_clust(index)); + ut_ad(!dict_index_is_online_ddl(index) || (flags & BTR_CREATE_FLAG)); + ut_ad(block->page.frame == page_align(rec)); + ut_ad(mtr->is_named_space(index->table->space)); + ut_ad(page_rec_is_leaf(rec)); + ut_ad(!rec_is_metadata(rec, *index)); + + if (flags & BTR_NO_LOCKING_FLAG) { + + return(DB_SUCCESS); + } + ut_ad(!index->table->is_temporary()); + + heap_no = page_rec_get_heap_no(rec); + +#ifdef WITH_WSREP + trx_t *trx= thr_get_trx(thr); + /* If transaction scanning an unique secondary key is wsrep + high priority thread (brute force) this scanning may involve + GAP-locking in the index. As this locking happens also when + applying replication events in high priority applier threads, + there is a probability for lock conflicts between two wsrep + high priority threads. To avoid this GAP-locking we mark that + this transaction is using unique key scan here. */ + if (trx->is_wsrep() && wsrep_thd_is_BF(trx->mysql_thd, false)) + trx->wsrep = 3; +#endif /* WITH_WSREP */ + + /* Another transaction cannot have an implicit lock on the record, + because when we come here, we already have modified the clustered + index record, and this would not have been possible if another active + transaction had modified this secondary index record. */ + + err = lock_rec_lock(true, LOCK_X | LOCK_REC_NOT_GAP, + block, heap_no, index, thr); + +#ifdef WITH_WSREP + if (trx->wsrep == 3) trx->wsrep = 1; +#endif /* WITH_WSREP */ + +#ifdef UNIV_DEBUG + { + mem_heap_t* heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + const rec_offs* offsets; + rec_offs_init(offsets_); + + offsets = rec_get_offsets(rec, index, offsets_, + index->n_core_fields, + ULINT_UNDEFINED, &heap); + + ut_ad(lock_rec_queue_validate( + false, block->page.id(), rec, index, offsets)); + + if (heap != NULL) { + mem_heap_free(heap); + } + } +#endif /* UNIV_DEBUG */ + + if (err == DB_SUCCESS || err == DB_SUCCESS_LOCKED_REC) { + /* Update the page max trx id field */ + /* It might not be necessary to do this if + err == DB_SUCCESS (no new lock created), + but it should not cost too much performance. */ + page_update_max_trx_id(block, + buf_block_get_page_zip(block), + thr_get_trx(thr)->id, mtr); + err = DB_SUCCESS; + } + + return(err); +} + +/*********************************************************************//** +Like lock_clust_rec_read_check_and_lock(), but reads a +secondary index record. +@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_sec_rec_read_check_and_lock( +/*=============================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: user record or page + supremum record which should + be read or passed over by a + read cursor */ + dict_index_t* index, /*!< in: secondary index */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + lock_mode mode, /*!< in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + unsigned gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr) /*!< in: query thread */ +{ + dberr_t err; + + ut_ad(!dict_index_is_clust(index)); + ut_ad(!dict_index_is_online_ddl(index)); + ut_ad(block->page.frame == page_align(rec)); + ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec)); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(page_rec_is_leaf(rec)); + ut_ad(mode == LOCK_X || mode == LOCK_S); + + if ((flags & BTR_NO_LOCKING_FLAG) + || srv_read_only_mode + || index->table->is_temporary()) { + + return(DB_SUCCESS); + } + + ut_ad(!rec_is_metadata(rec, *index)); + + trx_t *trx = thr_get_trx(thr); + + if (lock_table_has(trx, index->table, mode)) { + return DB_SUCCESS; + } + + if (!page_rec_is_supremum(rec) + && lock_rec_convert_impl_to_expl( + trx, block->page.id(), rec, index, offsets) + && gap_mode == LOCK_REC_NOT_GAP) { + /* We already hold an implicit exclusive lock. */ + return DB_SUCCESS; + } + +#ifdef WITH_WSREP + /* If transaction scanning an unique secondary key is wsrep + high priority thread (brute force) this scanning may involve + GAP-locking in the index. As this locking happens also when + applying replication events in high priority applier threads, + there is a probability for lock conflicts between two wsrep + high priority threads. To avoid this GAP-locking we mark that + this transaction is using unique key scan here. */ + if (trx->is_wsrep() && wsrep_thd_is_BF(trx->mysql_thd, false)) + trx->wsrep = 3; +#endif /* WITH_WSREP */ + + err = lock_rec_lock(false, gap_mode | mode, + block, page_rec_get_heap_no(rec), index, thr); + +#ifdef WITH_WSREP + if (trx->wsrep == 3) trx->wsrep = 1; +#endif /* WITH_WSREP */ + + ut_ad(lock_rec_queue_validate(false, block->page.id(), + rec, index, offsets)); + + DEBUG_SYNC_C("lock_sec_rec_read_check_and_lock_has_locked"); + + return(err); +} + +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate read, or passing +over by a read cursor, of a clustered index record. If they do, first tests +if the query thread should anyway be suspended for some reason; if not, then +puts the transaction and the query thread to the lock wait state and inserts a +waiting request for a record lock to the lock queue. Sets the requested mode +lock on the record. +@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_clust_rec_read_check_and_lock( +/*===============================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: user record or page + supremum record which should + be read or passed over by a + read cursor */ + dict_index_t* index, /*!< in: clustered index */ + const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */ + lock_mode mode, /*!< in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + unsigned gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr) /*!< in: query thread */ +{ + ut_ad(dict_index_is_clust(index)); + ut_ad(block->page.frame == page_align(rec)); + ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec)); + ut_ad(gap_mode == LOCK_ORDINARY || gap_mode == LOCK_GAP + || gap_mode == LOCK_REC_NOT_GAP); + ut_ad(rec_offs_validate(rec, index, offsets)); + ut_ad(page_rec_is_leaf(rec)); + ut_ad(!rec_is_metadata(rec, *index)); + + if ((flags & BTR_NO_LOCKING_FLAG) + || srv_read_only_mode + || index->table->is_temporary()) { + + return(DB_SUCCESS); + } + + const page_id_t id{block->page.id()}; + + ulint heap_no = page_rec_get_heap_no(rec); + + trx_t *trx = thr_get_trx(thr); + if (!lock_table_has(trx, index->table, LOCK_X) + && heap_no != PAGE_HEAP_NO_SUPREMUM + && lock_rec_convert_impl_to_expl(trx, id, + rec, index, offsets) + && gap_mode == LOCK_REC_NOT_GAP) { + /* We already hold an implicit exclusive lock. */ + return DB_SUCCESS; + } + + dberr_t err = lock_rec_lock(false, gap_mode | mode, + block, heap_no, index, thr); + + ut_ad(lock_rec_queue_validate(false, id, rec, index, offsets)); + + DEBUG_SYNC_C("after_lock_clust_rec_read_check_and_lock"); + + return(err); +} +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate read, or passing +over by a read cursor, of a clustered index record. If they do, first tests +if the query thread should anyway be suspended for some reason; if not, then +puts the transaction and the query thread to the lock wait state and inserts a +waiting request for a record lock to the lock queue. Sets the requested mode +lock on the record. This is an alternative version of +lock_clust_rec_read_check_and_lock() that does not require the parameter +"offsets". +@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_clust_rec_read_check_and_lock_alt( +/*===================================*/ + ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG + bit is set, does nothing */ + const buf_block_t* block, /*!< in: buffer block of rec */ + const rec_t* rec, /*!< in: user record or page + supremum record which should + be read or passed over by a + read cursor */ + dict_index_t* index, /*!< in: clustered index */ + lock_mode mode, /*!< in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + unsigned gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or + LOCK_REC_NOT_GAP */ + que_thr_t* thr) /*!< in: query thread */ +{ + mem_heap_t* tmp_heap = NULL; + rec_offs offsets_[REC_OFFS_NORMAL_SIZE]; + rec_offs* offsets = offsets_; + dberr_t err; + rec_offs_init(offsets_); + + ut_ad(page_rec_is_leaf(rec)); + offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields, + ULINT_UNDEFINED, &tmp_heap); + err = lock_clust_rec_read_check_and_lock(flags, block, rec, index, + offsets, mode, gap_mode, thr); + if (tmp_heap) { + mem_heap_free(tmp_heap); + } + + if (err == DB_SUCCESS_LOCKED_REC) { + err = DB_SUCCESS; + } + + return(err); +} + +/*******************************************************************//** +Check if a transaction holds any autoinc locks. +@return TRUE if the transaction holds any AUTOINC locks. */ +static +ibool +lock_trx_holds_autoinc_locks( +/*=========================*/ + const trx_t* trx) /*!< in: transaction */ +{ + ut_a(trx->autoinc_locks != NULL); + + return(!ib_vector_is_empty(trx->autoinc_locks)); +} + +/** Release all AUTO_INCREMENT locks of the transaction. */ +static void lock_release_autoinc_locks(trx_t *trx) +{ + { + LockMutexGuard g{SRW_LOCK_CALL}; + mysql_mutex_lock(&lock_sys.wait_mutex); + trx->mutex_lock(); + auto autoinc_locks= trx->autoinc_locks; + ut_a(autoinc_locks); + + /* We release the locks in the reverse order. This is to avoid + searching the vector for the element to delete at the lower level. + See (lock_table_remove_low()) for details. */ + while (ulint size= ib_vector_size(autoinc_locks)) + { + lock_t *lock= *static_cast + (ib_vector_get(autoinc_locks, size - 1)); + ut_ad(lock->type_mode == (LOCK_AUTO_INC | LOCK_TABLE)); + lock_table_dequeue(lock, true); + lock_trx_table_locks_remove(lock); + } + } + mysql_mutex_unlock(&lock_sys.wait_mutex); + trx->mutex_unlock(); +} + +/** Cancel a waiting lock request and release possibly waiting transactions */ +template +void lock_cancel_waiting_and_release(lock_t *lock) +{ + lock_sys.assert_locked(*lock); + mysql_mutex_assert_owner(&lock_sys.wait_mutex); + trx_t *trx= lock->trx; + if (inner_trx_lock) + trx->mutex_lock(); + ut_d(const auto trx_state= trx->state); + ut_ad(trx_state == TRX_STATE_COMMITTED_IN_MEMORY || + trx_state == TRX_STATE_ACTIVE); + + if (!lock->is_table()) + lock_rec_dequeue_from_page(lock, true); + else + { + if (lock->type_mode == (LOCK_AUTO_INC | LOCK_TABLE)) + { + ut_ad(trx->autoinc_locks); + ib_vector_remove(trx->autoinc_locks, lock); + } + lock_table_dequeue(lock, true); + /* Remove the lock from table lock vector too. */ + lock_trx_table_locks_remove(lock); + } + + /* Reset the wait flag and the back pointer to lock in trx. */ + lock_reset_lock_and_trx_wait(lock); + + lock_wait_end(trx); + + if (inner_trx_lock) + trx->mutex_unlock(); +} + +void lock_sys_t::cancel_lock_wait_for_trx(trx_t *trx) +{ + lock_sys.wr_lock(SRW_LOCK_CALL); + mysql_mutex_lock(&lock_sys.wait_mutex); + if (lock_t *lock= trx->lock.wait_lock) + { + /* check if victim is still waiting */ + if (lock->is_waiting()) + lock_cancel_waiting_and_release(lock); + } + lock_sys.wr_unlock(); + mysql_mutex_unlock(&lock_sys.wait_mutex); +} + +#ifdef WITH_WSREP +void lock_sys_t::cancel_lock_wait_for_wsrep_bf_abort(trx_t *trx) +{ + lock_sys.assert_locked(); + mysql_mutex_assert_owner(&lock_sys.wait_mutex); + ut_ad(trx->mutex_is_owner()); + ut_ad(trx->state == TRX_STATE_ACTIVE || trx->state == TRX_STATE_PREPARED); + trx->lock.set_wsrep_victim(); + if (lock_t *lock= trx->lock.wait_lock) + lock_cancel_waiting_and_release(lock); +} +#endif /* WITH_WSREP */ + +/** Cancel a waiting lock request. +@tparam check_victim whether to check for DB_DEADLOCK +@param trx active transaction +@param lock waiting lock request +@retval DB_SUCCESS if no lock existed +@retval DB_DEADLOCK if trx->lock.was_chosen_as_deadlock_victim was set +@retval DB_LOCK_WAIT if the lock was canceled */ +template +dberr_t lock_sys_t::cancel(trx_t *trx, lock_t *lock) +{ + DEBUG_SYNC_C("lock_sys_t_cancel_enter"); + mysql_mutex_assert_owner(&lock_sys.wait_mutex); + ut_ad(trx->state == TRX_STATE_ACTIVE); + /* trx->lock.wait_lock may be changed by other threads as long as + we are not holding lock_sys.latch. + + So, trx->lock.wait_lock==lock does not necessarily hold, but both + pointers should be valid, because other threads cannot assign + trx->lock.wait_lock=nullptr (or invalidate *lock) while we are + holding lock_sys.wait_mutex. Also, the type of trx->lock.wait_lock + (record or table lock) cannot be changed by other threads. So, it is + safe to call lock->is_table() while not holding lock_sys.latch. If + we have to release and reacquire lock_sys.wait_mutex, we must reread + trx->lock.wait_lock. We must also reread trx->lock.wait_lock after + lock_sys.latch acquiring, as it can be changed to not-null in lock moving + functions even if we hold lock_sys.wait_mutex. */ + dberr_t err= DB_SUCCESS; + /* This would be too large for a memory transaction, except in the + DB_DEADLOCK case, which was already tested in lock_trx_handle_wait(). */ + if (lock->is_table()) + { + if (!lock_sys.rd_lock_try()) + { + mysql_mutex_unlock(&lock_sys.wait_mutex); + lock_sys.rd_lock(SRW_LOCK_CALL); + mysql_mutex_lock(&lock_sys.wait_mutex); + lock= trx->lock.wait_lock; + /* Even if waiting lock was cancelled while lock_sys.wait_mutex was + unlocked, we need to return deadlock error if transaction was chosen + as deadlock victim to rollback it */ + if (check_victim && trx->lock.was_chosen_as_deadlock_victim) + err= DB_DEADLOCK; + else if (lock) + goto resolve_table_lock; + } + else + { + /* This function is invoked from the thread which executes the + transaction. Table locks are requested before record locks. Some other + transaction can't change trx->lock.wait_lock from table to record for the + current transaction at this point, because the current transaction has not + requested record locks yet. There is no need to move any table locks by + other threads. And trx->lock.wait_lock can't be set to null while we are + holding lock_sys.wait_mutex. That's why there is no need to reload + trx->lock.wait_lock here. */ + ut_ad(lock == trx->lock.wait_lock); +resolve_table_lock: + dict_table_t *table= lock->un_member.tab_lock.table; + if (!table->lock_mutex_trylock()) + { + /* The correct latching order is: + lock_sys.latch, table->lock_mutex_lock(), lock_sys.wait_mutex. + Thus, we must release lock_sys.wait_mutex for a blocking wait. */ + mysql_mutex_unlock(&lock_sys.wait_mutex); + table->lock_mutex_lock(); + mysql_mutex_lock(&lock_sys.wait_mutex); + /* Cache trx->lock.wait_lock under the corresponding latches. */ + lock= trx->lock.wait_lock; + if (!lock) + goto retreat; + else if (check_victim && trx->lock.was_chosen_as_deadlock_victim) + { + err= DB_DEADLOCK; + goto retreat; + } + } + else + /* Cache trx->lock.wait_lock under the corresponding latches if + it was not cached yet */ + lock= trx->lock.wait_lock; + if (lock->is_waiting()) + lock_cancel_waiting_and_release(lock); + /* Even if lock->is_waiting() did not hold above, we must return + DB_LOCK_WAIT, or otherwise optimistic parallel replication could + occasionally hang. Potentially affected tests: + rpl.rpl_parallel_optimistic + rpl.rpl_parallel_optimistic_nobinlog + rpl.rpl_parallel_optimistic_xa_lsu_off */ + err= DB_LOCK_WAIT; +retreat: + table->lock_mutex_unlock(); + } + lock_sys.rd_unlock(); + } + else + { + /* To prevent the record lock from being moved between pages + during a page split or merge, we must hold exclusive lock_sys.latch. */ + if (!lock_sys.wr_lock_try()) + { + mysql_mutex_unlock(&lock_sys.wait_mutex); + lock_sys.wr_lock(SRW_LOCK_CALL); + mysql_mutex_lock(&lock_sys.wait_mutex); + /* Cache trx->lock.wait_lock under the corresponding latches. */ + lock= trx->lock.wait_lock; + /* Even if waiting lock was cancelled while lock_sys.wait_mutex was + unlocked, we need to return deadlock error if transaction was chosen + as deadlock victim to rollback it */ + if (check_victim && trx->lock.was_chosen_as_deadlock_victim) + err= DB_DEADLOCK; + else if (lock) + goto resolve_record_lock; + } + else + { + /* Cache trx->lock.wait_lock under the corresponding latches if + it was not cached yet */ + lock= trx->lock.wait_lock; +resolve_record_lock: + if (lock->is_waiting()) + lock_cancel_waiting_and_release(lock); + /* Even if lock->is_waiting() did not hold above, we must return + DB_LOCK_WAIT, or otherwise optimistic parallel replication could + occasionally hang. Potentially affected tests: + rpl.rpl_parallel_optimistic + rpl.rpl_parallel_optimistic_nobinlog + rpl.rpl_parallel_optimistic_xa_lsu_off */ + err= DB_LOCK_WAIT; + } + lock_sys.wr_unlock(); + } + + return err; +} + +template dberr_t lock_sys_t::cancel(trx_t *, lock_t *); + +/*********************************************************************//** +Unlocks AUTO_INC type locks that were possibly reserved by a trx. This +function should be called at the the end of an SQL statement, by the +connection thread that owns the transaction (trx->mysql_thd). */ +void +lock_unlock_table_autoinc( +/*======================*/ + trx_t* trx) /*!< in/out: transaction */ +{ + lock_sys.assert_unlocked(); + ut_ad(!trx->mutex_is_owner()); + ut_ad(!trx->lock.wait_lock); + + /* This can be invoked on NOT_STARTED, ACTIVE, PREPARED, + but not COMMITTED transactions. */ + + ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED) + || !trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY)); + + /* This function is invoked for a running transaction by the + thread that is serving the transaction. Therefore it is not + necessary to hold trx->mutex here. */ + + if (lock_trx_holds_autoinc_locks(trx)) { + lock_release_autoinc_locks(trx); + } +} + +/** Handle a pending lock wait (DB_LOCK_WAIT) in a semi-consistent read +while holding a clustered index leaf page latch. + +@param trx transaction that is or was waiting for a lock +@retval DB_SUCCESS if the lock was granted +@retval DB_DEADLOCK if the transaction must be aborted due to a deadlock +@retval DB_LOCK_WAIT if a lock wait would be necessary; the pending + lock request was released */ +dberr_t lock_trx_handle_wait(trx_t *trx) +{ + DEBUG_SYNC_C("lock_trx_handle_wait_enter"); + if (trx->lock.was_chosen_as_deadlock_victim) + return DB_DEADLOCK; + DEBUG_SYNC_C("lock_trx_handle_wait_before_unlocked_wait_lock_check"); + /* trx->lock.was_chosen_as_deadlock_victim must always be set before + trx->lock.wait_lock if the transaction was chosen as deadlock victim, + the function must not return DB_SUCCESS if + trx->lock.was_chosen_as_deadlock_victim is set. */ + if (!trx->lock.wait_lock) + return trx->lock.was_chosen_as_deadlock_victim ? DB_DEADLOCK : DB_SUCCESS; + dberr_t err= DB_SUCCESS; + mysql_mutex_lock(&lock_sys.wait_mutex); + if (trx->lock.was_chosen_as_deadlock_victim) + err= DB_DEADLOCK; + /* Cache trx->lock.wait_lock to avoid unnecessary atomic variable load */ + else if (lock_t *wait_lock= trx->lock.wait_lock) + err= lock_sys_t::cancel(trx, wait_lock); + lock_sys.deadlock_check(); + mysql_mutex_unlock(&lock_sys.wait_mutex); + return err; +} + +#ifdef UNIV_DEBUG +/** + Do an exhaustive check for any locks (table or rec) against the table. + + @param[in] table check if there are any locks held on records in this table + or on the table itself +*/ + +static my_bool lock_table_locks_lookup(rw_trx_hash_element_t *element, + const dict_table_t *table) +{ + lock_sys.assert_locked(); + element->mutex.wr_lock(); + if (element->trx) + { + element->trx->mutex_lock(); + check_trx_state(element->trx); + if (element->trx->state != TRX_STATE_COMMITTED_IN_MEMORY) + { + for (const lock_t *lock= UT_LIST_GET_FIRST(element->trx->lock.trx_locks); + lock != NULL; + lock= UT_LIST_GET_NEXT(trx_locks, lock)) + { + ut_ad(lock->trx == element->trx); + if (!lock->is_table()) + { + ut_ad(lock->index->online_status != ONLINE_INDEX_CREATION || + lock->index->is_primary()); + ut_ad(lock->index->table != table); + } + else + ut_ad(lock->un_member.tab_lock.table != table); + } + } + element->trx->mutex_unlock(); + } + element->mutex.wr_unlock(); + return 0; +} +#endif /* UNIV_DEBUG */ + +/** Check if there are any locks on a table. +@return true if table has either table or record locks. */ +TRANSACTIONAL_TARGET +bool lock_table_has_locks(dict_table_t *table) +{ + if (table->n_rec_locks) + return true; + ulint len; +#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC + if (xbegin()) + { + if (table->lock_mutex_is_locked()) + xabort(); + len= UT_LIST_GET_LEN(table->locks); + xend(); + } + else +#endif + { + table->lock_mutex_lock(); + len= UT_LIST_GET_LEN(table->locks); + table->lock_mutex_unlock(); + } + if (len) + return true; +#ifdef UNIV_DEBUG + { + LockMutexGuard g{SRW_LOCK_CALL}; + trx_sys.rw_trx_hash.iterate(lock_table_locks_lookup, + const_cast(table)); + } +#endif /* UNIV_DEBUG */ + return false; +} + +/*******************************************************************//** +Initialise the table lock list. */ +void +lock_table_lock_list_init( +/*======================*/ + table_lock_list_t* lock_list) /*!< List to initialise */ +{ + UT_LIST_INIT(*lock_list, &lock_table_t::locks); +} + +#ifdef UNIV_DEBUG +/*******************************************************************//** +Check if the transaction holds any locks on the sys tables +or its records. +@return the strongest lock found on any sys table or 0 for none */ +const lock_t* +lock_trx_has_sys_table_locks( +/*=========================*/ + const trx_t* trx) /*!< in: transaction to check */ +{ + const lock_t* strongest_lock = 0; + lock_mode strongest = LOCK_NONE; + + LockMutexGuard g{SRW_LOCK_CALL}; + + const lock_list::const_iterator end = trx->lock.table_locks.end(); + lock_list::const_iterator it = trx->lock.table_locks.begin(); + + /* Find a valid mode. Note: ib_vector_size() can be 0. */ + + for (/* No op */; it != end; ++it) { + const lock_t* lock = *it; + + if (lock != NULL + && dict_is_sys_table(lock->un_member.tab_lock.table->id)) { + + strongest = lock->mode(); + ut_ad(strongest != LOCK_NONE); + strongest_lock = lock; + break; + } + } + + if (strongest == LOCK_NONE) { + return(NULL); + } + + for (/* No op */; it != end; ++it) { + const lock_t* lock = *it; + + if (lock == NULL) { + continue; + } + + ut_ad(trx == lock->trx); + ut_ad(lock->is_table()); + ut_ad(lock->un_member.tab_lock.table); + + lock_mode mode = lock->mode(); + + if (dict_is_sys_table(lock->un_member.tab_lock.table->id) + && lock_mode_stronger_or_eq(mode, strongest)) { + + strongest = mode; + strongest_lock = lock; + } + } + + return(strongest_lock); +} + +/** Check if the transaction holds an explicit exclusive lock on a record. +@param[in] trx transaction +@param[in] table table +@param[in] id leaf page identifier +@param[in] heap_no heap number identifying the record +@return whether an explicit X-lock is held */ +bool lock_trx_has_expl_x_lock(const trx_t &trx, const dict_table_t &table, + page_id_t id, ulint heap_no) +{ + ut_ad(heap_no > PAGE_HEAP_NO_SUPREMUM); + ut_ad(lock_table_has(&trx, &table, LOCK_IX)); + if (!lock_table_has(&trx, &table, LOCK_X)) + { + LockGuard g{lock_sys.rec_hash, id}; + ut_ad(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, + g.cell(), id, heap_no, &trx)); + } + return true; +} +#endif /* UNIV_DEBUG */ + +namespace Deadlock +{ + /** rewind(3) the file used for storing the latest detected deadlock and + print a heading message to stderr if printing of all deadlocks to stderr + is enabled. */ + static void start_print() + { + lock_sys.assert_locked(); + + rewind(lock_latest_err_file); + ut_print_timestamp(lock_latest_err_file); + + if (srv_print_all_deadlocks) + ib::info() << "Transactions deadlock detected," + " dumping detailed information."; + } + + /** Print a message to the deadlock file and possibly to stderr. + @param msg message to print */ + static void print(const char *msg) + { + fputs(msg, lock_latest_err_file); + if (srv_print_all_deadlocks) + ib::info() << msg; + } + + /** Print transaction data to the deadlock file and possibly to stderr. + @param trx transaction */ + static void print(const trx_t &trx) + { + lock_sys.assert_locked(); + + ulint n_rec_locks= trx.lock.n_rec_locks; + ulint n_trx_locks= UT_LIST_GET_LEN(trx.lock.trx_locks); + ulint heap_size= mem_heap_get_size(trx.lock.lock_heap); + + trx_print_low(lock_latest_err_file, &trx, 3000, + n_rec_locks, n_trx_locks, heap_size); + + if (srv_print_all_deadlocks) + trx_print_low(stderr, &trx, 3000, n_rec_locks, n_trx_locks, heap_size); + } + + /** Print lock data to the deadlock file and possibly to stderr. + @param lock record or table type lock */ + static void print(const lock_t &lock) + { + lock_sys.assert_locked(); + + if (!lock.is_table()) + { + mtr_t mtr; + lock_rec_print(lock_latest_err_file, &lock, mtr); + + if (srv_print_all_deadlocks) + lock_rec_print(stderr, &lock, mtr); + } + else + { + lock_table_print(lock_latest_err_file, &lock); + + if (srv_print_all_deadlocks) + lock_table_print(stderr, &lock); + } + } + + ATTRIBUTE_COLD + /** Calculate a number used to compare deadlock victim candidates. +Bit 62 is used to prefer transaction that did not modified non-transactional +tables. Bits 1-61 are set to TRX_WEIGHT to prefer transactions with less locks +and less modified rows. Bit 0 is used to prefer orig_trx in case of a tie. + @param trx Transaction + @return a 64-bit unsigned, the lower the more preferred TRX is as a deadlock + victim */ + static undo_no_t calc_victim_weight(trx_t *trx, const trx_t *orig_trx) + { + const undo_no_t trx_weight= (trx != orig_trx) | (TRX_WEIGHT(trx) << 1) | + (trx->mysql_thd && +#ifdef WITH_WSREP + (thd_has_edited_nontrans_tables(trx->mysql_thd) || + (trx->is_wsrep() && wsrep_thd_is_BF(trx->mysql_thd, false))) +#else + thd_has_edited_nontrans_tables(trx->mysql_thd) +#endif /* WITH_WSREP */ + ? 1ULL << 62 : 0); + return trx_weight; + } + + ATTRIBUTE_COLD + /** Report a deadlock (cycle in the waits-for graph). + @param trx transaction waiting for a lock in this thread + @param current_trx whether trx belongs to the current thread + @return the transaction to be rolled back (unless one was committed already) + @return nullptr if no deadlock */ + static trx_t *report(trx_t *const trx, bool current_trx) + { + mysql_mutex_assert_owner(&lock_sys.wait_mutex); + ut_ad(xtest() || lock_sys.is_writer() == !current_trx); + + /* Normally, trx should be a direct part of the deadlock + cycle. However, if innodb_deadlock_detect had been OFF in the + past, or if current_trx=false, trx may be waiting for a lock that + is held by a participant of a pre-existing deadlock, without being + part of the deadlock itself. That is, the path to the deadlock may be + P-shaped instead of O-shaped, with trx being at the foot of the P. + + We will process the entire path leading to a cycle, and we will + choose the victim (to be aborted) among the cycle. */ + + static const char rollback_msg[]= "*** WE ROLL BACK TRANSACTION (%u)\n"; + char buf[9 + sizeof rollback_msg]; + trx_t *victim= nullptr; + + /* Here, lock elision does not make sense, because + for the output we are going to invoke system calls, + which would interrupt a memory transaction. */ + if (current_trx && !lock_sys.wr_lock_try()) + { + mysql_mutex_unlock(&lock_sys.wait_mutex); + lock_sys.wr_lock(SRW_LOCK_CALL); + mysql_mutex_lock(&lock_sys.wait_mutex); + } + + { + unsigned l= 1; + /* Now that we are holding lock_sys.wait_mutex again, check + whether a cycle still exists. */ + trx_t *cycle= find_cycle(trx); + if (!cycle) + goto func_exit; /* One of the transactions was already aborted. */ + + victim= cycle; + undo_no_t victim_weight= calc_victim_weight(victim, trx); + unsigned victim_pos= l; + for (trx_t *next= cycle;;) + { + next= next->lock.wait_trx; + l++; + const undo_no_t next_weight= calc_victim_weight(next, trx); +#ifdef HAVE_REPLICATION + const int pref= + thd_deadlock_victim_preference(victim->mysql_thd, next->mysql_thd); + /* Set bit 63 for any non-preferred victim to make such preference take + priority in the weight comparison. + -1 means victim is preferred. 1 means next is preferred. */ + undo_no_t victim_not_pref= (1ULL << 63) & (undo_no_t)(int64_t)(-pref); + undo_no_t next_not_pref= (1ULL << 63) & (undo_no_t)(int64_t)pref; +#else + undo_no_t victim_not_pref= 0; + undo_no_t next_not_pref= 0; +#endif + /* Single comparison to decide which of two transactions is preferred + as a deadlock victim. + - If thd_deadlock_victim_preference() returned non-zero, bit 63 + comparison will decide the preferred one. + - Else if exactly one of them modified non-transactional tables, + bit 62 will decide. + - Else the TRX_WEIGHT in bits 1-61 will decide, if not equal. + - Else, if one of them is the original trx, bit 0 will decide. + - If all is equal, previous victim will arbitrarily be chosen. */ + if ((next_weight|next_not_pref) < (victim_weight|victim_not_pref)) + { + victim_weight= next_weight; + victim= next; + victim_pos= l; + } + if (next == cycle) + break; + } + + /* Finally, display the deadlock */ + switch (const auto r= static_cast(innodb_deadlock_report)) { + case REPORT_OFF: + break; + case REPORT_BASIC: + case REPORT_FULL: + start_print(); + l= 0; + + for (trx_t *next= cycle;;) + { + next= next->lock.wait_trx; + ut_ad(next); + ut_ad(next->state == TRX_STATE_ACTIVE); + const lock_t *wait_lock= next->lock.wait_lock; + ut_ad(wait_lock); + snprintf(buf, sizeof buf, "\n*** (%u) TRANSACTION:\n", ++l); + print(buf); + print(*next); + print("*** WAITING FOR THIS LOCK TO BE GRANTED:\n"); + print(*wait_lock); + if (r == REPORT_BASIC); + else if (wait_lock->is_table()) + { + if (const lock_t *lock= + UT_LIST_GET_FIRST(wait_lock->un_member.tab_lock.table->locks)) + { + ut_ad(!lock->is_waiting()); + print("*** CONFLICTING WITH:\n"); + do + print(*lock); + while ((lock= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) && + !lock->is_waiting()); + } + else + ut_ad("no conflicting table lock found" == 0); + } + else + { + const page_id_t id{wait_lock->un_member.rec_lock.page_id}; + hash_cell_t &cell= *(wait_lock->type_mode & LOCK_PREDICATE + ? lock_sys.prdt_hash : lock_sys.rec_hash). + cell_get(id.fold()); + if (const lock_t *lock= lock_sys_t::get_first(cell, id)) + { + const ulint heap_no= lock_rec_find_set_bit(wait_lock); + if (!lock_rec_get_nth_bit(lock, heap_no)) + lock= lock_rec_get_next_const(heap_no, lock); + ut_ad(!lock->is_waiting()); + print("*** CONFLICTING WITH:\n"); + do + print(*lock); + while ((lock= lock_rec_get_next_const(heap_no, lock)) && + !lock->is_waiting()); + } + else + ut_ad("no conflicting record lock found" == 0); + } + if (next == cycle) + break; + } + snprintf(buf, sizeof buf, rollback_msg, victim_pos); + print(buf); + } + + ut_ad(victim->state == TRX_STATE_ACTIVE); + + /* victim->lock.was_chosen_as_deadlock_victim must always be set before + releasing waiting locks and reseting trx->lock.wait_lock */ + victim->lock.was_chosen_as_deadlock_victim= true; + DEBUG_SYNC_C("deadlock_report_before_lock_releasing"); + lock_cancel_waiting_and_release(victim->lock.wait_lock); +#ifdef WITH_WSREP + if (victim->is_wsrep() && wsrep_thd_is_SR(victim->mysql_thd)) + wsrep_handle_SR_rollback(trx->mysql_thd, victim->mysql_thd); +#endif + } + +func_exit: + if (current_trx) + lock_sys.wr_unlock(); + return victim; + } +} + +/** Check if a lock request results in a deadlock. +Resolve a deadlock by choosing a transaction that will be rolled back. +@param trx transaction requesting a lock +@param wait_lock the lock being requested +@return the lock that trx is or was waiting for +@retval nullptr if the lock wait was resolved +@retval -1 if trx must report DB_DEADLOCK */ +static lock_t *Deadlock::check_and_resolve(trx_t *trx, lock_t *wait_lock) +{ + mysql_mutex_assert_owner(&lock_sys.wait_mutex); + + ut_ad(!trx->mutex_is_owner()); + ut_ad(trx->state == TRX_STATE_ACTIVE); + ut_ad(!srv_read_only_mode); + ut_ad(wait_lock); + + if (!innodb_deadlock_detect) + return wait_lock; + + if (UNIV_LIKELY_NULL(find_cycle(trx))) + { + if (report(trx, true) == trx) + return reinterpret_cast(-1); + /* Because report() released and reacquired lock_sys.wait_mutex, + another thread may have cleared trx->lock.wait_lock meanwhile. */ + wait_lock= trx->lock.wait_lock; + } + + if (UNIV_LIKELY(!trx->lock.was_chosen_as_deadlock_victim)) + return wait_lock; + + if (wait_lock) + lock_sys_t::cancel(trx, wait_lock); + + lock_sys.deadlock_check(); + return reinterpret_cast(-1); +} + +/** Check for deadlocks while holding only lock_sys.wait_mutex. */ +TRANSACTIONAL_TARGET +void lock_sys_t::deadlock_check() +{ + ut_ad(!is_writer()); + mysql_mutex_assert_owner(&wait_mutex); + bool acquired= false; +#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC + bool elided= false; +#endif + + if (Deadlock::to_be_checked) + { + for (;;) + { + auto i= Deadlock::to_check.begin(); + if (i == Deadlock::to_check.end()) + break; + if (acquired); +#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC + else if (xbegin()) + { + if (latch.is_locked_or_waiting()) + xabort(); + acquired= elided= true; + } +#endif + else + { + acquired= wr_lock_try(); + if (!acquired) + { + acquired= true; + mysql_mutex_unlock(&wait_mutex); + lock_sys.wr_lock(SRW_LOCK_CALL); + mysql_mutex_lock(&wait_mutex); + continue; + } + } + trx_t *trx= *i; + Deadlock::to_check.erase(i); + if (Deadlock::find_cycle(trx)) + Deadlock::report(trx, false); + } + Deadlock::to_be_checked= false; + } + ut_ad(Deadlock::to_check.empty()); +#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC + if (elided) + return; +#endif + if (acquired) + wr_unlock(); +} + +/** Update the locks when a page is split and merged to two pages, +in defragmentation. */ +void lock_update_split_and_merge( + const buf_block_t* left_block, /*!< in: left page to which merged */ + const rec_t* orig_pred, /*!< in: original predecessor of + supremum on the left page before merge*/ + const buf_block_t* right_block) /*!< in: right page from which merged */ +{ + ut_ad(page_is_leaf(left_block->page.frame)); + ut_ad(page_is_leaf(right_block->page.frame)); + ut_ad(page_align(orig_pred) == left_block->page.frame); + + const page_id_t l{left_block->page.id()}; + const page_id_t r{right_block->page.id()}; + const rec_t *left_next_rec= page_rec_get_next_const(orig_pred); + if (UNIV_UNLIKELY(!left_next_rec)) + { + ut_ad("corrupted page" == 0); + return; + } + ut_ad(!page_rec_is_metadata(left_next_rec)); + + /* This would likely be too large for a memory transaction. */ + LockMultiGuard g{lock_sys.rec_hash, l, r}; + + /* Inherit the locks on the supremum of the left page to the + first record which was moved from the right page */ + lock_rec_inherit_to_gap(g.cell1(), l, g.cell1(), l, left_block->page.frame, + page_rec_get_heap_no(left_next_rec), + PAGE_HEAP_NO_SUPREMUM); + + /* Reset the locks on the supremum of the left page, + releasing waiting transactions */ + lock_rec_reset_and_release_wait(g.cell1(), l, PAGE_HEAP_NO_SUPREMUM); + + /* Inherit the locks to the supremum of the left page from the + successor of the infimum on the right page */ + lock_rec_inherit_to_gap(g.cell1(), l, g.cell2(), r, left_block->page.frame, + PAGE_HEAP_NO_SUPREMUM, + lock_get_min_heap_no(right_block)); +} diff --git a/storage/innobase/lock/lock0prdt.cc b/storage/innobase/lock/lock0prdt.cc new file mode 100644 index 00000000..29756591 --- /dev/null +++ b/storage/innobase/lock/lock0prdt.cc @@ -0,0 +1,928 @@ +/***************************************************************************** + +Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2018, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file lock/lock0prdt.cc +The transaction lock system + +Created 9/7/2013 Jimmy Yang +*******************************************************/ + +#define LOCK_MODULE_IMPLEMENTATION + +#include "lock0lock.h" +#include "lock0priv.h" +#include "lock0prdt.h" +#include "dict0mem.h" +#include "que0que.h" + +/*********************************************************************//** +Get a minimum bounding box from a Predicate +@return the minimum bounding box */ +UNIV_INLINE +rtr_mbr_t* +prdt_get_mbr_from_prdt( +/*===================*/ + const lock_prdt_t* prdt) /*!< in: the lock predicate */ +{ + rtr_mbr_t* mbr_loc = reinterpret_cast(prdt->data); + + return(mbr_loc); +} + +/*********************************************************************//** +Get a predicate from a lock +@return the predicate */ +lock_prdt_t* +lock_get_prdt_from_lock( +/*====================*/ + const lock_t* lock) /*!< in: the lock */ +{ + lock_prdt_t* prdt = reinterpret_cast( + &((reinterpret_cast( + const_cast(&lock[1])))[ + UNIV_WORD_SIZE])); + + return(prdt); +} + +/*********************************************************************//** +Get a minimum bounding box directly from a lock +@return the minimum bounding box*/ +UNIV_INLINE +rtr_mbr_t* +lock_prdt_get_mbr_from_lock( +/*========================*/ + const lock_t* lock) /*!< in: the lock */ +{ + ut_ad(lock->type_mode & LOCK_PREDICATE); + + lock_prdt_t* prdt = lock_get_prdt_from_lock(lock); + + rtr_mbr_t* mbr_loc = prdt_get_mbr_from_prdt(prdt); + + return(mbr_loc); +} + +/*********************************************************************//** +Append a predicate to the lock */ +void +lock_prdt_set_prdt( +/*===============*/ + lock_t* lock, /*!< in: lock */ + const lock_prdt_t* prdt) /*!< in: Predicate */ +{ + ut_ad(lock->type_mode & LOCK_PREDICATE); + + memcpy(&(((byte*) &lock[1])[UNIV_WORD_SIZE]), prdt, sizeof *prdt); +} + + +/** Check whether two predicate locks are compatible with each other +@param[in] prdt1 first predicate lock +@param[in] prdt2 second predicate lock +@param[in] op predicate comparison operator +@return true if consistent */ +static +bool +lock_prdt_consistent( + lock_prdt_t* prdt1, + lock_prdt_t* prdt2, + ulint op) +{ + bool ret = false; + rtr_mbr_t* mbr1 = prdt_get_mbr_from_prdt(prdt1); + rtr_mbr_t* mbr2 = prdt_get_mbr_from_prdt(prdt2); + ulint action; + + if (op) { + action = op; + } else { + if (prdt2->op != 0 && (prdt1->op != prdt2->op)) { + return(false); + } + + action = prdt1->op; + } + + switch (action) { + case PAGE_CUR_CONTAIN: + ret = MBR_CONTAIN_CMP(mbr1, mbr2); + break; + case PAGE_CUR_DISJOINT: + ret = MBR_DISJOINT_CMP(mbr1, mbr2); + break; + case PAGE_CUR_MBR_EQUAL: + ret = MBR_EQUAL_CMP(mbr1, mbr2); + break; + case PAGE_CUR_INTERSECT: + ret = MBR_INTERSECT_CMP(mbr1, mbr2); + break; + case PAGE_CUR_WITHIN: + ret = MBR_WITHIN_CMP(mbr1, mbr2); + break; + default: + ib::error() << "invalid operator " << action; + ut_error; + } + + return(ret); +} + +/*********************************************************************//** +Checks if a predicate lock request for a new lock has to wait for +another lock. +@return true if new lock has to wait for lock2 to be released */ +bool +lock_prdt_has_to_wait( +/*==================*/ + const trx_t* trx, /*!< in: trx of new lock */ + unsigned type_mode,/*!< in: precise mode of the new lock + to set: LOCK_S or LOCK_X, possibly + ORed to LOCK_PREDICATE or LOCK_PRDT_PAGE, + LOCK_INSERT_INTENTION */ + lock_prdt_t* prdt, /*!< in: lock predicate to check */ + const lock_t* lock2) /*!< in: another record lock; NOTE that + it is assumed that this has a lock bit + set on the same record as in the new + lock we are setting */ +{ + lock_prdt_t* cur_prdt = lock_get_prdt_from_lock(lock2); + + ut_ad(trx && lock2); + ut_ad((lock2->type_mode & LOCK_PREDICATE && type_mode & LOCK_PREDICATE) + || (lock2->type_mode & LOCK_PRDT_PAGE + && type_mode & LOCK_PRDT_PAGE)); + + ut_ad(type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE)); + + if (trx != lock2->trx + && !lock_mode_compatible(static_cast( + LOCK_MODE_MASK & type_mode), + lock2->mode())) { + + /* If it is a page lock, then return true (conflict) */ + if (type_mode & LOCK_PRDT_PAGE) { + ut_ad(lock2->type_mode & LOCK_PRDT_PAGE); + + return(true); + } + + /* Predicate lock does not conflicts with non-predicate lock */ + if (!(lock2->type_mode & LOCK_PREDICATE)) { + return(FALSE); + } + + ut_ad(lock2->type_mode & LOCK_PREDICATE); + + if (!(type_mode & LOCK_INSERT_INTENTION)) { + /* PREDICATE locks without LOCK_INSERT_INTENTION flag + do not need to wait for anything. This is because + different users can have conflicting lock types + on predicates. */ + + return(FALSE); + } + + if (lock2->type_mode & LOCK_INSERT_INTENTION) { + + /* No lock request needs to wait for an insert + intention lock to be removed. This makes it similar + to GAP lock, that allows conflicting insert intention + locks */ + return(FALSE); + } + + if (!lock_prdt_consistent(cur_prdt, prdt, 0)) { + return(false); + } + + return(TRUE); + } + + return(FALSE); +} + +/*********************************************************************//** +Checks if a transaction has a GRANTED stronger or equal predicate lock +on the page +@return lock or NULL */ +UNIV_INLINE +lock_t* +lock_prdt_has_lock( +/*===============*/ + ulint precise_mode, /*!< in: LOCK_S or LOCK_X */ + hash_cell_t& cell, /*!< hash table cell of id */ + const page_id_t id, /*!< in: page identifier */ + lock_prdt_t* prdt, /*!< in: The predicate to be + attached to the new lock */ + const trx_t* trx) /*!< in: transaction */ +{ + ut_ad((precise_mode & LOCK_MODE_MASK) == LOCK_S + || (precise_mode & LOCK_MODE_MASK) == LOCK_X); + ut_ad(!(precise_mode & LOCK_INSERT_INTENTION)); + + for (lock_t*lock= lock_sys_t::get_first(cell, id, PRDT_HEAPNO); + lock; + lock = lock_rec_get_next(PRDT_HEAPNO, lock)) { + ut_ad(lock->type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE)); + + if (lock->trx == trx + && !(lock->type_mode & (LOCK_INSERT_INTENTION | LOCK_WAIT)) + && lock_mode_stronger_or_eq( + lock->mode(), + static_cast( + precise_mode & LOCK_MODE_MASK))) { + if (lock->type_mode & LOCK_PRDT_PAGE) { + return(lock); + } + + lock_prdt_t* cur_prdt = lock_get_prdt_from_lock( + lock); + + /* if the lock predicate operator is the same + as the one to look, and prdicate test is successful, + then we find a lock */ + if (cur_prdt->op == prdt->op + && lock_prdt_consistent(cur_prdt, prdt, 0)) { + + return(lock); + } + } + } + + return(NULL); +} + +/*********************************************************************//** +Checks if some other transaction has a conflicting predicate +lock request in the queue, so that we have to wait. +@return lock or NULL */ +static +lock_t* +lock_prdt_other_has_conflicting( +/*============================*/ + unsigned mode, /*!< in: LOCK_S or LOCK_X, + possibly ORed to LOCK_PREDICATE or + LOCK_PRDT_PAGE, LOCK_INSERT_INTENTION */ + const hash_cell_t& cell, /*!< in: hash table cell */ + const page_id_t id, /*!< in: page identifier */ + lock_prdt_t* prdt, /*!< in: Predicates (currently) + the Minimum Bounding Rectangle) + the new lock will be on */ + const trx_t* trx) /*!< in: our transaction */ +{ + for (lock_t* lock = lock_sys_t::get_first(cell, id, PRDT_HEAPNO); + lock != NULL; + lock = lock_rec_get_next(PRDT_HEAPNO, lock)) { + + if (lock->trx == trx) { + continue; + } + + if (lock_prdt_has_to_wait(trx, mode, prdt, lock)) { + return(lock); + } + } + + return(NULL); +} + +/*********************************************************************//** +Reset the Minimum Bounding Rectangle (to a large area) */ +static +void +lock_prdt_enlarge_mbr( +/*==================*/ + const lock_t* lock, /*!< in/out: lock to modify */ + rtr_mbr_t* mbr) /*!< in: Minimum Bounding Rectangle */ +{ + rtr_mbr_t* cur_mbr = lock_prdt_get_mbr_from_lock(lock); + + if (cur_mbr->xmin > mbr->xmin) { + cur_mbr->xmin = mbr->xmin; + } + + if (cur_mbr->ymin > mbr->ymin) { + cur_mbr->ymin = mbr->ymin; + } + + if (cur_mbr->xmax < mbr->xmax) { + cur_mbr->xmax = mbr->xmax; + } + + if (cur_mbr->ymax < mbr->ymax) { + cur_mbr->ymax = mbr->ymax; + } +} + +/*********************************************************************//** +Reset the predicates to a "covering" (larger) predicates */ +static +void +lock_prdt_enlarge_prdt( +/*===================*/ + lock_t* lock, /*!< in/out: lock to modify */ + lock_prdt_t* prdt) /*!< in: predicate */ +{ + rtr_mbr_t* mbr = prdt_get_mbr_from_prdt(prdt); + + lock_prdt_enlarge_mbr(lock, mbr); +} + +/*********************************************************************//** +Check two predicates' MBRs are the same +@return true if they are the same */ +static +bool +lock_prdt_is_same( +/*==============*/ + lock_prdt_t* prdt1, /*!< in: MBR with the lock */ + lock_prdt_t* prdt2) /*!< in: MBR with the lock */ +{ + rtr_mbr_t* mbr1 = prdt_get_mbr_from_prdt(prdt1); + rtr_mbr_t* mbr2 = prdt_get_mbr_from_prdt(prdt2); + + if (prdt1->op == prdt2->op && MBR_EQUAL_CMP(mbr1, mbr2)) { + return(true); + } + + return(false); +} + +/*********************************************************************//** +Looks for a similar predicate lock struct by the same trx on the same page. +This can be used to save space when a new record lock should be set on a page: +no new struct is needed, if a suitable old one is found. +@return lock or NULL */ +static +lock_t* +lock_prdt_find_on_page( +/*===================*/ + unsigned type_mode, /*!< in: lock type_mode field */ + const buf_block_t* block, /*!< in: buffer block */ + lock_prdt_t* prdt, /*!< in: MBR with the lock */ + const trx_t* trx) /*!< in: transaction */ +{ + const page_id_t id{block->page.id()}; + hash_cell_t& cell = *lock_sys.hash_get(type_mode).cell_get(id.fold()); + + for (lock_t *lock = lock_sys_t::get_first(cell, id); + lock != NULL; + lock = lock_rec_get_next_on_page(lock)) { + + if (lock->trx == trx + && lock->type_mode == type_mode) { + if (lock->type_mode & LOCK_PRDT_PAGE) { + return(lock); + } + + ut_ad(lock->type_mode & LOCK_PREDICATE); + + if (lock_prdt_is_same(lock_get_prdt_from_lock(lock), + prdt)) { + return(lock); + } + } + } + + return(NULL); +} + +/*********************************************************************//** +Adds a predicate lock request in the predicate lock queue. +@return lock where the bit was set */ +static +lock_t* +lock_prdt_add_to_queue( +/*===================*/ + unsigned type_mode,/*!< in: lock mode, wait, predicate + etc. flags */ + const buf_block_t* block, /*!< in: buffer block containing + the record */ + dict_index_t* index, /*!< in: index of record */ + trx_t* trx, /*!< in/out: transaction */ + lock_prdt_t* prdt, /*!< in: Minimum Bounding Rectangle + the new lock will be on */ + bool caller_owns_trx_mutex) + /*!< in: TRUE if caller owns the + transaction mutex */ +{ + ut_ad(caller_owns_trx_mutex == trx->mutex_is_owner()); + ut_ad(index->is_spatial()); + ut_ad(!dict_index_is_online_ddl(index)); + ut_ad(type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE)); + +#ifdef UNIV_DEBUG + switch (type_mode & LOCK_MODE_MASK) { + case LOCK_X: + case LOCK_S: + break; + default: + ut_error; + } +#endif /* UNIV_DEBUG */ + + /* Try to extend a similar non-waiting lock on the same page */ + if (!(type_mode & LOCK_WAIT)) { + const page_id_t id{block->page.id()}; + hash_cell_t& cell = *lock_sys.hash_get(type_mode). + cell_get(id.fold()); + + for (lock_t* lock = lock_sys_t::get_first(cell, id); + lock; lock = lock_rec_get_next_on_page(lock)) { + if (lock->is_waiting() + && lock->type_mode + & (LOCK_PREDICATE | LOCK_PRDT_PAGE) + && lock_rec_get_nth_bit(lock, PRDT_HEAPNO)) { + goto create; + } + } + + if (lock_t* lock = lock_prdt_find_on_page(type_mode, block, + prdt, trx)) { + if (lock->type_mode & LOCK_PREDICATE) { + lock_prdt_enlarge_prdt(lock, prdt); + } + + return lock; + } + } + +create: + /* Note: We will not pass any conflicting lock to lock_rec_create(), + because we should be moving an existing waiting lock request. */ + ut_ad(!(type_mode & LOCK_WAIT) || trx->lock.wait_trx); + + lock_t* lock = lock_rec_create(nullptr, + type_mode, block, PRDT_HEAPNO, index, + trx, caller_owns_trx_mutex); + + if (lock->type_mode & LOCK_PREDICATE) { + lock_prdt_set_prdt(lock, prdt); + } + + return lock; +} + +/*********************************************************************//** +Checks if locks of other transactions prevent an immediate insert of +a predicate record. +@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_prdt_insert_check_and_lock( +/*============================*/ + const rec_t* rec, /*!< in: record after which to insert */ + buf_block_t* block, /*!< in/out: buffer block of rec */ + dict_index_t* index, /*!< in: index */ + que_thr_t* thr, /*!< in: query thread */ + mtr_t* mtr, /*!< in/out: mini-transaction */ + lock_prdt_t* prdt) /*!< in: Predicates with Minimum Bound + Rectangle */ +{ + ut_ad(block->page.frame == page_align(rec)); + ut_ad(!index->table->is_temporary()); + ut_ad(index->is_spatial()); + + trx_t *trx= thr_get_trx(thr); + const page_id_t id{block->page.id()}; + dberr_t err= DB_SUCCESS; + + { + LockGuard g{lock_sys.prdt_hash, id}; + /* Because this code is invoked for a running transaction by + the thread that is serving the transaction, it is not necessary + to hold trx->mutex here. */ + ut_ad(lock_table_has(trx, index->table, LOCK_IX)); + + /* Only need to check locks on prdt_hash */ + if (ut_d(lock_t *lock=) lock_sys_t::get_first(g.cell(), id, PRDT_HEAPNO)) + { + ut_ad(lock->type_mode & LOCK_PREDICATE); + + /* If another transaction has an explicit lock request which locks + the predicate, waiting or granted, on the successor, the insert + has to wait. + + Similar to GAP lock, we do not consider lock from inserts conflicts + with each other */ + + const ulint mode= LOCK_X | LOCK_PREDICATE | LOCK_INSERT_INTENTION; + lock_t *c_lock= lock_prdt_other_has_conflicting(mode, g.cell(), id, + prdt, trx); + + if (c_lock) + { + rtr_mbr_t *mbr= prdt_get_mbr_from_prdt(prdt); + trx->mutex_lock(); + /* Allocate MBR on the lock heap */ + lock_init_prdt_from_mbr(prdt, mbr, 0, trx->lock.lock_heap); + err= lock_rec_enqueue_waiting(c_lock, mode, id, block->page.frame, + PRDT_HEAPNO, index, thr, prdt); + trx->mutex_unlock(); + } + } + } + + if (err == DB_SUCCESS) + /* Update the page max trx id field */ + page_update_max_trx_id(block, buf_block_get_page_zip(block), trx->id, mtr); + + return err; +} + +/**************************************************************//** +Check whether any predicate lock in parent needs to propagate to +child page after split. */ +void +lock_prdt_update_parent( +/*====================*/ + buf_block_t* left_block, /*!< in/out: page to be split */ + buf_block_t* right_block, /*!< in/out: the new half page */ + lock_prdt_t* left_prdt, /*!< in: MBR on the old page */ + lock_prdt_t* right_prdt, /*!< in: MBR on the new page */ + const page_id_t page_id) /*!< in: parent page */ +{ + auto fold= page_id.fold(); + LockMutexGuard g{SRW_LOCK_CALL}; + hash_cell_t& cell = *lock_sys.prdt_hash.cell_get(fold); + + /* Get all locks in parent */ + for (lock_t *lock = lock_sys_t::get_first(cell, page_id); + lock; + lock = lock_rec_get_next_on_page(lock)) { + lock_prdt_t* lock_prdt; + ulint op = PAGE_CUR_DISJOINT; + + ut_ad(lock); + + if (!(lock->type_mode & LOCK_PREDICATE) + || (lock->type_mode & LOCK_MODE_MASK) == LOCK_X) { + continue; + } + + lock_prdt = lock_get_prdt_from_lock(lock); + + /* Check each lock in parent to see if it intersects with + left or right child */ + if (!lock_prdt_consistent(lock_prdt, left_prdt, op) + && !lock_prdt_find_on_page(lock->type_mode, left_block, + lock_prdt, lock->trx)) { + lock_prdt_add_to_queue(lock->type_mode, + left_block, lock->index, + lock->trx, lock_prdt, + false); + } + + if (!lock_prdt_consistent(lock_prdt, right_prdt, op) + && !lock_prdt_find_on_page(lock->type_mode, right_block, + lock_prdt, lock->trx)) { + lock_prdt_add_to_queue(lock->type_mode, right_block, + lock->index, lock->trx, + lock_prdt, false); + } + } +} + +/**************************************************************//** +Update predicate lock when page splits */ +static +void +lock_prdt_update_split_low( +/*=======================*/ + buf_block_t* new_block, /*!< in/out: the new half page */ + lock_prdt_t* prdt, /*!< in: MBR on the old page */ + lock_prdt_t* new_prdt, /*!< in: MBR on the new page */ + const page_id_t id, /*!< in: page number */ + unsigned type_mode) /*!< in: LOCK_PREDICATE or + LOCK_PRDT_PAGE */ +{ + hash_cell_t& cell = *lock_sys.hash_get(type_mode).cell_get(id.fold()); + + for (lock_t* lock = lock_sys_t::get_first(cell, id); + lock; + lock = lock_rec_get_next_on_page(lock)) { + /* First dealing with Page Lock */ + if (lock->type_mode & LOCK_PRDT_PAGE) { + /* Duplicate the lock to new page */ + lock_prdt_add_to_queue(lock->type_mode, + new_block, + lock->index, + lock->trx, nullptr, false); + continue; + } + + /* Now dealing with Predicate Lock */ + lock_prdt_t* lock_prdt; + ulint op = PAGE_CUR_DISJOINT; + + ut_ad(lock->type_mode & LOCK_PREDICATE); + + /* No need to duplicate waiting X locks */ + if ((lock->type_mode & LOCK_MODE_MASK) == LOCK_X) { + continue; + } + + lock_prdt = lock_get_prdt_from_lock(lock); + + if (!lock_prdt_consistent(lock_prdt, new_prdt, op)) { + /* Move the lock to new page */ + lock_prdt_add_to_queue(lock->type_mode, new_block, + lock->index, lock->trx, + lock_prdt, false); + } + } +} + +/**************************************************************//** +Update predicate lock when page splits */ +void +lock_prdt_update_split( +/*===================*/ + buf_block_t* new_block, /*!< in/out: the new half page */ + lock_prdt_t* prdt, /*!< in: MBR on the old page */ + lock_prdt_t* new_prdt, /*!< in: MBR on the new page */ + const page_id_t page_id) /*!< in: page number */ +{ + LockMutexGuard g{SRW_LOCK_CALL}; + lock_prdt_update_split_low(new_block, prdt, new_prdt, + page_id, LOCK_PREDICATE); + + lock_prdt_update_split_low(new_block, NULL, NULL, + page_id, LOCK_PRDT_PAGE); +} + +/*********************************************************************//** +Initiate a Predicate Lock from a MBR */ +void +lock_init_prdt_from_mbr( +/*====================*/ + lock_prdt_t* prdt, /*!< in/out: predicate to initialized */ + rtr_mbr_t* mbr, /*!< in: Minimum Bounding Rectangle */ + ulint mode, /*!< in: Search mode */ + mem_heap_t* heap) /*!< in: heap for allocating memory */ +{ + memset(prdt, 0, sizeof(*prdt)); + + if (heap != NULL) { + prdt->data = mem_heap_dup(heap, mbr, sizeof *mbr); + } else { + prdt->data = static_cast(mbr); + } + + prdt->op = static_cast(mode); +} + +/*********************************************************************//** +Acquire a predicate lock on a block +@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_prdt_lock( +/*===========*/ + buf_block_t* block, /*!< in/out: buffer block of rec */ + lock_prdt_t* prdt, /*!< in: Predicate for the lock */ + dict_index_t* index, /*!< in: secondary index */ + lock_mode mode, /*!< in: mode of the lock which + the read cursor should set on + records: LOCK_S or LOCK_X; the + latter is possible in + SELECT FOR UPDATE */ + unsigned type_mode, + /*!< in: LOCK_PREDICATE or LOCK_PRDT_PAGE */ + que_thr_t* thr) /*!< in: query thread + (can be NULL if BTR_NO_LOCKING_FLAG) */ +{ + trx_t* trx = thr_get_trx(thr); + dberr_t err = DB_SUCCESS; + lock_rec_req_status status = LOCK_REC_SUCCESS; + + if (trx->read_only || index->table->is_temporary()) { + return(DB_SUCCESS); + } + + ut_ad(!dict_index_is_clust(index)); + ut_ad(!dict_index_is_online_ddl(index)); + ut_ad(type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE)); + + auto& hash = lock_sys.prdt_hash_get(type_mode != LOCK_PREDICATE); + const page_id_t id{block->page.id()}; + + /* Another transaction cannot have an implicit lock on the record, + because when we come here, we already have modified the clustered + index record, and this would not have been possible if another active + transaction had modified this secondary index record. */ + + LockGuard g{hash, id}; + + const unsigned prdt_mode = type_mode | mode; + lock_t* lock = lock_sys_t::get_first(g.cell(), id); + + if (lock == NULL) { + lock = lock_rec_create( + NULL, + prdt_mode, block, PRDT_HEAPNO, + index, trx, FALSE); + + status = LOCK_REC_SUCCESS_CREATED; + } else { + if (lock_rec_get_next_on_page(lock) + || lock->trx != trx + || lock->type_mode != prdt_mode + || lock_rec_get_n_bits(lock) == 0 + || ((type_mode & LOCK_PREDICATE) + && (!lock_prdt_consistent( + lock_get_prdt_from_lock(lock), prdt, 0)))) { + trx->mutex_lock(); + + lock = lock_prdt_has_lock( + mode, g.cell(), id, prdt, trx); + + if (lock) { + } else if (lock_t* wait_for + = lock_prdt_other_has_conflicting( + prdt_mode, g.cell(), id, prdt, + trx)) { + err = lock_rec_enqueue_waiting( + wait_for, prdt_mode, id, + block->page.frame, PRDT_HEAPNO, + index, thr, prdt); + } else { + lock_prdt_add_to_queue( + prdt_mode, block, index, trx, + prdt, true); + } + + trx->mutex_unlock(); + } else { + if (!lock_rec_get_nth_bit(lock, PRDT_HEAPNO)) { + lock_rec_set_nth_bit(lock, PRDT_HEAPNO); + status = LOCK_REC_SUCCESS_CREATED; + } + } + } + + if (status == LOCK_REC_SUCCESS_CREATED && type_mode == LOCK_PREDICATE) { + /* Append the predicate in the lock record */ + lock_prdt_set_prdt(lock, prdt); + } + + return(err); +} + +/*********************************************************************//** +Acquire a "Page" lock on a block +@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */ +dberr_t +lock_place_prdt_page_lock( + const page_id_t page_id, /*!< in: page identifier */ + dict_index_t* index, /*!< in: secondary index */ + que_thr_t* thr) /*!< in: query thread */ +{ + ut_ad(thr != NULL); + ut_ad(!high_level_read_only); + + ut_ad(index->is_spatial()); + ut_ad(!dict_index_is_online_ddl(index)); + if (index->table->is_temporary()) { + return DB_SUCCESS; + } + + /* Another transaction cannot have an implicit lock on the record, + because when we come here, we already have modified the clustered + index record, and this would not have been possible if another active + transaction had modified this secondary index record. */ + + LockGuard g{lock_sys.prdt_page_hash, page_id}; + + const lock_t* lock = lock_sys_t::get_first(g.cell(), page_id); + const ulint mode = LOCK_S | LOCK_PRDT_PAGE; + trx_t* trx = thr_get_trx(thr); + + if (lock != NULL) { + /* Find a matching record lock owned by this transaction. */ + + while (lock != NULL && lock->trx != trx) { + lock = lock_rec_get_next_on_page_const(lock); + } + + ut_ad(lock == NULL || lock->type_mode == mode); + ut_ad(lock == NULL || lock_rec_get_n_bits(lock) != 0); + } + + if (lock == NULL) { + lock = lock_rec_create_low( + NULL, + mode, page_id, NULL, PRDT_HEAPNO, + index, trx, FALSE); + +#ifdef PRDT_DIAG + printf("GIS_DIAGNOSTIC: page lock %d\n", (int) page_no); +#endif /* PRDT_DIAG */ + } + + return(DB_SUCCESS); +} + +/** Check whether there are R-tree Page lock on a page +@param[in] trx trx to test the lock +@param[in] page_id page identifier +@return true if there is none */ +bool lock_test_prdt_page_lock(const trx_t *trx, const page_id_t page_id) +{ + LockGuard g{lock_sys.prdt_page_hash, page_id}; + lock_t *lock= lock_sys_t::get_first(g.cell(), page_id); + return !lock || trx == lock->trx; +} + +/*************************************************************//** +Moves the locks of a page to another page and resets the lock bits of +the donating records. */ +void +lock_prdt_rec_move( +/*===============*/ + const buf_block_t* receiver, /*!< in: buffer block containing + the receiving record */ + const page_id_t donator) /*!< in: target page */ +{ + LockMultiGuard g{lock_sys.prdt_hash, receiver->page.id(), donator}; + + for (lock_t *lock = lock_sys_t::get_first(g.cell2(), donator, + PRDT_HEAPNO); + lock; + lock = lock_rec_get_next(PRDT_HEAPNO, lock)) { + + const auto type_mode = lock->type_mode; + lock_prdt_t* lock_prdt = lock_get_prdt_from_lock(lock); + + lock_rec_reset_nth_bit(lock, PRDT_HEAPNO); + if (type_mode & LOCK_WAIT) { + ut_ad(lock->trx->lock.wait_lock == lock); + lock->type_mode &= ~LOCK_WAIT; + } + lock_prdt_add_to_queue( + type_mode, receiver, lock->index, lock->trx, + lock_prdt, false); + } +} + +/** Remove locks on a discarded SPATIAL INDEX page. +@param id page to be discarded +@param page whether to discard also from lock_sys.prdt_hash */ +void lock_sys_t::prdt_page_free_from_discard(const page_id_t id, bool all) +{ + const auto id_fold= id.fold(); + rd_lock(SRW_LOCK_CALL); + auto cell= prdt_page_hash.cell_get(id_fold); + auto latch= hash_table::latch(cell); + latch->acquire(); + + for (lock_t *lock= get_first(*cell, id), *next; lock; lock= next) + { + next= lock_rec_get_next_on_page(lock); + lock_rec_discard(prdt_page_hash, lock); + } + + if (all) + { + latch->release(); + cell= prdt_hash.cell_get(id_fold); + latch= hash_table::latch(cell); + latch->acquire(); + for (lock_t *lock= get_first(*cell, id), *next; lock; lock= next) + { + next= lock_rec_get_next_on_page(lock); + lock_rec_discard(prdt_hash, lock); + } + } + + latch->release(); + cell= rec_hash.cell_get(id_fold); + latch= hash_table::latch(cell); + latch->acquire(); + + for (lock_t *lock= get_first(*cell, id), *next; lock; lock= next) + { + next= lock_rec_get_next_on_page(lock); + lock_rec_discard(rec_hash, lock); + } + + latch->release(); + /* Must be last, to avoid a race with lock_sys_t::hash_table::resize() */ + rd_unlock(); +} diff --git a/storage/innobase/log/log0crypt.cc b/storage/innobase/log/log0crypt.cc new file mode 100644 index 00000000..8a771410 --- /dev/null +++ b/storage/innobase/log/log0crypt.cc @@ -0,0 +1,641 @@ +/***************************************************************************** + +Copyright (C) 2013, 2015, Google Inc. All Rights Reserved. +Copyright (C) 2014, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ +/**************************************************//** +@file log0crypt.cc +Innodb log encrypt/decrypt + +Created 11/25/2013 Minli Zhu Google +Modified Jan Lindström jan.lindstrom@mariadb.com +MDEV-11782: Rewritten for MariaDB 10.2 by Marko Mäkelä, MariaDB Corporation. +*******************************************************/ +#include +#include "log0crypt.h" +#include +#include "assume_aligned.h" + +#include "log0crypt.h" +#include "log0recv.h" // for recv_sys +#include "mach0data.h" + +/** Redo log encryption key ID */ +#define LOG_DEFAULT_ENCRYPTION_KEY 1 + +struct crypt_info_t { + uint32_t checkpoint_no; /*!< checkpoint no; 32 bits */ + uint32_t key_version; /*!< key version */ + /** random string for encrypting the key */ + alignas(8) byte crypt_msg[MY_AES_BLOCK_SIZE]; + /** the secret key */ + alignas(8) byte crypt_key[MY_AES_BLOCK_SIZE]; + /** a random string for the per-block initialization vector */ + alignas(4) byte crypt_nonce[4]; +}; + +/** The crypt info */ +static crypt_info_t info; + +/** Initialization vector used for temporary files/tablespace */ +static byte tmp_iv[MY_AES_BLOCK_SIZE]; + +/** Crypt info when upgrading from 10.1 */ +static crypt_info_t infos[5 * 2]; +/** First unused slot in infos[] */ +static size_t infos_used; + +/* Offsets of a log block header */ +#define LOG_BLOCK_HDR_NO 0 /* block number which must be > 0 and + is allowed to wrap around at 2G; the + highest bit is set to 1 if this is the + first log block in a log flush write + segment */ +#define LOG_BLOCK_FLUSH_BIT_MASK 0x80000000UL + /* mask used to get the highest bit in + the preceding field */ +#define LOG_BLOCK_HDR_DATA_LEN 4 /* number of bytes of log written to + this block */ +#define LOG_BLOCK_FIRST_REC_GROUP 6 /* offset of the first start of an + mtr log record group in this log block, + 0 if none; if the value is the same + as LOG_BLOCK_HDR_DATA_LEN, it means + that the first rec group has not yet + been catenated to this log block, but + if it will, it will start at this + offset; an archive recovery can + start parsing the log records starting + from this offset in this log block, + if value not 0 */ +#define LOG_BLOCK_HDR_SIZE 12 /* size of the log block header in + bytes */ + +#define LOG_BLOCK_KEY 4 /* encryption key version + before LOG_BLOCK_CHECKSUM; + after log_t::FORMAT_ENC_10_4 only */ +#define LOG_BLOCK_CHECKSUM 4 /* 4 byte checksum of the log block + contents; in InnoDB versions + < 3.23.52 this did not contain the + checksum but the same value as + LOG_BLOCK_HDR_NO */ + +/*********************************************************************//** +Get a log block's start lsn. +@return a log block's start lsn */ +static inline +lsn_t +log_block_get_start_lsn( +/*====================*/ + lsn_t lsn, /*!< in: checkpoint lsn */ + ulint log_block_no) /*!< in: log block number */ +{ + lsn_t start_lsn = + (lsn & (lsn_t)0xffffffff00000000ULL) | + (((log_block_no - 1) & (lsn_t)0x3fffffff) << 9); + return start_lsn; +} + +/** Generate crypt key from crypt msg. +@param[in,out] info encryption key +@param[in] upgrade whether to use the key in MariaDB 10.1 format +@return whether the operation was successful */ +static bool init_crypt_key(crypt_info_t* info, bool upgrade = false) +{ + byte mysqld_key[MY_AES_MAX_KEY_LENGTH]; + uint keylen = sizeof mysqld_key; + + compile_time_assert(16 == sizeof info->crypt_key); + compile_time_assert(16 == MY_AES_BLOCK_SIZE); + + if (uint rc = encryption_key_get(LOG_DEFAULT_ENCRYPTION_KEY, + info->key_version, mysqld_key, + &keylen)) { + ib::error() + << "Obtaining redo log encryption key version " + << info->key_version << " failed (" << rc + << "). Maybe the key or the required encryption " + "key management plugin was not found."; + info->key_version = ENCRYPTION_KEY_VERSION_INVALID; + return false; + } + + if (upgrade) { + while (keylen < sizeof mysqld_key) { + mysqld_key[keylen++] = 0; + } + } + + uint dst_len; + int err= my_aes_crypt(MY_AES_ECB, + ENCRYPTION_FLAG_NOPAD | ENCRYPTION_FLAG_ENCRYPT, + info->crypt_msg, MY_AES_BLOCK_SIZE, + info->crypt_key, &dst_len, + mysqld_key, keylen, NULL, 0); + + if (err != MY_AES_OK || dst_len != MY_AES_BLOCK_SIZE) { + ib::error() << "Getting redo log crypto key failed: err = " + << err << ", len = " << dst_len; + info->key_version = ENCRYPTION_KEY_VERSION_INVALID; + return false; + } + + return true; +} + +static ulint log_block_get_hdr_no(const byte *log_block) +{ + static_assert(LOG_BLOCK_HDR_NO == 0, "compatibility"); + return mach_read_from_4(my_assume_aligned<4>(log_block)) & + ~LOG_BLOCK_FLUSH_BIT_MASK; +} + +/** Decrypt log blocks. +@param[in,out] buf log blocks to decrypt +@param[in] lsn log sequence number of the start of the buffer +@param[in] size size of the buffer, in bytes +@return whether the operation succeeded */ +ATTRIBUTE_COLD bool log_decrypt(byte* buf, lsn_t lsn, ulint size) +{ + ut_ad(!(size & 511)); + ut_ad(!(ulint(buf) & 511)); + ut_a(info.key_version); + + alignas(8) byte aes_ctr_iv[MY_AES_BLOCK_SIZE]; + +#define LOG_CRYPT_HDR_SIZE 4 + lsn &= ~lsn_t{511}; + + const bool has_encryption_key_rotation + = log_sys.format == log_t::FORMAT_ENC_10_4 + || log_sys.format == log_t::FORMAT_ENC_10_5; + + for (const byte* const end = buf + size; buf != end; + buf += 512, lsn += 512) { + alignas(4) byte dst[512 - LOG_CRYPT_HDR_SIZE + - LOG_BLOCK_CHECKSUM]; + + /* The log block number is not encrypted. */ + memcpy_aligned<4>(dst, buf + LOG_BLOCK_HDR_NO, 4); + memcpy_aligned<4>(aes_ctr_iv, buf + LOG_BLOCK_HDR_NO, 4); + *aes_ctr_iv &= byte(~(LOG_BLOCK_FLUSH_BIT_MASK >> 24)); + static_assert(LOG_BLOCK_HDR_NO + 4 == LOG_CRYPT_HDR_SIZE, + "compatibility"); + memcpy_aligned<4>(aes_ctr_iv + 4, info.crypt_nonce, 4); + mach_write_to_8(my_assume_aligned<8>(aes_ctr_iv + 8), lsn); + ut_ad(log_block_get_start_lsn(lsn, + log_block_get_hdr_no(buf)) + == lsn); + byte* key_ver = &buf[512 - LOG_BLOCK_KEY - LOG_BLOCK_CHECKSUM]; + + const size_t dst_size = has_encryption_key_rotation + ? sizeof dst - LOG_BLOCK_KEY + : sizeof dst; + if (has_encryption_key_rotation) { + const auto key_version = info.key_version; + info.key_version = mach_read_from_4(key_ver); + if (key_version == info.key_version) { + } else if (!init_crypt_key(&info)) { + return false; +#ifndef DBUG_OFF + } else { + DBUG_PRINT("ib_log", ("key_version: %x -> %x", + key_version, + info.key_version)); +#endif /* !DBUG_OFF */ + } + } + + ut_ad(LOG_CRYPT_HDR_SIZE + dst_size + == 512 - LOG_BLOCK_CHECKSUM - LOG_BLOCK_KEY); + + uint dst_len; + int rc = encryption_crypt( + buf + LOG_CRYPT_HDR_SIZE, static_cast(dst_size), + reinterpret_cast(dst), &dst_len, + const_cast(info.crypt_key), + MY_AES_BLOCK_SIZE, + aes_ctr_iv, sizeof aes_ctr_iv, + ENCRYPTION_FLAG_DECRYPT | ENCRYPTION_FLAG_NOPAD, + LOG_DEFAULT_ENCRYPTION_KEY, + info.key_version); + ut_a(rc == MY_AES_OK); + ut_a(dst_len == dst_size); + memcpy(buf + LOG_CRYPT_HDR_SIZE, dst, dst_size); + } + + return true; +} + +/** Initialize the redo log encryption key and random parameters +when creating a new redo log. +The random parameters will be persisted in the log checkpoint pages. +@see log_crypt_write_header() +@see log_crypt_read_header() +@return whether the operation succeeded */ +bool log_crypt_init() +{ + info.key_version= + encryption_key_get_latest_version(LOG_DEFAULT_ENCRYPTION_KEY); + + if (info.key_version == ENCRYPTION_KEY_VERSION_INVALID) + ib::error() << "log_crypt_init(): cannot get key version"; + else if (my_random_bytes(tmp_iv, MY_AES_BLOCK_SIZE) != MY_AES_OK || + my_random_bytes(info.crypt_msg, sizeof info.crypt_msg) != + MY_AES_OK || + my_random_bytes(info.crypt_nonce, sizeof info.crypt_nonce) != + MY_AES_OK) + ib::error() << "log_crypt_init(): my_random_bytes() failed"; + else if (init_crypt_key(&info)) + goto func_exit; + + info.key_version= 0; +func_exit: + return info.key_version != 0; +} + +/** Read the MariaDB 10.1 checkpoint crypto (version, msg and iv) info. +@param[in] buf checkpoint buffer +@return whether the operation was successful */ +ATTRIBUTE_COLD bool log_crypt_101_read_checkpoint(const byte* buf) +{ + buf += 20 + 32 * 9; + + const size_t n = *buf++ == 2 ? std::min(unsigned(*buf++), 5U) : 0; + + for (size_t i = 0; i < n; i++) { + struct crypt_info_t& info = infos[infos_used]; + unsigned checkpoint_no = mach_read_from_4(buf); + for (size_t j = 0; j < infos_used; j++) { + if (infos[j].checkpoint_no == checkpoint_no) { + /* Do not overwrite an existing slot. */ + goto next_slot; + } + } + if (infos_used >= UT_ARR_SIZE(infos)) { + ut_ad("too many checkpoint pages" == 0); + goto next_slot; + } + infos_used++; + info.checkpoint_no = checkpoint_no; + info.key_version = mach_read_from_4(buf + 4); + memcpy(info.crypt_msg, buf + 8, MY_AES_BLOCK_SIZE); + memcpy(info.crypt_nonce, buf + 24, sizeof info.crypt_nonce); + + if (!init_crypt_key(&info, true)) { + return false; + } +next_slot: + buf += 4 + 4 + 2 * MY_AES_BLOCK_SIZE; + } + + return true; +} + +/** Decrypt a MariaDB 10.1 redo log block. +@param[in,out] buf log block +@param[in] start_lsn server start LSN +@return whether the decryption was successful */ +ATTRIBUTE_COLD bool log_crypt_101_read_block(byte* buf, lsn_t start_lsn) +{ + const uint32_t checkpoint_no = mach_read_from_4(buf + 8); + const crypt_info_t* info = infos; + for (const crypt_info_t* const end = info + infos_used; info < end; + info++) { + if (info->key_version + && info->key_version != ENCRYPTION_KEY_VERSION_INVALID + && info->checkpoint_no == checkpoint_no) { + goto found; + } + } + + if (infos_used == 0) { + return false; + } + /* MariaDB Server 10.1 would use the first key if it fails to + find a key for the current checkpoint. */ + info = infos; + if (info->key_version == ENCRYPTION_KEY_VERSION_INVALID) { + return false; + } +found: + byte dst[512]; + uint dst_len; + byte aes_ctr_iv[MY_AES_BLOCK_SIZE]; + + const uint src_len = 512 - LOG_BLOCK_HDR_SIZE; + + ulint log_block_no = log_block_get_hdr_no(buf); + + /* The log block header is not encrypted. */ + memcpy(dst, buf, 512); + + memcpy(aes_ctr_iv, info->crypt_nonce, 3); + mach_write_to_8(aes_ctr_iv + 3, + log_block_get_start_lsn(start_lsn, log_block_no)); + memcpy(aes_ctr_iv + 11, buf, 4); + aes_ctr_iv[11] &= byte(~(LOG_BLOCK_FLUSH_BIT_MASK >> 24)); + aes_ctr_iv[15] = 0; + + int rc = encryption_crypt(buf + LOG_BLOCK_HDR_SIZE, src_len, + dst + LOG_BLOCK_HDR_SIZE, &dst_len, + const_cast(info->crypt_key), + MY_AES_BLOCK_SIZE, + aes_ctr_iv, MY_AES_BLOCK_SIZE, + ENCRYPTION_FLAG_DECRYPT + | ENCRYPTION_FLAG_NOPAD, + LOG_DEFAULT_ENCRYPTION_KEY, + info->key_version); + + if (rc != MY_AES_OK || dst_len != src_len) { + return false; + } + + memcpy(buf, dst, sizeof dst); + return true; +} + +/** MariaDB 10.2.5 encrypted redo log encryption key version (32 bits)*/ +constexpr size_t LOG_CHECKPOINT_CRYPT_KEY= 32; +/** MariaDB 10.2.5 encrypted redo log random nonce (32 bits) */ +constexpr size_t LOG_CHECKPOINT_CRYPT_NONCE= 36; +/** MariaDB 10.2.5 encrypted redo log random message (MY_AES_BLOCK_SIZE) */ +constexpr size_t LOG_CHECKPOINT_CRYPT_MESSAGE= 40; + +/** Add the encryption information to the log header buffer. +@param buf part of log header buffer */ +void log_crypt_write_header(byte *buf) +{ + ut_ad(info.key_version); + mach_write_to_4(my_assume_aligned<4>(buf), LOG_DEFAULT_ENCRYPTION_KEY); + mach_write_to_4(my_assume_aligned<4>(buf + 4), info.key_version); + memcpy_aligned<8>(buf + 8, info.crypt_msg, MY_AES_BLOCK_SIZE); + static_assert(MY_AES_BLOCK_SIZE == 16, "compatibility"); + memcpy_aligned<4>(buf + 24, info.crypt_nonce, sizeof info.crypt_nonce); +} + +/** Read the encryption information from a log header buffer. +@param buf part of log header buffer +@return whether the operation was successful */ +bool log_crypt_read_header(const byte *buf) +{ + MEM_UNDEFINED(&info.checkpoint_no, sizeof info.checkpoint_no); + MEM_NOACCESS(&info.checkpoint_no, sizeof info.checkpoint_no); + if (mach_read_from_4(my_assume_aligned<4>(buf)) != + LOG_DEFAULT_ENCRYPTION_KEY) + return false; + info.key_version= mach_read_from_4(my_assume_aligned<4>(buf + 4)); + memcpy_aligned<8>(info.crypt_msg, buf + 8, MY_AES_BLOCK_SIZE); + memcpy_aligned<4>(info.crypt_nonce, buf + 24, sizeof info.crypt_nonce); + return init_crypt_key(&info); +} + +/** Read the checkpoint crypto (version, msg and iv) info. +@param[in] buf checkpoint buffer +@return whether the operation was successful */ +ATTRIBUTE_COLD bool log_crypt_read_checkpoint_buf(const byte* buf) +{ + info.checkpoint_no = mach_read_from_4(buf + 4); + info.key_version = mach_read_from_4(buf + LOG_CHECKPOINT_CRYPT_KEY); + +#if MY_AES_BLOCK_SIZE != 16 +# error "MY_AES_BLOCK_SIZE != 16; redo log checkpoint format affected" +#endif + compile_time_assert(16 == sizeof info.crypt_msg); + compile_time_assert(16 == MY_AES_BLOCK_SIZE); + compile_time_assert(LOG_CHECKPOINT_CRYPT_MESSAGE + - LOG_CHECKPOINT_CRYPT_NONCE + == sizeof info.crypt_nonce); + + memcpy(info.crypt_msg, buf + LOG_CHECKPOINT_CRYPT_MESSAGE, + MY_AES_BLOCK_SIZE); + memcpy(info.crypt_nonce, buf + LOG_CHECKPOINT_CRYPT_NONCE, + sizeof info.crypt_nonce); + + return init_crypt_key(&info); +} + +/** Encrypt or decrypt a temporary file block. +@param[in] src block to encrypt or decrypt +@param[in] size size of the block +@param[out] dst destination block +@param[in] offs offset to block +@param[in] encrypt true=encrypt; false=decrypt +@return whether the operation succeeded */ +bool log_tmp_block_encrypt( + const byte* src, + ulint size, + byte* dst, + uint64_t offs, + bool encrypt) +{ + uint dst_len; + uint64_t iv[MY_AES_BLOCK_SIZE / sizeof(uint64_t)]; + iv[0] = offs; + memcpy(iv + 1, tmp_iv, sizeof iv - sizeof *iv); + + int rc = encryption_crypt( + src, uint(size), dst, &dst_len, + const_cast(info.crypt_key), MY_AES_BLOCK_SIZE, + reinterpret_cast(iv), uint(sizeof iv), + encrypt + ? ENCRYPTION_FLAG_ENCRYPT|ENCRYPTION_FLAG_NOPAD + : ENCRYPTION_FLAG_DECRYPT|ENCRYPTION_FLAG_NOPAD, + LOG_DEFAULT_ENCRYPTION_KEY, info.key_version); + + if (rc != MY_AES_OK) { + ib::error() << (encrypt ? "Encryption" : "Decryption") + << " failed for temporary file: " << rc; + } + + return rc == MY_AES_OK; +} + +/** Decrypt part of a log record. +@param iv initialization vector +@param buf buffer for the decrypted data +@param data the encrypted data +@param len length of the data, in bytes +@return buf */ +byte *log_decrypt_buf(const byte *iv, byte *buf, const byte *data, uint len) +{ + ut_a(MY_AES_OK == encryption_crypt(data, len, buf, &len, + info.crypt_key, MY_AES_BLOCK_SIZE, + iv, MY_AES_BLOCK_SIZE, + ENCRYPTION_FLAG_DECRYPT | + ENCRYPTION_FLAG_NOPAD, + LOG_DEFAULT_ENCRYPTION_KEY, + info.key_version)); + return buf; +} + +#include "mtr0log.h" + +/** Encrypt a log snippet +@param iv initialization vector +@param tmp temporary buffer +@param buf buffer to be replaced with encrypted contents +@param end pointer past the end of buf +@return encrypted data bytes that follow */ +static size_t log_encrypt_buf(byte iv[MY_AES_BLOCK_SIZE], + byte *&tmp, byte *buf, const byte *const end) +{ + for (byte *l= buf; l != end; ) + { + const byte b= *l++; + size_t rlen= b & 0xf; + if (!rlen) + { + const size_t lenlen= mlog_decode_varint_length(*l); + const uint32_t addlen= mlog_decode_varint(l); + ut_ad(addlen != MLOG_DECODE_ERROR); + rlen= addlen + 15 - lenlen; + l+= lenlen; + } + + if (b < 0x80) + { + /* Add the page identifier to the initialization vector. */ + size_t idlen= mlog_decode_varint_length(*l); + ut_ad(idlen <= 5); + ut_ad(idlen < rlen); + mach_write_to_4(my_assume_aligned<4>(iv + 8), mlog_decode_varint(l)); + l+= idlen; + rlen-= idlen; + idlen= mlog_decode_varint_length(*l); + ut_ad(idlen <= 5); + ut_ad(idlen <= rlen); + mach_write_to_4(my_assume_aligned<4>(iv + 12), mlog_decode_varint(l)); + l+= idlen; + rlen-= idlen; + } + + uint len; + + if (l + rlen > end) + { + if (size_t len= end - l) + { + /* Only WRITE or EXTENDED records may comprise multiple segments. */ + static_assert((EXTENDED | 0x10) == WRITE, "compatibility"); + ut_ad((b & 0x60) == EXTENDED); + ut_ad(l < end); + memcpy(tmp, l, len); + tmp+= len; + rlen-= len; + } + return rlen; + } + + if (!rlen) + continue; /* FREE_PAGE and INIT_PAGE have no payload. */ + + len= static_cast(rlen); + ut_a(MY_AES_OK == encryption_crypt(l, len, tmp, &len, + info.crypt_key, MY_AES_BLOCK_SIZE, + iv, MY_AES_BLOCK_SIZE, + ENCRYPTION_FLAG_ENCRYPT | + ENCRYPTION_FLAG_NOPAD, + LOG_DEFAULT_ENCRYPTION_KEY, + info.key_version)); + ut_ad(len == rlen); + memcpy(l, tmp, rlen); + l+= rlen; + } + + return 0; +} + +/** Encrypt the log */ +ATTRIBUTE_NOINLINE void mtr_t::encrypt() +{ + ut_ad(log_sys.format == log_t::FORMAT_ENC_10_8); + ut_ad(m_log.size()); + + alignas(8) byte iv[MY_AES_BLOCK_SIZE]; + + m_commit_lsn= log_sys.get_lsn(); + ut_ad(m_commit_lsn); + byte *tmp= static_cast(alloca(srv_page_size)), *t= tmp; + byte *dst= static_cast(alloca(srv_page_size)); + mach_write_to_8(iv, m_commit_lsn); + mtr_buf_t::block_t *start= nullptr; + size_t size= 0, start_size= 0; + m_crc= 0; + + m_log.for_each_block([&](mtr_buf_t::block_t *b) + { + ut_ad(t - tmp + size <= srv_page_size); + byte *buf= b->begin(); + if (!start) + { + parse: + ut_ad(t == tmp); + size= log_encrypt_buf(iv, t, buf, b->end()); + if (!size) + { + ut_ad(t == tmp); + start_size= 0; + } + else + { + start= b; + start_size= t - tmp; + } + m_crc= my_crc32c(m_crc, buf, b->end() - buf - start_size); + } + else if (size > b->used()) + { + ::memcpy(t, buf, b->used()); + t+= b->used(); + size-= b->used(); + } + else + { + ::memcpy(t, buf, size); + t+= size; + buf+= size; + uint len= static_cast(t - tmp); + ut_a(MY_AES_OK == encryption_crypt(tmp, len, dst, &len, + info.crypt_key, MY_AES_BLOCK_SIZE, + iv, MY_AES_BLOCK_SIZE, + ENCRYPTION_FLAG_ENCRYPT | + ENCRYPTION_FLAG_NOPAD, + LOG_DEFAULT_ENCRYPTION_KEY, + info.key_version)); + ut_ad(tmp + len == t); + m_crc= my_crc32c(m_crc, dst, len); + /* Copy the encrypted data back to the log snippets. */ + ::memcpy(start->end() - start_size, dst, start_size); + t= dst + start_size; + for (ilist::iterator i(start); &*++i != b;) + { + const size_t l{i->used()}; + ::memcpy(i->begin(), t, l); + t+= l; + } + ::memcpy(b->begin(), t, size); + ut_ad(t + size == dst + len); + t= tmp; + start= nullptr; + goto parse; + } + return true; + }); + + ut_ad(t == tmp); + ut_ad(!start); + ut_ad(!size); +} diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc new file mode 100644 index 00000000..91999c81 --- /dev/null +++ b/storage/innobase/log/log0log.cc @@ -0,0 +1,1358 @@ +/***************************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2014, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file log/log0log.cc +Database log + +Created 12/9/1995 Heikki Tuuri +*******************************************************/ + +#include "univ.i" +#include +#include + +#include "log0log.h" +#include "log0crypt.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "lock0lock.h" +#include "log0recv.h" +#include "fil0fil.h" +#include "dict0stats_bg.h" +#include "btr0defragment.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "trx0sys.h" +#include "trx0trx.h" +#include "trx0roll.h" +#include "srv0mon.h" +#include "buf0dump.h" +#include "log0sync.h" +#include "log.h" + +/* +General philosophy of InnoDB redo-logs: + +Every change to a contents of a data page must be done +through mtr_t, and mtr_t::commit() will write log records +to the InnoDB redo log. */ + +alignas(CPU_LEVEL1_DCACHE_LINESIZE) +static group_commit_lock flush_lock; +alignas(CPU_LEVEL1_DCACHE_LINESIZE) +static group_commit_lock write_lock; + +/** Redo log system */ +log_t log_sys; + +/* Margins for free space in the log buffer after a log entry is catenated */ +#define LOG_BUF_FLUSH_RATIO 2 +#define LOG_BUF_FLUSH_MARGIN ((4 * 4096) /* cf. log_t::append_prepare() */ \ + + (4U << srv_page_size_shift)) + +void log_t::set_capacity() +{ +#ifndef SUX_LOCK_GENERIC + ut_ad(log_sys.latch.is_write_locked()); +#endif + /* Margin for the free space in the smallest log, before a new query + step which modifies the database, is started */ + + lsn_t smallest_capacity = srv_log_file_size - log_t::START_OFFSET; + /* Add extra safety */ + smallest_capacity -= smallest_capacity / 10; + + lsn_t margin = smallest_capacity - (48 << srv_page_size_shift); + margin -= margin / 10; /* Add still some extra safety */ + + log_sys.log_capacity = smallest_capacity; + + log_sys.max_modified_age_async = margin - margin / 8; + log_sys.max_checkpoint_age = margin; +} + +#ifdef HAVE_PMEM +void log_t::create_low() +#else +bool log_t::create() +#endif +{ + ut_ad(this == &log_sys); + ut_ad(!is_initialised()); + + /* LSN 0 and 1 are reserved; @see buf_page_t::oldest_modification_ */ + lsn.store(FIRST_LSN, std::memory_order_relaxed); + flushed_to_disk_lsn.store(FIRST_LSN, std::memory_order_relaxed); + write_lsn= FIRST_LSN; + +#ifndef HAVE_PMEM + buf= static_cast(ut_malloc_dontdump(buf_size, PSI_INSTRUMENT_ME)); + if (!buf) + { + alloc_fail: + sql_print_error("InnoDB: Cannot allocate memory;" + " too large innodb_log_buffer_size?"); + return false; + } + flush_buf= static_cast(ut_malloc_dontdump(buf_size, + PSI_INSTRUMENT_ME)); + if (!flush_buf) + { + ut_free_dodump(buf, buf_size); + buf= nullptr; + goto alloc_fail; + } + + TRASH_ALLOC(buf, buf_size); + TRASH_ALLOC(flush_buf, buf_size); + checkpoint_buf= static_cast(aligned_malloc(4096, 4096)); + memset_aligned<4096>(checkpoint_buf, 0, 4096); +#else + ut_ad(!checkpoint_buf); + ut_ad(!buf); + ut_ad(!flush_buf); +#endif + + latch.SRW_LOCK_INIT(log_latch_key); + init_lsn_lock(); + + max_buf_free= buf_size / LOG_BUF_FLUSH_RATIO - LOG_BUF_FLUSH_MARGIN; + set_check_flush_or_checkpoint(); + + last_checkpoint_lsn= FIRST_LSN; + log_capacity= 0; + max_modified_age_async= 0; + max_checkpoint_age= 0; + next_checkpoint_lsn= 0; + checkpoint_pending= false; + + buf_free= 0; + + ut_ad(is_initialised()); +#ifndef HAVE_PMEM + return true; +#endif +} + +dberr_t log_file_t::close() noexcept +{ + ut_a(is_opened()); + + if (!os_file_close_func(m_file)) + return DB_ERROR; + + m_file= OS_FILE_CLOSED; + return DB_SUCCESS; +} + +__attribute__((warn_unused_result)) +dberr_t log_file_t::read(os_offset_t offset, span buf) noexcept +{ + ut_ad(is_opened()); + return os_file_read(IORequestRead, m_file, buf.data(), offset, buf.size(), + nullptr); +} + +void log_file_t::write(os_offset_t offset, span buf) noexcept +{ + ut_ad(is_opened()); + if (dberr_t err= os_file_write_func(IORequestWrite, "ib_logfile0", m_file, + buf.data(), offset, buf.size())) + ib::fatal() << "write(\"ib_logfile0\") returned " << err; +} + +#ifdef HAVE_PMEM +# include + +/** Attempt to memory map a file. +@param file log file handle +@param size file size +@return pointer to memory mapping +@retval MAP_FAILED if the memory cannot be mapped */ +static void *log_mmap(os_file_t file, os_offset_t size) +{ + void *ptr= + my_mmap(0, size_t(size), + srv_read_only_mode ? PROT_READ : PROT_READ | PROT_WRITE, + MAP_SHARED_VALIDATE | MAP_SYNC, file, 0); +#ifdef __linux__ + if (ptr == MAP_FAILED) + { + struct stat st; + if (!fstat(file, &st)) + { + MSAN_STAT_WORKAROUND(&st); + const auto st_dev= st.st_dev; + if (!stat("/dev/shm", &st)) + { + MSAN_STAT_WORKAROUND(&st); + if (st.st_dev == st_dev) + ptr= my_mmap(0, size_t(size), + srv_read_only_mode ? PROT_READ : PROT_READ | PROT_WRITE, + MAP_SHARED, file, 0); + } + } + } +#endif /* __linux__ */ + return ptr; +} +#endif + +#ifdef HAVE_PMEM +bool log_t::attach(log_file_t file, os_offset_t size) +#else +void log_t::attach_low(log_file_t file, os_offset_t size) +#endif +{ + log= file; + ut_ad(!size || size >= START_OFFSET + SIZE_OF_FILE_CHECKPOINT); + file_size= size; + +#ifdef HAVE_PMEM + ut_ad(!buf); + ut_ad(!flush_buf); + if (size && !(size_t(size) & 4095) && srv_operation != SRV_OPERATION_BACKUP) + { + void *ptr= log_mmap(log.m_file, size); + if (ptr != MAP_FAILED) + { + log.close(); + mprotect(ptr, size_t(size), PROT_READ); + buf= static_cast(ptr); +# if defined __linux__ || defined _WIN32 + set_block_size(CPU_LEVEL1_DCACHE_LINESIZE); +# endif + log_maybe_unbuffered= true; + log_buffered= false; + return true; + } + } + buf= static_cast(ut_malloc_dontdump(buf_size, PSI_INSTRUMENT_ME)); + if (!buf) + { + alloc_fail: + max_buf_free= 0; + sql_print_error("InnoDB: Cannot allocate memory;" + " too large innodb_log_buffer_size?"); + return false; + } + flush_buf= static_cast(ut_malloc_dontdump(buf_size, + PSI_INSTRUMENT_ME)); + if (!flush_buf) + { + ut_free_dodump(buf, buf_size); + buf= nullptr; + goto alloc_fail; + } + + TRASH_ALLOC(buf, buf_size); + TRASH_ALLOC(flush_buf, buf_size); +#endif + +#if defined __linux__ || defined _WIN32 + sql_print_information("InnoDB: %s (block size=%u bytes)", + log_buffered + ? "Buffered log writes" + : "File system buffers for log disabled", + block_size); +#endif + +#ifdef HAVE_PMEM + checkpoint_buf= static_cast(aligned_malloc(block_size, block_size)); + memset_aligned<64>(checkpoint_buf, 0, block_size); + return true; +#endif +} + +/** Write a log file header. +@param buf log header buffer +@param lsn log sequence number corresponding to log_sys.START_OFFSET +@param encrypted whether the log is encrypted */ +void log_t::header_write(byte *buf, lsn_t lsn, bool encrypted) +{ + mach_write_to_4(my_assume_aligned<4>(buf) + LOG_HEADER_FORMAT, + log_sys.FORMAT_10_8); + mach_write_to_8(my_assume_aligned<8>(buf + LOG_HEADER_START_LSN), lsn); + +#if defined __GNUC__ && __GNUC__ > 7 +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wstringop-truncation" +#endif + strncpy(reinterpret_cast(buf) + LOG_HEADER_CREATOR, + "MariaDB " PACKAGE_VERSION, + LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR); +#if defined __GNUC__ && __GNUC__ > 7 +# pragma GCC diagnostic pop +#endif + + if (encrypted) + log_crypt_write_header(buf + LOG_HEADER_CREATOR_END); + mach_write_to_4(my_assume_aligned<4>(508 + buf), my_crc32c(0, buf, 508)); +} + +void log_t::create(lsn_t lsn) noexcept +{ +#ifndef SUX_LOCK_GENERIC + ut_ad(latch.is_write_locked()); +#endif + ut_ad(!recv_no_log_write); + ut_ad(is_latest()); + ut_ad(this == &log_sys); + + this->lsn.store(lsn, std::memory_order_relaxed); + this->flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed); + first_lsn= lsn; + write_lsn= lsn; + + last_checkpoint_lsn= 0; + +#ifdef HAVE_PMEM + if (is_pmem()) + { + mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE); + memset_aligned<4096>(buf, 0, 4096); + buf_free= START_OFFSET; + } + else +#endif + { + buf_free= 0; + memset_aligned<4096>(flush_buf, 0, buf_size); + memset_aligned<4096>(buf, 0, buf_size); + } + + log_sys.header_write(buf, lsn, is_encrypted()); + DBUG_PRINT("ib_log", ("write header " LSN_PF, lsn)); + +#ifdef HAVE_PMEM + if (is_pmem()) + pmem_persist(buf, 512); + else +#endif + { + log.write(0, {buf, 4096}); + memset_aligned<512>(buf, 0, 512); + } +} + +void log_t::close_file() +{ +#ifdef HAVE_PMEM + if (is_pmem()) + { + ut_ad(!is_opened()); + ut_ad(!checkpoint_buf); + if (buf) + { + my_munmap(buf, file_size); + buf= nullptr; + } + return; + } + + ut_free_dodump(buf, buf_size); + buf= nullptr; + ut_free_dodump(flush_buf, buf_size); + flush_buf= nullptr; + aligned_free(checkpoint_buf); + checkpoint_buf= nullptr; +#endif + if (is_opened()) + if (const dberr_t err= log.close()) + ib::fatal() << "closing ib_logfile0 failed: " << err; +} + +/** Acquire all latches that protect the log. */ +static void log_resize_acquire() +{ + if (!log_sys.is_pmem()) + { + while (flush_lock.acquire(log_sys.get_lsn() + 1, nullptr) != + group_commit_lock::ACQUIRED); + while (write_lock.acquire(log_sys.get_lsn() + 1, nullptr) != + group_commit_lock::ACQUIRED); + } + + log_sys.latch.wr_lock(SRW_LOCK_CALL); +} + +/** Release the latches that protect the log. */ +void log_resize_release() +{ + log_sys.latch.wr_unlock(); + + if (!log_sys.is_pmem()) + { + lsn_t lsn1= write_lock.release(write_lock.value()); + lsn_t lsn2= flush_lock.release(flush_lock.value()); + if (lsn1 || lsn2) + log_write_up_to(std::max(lsn1, lsn2), true, nullptr); + } +} + +#if defined __linux__ || defined _WIN32 +/** Try to enable or disable file system caching (update log_buffered) */ +void log_t::set_buffered(bool buffered) +{ + if (!log_maybe_unbuffered || is_pmem() || high_level_read_only) + return; + log_resize_acquire(); + if (!resize_in_progress() && is_opened() && bool(log_buffered) != buffered) + { + os_file_close_func(log.m_file); + log.m_file= OS_FILE_CLOSED; + std::string path{get_log_file_path()}; + log_buffered= buffered; + bool success; + log.m_file= os_file_create_func(path.c_str(), + OS_FILE_OPEN, OS_FILE_NORMAL, OS_LOG_FILE, + false, &success); + ut_a(log.m_file != OS_FILE_CLOSED); + sql_print_information("InnoDB: %s (block size=%u bytes)", + log_buffered + ? "Buffered log writes" + : "File system buffers for log disabled", + block_size); + } + log_resize_release(); +} +#endif + +/** Start resizing the log and release the exclusive latch. +@param size requested new file_size +@return whether the resizing was started successfully */ +log_t::resize_start_status log_t::resize_start(os_offset_t size) noexcept +{ + ut_ad(size >= 4U << 20); + ut_ad(!(size & 4095)); + ut_ad(!srv_read_only_mode); + + log_resize_acquire(); + + resize_start_status status= RESIZE_NO_CHANGE; + lsn_t start_lsn{0}; + + if (resize_in_progress()) + status= RESIZE_IN_PROGRESS; + else if (size != file_size) + { + ut_ad(!resize_in_progress()); + ut_ad(!resize_log.is_opened()); + ut_ad(!resize_buf); + ut_ad(!resize_flush_buf); + std::string path{get_log_file_path("ib_logfile101")}; + bool success; + resize_lsn.store(1, std::memory_order_relaxed); + resize_target= 0; + resize_log.m_file= + os_file_create_func(path.c_str(), + OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT, + OS_FILE_NORMAL, OS_LOG_FILE, false, &success); + if (success) + { + log_resize_release(); + + void *ptr= nullptr, *ptr2= nullptr; + success= os_file_set_size(path.c_str(), resize_log.m_file, size); + if (!success); +#ifdef HAVE_PMEM + else if (is_pmem()) + { + ptr= log_mmap(resize_log.m_file, size); + if (ptr == MAP_FAILED) + goto alloc_fail; + } +#endif + else + { + ptr= ut_malloc_dontdump(buf_size, PSI_INSTRUMENT_ME); + if (ptr) + { + TRASH_ALLOC(ptr, buf_size); + ptr2= ut_malloc_dontdump(buf_size, PSI_INSTRUMENT_ME); + if (ptr2) + TRASH_ALLOC(ptr2, buf_size); + else + { + ut_free_dodump(ptr, buf_size); + ptr= nullptr; + goto alloc_fail; + } + } + else + alloc_fail: + success= false; + } + + log_resize_acquire(); + + if (!success) + { + resize_log.close(); + IF_WIN(DeleteFile(path.c_str()), unlink(path.c_str())); + } + else + { + resize_target= size; + resize_buf= static_cast(ptr); + resize_flush_buf= static_cast(ptr2); + if (is_pmem()) + { + resize_log.close(); + start_lsn= get_lsn(); + } + else + { + memcpy_aligned<16>(resize_buf, buf, (buf_free + 15) & ~15); + start_lsn= first_lsn + + (~lsn_t{get_block_size() - 1} & (write_lsn - first_lsn)); + } + } + resize_lsn.store(start_lsn, std::memory_order_relaxed); + status= success ? RESIZE_STARTED : RESIZE_FAILED; + } + } + + log_resize_release(); + + if (start_lsn) + { + mysql_mutex_lock(&buf_pool.flush_list_mutex); + lsn_t target_lsn= buf_pool.get_oldest_modification(0); + if (start_lsn < target_lsn) + start_lsn= target_lsn + 1; + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + buf_flush_ahead(start_lsn, false); + } + + return status; +} + +/** Abort log resizing. */ +void log_t::resize_abort() noexcept +{ + log_resize_acquire(); + + if (resize_in_progress() > 1) + { + if (!is_pmem()) + { + resize_log.close(); + ut_free_dodump(resize_buf, buf_size); + ut_free_dodump(resize_flush_buf, buf_size); + resize_flush_buf= nullptr; + } +#ifdef HAVE_PMEM + else + { + ut_ad(!resize_log.is_opened()); + ut_ad(!resize_flush_buf); + if (resize_buf) + my_munmap(resize_buf, resize_target); + } +#endif + resize_buf= nullptr; + resize_target= 0; + resize_lsn.store(0, std::memory_order_relaxed); + } + + log_resize_release(); +} + +/** Write an aligned buffer to ib_logfile0. +@param buf buffer to be written +@param len length of data to be written +@param offset log file offset */ +static void log_write_buf(const byte *buf, size_t len, lsn_t offset) +{ + ut_ad(write_lock.is_owner()); + ut_ad(!recv_no_log_write); + ut_d(const size_t block_size_1= log_sys.get_block_size() - 1); + ut_ad(!(offset & block_size_1)); + ut_ad(!(len & block_size_1)); + ut_ad(!(size_t(buf) & block_size_1)); + ut_ad(len); + + if (UNIV_LIKELY(offset + len <= log_sys.file_size)) + { +write: + log_sys.log.write(offset, {buf, len}); + return; + } + + const size_t write_len= size_t(log_sys.file_size - offset); + log_sys.log.write(offset, {buf, write_len}); + len-= write_len; + buf+= write_len; + ut_ad(log_sys.START_OFFSET + len < offset); + offset= log_sys.START_OFFSET; + goto write; +} + +/** Invoke commit_checkpoint_notify_ha() to notify that outstanding +log writes have been completed. */ +void log_flush_notify(lsn_t flush_lsn); + +#if 0 // Currently we overwrite the last log block until it is complete. +/** CRC-32C of pad messages using between 1 and 15 bytes of NUL bytes +in the payload */ +static const unsigned char pad_crc[15][4]= { + {0xA6,0x59,0xC1,0xDB}, {0xF2,0xAF,0x80,0x73}, {0xED,0x02,0xF1,0x90}, + {0x68,0x4E,0xA3,0xF3}, {0x5D,0x1B,0xEA,0x6A}, {0xE0,0x01,0x86,0xB9}, + {0xD1,0x06,0x86,0xF5}, {0xEB,0x20,0x12,0x33}, {0xBA,0x73,0xB2,0xA3}, + {0x5F,0xA2,0x08,0x03}, {0x70,0x03,0xD6,0x9D}, {0xED,0xB3,0x49,0x78}, + {0xFD,0xD6,0xB9,0x9C}, {0x25,0xF8,0xB1,0x2C}, {0xCD,0xAA,0xE7,0x10} +}; + +/** Pad the log with some dummy bytes +@param lsn desired log sequence number +@param pad number of bytes to append to the log +@param begin buffer to write 'pad' bytes to +@param extra buffer for additional pad bytes (up to 15 bytes) +@return additional bytes used in extra[] */ +ATTRIBUTE_NOINLINE +static size_t log_pad(lsn_t lsn, size_t pad, byte *begin, byte *extra) +{ + ut_ad(!(size_t(begin + pad) & (log_sys.get_block_size() - 1))); + byte *b= begin; + const byte seq{log_sys.get_sequence_bit(lsn)}; + /* The caller should never request padding such that the + file would wrap around to the beginning. That is, the sequence + bit must be the same for all records. */ + ut_ad(seq == log_sys.get_sequence_bit(lsn + pad)); + + if (log_sys.is_encrypted()) + { + /* The lengths of our pad messages vary between 15 and 29 bytes + (FILE_CHECKPOINT byte, 1 to 15 NUL bytes, sequence byte, + 4 bytes checksum, 8 NUL bytes nonce). */ + if (pad < 15) + { + extra[0]= FILE_CHECKPOINT | 1; + extra[1]= 0; + extra[2]= seq; + memcpy(extra + 3, pad_crc[0], 4); + memset(extra + 7, 0, 8); + memcpy(b, extra, pad); + memmove(extra, extra + pad, 15 - pad); + return 15 - pad; + } + + /* Pad first with 29-byte messages until the remaining size is + less than 29+15 bytes, and then write 1 or 2 shorter messages. */ + const byte *const end= begin + pad; + for (; b + (29 + 15) < end; b+= 29) + { + b[0]= FILE_CHECKPOINT | 15; + memset(b + 1, 0, 15); + b[16]= seq; + memcpy(b + 17, pad_crc[14], 4); + memset(b + 21, 0, 8); + } + if (b + 29 < end) + { + b[0]= FILE_CHECKPOINT | 1; + b[1]= 0; + b[2]= seq; + memcpy(b + 3, pad_crc[0], 4); + memset(b + 7, 0, 8); + b+= 15; + } + const size_t last_pad(end - b); + ut_ad(last_pad >= 15); + ut_ad(last_pad <= 29); + b[0]= FILE_CHECKPOINT | byte(last_pad - 14); + memset(b + 1, 0, last_pad - 14); + b[last_pad - 13]= seq; + memcpy(b + last_pad - 12, pad_crc[last_pad - 15], 4); + memset(b + last_pad - 8, 0, 8); + } + else + { + /* The lengths of our pad messages vary between 7 and 21 bytes + (FILE_CHECKPOINT byte, 1 to 15 NUL bytes, sequence byte, + 4 bytes checksum). */ + if (pad < 7) + { + extra[0]= FILE_CHECKPOINT | 1; + extra[1]= 0; + extra[2]= seq; + memcpy(extra + 3, pad_crc[0], 4); + memcpy(b, extra, pad); + memmove(extra, extra + pad, 7 - pad); + return 7 - pad; + } + + /* Pad first with 21-byte messages until the remaining size is + less than 21+7 bytes, and then write 1 or 2 shorter messages. */ + const byte *const end= begin + pad; + for (; b + (21 + 7) < end; b+= 21) + { + b[0]= FILE_CHECKPOINT | 15; + memset(b + 1, 0, 15); + b[16]= seq; + memcpy(b + 17, pad_crc[14], 4); + } + if (b + 21 < end) + { + b[0]= FILE_CHECKPOINT | 1; + b[1]= 0; + b[2]= seq; + memcpy(b + 3, pad_crc[0], 4); + b+= 7; + } + const size_t last_pad(end - b); + ut_ad(last_pad >= 7); + ut_ad(last_pad <= 21); + b[0]= FILE_CHECKPOINT | byte(last_pad - 6); + memset(b + 1, 0, last_pad - 6); + b[last_pad - 5]= seq; + memcpy(b + last_pad - 4, pad_crc[last_pad - 7], 4); + } + + return 0; +} +#endif + +#ifdef HAVE_PMEM +/** Persist the log. +@param lsn desired new value of flushed_to_disk_lsn */ +inline void log_t::persist(lsn_t lsn) noexcept +{ + ut_ad(is_pmem()); + ut_ad(!write_lock.is_owner()); + ut_ad(!flush_lock.is_owner()); + + lsn_t old= flushed_to_disk_lsn.load(std::memory_order_relaxed); + + if (old >= lsn) + return; + + const lsn_t resizing{resize_in_progress()}; + if (UNIV_UNLIKELY(resizing)) + latch.rd_lock(SRW_LOCK_CALL); + const size_t start(calc_lsn_offset(old)); + const size_t end(calc_lsn_offset(lsn)); + + if (UNIV_UNLIKELY(end < start)) + { + pmem_persist(log_sys.buf + start, log_sys.file_size - start); + pmem_persist(log_sys.buf + log_sys.START_OFFSET, + end - log_sys.START_OFFSET); + } + else + pmem_persist(log_sys.buf + start, end - start); + + old= flushed_to_disk_lsn.load(std::memory_order_relaxed); + + if (old < lsn) + { + while (!flushed_to_disk_lsn.compare_exchange_weak + (old, lsn, std::memory_order_release, std::memory_order_relaxed)) + if (old >= lsn) + break; + + log_flush_notify(lsn); + DBUG_EXECUTE_IF("crash_after_log_write_upto", DBUG_SUICIDE();); + } + + if (UNIV_UNLIKELY(resizing)) + latch.rd_unlock(); +} +#endif + +/** Write resize_buf to resize_log. +@param length the used length of resize_buf */ +ATTRIBUTE_COLD void log_t::resize_write_buf(size_t length) noexcept +{ + const size_t block_size_1= get_block_size() - 1; + ut_ad(!(resize_target & block_size_1)); + ut_ad(!(length & block_size_1)); + ut_ad(length > block_size_1); + ut_ad(length <= resize_target); + const lsn_t resizing{resize_in_progress()}; + ut_ad(resizing <= write_lsn); + lsn_t offset= START_OFFSET + + ((write_lsn - resizing) & ~lsn_t{block_size_1}) % + (resize_target - START_OFFSET); + + if (UNIV_UNLIKELY(offset + length > resize_target)) + { + offset= START_OFFSET; + resize_lsn.store(first_lsn + + (~lsn_t{block_size_1} & (write_lsn - first_lsn)), + std::memory_order_relaxed); + } + + ut_a(os_file_write_func(IORequestWrite, "ib_logfile101", resize_log.m_file, + resize_flush_buf, offset, length) == DB_SUCCESS); +} + +/** Write buf to ib_logfile0. +@tparam release_latch whether to invoke latch.wr_unlock() +@return the current log sequence number */ +template inline lsn_t log_t::write_buf() noexcept +{ +#ifndef SUX_LOCK_GENERIC + ut_ad(latch.is_write_locked()); +#endif + ut_ad(!srv_read_only_mode); + ut_ad(!is_pmem()); + + const lsn_t lsn{get_lsn(std::memory_order_relaxed)}; + + if (write_lsn >= lsn) + { + if (release_latch) + latch.wr_unlock(); + ut_ad(write_lsn == lsn); + } + else + { + ut_ad(!recv_no_log_write); + write_lock.set_pending(lsn); + ut_ad(write_lsn >= get_flushed_lsn()); + const size_t block_size_1{get_block_size() - 1}; + lsn_t offset{calc_lsn_offset(write_lsn) & ~lsn_t{block_size_1}}; + + DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF " at " LSN_PF, + write_lsn, lsn, offset)); + const byte *write_buf{buf}; + size_t length{buf_free}; + ut_ad(length >= (calc_lsn_offset(write_lsn) & block_size_1)); + const size_t new_buf_free{length & block_size_1}; + buf_free= new_buf_free; + ut_ad(new_buf_free == ((lsn - first_lsn) & block_size_1)); + + if (new_buf_free) + { +#if 0 /* TODO: Pad the last log block with dummy records. */ + buf_free= log_pad(lsn, get_block_size() - new_buf_free, + buf + new_buf_free, flush_buf); + ... /* TODO: Update the LSN and adjust other code. */ +#else + /* The rest of the block will be written as garbage. + (We want to avoid memset() while holding mutex.) + This block will be overwritten later, once records beyond + the current LSN are generated. */ +# ifdef HAVE_valgrind + MEM_MAKE_DEFINED(buf + length, get_block_size() - new_buf_free); + if (UNIV_LIKELY_NULL(resize_flush_buf)) + MEM_MAKE_DEFINED(resize_buf + length, get_block_size() - new_buf_free); +# endif + buf[length]= 0; /* allow recovery to catch EOF faster */ + length&= ~block_size_1; + memcpy_aligned<16>(flush_buf, buf + length, (new_buf_free + 15) & ~15); + if (UNIV_LIKELY_NULL(resize_flush_buf)) + memcpy_aligned<16>(resize_flush_buf, resize_buf + length, + (new_buf_free + 15) & ~15); + length+= get_block_size(); +#endif + } + + std::swap(buf, flush_buf); + std::swap(resize_buf, resize_flush_buf); + write_to_log++; + if (release_latch) + latch.wr_unlock(); + + if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED)) + { + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "InnoDB log write: " LSN_PF, write_lsn); + } + + /* Do the write to the log file */ + log_write_buf(write_buf, length, offset); + if (UNIV_LIKELY_NULL(resize_buf)) + resize_write_buf(length); + write_lsn= lsn; + } + + return lsn; +} + +bool log_t::flush(lsn_t lsn) noexcept +{ + ut_ad(lsn >= get_flushed_lsn()); + flush_lock.set_pending(lsn); + const bool success{srv_file_flush_method == SRV_O_DSYNC || log.flush()}; + if (UNIV_LIKELY(success)) + { + flushed_to_disk_lsn.store(lsn, std::memory_order_release); + log_flush_notify(lsn); + } + return success; +} + +/** Ensure that previous log writes are durable. +@param lsn previously written LSN +@return new durable lsn target +@retval 0 if there are no pending callbacks on flush_lock + or there is another group commit lead. +*/ +static lsn_t log_flush(lsn_t lsn) +{ + ut_ad(!log_sys.is_pmem()); + ut_a(log_sys.flush(lsn)); + DBUG_EXECUTE_IF("crash_after_log_write_upto", DBUG_SUICIDE();); + return flush_lock.release(lsn); +} + +static const completion_callback dummy_callback{[](void *) {},nullptr}; + +/** Ensure that the log has been written to the log file up to a given +log entry (such as that of a transaction commit). Start a new write, or +wait and check if an already running write is covering the request. +@param lsn log sequence number that should be included in the file write +@param durable whether the write needs to be durable +@param callback log write completion callback */ +void log_write_up_to(lsn_t lsn, bool durable, + const completion_callback *callback) +{ + ut_ad(!srv_read_only_mode); + ut_ad(lsn != LSN_MAX); + + if (UNIV_UNLIKELY(recv_no_ibuf_operations)) + { + /* A non-final batch of recovery is active no writes to the log + are allowed yet. */ + ut_a(!callback); + return; + } + + ut_ad(lsn <= log_sys.get_lsn()); + +#ifdef HAVE_PMEM + if (log_sys.is_pmem()) + { + ut_ad(!callback); + if (durable) + log_sys.persist(lsn); + return; + } +#endif + +repeat: + if (durable) + { + if (flush_lock.acquire(lsn, callback) != group_commit_lock::ACQUIRED) + return; + flush_lock.set_pending(log_sys.get_lsn()); + } + + lsn_t pending_write_lsn= 0, pending_flush_lsn= 0; + + if (write_lock.acquire(lsn, durable ? nullptr : callback) == + group_commit_lock::ACQUIRED) + { + log_sys.latch.wr_lock(SRW_LOCK_CALL); + pending_write_lsn= write_lock.release(log_sys.write_buf()); + } + + if (durable) + { + pending_flush_lsn= log_flush(write_lock.value()); + } + + if (pending_write_lsn || pending_flush_lsn) + { + /* There is no new group commit lead; some async waiters could stall. */ + callback= &dummy_callback; + lsn= std::max(pending_write_lsn, pending_flush_lsn); + goto repeat; + } +} + +/** Write to the log file up to the last log entry. +@param durable whether to wait for a durable write to complete */ +void log_buffer_flush_to_disk(bool durable) +{ + ut_ad(!srv_read_only_mode); + log_write_up_to(log_sys.get_lsn(std::memory_order_acquire), durable); +} + +/** Prepare to invoke log_write_and_flush(), before acquiring log_sys.latch. */ +ATTRIBUTE_COLD void log_write_and_flush_prepare() +{ + if (log_sys.is_pmem()) + return; + + while (flush_lock.acquire(log_sys.get_lsn() + 1, nullptr) != + group_commit_lock::ACQUIRED); + while (write_lock.acquire(log_sys.get_lsn() + 1, nullptr) != + group_commit_lock::ACQUIRED); +} + +/** Durably write the log up to log_sys.get_lsn(). */ +ATTRIBUTE_COLD void log_write_and_flush() +{ + ut_ad(!srv_read_only_mode); + if (!log_sys.is_pmem()) + { + const lsn_t lsn{log_sys.write_buf()}; + write_lock.release(lsn); + log_flush(lsn); + } +#ifdef HAVE_PMEM + else + log_sys.persist(log_sys.get_lsn()); +#endif +} + +/******************************************************************** + +Tries to establish a big enough margin of free space in the log buffer, such +that a new log entry can be catenated without an immediate need for a flush. */ +ATTRIBUTE_COLD static void log_flush_margin() +{ + if (log_sys.buf_free > log_sys.max_buf_free) + log_buffer_flush_to_disk(false); +} + +/****************************************************************//** +Tries to establish a big enough margin of free space in the log, such +that a new log entry can be catenated without an immediate need for a +checkpoint. NOTE: this function may only be called if the calling thread +owns no synchronization objects! */ +ATTRIBUTE_COLD static void log_checkpoint_margin() +{ + while (log_sys.check_flush_or_checkpoint()) + { + log_sys.latch.rd_lock(SRW_LOCK_CALL); + ut_ad(!recv_no_log_write); + + if (!log_sys.check_flush_or_checkpoint()) + { +func_exit: + log_sys.latch.rd_unlock(); + return; + } + + const lsn_t lsn= log_sys.get_lsn(); + const lsn_t checkpoint= log_sys.last_checkpoint_lsn; + const lsn_t sync_lsn= checkpoint + log_sys.max_checkpoint_age; + + if (lsn <= sync_lsn) + { +#ifndef DBUG_OFF + skip_checkpoint: +#endif + log_sys.set_check_flush_or_checkpoint(false); + goto func_exit; + } + + DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", goto skip_checkpoint;); + log_sys.latch.rd_unlock(); + + /* We must wait to prevent the tail of the log overwriting the head. */ + buf_flush_wait_flushed(std::min(sync_lsn, checkpoint + (1U << 20))); + /* Sleep to avoid a thundering herd */ + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } +} + +/** +Checks that there is enough free space in the log to start a new query step. +Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this +function may only be called if the calling thread owns no synchronization +objects! */ +ATTRIBUTE_COLD void log_check_margins() +{ + do + { + log_flush_margin(); + log_checkpoint_margin(); + ut_ad(!recv_no_log_write); + } + while (log_sys.check_flush_or_checkpoint()); +} + +/** Wait for a log checkpoint if needed. +NOTE that this function may only be called while not holding +any synchronization objects except dict_sys.latch. */ +void log_free_check() +{ + ut_ad(!lock_sys.is_writer()); + if (log_sys.check_flush_or_checkpoint()) + log_check_margins(); +} + +extern void buf_resize_shutdown(); + +/** Make a checkpoint at the latest lsn on shutdown. */ +ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown() +{ + lsn_t lsn; + ulint count = 0; + + ib::info() << "Starting shutdown..."; + + /* Wait until the master thread and all other operations are idle: our + algorithm only works if the server is idle at shutdown */ + bool do_srv_shutdown = false; + if (srv_master_timer) { + do_srv_shutdown = srv_fast_shutdown < 2; + srv_master_timer.reset(); + } + + /* Wait for the end of the buffer resize task.*/ + buf_resize_shutdown(); + dict_stats_shutdown(); + btr_defragment_shutdown(); + + srv_shutdown_state = SRV_SHUTDOWN_CLEANUP; + + if (srv_buffer_pool_dump_at_shutdown && + !srv_read_only_mode && srv_fast_shutdown < 2) { + buf_dump_start(); + } + srv_monitor_timer.reset(); + + if (do_srv_shutdown) { + srv_shutdown(srv_fast_shutdown == 0); + } + + +loop: + ut_ad(lock_sys.is_initialised() || !srv_was_started); + ut_ad(log_sys.is_initialised() || !srv_was_started); + ut_ad(fil_system.is_initialised() || !srv_was_started); + +#define COUNT_INTERVAL 600U +#define CHECK_INTERVAL 100000U + std::this_thread::sleep_for(std::chrono::microseconds(CHECK_INTERVAL)); + + count++; + + /* Check that there are no longer transactions, except for + PREPARED ones. We need this wait even for the 'very fast' + shutdown, because the InnoDB layer may have committed or + prepared transactions and we don't want to lose them. */ + + if (ulint total_trx = srv_was_started && !srv_read_only_mode + && srv_force_recovery < SRV_FORCE_NO_TRX_UNDO + ? trx_sys.any_active_transactions() : 0) { + + if (srv_print_verbose_log && count > COUNT_INTERVAL) { + service_manager_extend_timeout( + COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2, + "Waiting for %lu active transactions to finish", + (ulong) total_trx); + ib::info() << "Waiting for " << total_trx << " active" + << " transactions to finish"; + + count = 0; + } + + goto loop; + } + + /* We need these threads to stop early in shutdown. */ + const char* thread_name = srv_fast_shutdown != 2 + && trx_rollback_is_active + ? "rollback of recovered transactions" : nullptr; + + if (thread_name) { + ut_ad(!srv_read_only_mode); +wait_suspend_loop: + service_manager_extend_timeout( + COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2, + "Waiting for %s to exit", thread_name); + if (srv_print_verbose_log && count > COUNT_INTERVAL) { + ib::info() << "Waiting for " << thread_name + << " to exit"; + count = 0; + } + goto loop; + } + + /* Check that the background threads are suspended */ + + ut_ad(!srv_any_background_activity()); + if (srv_n_fil_crypt_threads_started) { + fil_crypt_threads_signal(true); + thread_name = "fil_crypt_thread"; + goto wait_suspend_loop; + } + + if (buf_page_cleaner_is_active) { + thread_name = "page cleaner thread"; + pthread_cond_signal(&buf_pool.do_flush_list); + goto wait_suspend_loop; + } + + buf_load_dump_end(); + + if (!buf_pool.is_initialised()) { + ut_ad(!srv_was_started); + } else { + buf_flush_buffer_pool(); + } + + if (srv_fast_shutdown == 2 || !srv_was_started) { + if (!srv_read_only_mode && srv_was_started) { + sql_print_information( + "InnoDB: Executing innodb_fast_shutdown=2." + " Next startup will execute crash recovery!"); + + /* In this fastest shutdown we do not flush the + buffer pool: + + it is essentially a 'crash' of the InnoDB server. + Make sure that the log is all flushed to disk, so + that we can recover all committed transactions in + a crash recovery. */ + log_buffer_flush_to_disk(); + } + + srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE; + return; + } + + if (!srv_read_only_mode) { + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "ensuring dirty buffer pool are written to log"); + log_make_checkpoint(); + + const auto sizeof_cp = log_sys.is_encrypted() + ? SIZE_OF_FILE_CHECKPOINT + 8 + : SIZE_OF_FILE_CHECKPOINT; + + log_sys.latch.rd_lock(SRW_LOCK_CALL); + + lsn = log_sys.get_lsn(); + + const bool lsn_changed = lsn != log_sys.last_checkpoint_lsn + && lsn != log_sys.last_checkpoint_lsn + sizeof_cp; + ut_ad(lsn >= log_sys.last_checkpoint_lsn); + + log_sys.latch.rd_unlock(); + + if (lsn_changed) { + goto loop; + } + } else { + lsn = recv_sys.lsn; + } + + srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE; + + /* Make some checks that the server really is quiet */ + ut_ad(!srv_any_background_activity()); + + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "Free innodb buffer pool"); + ut_d(buf_pool.assert_all_freed()); + + ut_a(lsn == log_sys.get_lsn() + || srv_force_recovery == SRV_FORCE_NO_LOG_REDO); + + if (UNIV_UNLIKELY(lsn < recv_sys.lsn)) { + sql_print_error("InnoDB: Shutdown LSN=" LSN_PF + " is less than start LSN=" LSN_PF, + lsn, recv_sys.lsn); + } + + srv_shutdown_lsn = lsn; + + /* Make some checks that the server really is quiet */ + ut_ad(!srv_any_background_activity()); + + ut_a(lsn == log_sys.get_lsn() + || srv_force_recovery == SRV_FORCE_NO_LOG_REDO); +} + +/******************************************************//** +Prints info of the log. */ +void +log_print( +/*======*/ + FILE* file) /*!< in: file where to print */ +{ + log_sys.latch.rd_lock(SRW_LOCK_CALL); + + const lsn_t lsn= log_sys.get_lsn(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + const lsn_t pages_flushed = buf_pool.get_oldest_modification(lsn); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + fprintf(file, + "Log sequence number " LSN_PF "\n" + "Log flushed up to " LSN_PF "\n" + "Pages flushed up to " LSN_PF "\n" + "Last checkpoint at " LSN_PF "\n", + lsn, + log_sys.get_flushed_lsn(), + pages_flushed, + lsn_t{log_sys.last_checkpoint_lsn}); + + log_sys.latch.rd_unlock(); +} + +/** Shut down the redo log subsystem. */ +void log_t::close() +{ + ut_ad(this == &log_sys); + if (!is_initialised()) return; + close_file(); + +#ifndef HAVE_PMEM + ut_free_dodump(buf, buf_size); + buf= nullptr; + ut_free_dodump(flush_buf, buf_size); + flush_buf= nullptr; + aligned_free(checkpoint_buf); + checkpoint_buf= nullptr; +#else + ut_ad(!checkpoint_buf); + ut_ad(!buf); + ut_ad(!flush_buf); +#endif + + latch.destroy(); + destroy_lsn_lock(); + + recv_sys.close(); + + max_buf_free= 0; +} + +std::string get_log_file_path(const char *filename) +{ + const size_t size= strlen(srv_log_group_home_dir) + /* path separator */ 1 + + strlen(filename) + /* longest suffix */ 3; + std::string path; + path.reserve(size); + path.assign(srv_log_group_home_dir); + + switch (path.back()) { +#ifdef _WIN32 + case '\\': +#endif + case '/': + break; + default: + path.push_back('/'); + } + path.append(filename); + + return path; +} diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc new file mode 100644 index 00000000..3c3fe41e --- /dev/null +++ b/storage/innobase/log/log0recv.cc @@ -0,0 +1,4870 @@ +/***************************************************************************** + +Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2022, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file log/log0recv.cc +Recovery + +Created 9/20/1997 Heikki Tuuri +*******************************************************/ + +#include "univ.i" + +#include +#include +#include + +#include "log0recv.h" + +#ifdef HAVE_MY_AES_H +#include +#endif + +#include "log0crypt.h" +#include "mem0mem.h" +#include "buf0buf.h" +#include "buf0dblwr.h" +#include "buf0flu.h" +#include "mtr0mtr.h" +#include "mtr0log.h" +#include "page0page.h" +#include "page0cur.h" +#include "trx0undo.h" +#include "ibuf0ibuf.h" +#include "trx0undo.h" +#include "trx0rec.h" +#include "fil0fil.h" +#include "buf0rea.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "fil0pagecompress.h" +#include "log.h" + +/** The recovery system */ +recv_sys_t recv_sys; +/** TRUE when recv_init_crash_recovery() has been called. */ +bool recv_needed_recovery; +#ifdef UNIV_DEBUG +/** TRUE if writing to the redo log (mtr_commit) is forbidden. +Protected by log_sys.latch. */ +bool recv_no_log_write = false; +#endif /* UNIV_DEBUG */ + +/** TRUE if buf_page_is_corrupted() should check if the log sequence +number (FIL_PAGE_LSN) is in the future. Initially FALSE, and set by +recv_recovery_from_checkpoint_start(). */ +bool recv_lsn_checks_on; + +/** If the following is TRUE, the buffer pool file pages must be invalidated +after recovery and no ibuf operations are allowed; this becomes TRUE if +the log record hash table becomes too full, and log records must be merged +to file pages already before the recovery is finished: in this case no +ibuf operations are allowed, as they could modify the pages read in the +buffer pool before the pages have been recovered to the up-to-date state. + +true means that recovery is running and no operations on the log file +are allowed yet: the variable name is misleading. */ +bool recv_no_ibuf_operations; + +/** The maximum lsn we see for a page during the recovery process. If this +is bigger than the lsn we are able to scan up to, that is an indication that +the recovery failed and the database may be corrupt. */ +static lsn_t recv_max_page_lsn; + +/** Stored physical log record */ +struct log_phys_t : public log_rec_t +{ + /** start LSN of the mini-transaction (not necessarily of this record) */ + const lsn_t start_lsn; +private: + /** @return the start of length and data */ + const byte *start() const + { + return my_assume_aligned + (reinterpret_cast(&start_lsn + 1)); + } + /** @return the start of length and data */ + byte *start() + { return const_cast(const_cast(this)->start()); } + /** @return the length of the following record */ + uint16_t len() const { uint16_t i; memcpy(&i, start(), 2); return i; } + + /** @return start of the log records */ + byte *begin() { return start() + 2; } + /** @return end of the log records */ + byte *end() { byte *e= begin() + len(); ut_ad(!*e); return e; } +public: + /** @return start of the log records */ + const byte *begin() const { return const_cast(this)->begin(); } + /** @return end of the log records */ + const byte *end() const { return const_cast(this)->end(); } + + /** Determine the allocated size of the object. + @param len length of recs, excluding terminating NUL byte + @return the total allocation size */ + static inline size_t alloc_size(size_t len); + + /** Constructor. + @param start_lsn start LSN of the mini-transaction + @param lsn mtr_t::commit_lsn() of the mini-transaction + @param recs the first log record for the page in the mini-transaction + @param size length of recs, in bytes, excluding terminating NUL byte */ + log_phys_t(lsn_t start_lsn, lsn_t lsn, const byte *recs, size_t size) : + log_rec_t(lsn), start_lsn(start_lsn) + { + ut_ad(start_lsn); + ut_ad(start_lsn < lsn); + const uint16_t len= static_cast(size); + ut_ad(len == size); + memcpy(start(), &len, 2); + reinterpret_cast(memcpy(begin(), recs, size))[size]= 0; + } + + /** Append a record to the log. + @param recs log to append + @param size size of the log, in bytes */ + void append(const byte *recs, size_t size) + { + ut_ad(start_lsn < lsn); + uint16_t l= len(); + reinterpret_cast(memcpy(end(), recs, size))[size]= 0; + l= static_cast(l + size); + memcpy(start(), &l, 2); + } + + /** Apply an UNDO_APPEND record. + @see mtr_t::undo_append() + @param block undo log page + @param data undo log record + @param len length of the undo log record + @return whether the operation failed (inconcistency was noticed) */ + static bool undo_append(const buf_block_t &block, const byte *data, + size_t len) + { + ut_ad(len > 2); + byte *free_p= my_assume_aligned<2> + (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + block.page.frame); + const uint16_t free= mach_read_from_2(free_p); + if (UNIV_UNLIKELY(free < TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE || + free + len + 6 >= srv_page_size - FIL_PAGE_DATA_END)) + { + ib::error() << "Not applying UNDO_APPEND due to corruption on " + << block.page.id(); + return true; + } + + byte *p= block.page.frame + free; + mach_write_to_2(free_p, free + 4 + len); + memcpy(p, free_p, 2); + p+= 2; + memcpy(p, data, len); + p+= len; + mach_write_to_2(p, free); + return false; + } + + /** Check an OPT_PAGE_CHECKSUM record. + @see mtr_t::page_checksum() + @param block buffer page + @param l pointer to checksum + @return whether an unrecoverable mismatch was found */ + static bool page_checksum(const buf_block_t &block, const byte *l) + { + size_t size; + const byte *page= block.page.zip.data; + if (UNIV_LIKELY_NULL(page)) + size= (UNIV_ZIP_SIZE_MIN >> 1) << block.page.zip.ssize; + else + { + page= block.page.frame; + size= srv_page_size; + } + if (UNIV_LIKELY(my_crc32c(my_crc32c(my_crc32c(0, page + FIL_PAGE_OFFSET, + FIL_PAGE_LSN - + FIL_PAGE_OFFSET), + page + FIL_PAGE_TYPE, 2), + page + FIL_PAGE_SPACE_ID, + size - (FIL_PAGE_SPACE_ID + 8)) == + mach_read_from_4(l))) + return false; + + ib::error() << "OPT_PAGE_CHECKSUM mismatch on " << block.page.id(); + return !srv_force_recovery; + } + + /** The status of apply() */ + enum apply_status { + /** The page was not affected */ + APPLIED_NO= 0, + /** The page was modified */ + APPLIED_YES, + /** The page was modified, affecting the encryption parameters */ + APPLIED_TO_ENCRYPTION, + /** The page was modified, affecting the tablespace header */ + APPLIED_TO_FSP_HEADER, + /** The page was found to be corrupted */ + APPLIED_CORRUPTED, + }; + + /** Apply log to a page frame. + @param[in,out] block buffer block + @param[in,out] last_offset last byte offset, for same_page records + @return whether any log was applied to the page */ + apply_status apply(const buf_block_t &block, uint16_t &last_offset) const + { + const byte * const recs= begin(); + byte *const frame= block.page.zip.data + ? block.page.zip.data : block.page.frame; + const size_t size= block.physical_size(); + apply_status applied= APPLIED_NO; + + for (const byte *l= recs;;) + { + const byte b= *l++; + if (!b) + return applied; + ut_ad((b & 0x70) != RESERVED); + size_t rlen= b & 0xf; + if (!rlen) + { + const size_t lenlen= mlog_decode_varint_length(*l); + const uint32_t addlen= mlog_decode_varint(l); + ut_ad(addlen != MLOG_DECODE_ERROR); + rlen= addlen + 15 - lenlen; + l+= lenlen; + } + if (!(b & 0x80)) + { + /* Skip the page identifier. It has already been validated. */ + size_t idlen= mlog_decode_varint_length(*l); + ut_ad(idlen <= 5); + ut_ad(idlen < rlen); + ut_ad(mlog_decode_varint(l) == block.page.id().space()); + l+= idlen; + rlen-= idlen; + idlen= mlog_decode_varint_length(*l); + ut_ad(idlen <= 5); + ut_ad(idlen <= rlen); + ut_ad(mlog_decode_varint(l) == block.page.id().page_no()); + l+= idlen; + rlen-= idlen; + last_offset= 0; + } + + switch (b & 0x70) { + case FREE_PAGE: + ut_ad(last_offset == 0); + goto next_not_same_page; + case INIT_PAGE: + if (UNIV_LIKELY(rlen == 0)) + { + memset_aligned(frame, 0, size); + mach_write_to_4(frame + FIL_PAGE_OFFSET, block.page.id().page_no()); + memset_aligned<8>(FIL_PAGE_PREV + frame, 0xff, 8); + mach_write_to_4(frame + FIL_PAGE_SPACE_ID, block.page.id().space()); + last_offset= FIL_PAGE_TYPE; + next_after_applying: + if (applied == APPLIED_NO) + applied= APPLIED_YES; + } + else + { + record_corrupted: + if (!srv_force_recovery) + { + recv_sys.set_corrupt_log(); + return applied; + } + next_not_same_page: + last_offset= 1; /* the next record must not be same_page */ + } + l+= rlen; + continue; + case OPTION: + ut_ad(rlen == 5); + ut_ad(*l == OPT_PAGE_CHECKSUM); + if (page_checksum(block, l + 1)) + { +page_corrupted: + sql_print_error("InnoDB: Set innodb_force_recovery=1" + " to ignore corruption."); + return APPLIED_CORRUPTED; + } + goto next_after_applying; + } + + ut_ad(mach_read_from_4(frame + FIL_PAGE_OFFSET) == + block.page.id().page_no()); + ut_ad(mach_read_from_4(frame + FIL_PAGE_SPACE_ID) == + block.page.id().space()); + ut_ad(last_offset <= 1 || last_offset > 8); + ut_ad(last_offset <= size); + + switch (b & 0x70) { + case EXTENDED: + if (UNIV_UNLIKELY(block.page.id().page_no() < 3 || + block.page.zip.ssize)) + goto record_corrupted; + static_assert(INIT_ROW_FORMAT_REDUNDANT == 0, "compatiblity"); + static_assert(INIT_ROW_FORMAT_DYNAMIC == 1, "compatibility"); + if (UNIV_UNLIKELY(!rlen)) + goto record_corrupted; + switch (const byte subtype= *l) { + uint8_t ll; + size_t prev_rec, hdr_size; + default: + goto record_corrupted; + case INIT_ROW_FORMAT_REDUNDANT: + case INIT_ROW_FORMAT_DYNAMIC: + if (UNIV_UNLIKELY(rlen != 1)) + goto record_corrupted; + page_create_low(&block, *l != INIT_ROW_FORMAT_REDUNDANT); + break; + case UNDO_INIT: + if (UNIV_UNLIKELY(rlen != 1)) + goto record_corrupted; + trx_undo_page_init(block); + break; + case UNDO_APPEND: + if (UNIV_UNLIKELY(rlen <= 3)) + goto record_corrupted; + if (undo_append(block, ++l, --rlen) && !srv_force_recovery) + goto page_corrupted; + break; + case INSERT_HEAP_REDUNDANT: + case INSERT_REUSE_REDUNDANT: + case INSERT_HEAP_DYNAMIC: + case INSERT_REUSE_DYNAMIC: + if (UNIV_UNLIKELY(rlen < 2)) + goto record_corrupted; + rlen--; + ll= mlog_decode_varint_length(*++l); + if (UNIV_UNLIKELY(ll > 3 || ll >= rlen)) + goto record_corrupted; + prev_rec= mlog_decode_varint(l); + ut_ad(prev_rec != MLOG_DECODE_ERROR); + rlen-= ll; + l+= ll; + ll= mlog_decode_varint_length(*l); + static_assert(INSERT_HEAP_REDUNDANT == 4, "compatibility"); + static_assert(INSERT_REUSE_REDUNDANT == 5, "compatibility"); + static_assert(INSERT_HEAP_DYNAMIC == 6, "compatibility"); + static_assert(INSERT_REUSE_DYNAMIC == 7, "compatibility"); + if (subtype & 2) + { + size_t shift= 0; + if (subtype & 1) + { + if (UNIV_UNLIKELY(ll > 3 || ll >= rlen)) + goto record_corrupted; + shift= mlog_decode_varint(l); + ut_ad(shift != MLOG_DECODE_ERROR); + rlen-= ll; + l+= ll; + ll= mlog_decode_varint_length(*l); + } + if (UNIV_UNLIKELY(ll > 3 || ll >= rlen)) + goto record_corrupted; + size_t enc_hdr_l= mlog_decode_varint(l); + ut_ad(enc_hdr_l != MLOG_DECODE_ERROR); + rlen-= ll; + l+= ll; + ll= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(ll > 2 || ll >= rlen)) + goto record_corrupted; + size_t hdr_c= mlog_decode_varint(l); + ut_ad(hdr_c != MLOG_DECODE_ERROR); + rlen-= ll; + l+= ll; + ll= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(ll > 3 || ll > rlen)) + goto record_corrupted; + size_t data_c= mlog_decode_varint(l); + ut_ad(data_c != MLOG_DECODE_ERROR); + rlen-= ll; + l+= ll; + if (page_apply_insert_dynamic(block, subtype & 1, prev_rec, + shift, enc_hdr_l, hdr_c, data_c, + l, rlen) && !srv_force_recovery) + goto page_corrupted; + } + else + { + if (UNIV_UNLIKELY(ll > 2 || ll >= rlen)) + goto record_corrupted; + size_t header= mlog_decode_varint(l); + ut_ad(header != MLOG_DECODE_ERROR); + rlen-= ll; + l+= ll; + ll= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(ll > 2 || ll >= rlen)) + goto record_corrupted; + size_t hdr_c= mlog_decode_varint(l); + ut_ad(hdr_c != MLOG_DECODE_ERROR); + rlen-= ll; + l+= ll; + ll= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(ll > 2 || ll > rlen)) + goto record_corrupted; + size_t data_c= mlog_decode_varint(l); + rlen-= ll; + l+= ll; + if (page_apply_insert_redundant(block, subtype & 1, prev_rec, + header, hdr_c, data_c, + l, rlen) && !srv_force_recovery) + goto page_corrupted; + } + break; + case DELETE_ROW_FORMAT_REDUNDANT: + if (UNIV_UNLIKELY(rlen < 2 || rlen > 4)) + goto record_corrupted; + rlen--; + ll= mlog_decode_varint_length(*++l); + if (UNIV_UNLIKELY(ll != rlen)) + goto record_corrupted; + if (page_apply_delete_redundant(block, mlog_decode_varint(l)) && + !srv_force_recovery) + goto page_corrupted; + break; + case DELETE_ROW_FORMAT_DYNAMIC: + if (UNIV_UNLIKELY(rlen < 2)) + goto record_corrupted; + rlen--; + ll= mlog_decode_varint_length(*++l); + if (UNIV_UNLIKELY(ll > 3 || ll >= rlen)) + goto record_corrupted; + prev_rec= mlog_decode_varint(l); + ut_ad(prev_rec != MLOG_DECODE_ERROR); + rlen-= ll; + l+= ll; + ll= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(ll > 2 || ll >= rlen)) + goto record_corrupted; + hdr_size= mlog_decode_varint(l); + ut_ad(hdr_size != MLOG_DECODE_ERROR); + rlen-= ll; + l+= ll; + ll= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(ll > 3 || ll != rlen)) + goto record_corrupted; + if (page_apply_delete_dynamic(block, prev_rec, hdr_size, + mlog_decode_varint(l)) && + !srv_force_recovery) + goto page_corrupted; + break; + } + last_offset= FIL_PAGE_TYPE; + goto next_after_applying; + case WRITE: + case MEMSET: + case MEMMOVE: + if (UNIV_UNLIKELY(last_offset == 1)) + goto record_corrupted; + const size_t olen= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(olen >= rlen) || UNIV_UNLIKELY(olen > 3)) + goto record_corrupted; + const uint32_t offset= mlog_decode_varint(l); + ut_ad(offset != MLOG_DECODE_ERROR); + static_assert(FIL_PAGE_OFFSET == 4, "compatibility"); + if (UNIV_UNLIKELY(offset >= size)) + goto record_corrupted; + if (UNIV_UNLIKELY(offset + last_offset < 8 || + offset + last_offset >= size)) + goto record_corrupted; + last_offset= static_cast(last_offset + offset); + l+= olen; + rlen-= olen; + size_t llen= rlen; + if ((b & 0x70) == WRITE) + { + if (UNIV_UNLIKELY(rlen + last_offset > size)) + goto record_corrupted; + memcpy(frame + last_offset, l, llen); + if (UNIV_LIKELY(block.page.id().page_no())); + else if (llen == 11 + MY_AES_BLOCK_SIZE && + last_offset == FSP_HEADER_OFFSET + MAGIC_SZ + + fsp_header_get_encryption_offset(block.zip_size())) + applied= APPLIED_TO_ENCRYPTION; + else if (last_offset < FSP_HEADER_OFFSET + FSP_FREE + FLST_LEN + 4 && + last_offset + llen >= FSP_HEADER_OFFSET + FSP_SIZE) + applied= APPLIED_TO_FSP_HEADER; + next_after_applying_write: + ut_ad(llen + last_offset <= size); + last_offset= static_cast(last_offset + llen); + goto next_after_applying; + } + llen= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(llen > rlen || llen > 3)) + goto record_corrupted; + const uint32_t len= mlog_decode_varint(l); + ut_ad(len != MLOG_DECODE_ERROR); + if (UNIV_UNLIKELY(len + last_offset > size)) + goto record_corrupted; + l+= llen; + rlen-= llen; + llen= len; + if ((b & 0x70) == MEMSET) + { + ut_ad(rlen <= llen); + if (UNIV_UNLIKELY(rlen != 1)) + { + size_t s; + for (s= 0; s < llen; s+= rlen) + memcpy(frame + last_offset + s, l, rlen); + memcpy(frame + last_offset + s, l, llen - s); + } + else + memset(frame + last_offset, *l, llen); + goto next_after_applying_write; + } + const size_t slen= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(slen != rlen || slen > 3)) + goto record_corrupted; + uint32_t s= mlog_decode_varint(l); + ut_ad(slen != MLOG_DECODE_ERROR); + if (s & 1) + s= last_offset - (s >> 1) - 1; + else + s= last_offset + (s >> 1) + 1; + if (UNIV_LIKELY(s >= 8 && s + llen <= size)) + { + memmove(frame + last_offset, frame + s, llen); + goto next_after_applying_write; + } + } + goto record_corrupted; + } + } +}; + + +inline size_t log_phys_t::alloc_size(size_t len) +{ + return len + (1 + 2 + sizeof(log_phys_t)); +} + + +/** Tablespace item during recovery */ +struct file_name_t { + /** Tablespace file name (FILE_MODIFY) */ + std::string name; + /** Tablespace object (NULL if not valid or not found) */ + fil_space_t* space = nullptr; + + /** Tablespace status. */ + enum fil_status { + /** Normal tablespace */ + NORMAL, + /** Deleted tablespace */ + DELETED, + /** Missing tablespace */ + MISSING + }; + + /** Status of the tablespace */ + fil_status status; + + /** FSP_SIZE of tablespace */ + uint32_t size = 0; + + /** Freed pages of tablespace */ + range_set freed_ranges; + + /** Dummy flags before they have been read from the .ibd file */ + static constexpr uint32_t initial_flags = FSP_FLAGS_FCRC32_MASK_MARKER; + /** FSP_SPACE_FLAGS of tablespace */ + uint32_t flags = initial_flags; + + /** Constructor */ + file_name_t(std::string name_, bool deleted) + : name(std::move(name_)), status(deleted ? DELETED: NORMAL) {} + + /** Add the freed pages */ + void add_freed_page(uint32_t page_no) { freed_ranges.add_value(page_no); } + + /** Remove the freed pages */ + void remove_freed_page(uint32_t page_no) + { + if (freed_ranges.empty()) return; + freed_ranges.remove_value(page_no); + } +}; + +/** Map of dirty tablespaces during recovery */ +typedef std::map< + uint32_t, + file_name_t, + std::less, + ut_allocator > > recv_spaces_t; + +static recv_spaces_t recv_spaces; + +/** The last parsed FILE_RENAME records */ +static std::map renamed_spaces; + +/** Files for which fil_ibd_load() returned FIL_LOAD_DEFER */ +static struct +{ + /** Maintains the last opened defer file name along with lsn */ + struct item + { + /** Log sequence number of latest add() called by fil_name_process() */ + lsn_t lsn; + /** File name from the FILE_ record */ + std::string file_name; + /** whether a FILE_DELETE record was encountered */ + mutable bool deleted; + }; + + using map= std::map, + ut_allocator > >; + + /** Map of defer tablespaces */ + map defers; + + /** Add the deferred space only if it is latest one + @param space space identifier + @param f_name file name + @param lsn log sequence number of the FILE_ record */ + void add(uint32_t space, const std::string &f_name, lsn_t lsn) + { + mysql_mutex_assert_owner(&recv_sys.mutex); + const char *filename= f_name.c_str(); + + if (srv_operation == SRV_OPERATION_RESTORE) + { + /* Replace absolute DATA DIRECTORY file paths with + short names relative to the backup directory. */ + if (const char *name= strrchr(filename, '/')) + { + while (--name > filename && *name != '/'); + if (name > filename) + filename= name + 1; + } + } + + char *fil_path= fil_make_filepath(nullptr, {filename, strlen(filename)}, + IBD, false); + const item defer{lsn, fil_path, false}; + ut_free(fil_path); + + /* The file name must be unique. Keep the one with the latest LSN. */ + auto d= defers.begin(); + + while (d != defers.end()) + { + if (d->second.file_name != defer.file_name) + ++d; + else if (d->first == space) + { + /* Neither the file name nor the tablespace ID changed. + Update the LSN if needed. */ + if (d->second.lsn < lsn) + d->second.lsn= lsn; + return; + } + else if (d->second.lsn < lsn) + { + /* Reset the old tablespace name in recovered spaces list */ + recv_spaces_t::iterator it{recv_spaces.find(d->first)}; + if (it != recv_spaces.end() && + it->second.name == d->second.file_name) + it->second.name = ""; + defers.erase(d++); + } + else + { + ut_ad(d->second.lsn != lsn); + return; /* A later tablespace already has this name. */ + } + } + + auto p= defers.emplace(space, defer); + if (!p.second && p.first->second.lsn <= lsn) + { + p.first->second.lsn= lsn; + p.first->second.file_name= defer.file_name; + } + /* Add the newly added defered space and change the file name */ + recv_spaces_t::iterator it{recv_spaces.find(space)}; + if (it != recv_spaces.end()) + it->second.name = defer.file_name; + } + + void remove(uint32_t space) + { + mysql_mutex_assert_owner(&recv_sys.mutex); + defers.erase(space); + } + + /** Look up a tablespace that was found corrupted during recovery. + @param id tablespace id + @return tablespace whose creation was deferred + @retval nullptr if no such tablespace was found */ + item *find(uint32_t id) + { + mysql_mutex_assert_owner(&recv_sys.mutex); + auto it= defers.find(id); + if (it != defers.end()) + return &it->second; + return nullptr; + } + + void clear() + { + mysql_mutex_assert_owner(&recv_sys.mutex); + defers.clear(); + } + + /** Initialize all deferred tablespaces. + @return whether any deferred initialization failed */ + bool reinit_all() + { +retry: + log_sys.latch.wr_unlock(); + fil_space_t *space= fil_system.sys_space; + buf_block_t *free_block= buf_LRU_get_free_block(false); + log_sys.latch.wr_lock(SRW_LOCK_CALL); + mysql_mutex_lock(&recv_sys.mutex); + + for (auto d= defers.begin(); d != defers.end(); ) + { + const uint32_t space_id{d->first}; + recv_sys_t::map::iterator p{recv_sys.pages.lower_bound({space_id,0})}; + + if (d->second.deleted || + p == recv_sys.pages.end() || p->first.space() != space_id) + { + /* We found a FILE_DELETE record for the tablespace, or + there were no buffered records. Either way, we must create a + dummy tablespace with the latest known name, + for dict_drop_index_tree(). */ + recv_sys.pages_it_invalidate(space_id); + while (p != recv_sys.pages.end() && p->first.space() == space_id) + { + ut_ad(!p->second.being_processed); + recv_sys_t::map::iterator r= p++; + recv_sys.erase(r); + } + recv_spaces_t::iterator it{recv_spaces.find(space_id)}; + if (it != recv_spaces.end()) + { + const std::string *name= &d->second.file_name; + if (d->second.deleted) + { + const auto r= renamed_spaces.find(space_id); + if (r != renamed_spaces.end()) + name= &r->second; + bool exists; + os_file_type_t ftype; + if (!os_file_status(name->c_str(), &exists, &ftype) || !exists) + goto processed; + } + if (create(it, *name, static_cast + (1U << FSP_FLAGS_FCRC32_POS_MARKER | + FSP_FLAGS_FCRC32_PAGE_SSIZE()), nullptr, 0)) + mysql_mutex_unlock(&fil_system.mutex); + } + } + else + space= recv_sys.recover_deferred(p, d->second.file_name, free_block); +processed: + auto e= d++; + defers.erase(e); + if (!space) + break; + if (space != fil_system.sys_space) + space->release(); + if (free_block) + continue; + mysql_mutex_unlock(&recv_sys.mutex); + goto retry; + } + + clear(); + mysql_mutex_unlock(&recv_sys.mutex); + if (free_block) + buf_pool.free_block(free_block); + return !space; + } + + /** Create tablespace metadata for a data file that was initially + found corrupted during recovery. + @param it tablespace iterator + @param name latest file name + @param flags FSP_SPACE_FLAGS + @param crypt_data encryption metadata + @param size tablespace size in pages + @return tablespace; the caller must release fil_system.mutex + @retval nullptr if crypt_data is invalid */ + static fil_space_t *create(const recv_spaces_t::const_iterator &it, + const std::string &name, uint32_t flags, + fil_space_crypt_t *crypt_data, uint32_t size) + { + if (crypt_data && !fil_crypt_check(crypt_data, name.c_str())) + return nullptr; + mysql_mutex_lock(&fil_system.mutex); + fil_space_t *space= fil_space_t::create(it->first, flags, + FIL_TYPE_TABLESPACE, crypt_data); + ut_ad(space); + const char *filename= name.c_str(); + if (srv_operation == SRV_OPERATION_RESTORE) + { + if (const char *tbl_name= strrchr(filename, '/')) + { + while (--tbl_name > filename && *tbl_name != '/'); + if (tbl_name > filename) + filename= tbl_name + 1; + } + } + space->add(filename, OS_FILE_CLOSED, size, false, false); + space->recv_size= it->second.size; + space->size_in_header= size; + return space; + } + + /** Attempt to recover pages from the doublewrite buffer. + This is invoked if we found neither a valid first page in the + data file nor redo log records that would initialize the first + page. */ + void deferred_dblwr() + { + for (auto d= defers.begin(); d != defers.end(); ) + { + if (d->second.deleted) + { + next_item: + d++; + continue; + } + const page_id_t page_id{d->first, 0}; + const byte *page= recv_sys.dblwr.find_page(page_id); + if (!page) + goto next_item; + const uint32_t space_id= mach_read_from_4(page + FIL_PAGE_SPACE_ID); + const uint32_t flags= fsp_header_get_flags(page); + const uint32_t page_no= mach_read_from_4(page + FIL_PAGE_OFFSET); + const uint32_t size= fsp_header_get_field(page, FSP_SIZE); + + if (page_no == 0 && space_id == d->first && size >= 4 && + fil_space_t::is_valid_flags(flags, space_id) && + fil_space_t::logical_size(flags) == srv_page_size) + { + recv_spaces_t::iterator it {recv_spaces.find(d->first)}; + ut_ad(it != recv_spaces.end()); + + fil_space_t *space= create( + it, d->second.file_name.c_str(), flags, + fil_space_read_crypt_data(fil_space_t::zip_size(flags), page), + size); + + if (!space) + goto next_item; + + space->free_limit= fsp_header_get_field(page, FSP_FREE_LIMIT); + space->free_len= flst_get_len(FSP_HEADER_OFFSET + FSP_FREE + page); + fil_node_t *node= UT_LIST_GET_FIRST(space->chain); + mysql_mutex_unlock(&fil_system.mutex); + if (!space->acquire()) + { +free_space: + fil_space_free(it->first, false); + goto next_item; + } + if (os_file_write(IORequestWrite, node->name, node->handle, + page, 0, fil_space_t::physical_size(flags)) != + DB_SUCCESS) + { + space->release(); + goto free_space; + } + space->release(); + it->second.space= space; + defers.erase(d++); + continue; + } + goto next_item; + } + } +} +deferred_spaces; + +/** Report an operation to create, delete, or rename a file during backup. +@param[in] space_id tablespace identifier +@param[in] type redo log type +@param[in] name file name (not NUL-terminated) +@param[in] len length of name, in bytes +@param[in] new_name new file name (NULL if not rename) +@param[in] new_len length of new_name, in bytes (0 if NULL) */ +void (*log_file_op)(uint32_t space_id, int type, + const byte* name, ulint len, + const byte* new_name, ulint new_len); + +void (*undo_space_trunc)(uint32_t space_id); + +void (*first_page_init)(uint32_t space_id); + +/** Information about initializing page contents during redo log processing. +FIXME: Rely on recv_sys.pages! */ +class mlog_init_t +{ + using map= std::map, + ut_allocator>>; + /** Map of page initialization operations. + FIXME: Merge this to recv_sys.pages! */ + map inits; + + /** Iterator to the last add() or will_avoid_read(), for speeding up + will_avoid_read(). */ + map::iterator i; +public: + /** Constructor */ + mlog_init_t() : i(inits.end()) {} + + /** Record that a page will be initialized by the redo log. + @param page_id page identifier + @param lsn log sequence number + @return whether the state was changed */ + bool add(const page_id_t page_id, lsn_t lsn) + { + mysql_mutex_assert_owner(&recv_sys.mutex); + const recv_init init = { lsn, false }; + std::pair p= + inits.insert(map::value_type(page_id, init)); + ut_ad(!p.first->second.created); + if (p.second) return true; + if (p.first->second.lsn >= lsn) return false; + p.first->second = init; + i = p.first; + return true; + } + + /** Get the last stored lsn of the page id and its respective + init/load operation. + @param page_id page identifier + @return the latest page initialization; + not valid after releasing recv_sys.mutex. */ + recv_init &last(page_id_t page_id) + { + mysql_mutex_assert_owner(&recv_sys.mutex); + return inits.find(page_id)->second; + } + + /** Determine if a page will be initialized or freed after a time. + @param page_id page identifier + @param lsn log sequence number + @return whether page_id will be freed or initialized after lsn */ + bool will_avoid_read(page_id_t page_id, lsn_t lsn) + { + mysql_mutex_assert_owner(&recv_sys.mutex); + if (i != inits.end() && i->first == page_id) + return i->second.lsn > lsn; + i = inits.lower_bound(page_id); + return i != inits.end() && i->first == page_id && i->second.lsn > lsn; + } + + /** At the end of each recovery batch, reset the 'created' flags. */ + void reset() + { + mysql_mutex_assert_owner(&recv_sys.mutex); + ut_ad(recv_no_ibuf_operations); + for (map::value_type &i : inits) + i.second.created= false; + } + + /** During the last recovery batch, mark whether there exist + buffered changes for the pages that were initialized + by buf_page_create() and still reside in the buffer pool. */ + void mark_ibuf_exist() + { + mysql_mutex_assert_owner(&recv_sys.mutex); + + for (const map::value_type &i : inits) + if (i.second.created) + { + auto &chain= buf_pool.page_hash.cell_get(i.first.fold()); + page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain); + + hash_lock.lock_shared(); + buf_block_t *block= reinterpret_cast + (buf_pool.page_hash.get(i.first, chain)); + bool got_latch= block && block->page.lock.x_lock_try(); + hash_lock.unlock_shared(); + + if (!block) + continue; + + uint32_t state; + + if (!got_latch) + { + mysql_mutex_lock(&buf_pool.mutex); + block= reinterpret_cast + (buf_pool.page_hash.get(i.first, chain)); + if (!block) + { + mysql_mutex_unlock(&buf_pool.mutex); + continue; + } + + state= block->page.fix(); + mysql_mutex_unlock(&buf_pool.mutex); + if (state < buf_page_t::UNFIXED) + { + block->page.unfix(); + continue; + } + block->page.lock.x_lock(); + state= block->page.unfix(); + ut_ad(state < buf_page_t::READ_FIX); + if (state >= buf_page_t::UNFIXED && block->page.id() == i.first) + goto check_ibuf; + } + else + { + state= block->page.state(); + ut_ad(state >= buf_page_t::FREED); + ut_ad(state < buf_page_t::READ_FIX); + + if (state >= buf_page_t::UNFIXED) + { + check_ibuf: + mysql_mutex_unlock(&recv_sys.mutex); + if (ibuf_page_exists(block->page.id(), block->zip_size())) + block->page.set_ibuf_exist(); + mysql_mutex_lock(&recv_sys.mutex); + } + } + + block->page.lock.x_unlock(); + } + } + + /** Clear the data structure */ + void clear() { inits.clear(); i = inits.end(); } +}; + +static mlog_init_t mlog_init; + +/** Try to recover a tablespace that was not readable earlier +@param p iterator to the page +@param name tablespace file name +@param free_block spare buffer block +@return recovered tablespace +@retval nullptr if recovery failed */ +fil_space_t *recv_sys_t::recover_deferred(const recv_sys_t::map::iterator &p, + const std::string &name, + buf_block_t *&free_block) +{ + mysql_mutex_assert_owner(&mutex); + + ut_ad(p->first.space()); + + recv_spaces_t::iterator it{recv_spaces.find(p->first.space())}; + ut_ad(it != recv_spaces.end()); + + if (!p->first.page_no() && p->second.skip_read) + { + mtr_t mtr; + ut_ad(!p->second.being_processed); + p->second.being_processed= 1; + init &init= mlog_init.last(p->first); + mysql_mutex_unlock(&mutex); + buf_block_t *block= recover_low(p, mtr, free_block, init); + mysql_mutex_lock(&mutex); + p->second.being_processed= -1; + ut_ad(block == free_block || block == reinterpret_cast(-1)); + free_block= nullptr; + if (UNIV_UNLIKELY(!block || block == reinterpret_cast(-1))) + goto fail; + const byte *page= UNIV_LIKELY_NULL(block->page.zip.data) + ? block->page.zip.data + : block->page.frame; + const uint32_t space_id= mach_read_from_4(page + FIL_PAGE_SPACE_ID); + const uint32_t flags= fsp_header_get_flags(page); + const uint32_t page_no= mach_read_from_4(page + FIL_PAGE_OFFSET); + const uint32_t size= fsp_header_get_field(page, FSP_SIZE); + + if (page_id_t{space_id, page_no} == p->first && size >= 4 && + fil_space_t::is_valid_flags(flags, space_id) && + fil_space_t::logical_size(flags) == srv_page_size) + { + fil_space_t *space= deferred_spaces.create(it, name, flags, + fil_space_read_crypt_data + (fil_space_t::zip_size(flags), + page), size); + if (!space) + goto release_and_fail; + space->free_limit= fsp_header_get_field(page, FSP_FREE_LIMIT); + space->free_len= flst_get_len(FSP_HEADER_OFFSET + FSP_FREE + page); + fil_node_t *node= UT_LIST_GET_FIRST(space->chain); + node->deferred= true; + mysql_mutex_unlock(&fil_system.mutex); + if (!space->acquire()) + goto release_and_fail; + fil_names_dirty(space); + const bool is_compressed= fil_space_t::is_compressed(flags); +#ifdef _WIN32 + const bool is_sparse= is_compressed; + if (is_compressed) + os_file_set_sparse_win32(node->handle); +#else + const bool is_sparse= is_compressed && + DB_SUCCESS == os_file_punch_hole(node->handle, 0, 4096) && + !my_test_if_thinly_provisioned(node->handle); +#endif + /* Mimic fil_node_t::read_page0() in case the file exists and + has already been extended to a larger size. */ + ut_ad(node->size == size); + const os_offset_t file_size= os_file_get_size(node->handle); + if (file_size != os_offset_t(-1)) + { + const uint32_t n_pages= + uint32_t(file_size / fil_space_t::physical_size(flags)); + if (n_pages > size) + { + mysql_mutex_lock(&fil_system.mutex); + space->size= node->size= n_pages; + space->set_committed_size(); + mysql_mutex_unlock(&fil_system.mutex); + goto size_set; + } + } + if (!os_file_set_size(node->name, node->handle, + (size * fil_space_t::physical_size(flags)) & + ~4095ULL, is_sparse)) + { + space->release(); + goto release_and_fail; + } + size_set: + node->deferred= false; + it->second.space= space; + block->page.lock.x_unlock(); + p->second.being_processed= -1; + return space; + } + + release_and_fail: + block->page.lock.x_unlock(); + } + +fail: + ib::error() << "Cannot apply log to " << p->first + << " of corrupted file '" << name << "'"; + return nullptr; +} + +/** Process a record that indicates that a tablespace is +being shrunk in size. +@param page_id first page identifier that is not in the file +@param lsn log sequence number of the shrink operation */ +inline void recv_sys_t::trim(const page_id_t page_id, lsn_t lsn) +{ + DBUG_ENTER("recv_sys_t::trim"); + DBUG_LOG("ib_log", "discarding log beyond end of tablespace " + << page_id << " before LSN " << lsn); + mysql_mutex_assert_owner(&mutex); + if (pages_it != pages.end() && pages_it->first.space() == page_id.space()) + pages_it= pages.end(); + for (recv_sys_t::map::iterator p = pages.lower_bound(page_id); + p != pages.end() && p->first.space() == page_id.space();) + { + recv_sys_t::map::iterator r = p++; + if (r->second.trim(lsn)) + { + ut_ad(!r->second.being_processed); + pages.erase(r); + } + } + DBUG_VOID_RETURN; +} + +inline dberr_t recv_sys_t::read(os_offset_t total_offset, span buf) +{ + size_t file_idx= static_cast(total_offset / log_sys.file_size); + os_offset_t offset= total_offset % log_sys.file_size; + return file_idx + ? recv_sys.files[file_idx].read(offset, buf) + : log_sys.log.read(offset, buf); +} + +inline size_t recv_sys_t::files_size() +{ + ut_ad(!files.empty()); + return files.size(); +} + +/** Process a file name from a FILE_* record. +@param[in] name file name +@param[in] len length of the file name +@param[in] space_id the tablespace ID +@param[in] ftype FILE_MODIFY, FILE_DELETE, or FILE_RENAME +@param[in] lsn lsn of the redo log +@param[in] if_exists whether to check if the tablespace exists */ +static void fil_name_process(const char *name, ulint len, uint32_t space_id, + mfile_type_t ftype, lsn_t lsn, bool if_exists) +{ + ut_ad(srv_operation <= SRV_OPERATION_EXPORT_RESTORED + || srv_operation == SRV_OPERATION_RESTORE + || srv_operation == SRV_OPERATION_RESTORE_EXPORT); + + /* We will also insert space=NULL into the map, so that + further checks can ensure that a FILE_MODIFY record was + scanned before applying any page records for the space_id. */ + + const bool deleted{ftype == FILE_DELETE}; + const file_name_t fname(std::string(name, len), deleted); + std::pair p = recv_spaces.emplace( + space_id, fname); + ut_ad(p.first->first == space_id); + + file_name_t& f = p.first->second; + + if (auto d = deferred_spaces.find(space_id)) { + if (deleted) { + d->deleted = true; + goto got_deleted; + } + goto reload; + } + + if (deleted) { +got_deleted: + /* Got FILE_DELETE */ + if (!p.second && f.status != file_name_t::DELETED) { + f.status = file_name_t::DELETED; + if (f.space != NULL) { + fil_space_free(space_id, false); + f.space = NULL; + } + } + + ut_ad(f.space == NULL); + } else if (p.second // the first FILE_MODIFY or FILE_RENAME + || f.name != fname.name) { +reload: + fil_space_t* space; + + /* Check if the tablespace file exists and contains + the space_id. If not, ignore the file after displaying + a note. Abort if there are multiple files with the + same space_id. */ + switch (fil_ibd_load(space_id, fname.name.c_str(), space)) { + case FIL_LOAD_OK: + ut_ad(space != NULL); + + deferred_spaces.remove(space_id); + if (!f.space) { + if (f.size + || f.flags != f.initial_flags) { + fil_space_set_recv_size_and_flags( + space->id, f.size, f.flags); + } + + f.space = space; + goto same_space; + } else if (f.space == space) { +same_space: + f.name = fname.name; + f.status = file_name_t::NORMAL; + } else { + sql_print_error("InnoDB: Tablespace " UINT32PF + " has been found" + " in two places:" + " '%.*s' and '%.*s'." + " You must delete" + " one of them.", + space_id, + int(f.name.size()), + f.name.data(), + int(fname.name.size()), + fname.name.data()); + recv_sys.set_corrupt_fs(); + } + break; + + case FIL_LOAD_ID_CHANGED: + ut_ad(space == NULL); + break; + + case FIL_LOAD_NOT_FOUND: + /* No matching tablespace was found; maybe it + was renamed, and we will find a subsequent + FILE_* record. */ + ut_ad(space == NULL); + + if (srv_force_recovery) { + /* Without innodb_force_recovery, + missing tablespaces will only be + reported in + recv_init_crash_recovery_spaces(). + Enable some more diagnostics when + forcing recovery. */ + + sql_print_information( + "InnoDB: At LSN: " LSN_PF + ": unable to open file %.*s" + " for tablespace " UINT32PF, + recv_sys.lsn, + int(fname.name.size()), + fname.name.data(), space_id); + } + break; + + case FIL_LOAD_DEFER: + /** Skip the deferred spaces + when lsn is already processed */ + if (!if_exists) { + deferred_spaces.add( + space_id, fname.name.c_str(), lsn); + } + break; + case FIL_LOAD_INVALID: + ut_ad(space == NULL); + if (srv_force_recovery == 0) { + sql_print_error("InnoDB: Recovery cannot access" + " file %.*s (tablespace " + UINT32PF ")", int(len), name, + space_id); + sql_print_information("InnoDB: You may set " + "innodb_force_recovery=1" + " to ignore this and" + " possibly get a" + " corrupted database."); + recv_sys.set_corrupt_fs(); + break; + } + + sql_print_warning("InnoDB: Ignoring changes to" + " file %.*s (tablespace " + UINT32PF ")" + " due to innodb_force_recovery", + int(len), name, space_id); + } + } +} + +void recv_sys_t::close_files() +{ + for (auto &file : files) + if (file.is_opened()) + file.close(); + files.clear(); + files.shrink_to_fit(); +} + +/** Clean up after recv_sys_t::create() */ +void recv_sys_t::close() +{ + ut_ad(this == &recv_sys); + + if (is_initialised()) + { + dblwr.pages.clear(); + ut_d(mysql_mutex_lock(&mutex)); + clear(); + deferred_spaces.clear(); + ut_d(mysql_mutex_unlock(&mutex)); + + scanned_lsn= 0; + mysql_mutex_destroy(&mutex); + } + + recv_spaces.clear(); + renamed_spaces.clear(); + mlog_init.clear(); + close_files(); +} + +/** Initialize the redo log recovery subsystem. */ +void recv_sys_t::create() +{ + ut_ad(this == &recv_sys); + ut_ad(!is_initialised()); + mysql_mutex_init(recv_sys_mutex_key, &mutex, nullptr); + + apply_log_recs = false; + + len = 0; + offset = 0; + lsn = 0; + scanned_lsn = 1; + found_corrupt_log = false; + found_corrupt_fs = false; + file_checkpoint = 0; + + progress_time = time(NULL); + ut_ad(pages.empty()); + pages_it = pages.end(); + recv_max_page_lsn = 0; + + memset(truncated_undo_spaces, 0, sizeof truncated_undo_spaces); + UT_LIST_INIT(blocks, &buf_block_t::unzip_LRU); +} + +/** Clear a fully processed set of stored redo log records. */ +void recv_sys_t::clear() +{ + mysql_mutex_assert_owner(&mutex); + apply_log_recs= false; + ut_ad(!after_apply || found_corrupt_fs || !UT_LIST_GET_LAST(blocks)); + pages.clear(); + pages_it= pages.end(); + + for (buf_block_t *block= UT_LIST_GET_LAST(blocks); block; ) + { + buf_block_t *prev_block= UT_LIST_GET_PREV(unzip_LRU, block); + ut_ad(block->page.state() == buf_page_t::MEMORY); + UT_LIST_REMOVE(blocks, block); + MEM_MAKE_ADDRESSABLE(block->page.frame, srv_page_size); + buf_block_free(block); + block= prev_block; + } +} + +/** Free most recovery data structures. */ +void recv_sys_t::debug_free() +{ + ut_ad(this == &recv_sys); + ut_ad(is_initialised()); + mysql_mutex_lock(&mutex); + + recovery_on= false; + pages.clear(); + pages_it= pages.end(); + + mysql_mutex_unlock(&mutex); +} + + +/** Free a redo log snippet. +@param data buffer allocated in add() */ +inline void recv_sys_t::free(const void *data) +{ + ut_ad(!ut_align_offset(data, ALIGNMENT)); + data= page_align(data); + mysql_mutex_assert_owner(&mutex); + + /* MDEV-14481 FIXME: To prevent race condition with buf_pool.resize(), + we must acquire and hold the buffer pool mutex here. */ + ut_ad(!buf_pool.resize_in_progress()); + + auto *chunk= buf_pool.chunks; + for (auto i= buf_pool.n_chunks; i--; chunk++) + { + if (data < chunk->blocks->page.frame) + continue; + const size_t offs= (reinterpret_cast(data) - + chunk->blocks->page.frame) >> srv_page_size_shift; + if (offs >= chunk->size) + continue; + buf_block_t *block= &chunk->blocks[offs]; + ut_ad(block->page.frame == data); + ut_ad(block->page.state() == buf_page_t::MEMORY); + ut_ad(static_cast(block->page.access_time - 1) < + srv_page_size); + unsigned a= block->page.access_time; + ut_ad(a >= 1U << 16); + a-= 1U << 16; + block->page.access_time= a; + if (!(a >> 16)) + { + UT_LIST_REMOVE(blocks, block); + MEM_MAKE_ADDRESSABLE(block->page.frame, srv_page_size); + buf_block_free(block); + } + return; + } + ut_ad(0); +} + + +/** @return whether a log_t::FORMAT_10_5 log block checksum matches */ +static bool recv_check_log_block(const byte *buf) +{ + return mach_read_from_4(my_assume_aligned<4>(508 + buf)) == + my_crc32c(0, buf, 508); +} + +/** Calculate the checksum for a log block using the pre-10.2.2 algorithm. */ +inline uint32_t log_block_calc_checksum_format_0(const byte *b) +{ + uint32_t sum= 1; + const byte *const end= &b[512 - 4]; + + for (uint32_t sh= 0; b < end; ) + { + sum&= 0x7FFFFFFFUL; + sum+= uint32_t{*b} << sh++; + sum+= *b++; + if (sh > 24) + sh= 0; + } + + return sum; +} + +/** Determine if a redo log from before MariaDB 10.2.2 is clean. +@return error code +@retval DB_SUCCESS if the redo log is clean +@retval DB_CORRUPTION if the redo log is corrupted +@retval DB_ERROR if the redo log is not empty */ +ATTRIBUTE_COLD static dberr_t recv_log_recover_pre_10_2() +{ + uint64_t max_no= 0; + + ut_ad(log_sys.format == 0); + + /** Offset of the first checkpoint checksum */ + constexpr uint CHECKSUM_1= 288; + /** Offset of the second checkpoint checksum */ + constexpr uint CHECKSUM_2= CHECKSUM_1 + 4; + /** the checkpoint LSN field */ + constexpr uint CHECKPOINT_LSN= 8; + /** Most significant bits of the checkpoint offset */ + constexpr uint OFFS_HI= CHECKSUM_2 + 12; + /** Least significant bits of the checkpoint offset */ + constexpr uint OFFS_LO= 16; + + lsn_t source_offset= 0; + const lsn_t log_size{(log_sys.file_size - 2048) * recv_sys.files_size()}; + for (size_t field= 512; field < 2048; field+= 1024) + { + const byte *buf= log_sys.buf + field; + + if (static_cast(ut_fold_binary(buf, CHECKSUM_1)) != + mach_read_from_4(buf + CHECKSUM_1) || + static_cast(ut_fold_binary(buf + CHECKPOINT_LSN, + CHECKSUM_2 - CHECKPOINT_LSN)) != + mach_read_from_4(buf + CHECKSUM_2)) + { + DBUG_PRINT("ib_log", ("invalid pre-10.2.2 checkpoint %zu", field)); + continue; + } + + if (!log_crypt_101_read_checkpoint(buf)) + { + sql_print_error("InnoDB: Decrypting checkpoint failed"); + continue; + } + + const uint64_t checkpoint_no= mach_read_from_8(buf); + + DBUG_PRINT("ib_log", ("checkpoint " UINT64PF " at " LSN_PF " found", + checkpoint_no, + mach_read_from_8(buf + CHECKPOINT_LSN))); + + if (checkpoint_no < max_no) + continue; + + const lsn_t o= lsn_t{mach_read_from_4(buf + OFFS_HI)} << 32 | + mach_read_from_4(buf + OFFS_LO); + if (o >= 0x80c && (o & ~511) + 512 < log_size) + { + max_no= checkpoint_no; + log_sys.next_checkpoint_lsn= mach_read_from_8(buf + CHECKPOINT_LSN); + source_offset= o; + } + } + + const char *uag= srv_operation == SRV_OPERATION_NORMAL + ? "InnoDB: Upgrade after a crash is not supported." + : "mariadb-backup --prepare is not possible."; + + if (!log_sys.next_checkpoint_lsn) + { + sql_print_error("%s" + " This redo log was created before MariaDB 10.2.2," + " and we did not find a valid checkpoint." + " Please follow the instructions at" + " https://mariadb.com/kb/en/library/upgrading/", uag); + return DB_ERROR; + } + + static const char pre_10_2[]= + " This redo log was created before MariaDB 10.2.2"; + + byte *buf= const_cast(field_ref_zero); + + if (source_offset < (log_sys.is_pmem() ? log_sys.file_size : 4096)) + memcpy_aligned<512>(buf, &log_sys.buf[source_offset & ~511], 512); + else + if (dberr_t err= recv_sys.read(source_offset & ~511, {buf, 512})) + return err; + + if (log_block_calc_checksum_format_0(buf) != + mach_read_from_4(my_assume_aligned<4>(buf + 508)) && + !log_crypt_101_read_block(buf, log_sys.next_checkpoint_lsn)) + { + sql_print_error("%s%s, and it appears corrupted.", uag, pre_10_2); + return DB_CORRUPTION; + } + + if (mach_read_from_2(buf + 4) == (source_offset & 511)) + return DB_SUCCESS; + + if (buf[20 + 32 * 9] == 2) + sql_print_error("InnoDB: Cannot decrypt log for upgrading." + " The encrypted log was created before MariaDB 10.2.2."); + else + sql_print_error("%s%s. You must start up and shut down" + " MariaDB 10.1 or MySQL 5.6 or earlier" + " on the data directory.", + uag, pre_10_2); + + return DB_ERROR; +} + +/** Determine if a redo log from MariaDB 10.2.2, 10.3, 10.4, or 10.5 is clean. +@param lsn_offset checkpoint LSN offset +@return error code +@retval DB_SUCCESS if the redo log is clean +@retval DB_CORRUPTION if the redo log is corrupted +@retval DB_ERROR if the redo log is not empty */ +static dberr_t recv_log_recover_10_5(lsn_t lsn_offset) +{ + byte *buf= const_cast(field_ref_zero); + + if (lsn_offset < (log_sys.is_pmem() ? log_sys.file_size : 4096)) + memcpy_aligned<512>(buf, &log_sys.buf[lsn_offset & ~511], 512); + else + { + if (dberr_t err= recv_sys.read(lsn_offset & ~lsn_t{4095}, {buf, 4096})) + return err; + buf+= lsn_offset & 0xe00; + } + + if (!recv_check_log_block(buf)) + { + sql_print_error("InnoDB: Invalid log header checksum"); + return DB_CORRUPTION; + } + + if (log_sys.is_encrypted() && + !log_decrypt(buf, log_sys.next_checkpoint_lsn & ~511, 512)) + return DB_ERROR; + + /* On a clean shutdown, the redo log will be logically empty + after the checkpoint lsn. */ + + if (mach_read_from_2(my_assume_aligned<2>(buf + 4)) != (lsn_offset & 511)) + return DB_ERROR; + + return DB_SUCCESS; +} + +dberr_t recv_sys_t::find_checkpoint() +{ + bool wrong_size= false; + byte *buf; + + ut_ad(pages.empty()); + pages_it= pages.end(); + + if (files.empty()) + { + file_checkpoint= 0; + std::string path{get_log_file_path()}; + bool success; + os_file_t file{os_file_create_func(path.c_str(), + OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT, + OS_FILE_NORMAL, OS_LOG_FILE, + srv_read_only_mode, &success)}; + if (file == OS_FILE_CLOSED) + return DB_ERROR; + const os_offset_t size{os_file_get_size(file)}; + if (!size) + { + if (srv_operation != SRV_OPERATION_NORMAL) + goto too_small; + } + else if (size < log_t::START_OFFSET + SIZE_OF_FILE_CHECKPOINT) + { + too_small: + sql_print_error("InnoDB: File %.*s is too small", + int(path.size()), path.data()); + err_exit: + os_file_close(file); + return DB_ERROR; + } + else if (!log_sys.attach(file, size)) + goto err_exit; + else + file= OS_FILE_CLOSED; + + recv_sys.files.emplace_back(file); + for (int i= 1; i < 101; i++) + { + path= get_log_file_path(LOG_FILE_NAME_PREFIX).append(std::to_string(i)); + file= os_file_create_func(path.c_str(), + OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT | + OS_FILE_ON_ERROR_SILENT, + OS_FILE_NORMAL, OS_LOG_FILE, true, &success); + if (file == OS_FILE_CLOSED) + break; + const os_offset_t sz{os_file_get_size(file)}; + if (size != sz) + { + sql_print_error("InnoDB: Log file %.*s is of different size " UINT64PF + " bytes than other log files " UINT64PF " bytes!", + int(path.size()), path.data(), sz, size); + wrong_size= true; + } + recv_sys.files.emplace_back(file); + } + + if (!size) + { + if (wrong_size) + return DB_CORRUPTION; + if (log_sys.next_checkpoint_lsn < 8204) + { + /* Before MDEV-14425, InnoDB had a minimum LSN of 8192+12=8204. + Likewise, mariadb-backup --prepare would create an empty + ib_logfile0 after applying the log. We will allow an upgrade + from such an empty log. + + If a user replaces the redo log with an empty file and the + FIL_PAGE_FILE_FLUSH_LSN field was zero in the system + tablespace (see SysTablespace::read_lsn_and_check_flags()) we + must refuse to start up. */ + sql_print_error("InnoDB: ib_logfile0 is empty, and LSN is unknown."); + return DB_CORRUPTION; + } + lsn= log_sys.next_checkpoint_lsn; + log_sys.format= log_t::FORMAT_3_23; + goto upgrade; + } + } + else + ut_ad(srv_operation == SRV_OPERATION_BACKUP); + log_sys.next_checkpoint_lsn= 0; + lsn= 0; + buf= my_assume_aligned<4096>(log_sys.buf); + if (!log_sys.is_pmem()) + if (dberr_t err= log_sys.log.read(0, {buf, 4096})) + return err; + /* Check the header page checksum. There was no + checksum in the first redo log format (version 0). */ + log_sys.format= mach_read_from_4(buf + LOG_HEADER_FORMAT); + if (log_sys.format == log_t::FORMAT_3_23) + { + if (wrong_size) + return DB_CORRUPTION; + if (dberr_t err= recv_log_recover_pre_10_2()) + return err; + upgrade: + memset_aligned<4096>(const_cast(field_ref_zero), 0, 4096); + /* Mark the redo log for upgrading. */ + log_sys.last_checkpoint_lsn= log_sys.next_checkpoint_lsn; + log_sys.set_recovered_lsn(log_sys.next_checkpoint_lsn); + lsn= file_checkpoint= log_sys.next_checkpoint_lsn; + log_sys.next_checkpoint_no= 0; + return DB_SUCCESS; + } + + if (!recv_check_log_block(buf)) + { + sql_print_error("InnoDB: Invalid log header checksum"); + return DB_CORRUPTION; + } + + const lsn_t first_lsn{mach_read_from_8(buf + LOG_HEADER_START_LSN)}; + log_sys.set_first_lsn(first_lsn); + char creator[LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR + 1]; + memcpy(creator, buf + LOG_HEADER_CREATOR, sizeof creator); + /* Ensure that the string is NUL-terminated. */ + creator[LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR]= 0; + + lsn_t lsn_offset= 0; + + switch (log_sys.format) { + default: + sql_print_error("InnoDB: Unsupported redo log format." + " The redo log was created with %s.", creator); + return DB_ERROR; + case log_t::FORMAT_10_8: + if (files.size() != 1) + { + sql_print_error("InnoDB: Expecting only ib_logfile0"); + return DB_CORRUPTION; + } + + if (*reinterpret_cast(buf + LOG_HEADER_FORMAT + 4) || + first_lsn < log_t::FIRST_LSN) + { + sql_print_error("InnoDB: Invalid ib_logfile0 header block;" + " the log was created with %s.", creator); + return DB_CORRUPTION; + } + + if (!mach_read_from_4(buf + LOG_HEADER_CREATOR_END)); + else if (!log_crypt_read_header(buf + LOG_HEADER_CREATOR_END)) + { + sql_print_error("InnoDB: Reading log encryption info failed;" + " the log was created with %s.", creator); + return DB_ERROR; + } + else + log_sys.format= log_t::FORMAT_ENC_10_8; + + for (size_t field= log_t::CHECKPOINT_1; field <= log_t::CHECKPOINT_2; + field+= log_t::CHECKPOINT_2 - log_t::CHECKPOINT_1) + { + if (log_sys.is_pmem()) + buf= log_sys.buf + field; + else + if (dberr_t err= log_sys.log.read(field, + {buf, log_sys.get_block_size()})) + return err; + const lsn_t checkpoint_lsn{mach_read_from_8(buf)}; + const lsn_t end_lsn{mach_read_from_8(buf + 8)}; + if (checkpoint_lsn < first_lsn || end_lsn < checkpoint_lsn || + memcmp(buf + 16, field_ref_zero, 60 - 16) || + my_crc32c(0, buf, 60) != mach_read_from_4(buf + 60)) + { + DBUG_PRINT("ib_log", ("invalid checkpoint at %zu", field)); + continue; + } + + if (checkpoint_lsn >= log_sys.next_checkpoint_lsn) + { + log_sys.next_checkpoint_lsn= checkpoint_lsn; + log_sys.next_checkpoint_no= field == log_t::CHECKPOINT_1; + lsn= end_lsn; + } + } + if (!log_sys.next_checkpoint_lsn) + goto got_no_checkpoint; + if (!memcmp(creator, "Backup ", 7)) + srv_start_after_restore= true; + return DB_SUCCESS; + case log_t::FORMAT_10_5: + case log_t::FORMAT_10_5 | log_t::FORMAT_ENCRYPTED: + if (files.size() != 1) + { + sql_print_error("InnoDB: Expecting only ib_logfile0"); + return DB_CORRUPTION; + } + /* fall through */ + case log_t::FORMAT_10_2: + case log_t::FORMAT_10_2 | log_t::FORMAT_ENCRYPTED: + case log_t::FORMAT_10_3: + case log_t::FORMAT_10_3 | log_t::FORMAT_ENCRYPTED: + case log_t::FORMAT_10_4: + case log_t::FORMAT_10_4 | log_t::FORMAT_ENCRYPTED: + uint64_t max_no= 0; + const lsn_t log_size{(log_sys.file_size - 2048) * files.size()}; + for (size_t field= 512; field < 2048; field += 1024) + { + const byte *b = buf + field; + + if (!recv_check_log_block(b)) + { + DBUG_PRINT("ib_log", ("invalid checkpoint checksum at %zu", field)); + continue; + } + + if (log_sys.is_encrypted() && !log_crypt_read_checkpoint_buf(b)) + { + sql_print_error("InnoDB: Reading checkpoint encryption info failed."); + continue; + } + + const uint64_t checkpoint_no= mach_read_from_8(b); + const lsn_t checkpoint_lsn= mach_read_from_8(b + 8); + DBUG_PRINT("ib_log", ("checkpoint " UINT64PF " at " LSN_PF " found", + checkpoint_no, checkpoint_lsn)); + const lsn_t o{mach_read_from_8(b + 16)}; + if (checkpoint_no >= max_no && o >= 0x80c && (o & ~511) + 512 < log_size) + { + max_no= checkpoint_no; + log_sys.next_checkpoint_lsn= checkpoint_lsn; + log_sys.next_checkpoint_no= field == 512; + lsn_offset= mach_read_from_8(b + 16); + } + } + } + + if (!log_sys.next_checkpoint_lsn) + { + got_no_checkpoint: + sql_print_error("InnoDB: No valid checkpoint was found;" + " the log was created with %s.", creator); + return DB_ERROR; + } + + if (wrong_size) + return DB_CORRUPTION; + + if (dberr_t err= recv_log_recover_10_5(lsn_offset)) + { + const char *msg1, *msg2, *msg3; + msg1= srv_operation == SRV_OPERATION_NORMAL + ? "InnoDB: Upgrade after a crash is not supported." + : "mariadb-backup --prepare is not possible."; + + if (err == DB_ERROR) + { + msg2= srv_operation == SRV_OPERATION_NORMAL + ? ". You must start up and shut down MariaDB " + : ". You must use mariadb-backup "; + msg3= (log_sys.format & ~log_t::FORMAT_ENCRYPTED) == log_t::FORMAT_10_5 + ? "10.7 or earlier." : "10.4 or earlier."; + } + else + msg2= ", and it appears corrupted.", msg3= ""; + + sql_print_error("%s The redo log was created with %s%s%s", + msg1, creator, msg2, msg3); + return err; + } + + goto upgrade; +} + +/** Trim old log records for a page. +@param start_lsn oldest log sequence number to preserve +@return whether all the log for the page was trimmed */ +inline bool page_recv_t::trim(lsn_t start_lsn) +{ + while (log.head) + { + if (log.head->lsn > start_lsn) return false; + last_offset= 1; /* the next record must not be same_page */ + log_rec_t *next= log.head->next; + recv_sys.free(log.head); + log.head= next; + } + log.tail= nullptr; + return true; +} + + +void page_recv_t::recs_t::rewind(lsn_t start_lsn) +{ + mysql_mutex_assert_owner(&recv_sys.mutex); + log_phys_t *trim= static_cast(head); + ut_ad(trim); + while (log_phys_t *next= static_cast(trim->next)) + { + ut_ad(trim->start_lsn < start_lsn); + if (next->start_lsn == start_lsn) + break; + trim= next; + } + tail= trim; + log_rec_t *l= tail->next; + tail->next= nullptr; + while (l) + { + log_rec_t *next= l->next; + recv_sys.free(l); + l= next; + } +} + + +void page_recv_t::recs_t::clear() +{ + mysql_mutex_assert_owner(&recv_sys.mutex); + for (const log_rec_t *l= head; l; ) + { + const log_rec_t *next= l->next; + recv_sys.free(l); + l= next; + } + head= tail= nullptr; +} + +/** Ignore any earlier redo log records for this page. */ +inline void page_recv_t::will_not_read() +{ + ut_ad(!being_processed); + skip_read= true; + log.clear(); +} + +void recv_sys_t::erase(map::iterator p) +{ + ut_ad(p->second.being_processed <= 0); + p->second.log.clear(); + pages.erase(p); +} + +/** Free log for processed pages. */ +void recv_sys_t::garbage_collect() +{ + mysql_mutex_assert_owner(&mutex); + + if (pages_it != pages.end() && pages_it->second.being_processed < 0) + pages_it= pages.end(); + + for (map::iterator p= pages.begin(); p != pages.end(); ) + { + if (p->second.being_processed < 0) + { + map::iterator r= p++; + erase(r); + } + else + p++; + } +} + +/** Allocate a block from the buffer pool for recv_sys.pages */ +ATTRIBUTE_COLD buf_block_t *recv_sys_t::add_block() +{ + for (bool freed= false;;) + { + const auto rs= UT_LIST_GET_LEN(blocks) * 2; + mysql_mutex_lock(&buf_pool.mutex); + const auto bs= + UT_LIST_GET_LEN(buf_pool.free) + UT_LIST_GET_LEN(buf_pool.LRU); + if (UNIV_LIKELY(bs > BUF_LRU_MIN_LEN || rs < bs)) + { + buf_block_t *block= buf_LRU_get_free_block(true); + mysql_mutex_unlock(&buf_pool.mutex); + return block; + } + /* out of memory: redo log occupies more than 1/3 of buf_pool + and there are fewer than BUF_LRU_MIN_LEN pages left */ + mysql_mutex_unlock(&buf_pool.mutex); + if (freed) + return nullptr; + freed= true; + garbage_collect(); + } +} + +/** Wait for buffer pool to become available. */ +ATTRIBUTE_COLD void recv_sys_t::wait_for_pool(size_t pages) +{ + mysql_mutex_unlock(&mutex); + os_aio_wait_until_no_pending_reads(false); + mysql_mutex_lock(&mutex); + garbage_collect(); + mysql_mutex_lock(&buf_pool.mutex); + bool need_more= UT_LIST_GET_LEN(buf_pool.free) < pages; + mysql_mutex_unlock(&buf_pool.mutex); + if (need_more) + buf_flush_sync_batch(lsn); +} + +/** Register a redo log snippet for a page. +@param it page iterator +@param start_lsn start LSN of the mini-transaction +@param lsn @see mtr_t::commit_lsn() +@param l redo log snippet +@param len length of l, in bytes +@return whether we ran out of memory */ +ATTRIBUTE_NOINLINE +bool recv_sys_t::add(map::iterator it, lsn_t start_lsn, lsn_t lsn, + const byte *l, size_t len) +{ + mysql_mutex_assert_owner(&mutex); + page_recv_t &recs= it->second; + buf_block_t *block; + + switch (*l & 0x70) { + case FREE_PAGE: case INIT_PAGE: + recs.will_not_read(); + mlog_init.add(it->first, start_lsn); /* FIXME: remove this! */ + /* fall through */ + default: + log_phys_t *tail= static_cast(recs.log.last()); + if (!tail) + break; + if (tail->start_lsn != start_lsn) + break; + ut_ad(tail->lsn == lsn); + block= UT_LIST_GET_LAST(blocks); + ut_ad(block); + const size_t used= static_cast(block->page.access_time - 1) + 1; + ut_ad(used >= ALIGNMENT); + const byte *end= const_cast(tail)->end(); + if (!((reinterpret_cast(end + len) ^ + reinterpret_cast(end)) & ~(ALIGNMENT - 1))) + { + /* Use already allocated 'padding' bytes */ +append: + MEM_MAKE_ADDRESSABLE(end + 1, len); + /* Append to the preceding record for the page */ + tail->append(l, len); + return false; + } + if (end <= &block->page.frame[used - ALIGNMENT] || + &block->page.frame[used] >= end) + break; /* Not the last allocated record in the page */ + const size_t new_used= static_cast + (end - block->page.frame + len + 1); + ut_ad(new_used > used); + if (new_used > srv_page_size) + break; + block->page.access_time= (block->page.access_time & ~0U << 16) | + ut_calc_align(static_cast(new_used), ALIGNMENT); + goto append; + } + + const size_t size{log_phys_t::alloc_size(len)}; + ut_ad(size <= srv_page_size); + void *buf; + block= UT_LIST_GET_FIRST(blocks); + if (UNIV_UNLIKELY(!block)) + { + create_block: + block= add_block(); + if (UNIV_UNLIKELY(!block)) + return true; + block->page.access_time= 1U << 16 | + ut_calc_align(static_cast(size), ALIGNMENT); + static_assert(ut_is_2pow(ALIGNMENT), "ALIGNMENT must be a power of 2"); + UT_LIST_ADD_FIRST(blocks, block); + MEM_MAKE_ADDRESSABLE(block->page.frame, size); + MEM_NOACCESS(block->page.frame + size, srv_page_size - size); + buf= block->page.frame; + } + else + { + size_t free_offset= static_cast(block->page.access_time); + ut_ad(!ut_2pow_remainder(free_offset, ALIGNMENT)); + if (UNIV_UNLIKELY(!free_offset)) + { + ut_ad(srv_page_size == 65536); + goto create_block; + } + ut_ad(free_offset <= srv_page_size); + free_offset+= size; + + if (free_offset > srv_page_size) + goto create_block; + + block->page.access_time= ((block->page.access_time >> 16) + 1) << 16 | + ut_calc_align(static_cast(free_offset), ALIGNMENT); + MEM_MAKE_ADDRESSABLE(block->page.frame + free_offset - size, size); + buf= block->page.frame + free_offset - size; + } + + recs.log.append(new (my_assume_aligned(buf)) + log_phys_t{start_lsn, lsn, l, len}); + return false; +} + +/** Store/remove the freed pages in fil_name_t of recv_spaces. +@param[in] page_id freed or init page_id +@param[in] freed TRUE if page is freed */ +static void store_freed_or_init_rec(page_id_t page_id, bool freed) +{ + uint32_t space_id= page_id.space(); + uint32_t page_no= page_id.page_no(); + if (is_predefined_tablespace(space_id)) + { + if (!srv_immediate_scrub_data_uncompressed) + return; + fil_space_t *space; + if (space_id == TRX_SYS_SPACE) + space= fil_system.sys_space; + else + space= fil_space_get(space_id); + + space->free_page(page_no, freed); + return; + } + + recv_spaces_t::iterator i= recv_spaces.lower_bound(space_id); + if (i != recv_spaces.end() && i->first == space_id) + { + if (freed) + i->second.add_freed_page(page_no); + else + i->second.remove_freed_page(page_no); + } +} + +/** Wrapper for log_sys.buf[] between recv_sys.offset and recv_sys.len */ +struct recv_buf +{ + bool is_pmem() const noexcept { return log_sys.is_pmem(); } + + const byte *ptr; + + constexpr recv_buf(const byte *ptr) : ptr(ptr) {} + constexpr bool operator==(const recv_buf other) const + { return ptr == other.ptr; } + + static const byte *end() { return &log_sys.buf[recv_sys.len]; } + + const char *get_filename(byte*, size_t) const noexcept + { return reinterpret_cast(ptr); } + + bool is_eof(size_t len= 0) const noexcept { return ptr + len >= end(); } + + byte operator*() const noexcept + { + ut_ad(ptr >= log_sys.buf); + ut_ad(ptr < end()); + return *ptr; + } + byte operator[](size_t size) const noexcept { return *(*this + size); } + recv_buf operator+(size_t len) const noexcept + { recv_buf r{*this}; return r+= len; } + recv_buf &operator++() noexcept { return *this+= 1; } + recv_buf &operator+=(size_t len) noexcept { ptr+= len; return *this; } + + size_t operator-(const recv_buf start) const noexcept + { + ut_ad(ptr >= start.ptr); + return size_t(ptr - start.ptr); + } + + uint32_t crc32c(const recv_buf start) const noexcept + { + return my_crc32c(0, start.ptr, ptr - start.ptr); + } + + void *memcpy(void *buf, size_t size) const noexcept + { + ut_ad(size); + ut_ad(!is_eof(size - 1)); + return ::memcpy(buf, ptr, size); + } + + bool is_zero(size_t size) const noexcept + { + ut_ad(!is_eof(size)); + return !memcmp(ptr, field_ref_zero, size); + } + + uint64_t read8() const noexcept + { ut_ad(!is_eof(7)); return mach_read_from_8(ptr); } + uint32_t read4() const noexcept + { ut_ad(!is_eof(3)); return mach_read_from_4(ptr); } + + /** Update the pointer if the new pointer is within the buffer. */ + bool set_if_contains(const byte *pos) noexcept + { + if (pos > end() || pos < ptr) + return false; + ptr= pos; + return true; + } + + /** Get the contiguous, unencrypted buffer. + @param buf return value of copy_if_needed() + @param start start of the mini-transaction + @param decrypt_buf possibly, a copy of the mini-transaction + @return contiguous, non-encrypted buffer */ + const byte *get_buf(const byte *buf, const recv_buf start, + const byte *decrypt_buf) const noexcept + { return ptr == buf ? start.ptr : decrypt_buf; } + + /** Copy and decrypt a log record if needed. + @param iv initialization vector + @param tmp buffer for the decrypted log record + @param start un-encrypted start of the log record + @param len length of the possibly encrypted part, in bytes */ + const byte *copy_if_needed(const byte *iv, byte *tmp, recv_buf start, + size_t len) + { + ut_ad(*this - start + len <= srv_page_size); + if (!len || !log_sys.is_encrypted()) + return ptr; + const size_t s(*this - start); + start.memcpy(tmp, s); + return log_decrypt_buf(iv, tmp + s, ptr, static_cast(len)); + } +}; + +#ifdef HAVE_PMEM +/** Ring buffer wrapper for log_sys.buf[]; recv_sys.len == log_sys.file_size */ +struct recv_ring : public recv_buf +{ + static constexpr bool is_pmem() { return true; } + + constexpr recv_ring(const byte *ptr) : recv_buf(ptr) {} + + constexpr static bool is_eof() { return false; } + constexpr static bool is_eof(size_t) { return false; } + + byte operator*() const noexcept + { + ut_ad(ptr >= &log_sys.buf[log_sys.START_OFFSET]); + ut_ad(ptr < end()); + return *ptr; + } + byte operator[](size_t size) const noexcept { return *(*this + size); } + recv_ring operator+(size_t len) const noexcept + { recv_ring r{*this}; return r+= len; } + recv_ring &operator++() noexcept { return *this+= 1; } + recv_ring &operator+=(size_t len) noexcept + { + ut_ad(ptr < end()); + ut_ad(ptr >= &log_sys.buf[log_sys.START_OFFSET]); + ut_ad(len < recv_sys.MTR_SIZE_MAX * 2); + ptr+= len; + if (ptr >= end()) + { + ptr-= recv_sys.len - log_sys.START_OFFSET; + ut_ad(ptr >= &log_sys.buf[log_sys.START_OFFSET]); + ut_ad(ptr < end()); + } + return *this; + } + size_t operator-(const recv_ring start) const noexcept + { + auto s= ptr - start.ptr; + return s >= 0 + ? size_t(s) + : size_t(s + recv_sys.len - log_sys.START_OFFSET); + } + + uint32_t crc32c(const recv_ring start) const noexcept + { + return ptr >= start.ptr + ? my_crc32c(0, start.ptr, ptr - start.ptr) + : my_crc32c(my_crc32c(0, start.ptr, end() - start.ptr), + &log_sys.buf[log_sys.START_OFFSET], + ptr - &log_sys.buf[log_sys.START_OFFSET]); + } + + void *memcpy(void *buf, size_t size) const noexcept + { + ut_ad(size); + ut_ad(size < srv_page_size); + + auto s= ptr + size - end(); + if (s <= 0) + return ::memcpy(buf, ptr, size); + ::memcpy(buf, ptr, size - s); + ::memcpy(static_cast(buf) + size - s, + &log_sys.buf[log_sys.START_OFFSET], s); + return buf; + } + + bool is_zero(size_t size) const noexcept + { + auto s= ptr + size - end(); + if (s <= 0) + return !memcmp(ptr, field_ref_zero, size); + return !memcmp(ptr, field_ref_zero, size - s) && + !memcmp(&log_sys.buf[log_sys.START_OFFSET], field_ref_zero, s); + } + + uint64_t read8() const noexcept + { + if (UNIV_LIKELY(ptr + 8 <= end())) + return mach_read_from_8(ptr); + byte b[8]; + return mach_read_from_8(static_cast(memcpy(b, 8))); + } + uint32_t read4() const noexcept + { + if (UNIV_LIKELY(ptr + 4 <= end())) + return mach_read_from_4(ptr); + byte b[4]; + return mach_read_from_4(static_cast(memcpy(b, 4))); + } + + /** Get the contiguous, unencrypted buffer. + @param buf return value of copy_if_needed() + @param start start of the mini-transaction + @param decrypt_buf possibly, a copy of the mini-transaction + @return contiguous, non-encrypted buffer */ + const byte *get_buf(const byte *buf, const recv_ring start, + const byte *decrypt_buf) const noexcept + { return ptr == buf && start.ptr < ptr ? start.ptr : decrypt_buf; } + + const char *get_filename(byte* buf, size_t rlen) const noexcept + { + return UNIV_LIKELY(ptr + rlen <= end()) + ? reinterpret_cast(ptr) + : static_cast(memcpy(buf, rlen)); + } + + /** Copy and decrypt a log record if needed. + @param iv initialization vector + @param tmp buffer for the decrypted log record + @param start un-encrypted start of the log record + @param len length of the possibly encrypted part, in bytes */ + const byte *copy_if_needed(const byte *iv, byte *tmp, recv_ring start, + size_t len) + { + const size_t s(*this - start); + ut_ad(s + len <= srv_page_size); + if (!log_sys.is_encrypted()) + { + if (start.ptr + s == ptr && ptr + len <= end()) + return ptr; + start.memcpy(tmp, s + len); + return tmp + s; + } + + start.memcpy(tmp, s); + + const byte *b= ptr; + if (ptr + len > end()) + b= static_cast(memcpy(alloca(len), len)); + return log_decrypt_buf(iv, tmp + s, b, static_cast(len)); + } +}; +#endif + +template +void recv_sys_t::rewind(source &l, source &begin) noexcept +{ + ut_ad(srv_operation != SRV_OPERATION_BACKUP); + mysql_mutex_assert_owner(&mutex); + + const source end= l; + uint32_t rlen; + for (l= begin; !(l == end); l+= rlen) + { + const source recs{l}; + ++l; + const byte b= *recs; + + ut_ad(b > 1); + ut_ad(UNIV_LIKELY((b & 0x70) != RESERVED) || srv_force_recovery); + + rlen= b & 0xf; + if (!rlen) + { + const uint32_t lenlen= mlog_decode_varint_length(*l); + const uint32_t addlen= mlog_decode_varint(l); + ut_ad(addlen != MLOG_DECODE_ERROR); + rlen= addlen + 15 - lenlen; + l+= lenlen; + } + ut_ad(!l.is_eof(rlen)); + if (b & 0x80) + continue; + + uint32_t idlen= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(idlen > 5 || idlen >= rlen)) + continue; + const uint32_t space_id= mlog_decode_varint(l); + if (UNIV_UNLIKELY(space_id == MLOG_DECODE_ERROR)) + continue; + l+= idlen; + rlen-= idlen; + idlen= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(idlen > 5 || idlen > rlen)) + continue; + const uint32_t page_no= mlog_decode_varint(l); + if (UNIV_UNLIKELY(page_no == MLOG_DECODE_ERROR)) + continue; + const page_id_t id{space_id, page_no}; + if (pages_it == pages.end() || pages_it->first != id) + { + pages_it= pages.find(id); + if (pages_it == pages.end()) + continue; + } + + ut_ad(!pages_it->second.being_processed); + const log_phys_t *head= + static_cast(*pages_it->second.log.begin()); + if (!head || head->start_lsn == lsn) + { + erase(pages_it); + pages_it= pages.end(); + } + else + pages_it->second.log.rewind(lsn); + } + + l= begin; + pages_it= pages.end(); +} + +/** Parse and register one log_t::FORMAT_10_8 mini-transaction. +@tparam store whether to store the records +@param l log data source +@param if_exists if store: whether to check if the tablespace exists */ +template +inline +recv_sys_t::parse_mtr_result recv_sys_t::parse(source &l, bool if_exists) + noexcept +{ +restart: +#ifndef SUX_LOCK_GENERIC + ut_ad(log_sys.latch.is_write_locked() || + srv_operation == SRV_OPERATION_BACKUP || + srv_operation == SRV_OPERATION_BACKUP_NO_DEFER); +#endif + mysql_mutex_assert_owner(&mutex); + ut_ad(log_sys.next_checkpoint_lsn); + ut_ad(log_sys.is_latest()); + ut_ad(store || !if_exists); + ut_ad(store || + srv_operation != SRV_OPERATION_BACKUP || + srv_operation != SRV_OPERATION_BACKUP_NO_DEFER); + + alignas(8) byte iv[MY_AES_BLOCK_SIZE]; + byte *decrypt_buf= static_cast(alloca(srv_page_size)); + + const lsn_t start_lsn{lsn}; + + /* Check that the entire mini-transaction is included within the buffer */ + if (l.is_eof(0)) + return PREMATURE_EOF; + + if (*l <= 1) + return GOT_EOF; /* We should never write an empty mini-transaction. */ + + source begin{l}; + uint32_t rlen; + for (uint32_t total_len= 0; !l.is_eof(); l+= rlen, total_len+= rlen) + { + if (total_len >= MTR_SIZE_MAX) + return GOT_EOF; + if (*l <= 1) + goto eom_found; + rlen= *l & 0xf; + ++l; + if (!rlen) + { + if (l.is_eof(0)) + break; + rlen= mlog_decode_varint_length(*l); + if (l.is_eof(rlen)) + break; + const uint32_t addlen= mlog_decode_varint(l); + if (UNIV_UNLIKELY(addlen >= MTR_SIZE_MAX)) + return GOT_EOF; + rlen= addlen + 15; + } + } + + /* Not the entire mini-transaction was present. */ + return PREMATURE_EOF; + + eom_found: + if (*l != log_sys.get_sequence_bit((l - begin) + lsn)) + return GOT_EOF; + + if (l.is_eof(4)) + return PREMATURE_EOF; + + uint32_t crc{l.crc32c(begin)}; + + if (log_sys.is_encrypted()) + { + if (l.is_eof(8 + 4)) + return PREMATURE_EOF; + (l + 1).memcpy(iv, 8); + l+= 8; + crc= my_crc32c(crc, iv, 8); + } + + DBUG_EXECUTE_IF("log_intermittent_checksum_mismatch", + { + static int c; + if (!c++) + { + sql_print_information("Invalid log block checksum"); + return GOT_EOF; + } + }); + + if (crc != (l + 1).read4()) + return GOT_EOF; + + l+= 5; + ut_d(const source el{l}); + lsn+= l - begin; + offset= l.ptr - log_sys.buf; + if (!l.is_pmem()); + else if (offset == log_sys.file_size) + offset= log_sys.START_OFFSET; + else + ut_ad(offset < log_sys.file_size); + + ut_d(std::set freed); +#if 0 && defined UNIV_DEBUG /* MDEV-21727 FIXME: enable this */ + /* Pages that have been modified in this mini-transaction. + If a mini-transaction writes INIT_PAGE for a page, it should not have + written any log records for the page. Unfortunately, this does not + hold for ROW_FORMAT=COMPRESSED pages, because page_zip_compress() + can be invoked in a pessimistic operation, even after log has + been written for other pages. */ + ut_d(std::set modified); +#endif + + uint32_t space_id= 0, page_no= 0, last_offset= 0; + bool got_page_op= false; + + for (l= begin;; l+= rlen) + { + const source recs{l}; + ++l; + const byte b= *recs; + + if (b <= 1) + break; + + if (UNIV_LIKELY((b & 0x70) != RESERVED)); + else if (srv_force_recovery) + sql_print_warning("InnoDB: Ignoring unknown log record at LSN " LSN_PF, + lsn); + else + { + sql_print_error("InnoDB: Unknown log record at LSN " LSN_PF, lsn); + corrupted: + found_corrupt_log= true; + return GOT_EOF; + } + + rlen= b & 0xf; + if (!rlen) + { + const uint32_t lenlen= mlog_decode_varint_length(*l); + const uint32_t addlen= mlog_decode_varint(l); + ut_ad(addlen != MLOG_DECODE_ERROR); + rlen= addlen + 15 - lenlen; + l+= lenlen; + } + ut_ad(!l.is_eof(rlen)); + + uint32_t idlen; + if ((b & 0x80) && got_page_op) + { + /* This record is for the same page as the previous one. */ + if (UNIV_UNLIKELY((b & 0x70) <= INIT_PAGE)) + { + record_corrupted: + /* FREE_PAGE,INIT_PAGE cannot be with same_page flag */ + if (!srv_force_recovery) + { + malformed: + sql_print_error("InnoDB: Malformed log record at LSN " LSN_PF + "; set innodb_force_recovery=1 to ignore.", lsn); + goto corrupted; + } + sql_print_warning("InnoDB: Ignoring malformed log record at LSN " + LSN_PF, lsn); + last_offset= 1; /* the next record must not be same_page */ + continue; + } + if (srv_operation == SRV_OPERATION_BACKUP) + continue; + DBUG_PRINT("ib_log", + ("scan " LSN_PF ": rec %x len %zu page %u:%u", + lsn, b, l - recs + rlen, space_id, page_no)); + goto same_page; + } + last_offset= 0; + idlen= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(idlen > 5 || idlen >= rlen)) + { + if (!*l && b == FILE_CHECKPOINT + 1) + continue; + page_id_corrupted: + if (!srv_force_recovery) + { + sql_print_error("InnoDB: Corrupted page identifier at " LSN_PF + "; set innodb_force_recovery=1 to ignore the record.", + lsn); + goto corrupted; + } + sql_print_warning("InnoDB: Ignoring corrupted page identifier at LSN " + LSN_PF, lsn); + continue; + } + space_id= mlog_decode_varint(l); + if (UNIV_UNLIKELY(space_id == MLOG_DECODE_ERROR)) + goto page_id_corrupted; + l+= idlen; + rlen-= idlen; + idlen= mlog_decode_varint_length(*l); + if (UNIV_UNLIKELY(idlen > 5 || idlen > rlen)) + goto page_id_corrupted; + page_no= mlog_decode_varint(l); + if (UNIV_UNLIKELY(page_no == MLOG_DECODE_ERROR)) + goto page_id_corrupted; + l+= idlen; + rlen-= idlen; + mach_write_to_4(iv + 8, space_id); + mach_write_to_4(iv + 12, page_no); + got_page_op= !(b & 0x80); + if (!got_page_op); + else if (!store && srv_operation == SRV_OPERATION_BACKUP) + { + if (page_no == 0 && first_page_init && (b & 0x10)) + first_page_init(space_id); + continue; + } + else if (store && file_checkpoint && !is_predefined_tablespace(space_id)) + { + recv_spaces_t::iterator i= recv_spaces.lower_bound(space_id); + if (i != recv_spaces.end() && i->first == space_id); + else if (lsn < file_checkpoint) + /* We have not seen all records between the checkpoint and + FILE_CHECKPOINT. There should be a FILE_DELETE for this + tablespace later. */ + recv_spaces.emplace_hint(i, space_id, file_name_t("", false)); + else + { + const page_id_t id(space_id, page_no); + if (!srv_force_recovery) + { + ib::error() << "Missing FILE_DELETE or FILE_MODIFY for " << id + << " at " << lsn + << "; set innodb_force_recovery=1 to ignore the record."; + goto corrupted; + } + ib::warn() << "Ignoring record for " << id << " at " << lsn; + continue; + } + } + DBUG_PRINT("ib_log", + ("scan " LSN_PF ": rec %x len %zu page %u:%u", + lsn, b, l - recs + rlen, space_id, page_no)); + if (got_page_op) + { + same_page: + const byte *cl= l.ptr; + if (!rlen); + else if (UNIV_UNLIKELY(l - recs + rlen > srv_page_size)) + goto record_corrupted; + const page_id_t id{space_id, page_no}; + ut_d(if ((b & 0x70) == INIT_PAGE || (b & 0x70) == OPTION) + freed.erase(id)); + ut_ad(freed.find(id) == freed.end()); + switch (b & 0x70) { + case FREE_PAGE: + ut_ad(freed.emplace(id).second); + last_offset= 1; /* the next record must not be same_page */ + goto free_or_init_page; + case INIT_PAGE: + last_offset= FIL_PAGE_TYPE; + free_or_init_page: + if (store) + store_freed_or_init_rec(id, (b & 0x70) == FREE_PAGE); + if (UNIV_UNLIKELY(rlen != 0)) + goto record_corrupted; + copy_if_needed: + cl= l.copy_if_needed(iv, decrypt_buf, recs, rlen); + break; + case EXTENDED: + if (UNIV_UNLIKELY(!rlen)) + goto record_corrupted; + cl= l.copy_if_needed(iv, decrypt_buf, recs, rlen); + if (rlen == 1 && *cl == TRIM_PAGES) + { +#if 0 /* For now, we can only truncate an undo log tablespace */ + if (UNIV_UNLIKELY(!space_id || !page_no)) + goto record_corrupted; +#else + if (!srv_is_undo_tablespace(space_id) || + page_no != SRV_UNDO_TABLESPACE_SIZE_IN_PAGES) + goto record_corrupted; + static_assert(UT_ARR_SIZE(truncated_undo_spaces) == + TRX_SYS_MAX_UNDO_SPACES, "compatibility"); + /* The entire undo tablespace will be reinitialized by + innodb_undo_log_truncate=ON. Discard old log for all pages. */ + trim({space_id, 0}, start_lsn); + truncated_undo_spaces[space_id - srv_undo_space_id_start]= + { start_lsn, page_no }; + if (!store && undo_space_trunc) + undo_space_trunc(space_id); +#endif + last_offset= 1; /* the next record must not be same_page */ + continue; + } + last_offset= FIL_PAGE_TYPE; + break; + case OPTION: + if (rlen == 5 && *l == OPT_PAGE_CHECKSUM) + goto copy_if_needed; + /* fall through */ + case RESERVED: + continue; + case WRITE: + case MEMMOVE: + case MEMSET: + if (UNIV_UNLIKELY(rlen == 0 || last_offset == 1)) + goto record_corrupted; + ut_d(const source payload{l}); + cl= l.copy_if_needed(iv, decrypt_buf, recs, rlen); + const uint32_t olen= mlog_decode_varint_length(*cl); + if (UNIV_UNLIKELY(olen >= rlen) || UNIV_UNLIKELY(olen > 3)) + goto record_corrupted; + const uint32_t offset= mlog_decode_varint(cl); + ut_ad(offset != MLOG_DECODE_ERROR); + static_assert(FIL_PAGE_OFFSET == 4, "compatibility"); + if (UNIV_UNLIKELY(offset >= srv_page_size)) + goto record_corrupted; + last_offset+= offset; + if (UNIV_UNLIKELY(last_offset < 8 || last_offset >= srv_page_size)) + goto record_corrupted; + cl+= olen; + rlen-= olen; + if ((b & 0x70) == WRITE) + { + if (UNIV_UNLIKELY(rlen + last_offset > srv_page_size)) + goto record_corrupted; + if (store && UNIV_UNLIKELY(!page_no) && file_checkpoint) + { + const bool has_size= last_offset <= FSP_HEADER_OFFSET + FSP_SIZE && + last_offset + rlen >= FSP_HEADER_OFFSET + FSP_SIZE + 4; + const bool has_flags= last_offset <= + FSP_HEADER_OFFSET + FSP_SPACE_FLAGS && + last_offset + rlen >= FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + 4; + if (has_size || has_flags) + { + recv_spaces_t::iterator it= recv_spaces.find(space_id); + const uint32_t size= has_size + ? mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + cl - + last_offset) + : 0; + const uint32_t flags= has_flags + ? mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + cl - + last_offset) + : file_name_t::initial_flags; + if (it == recv_spaces.end()) + ut_ad(!file_checkpoint || space_id == TRX_SYS_SPACE || + srv_is_undo_tablespace(space_id)); + else if (!it->second.space) + { + if (has_size) + it->second.size= size; + if (has_flags) + it->second.flags= flags; + } + fil_space_set_recv_size_and_flags(space_id, size, flags); + } + } + parsed_ok: + last_offset+= rlen; + ut_ad(l == payload); + if (!l.set_if_contains(cl)) + (l= recs)+= cl - decrypt_buf; + break; + } + uint32_t llen= mlog_decode_varint_length(*cl); + if (UNIV_UNLIKELY(llen > rlen || llen > 3)) + goto record_corrupted; + const uint32_t len= mlog_decode_varint(cl); + ut_ad(len != MLOG_DECODE_ERROR); + if (UNIV_UNLIKELY(last_offset + len > srv_page_size)) + goto record_corrupted; + cl+= llen; + rlen-= llen; + llen= len; + if ((b & 0x70) == MEMSET) + { + if (UNIV_UNLIKELY(rlen > llen)) + goto record_corrupted; + goto parsed_ok; + } + const uint32_t slen= mlog_decode_varint_length(*cl); + if (UNIV_UNLIKELY(slen != rlen || slen > 3)) + goto record_corrupted; + uint32_t s= mlog_decode_varint(cl); + ut_ad(slen != MLOG_DECODE_ERROR); + if (s & 1) + s= last_offset - (s >> 1) - 1; + else + s= last_offset + (s >> 1) + 1; + if (UNIV_UNLIKELY(s < 8 || s + llen > srv_page_size)) + goto record_corrupted; + goto parsed_ok; + } +#if 0 && defined UNIV_DEBUG + switch (b & 0x70) { + case RESERVED: + ut_ad(0); /* we did "continue" earlier */ + break; + case OPTION: + case FREE_PAGE: + break; + default: + ut_ad(modified.emplace(id).second || (b & 0x70) != INIT_PAGE); + } +#endif + if (store) + { + if (if_exists) + { + if (fil_space_t *space= fil_space_t::get(space_id)) + { + const auto size= space->get_size(); + space->release(); + if (!size) + continue; + } + else if (!deferred_spaces.find(space_id)) + continue; + } + if (!mlog_init.will_avoid_read(id, start_lsn)) + { + if (pages_it == pages.end() || pages_it->first != id) + pages_it= pages.emplace(id, page_recv_t{}).first; + if (UNIV_UNLIKELY(add(pages_it, start_lsn, lsn, + l.get_buf(cl, recs, decrypt_buf), + l - recs + rlen))) + { + lsn= start_lsn; + log_sys.set_recovered_lsn(start_lsn); + l+= rlen; + offset= begin.ptr - log_sys.buf; + rewind(l, begin); + if (if_exists) + { + apply(false); + if (is_corrupt_fs()) + return GOT_EOF; + goto restart; + } + sql_print_information("InnoDB: Multi-batch recovery needed at LSN " + LSN_PF, lsn); + return GOT_OOM; + } + } + } + else if ((b & 0x70) <= INIT_PAGE) + { + mlog_init.add(id, start_lsn); + if (pages_it == pages.end() || pages_it->first != id) + { + pages_it= pages.find(id); + if (pages_it == pages.end()) + continue; + } + map::iterator r= pages_it++; + erase(r); + } + } + else if (rlen) + { + switch (b & 0xf0) { + case FILE_CHECKPOINT: + if (space_id || page_no || l[rlen] > 1); + else if (rlen != 8) + { + if (rlen < UNIV_PAGE_SIZE_MAX && !l.is_zero(rlen)) + continue; + } + else if (store) + { + ut_ad(file_checkpoint); + continue; + } + else if (const lsn_t c= l.read8()) + { + if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) + fprintf(stderr, "FILE_CHECKPOINT(" LSN_PF ") %s at " LSN_PF "\n", + c, c != log_sys.next_checkpoint_lsn + ? "ignored" : file_checkpoint ? "reread" : "read", lsn); + + DBUG_PRINT("ib_log", + ("FILE_CHECKPOINT(" LSN_PF ") %s at " LSN_PF, + c, c != log_sys.next_checkpoint_lsn + ? "ignored" : file_checkpoint ? "reread" : "read", lsn)); + + if (c == log_sys.next_checkpoint_lsn) + { + /* There can be multiple FILE_CHECKPOINT for the same LSN. */ + if (file_checkpoint) + continue; + file_checkpoint= lsn; + return GOT_EOF; + } + continue; + } + else + continue; + /* fall through */ + default: + if (!srv_force_recovery) + goto malformed; + sql_print_warning("InnoDB: Ignoring malformed log record at LSN " + LSN_PF, lsn); + continue; + case FILE_DELETE: + case FILE_MODIFY: + case FILE_RENAME: + if (UNIV_UNLIKELY(page_no != 0)) + { + file_rec_error: + if (!srv_force_recovery) + { + sql_print_error("InnoDB: Corrupted file-level record;" + " set innodb_force_recovery=1 to ignore."); + goto corrupted; + } + + sql_print_warning("InnoDB: Ignoring corrupted file-level record" + " at LSN " LSN_PF, lsn); + continue; + } + /* fall through */ + case FILE_CREATE: + if (UNIV_UNLIKELY(!space_id || page_no)) + goto file_rec_error; + /* There is no terminating NUL character. Names must end in .ibd. + For FILE_RENAME, there is a NUL between the two file names. */ + + const char * const fn= l.get_filename(decrypt_buf, rlen); + const char *fn2= static_cast(memchr(fn, 0, rlen)); + + if (UNIV_UNLIKELY((fn2 == nullptr) == ((b & 0xf0) == FILE_RENAME))) + goto file_rec_error; + + const char * const fnend= fn2 ? fn2 : fn + rlen; + const char * const fn2end= fn2 ? fn + rlen : nullptr; + + if (fn2) + { + fn2++; + if (memchr(fn2, 0, fn2end - fn2)) + goto file_rec_error; + if (fn2end - fn2 < 4 || memcmp(fn2end - 4, DOT_IBD, 4)) + goto file_rec_error; + } + + if (is_predefined_tablespace(space_id)) + goto file_rec_error; + if (fnend - fn < 4 || memcmp(fnend - 4, DOT_IBD, 4)) + goto file_rec_error; + + if (UNIV_UNLIKELY(!recv_needed_recovery && srv_read_only_mode)) + continue; + + if (!store && + (srv_operation == SRV_OPERATION_BACKUP || + srv_operation == SRV_OPERATION_BACKUP_NO_DEFER)) + { + if ((b & 0xf0) < FILE_CHECKPOINT && log_file_op) + log_file_op(space_id, b & 0xf0, + reinterpret_cast(fn), + static_cast(fnend - fn), + reinterpret_cast(fn2), + fn2 ? static_cast(fn2end - fn2) : 0); + continue; + } + + fil_name_process(fn, fnend - fn, space_id, + (b & 0xf0) == FILE_DELETE ? FILE_DELETE : FILE_MODIFY, + start_lsn, if_exists); + + if (fn2) + { + fil_name_process(fn2, fn2end - fn2, space_id, + FILE_RENAME, start_lsn, if_exists); + if (file_checkpoint) + { + const size_t len= fn2end - fn2; + auto r= renamed_spaces.emplace(space_id, std::string{fn2, len}); + if (!r.second) + r.first->second= std::string{fn2, len}; + } + } + + if (is_corrupt_fs()) + return GOT_EOF; + } + } + else if (b == FILE_CHECKPOINT + 2 && !space_id && !page_no); + else + goto malformed; + } + + l+= log_sys.is_encrypted() ? 4U + 8U : 4U; + ut_ad(l == el); + return OK; +} + +template +recv_sys_t::parse_mtr_result recv_sys_t::parse_mtr(bool if_exists) noexcept +{ + recv_buf s{&log_sys.buf[recv_sys.offset]}; + return recv_sys.parse(s, if_exists); +} + +/** for mariadb-backup; @see xtrabackup_copy_logfile() */ +template +recv_sys_t::parse_mtr_result recv_sys_t::parse_mtr(bool) noexcept; + +#ifdef HAVE_PMEM +template +recv_sys_t::parse_mtr_result recv_sys_t::parse_pmem(bool if_exists) noexcept +{ + recv_sys_t::parse_mtr_result r{parse_mtr(if_exists)}; + if (UNIV_LIKELY(r != PREMATURE_EOF) || !log_sys.is_pmem()) + return r; + ut_ad(recv_sys.len == log_sys.file_size); + ut_ad(recv_sys.offset >= log_sys.START_OFFSET); + ut_ad(recv_sys.offset <= recv_sys.len); + recv_ring s + {recv_sys.offset == recv_sys.len + ? &log_sys.buf[log_sys.START_OFFSET] + : &log_sys.buf[recv_sys.offset]}; + return recv_sys.parse(s, if_exists); +} +#endif + +/** Apply the hashed log records to the page, if the page lsn is less than the +lsn of a log record. +@param[in,out] block buffer pool page +@param[in,out] mtr mini-transaction +@param[in,out] recs log records to apply +@param[in,out] space tablespace, or NULL if not looked up yet +@param[in,out] init page initialization operation, or NULL +@return the recovered page +@retval nullptr on failure */ +static buf_block_t *recv_recover_page(buf_block_t *block, mtr_t &mtr, + page_recv_t &recs, + fil_space_t *space, + recv_init *init) +{ + mysql_mutex_assert_not_owner(&recv_sys.mutex); + ut_ad(recv_sys.apply_log_recs); + ut_ad(recv_needed_recovery); + ut_ad(!init || init->created); + ut_ad(!init || init->lsn); + ut_ad(recs.being_processed == 1); + ut_ad(!space || space->id == block->page.id().space()); + ut_ad(log_sys.is_latest()); + + if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) { + ib::info() << "Applying log to page " << block->page.id(); + } + + DBUG_PRINT("ib_log", ("Applying log to page %u:%u", + block->page.id().space(), + block->page.id().page_no())); + + byte *frame = UNIV_LIKELY_NULL(block->page.zip.data) + ? block->page.zip.data + : block->page.frame; + const lsn_t page_lsn = init + ? 0 + : mach_read_from_8(frame + FIL_PAGE_LSN); + bool free_page = false; + lsn_t start_lsn = 0, end_lsn = 0; + ut_d(lsn_t recv_start_lsn = 0); + const lsn_t init_lsn = init ? init->lsn : 0; + + bool skipped_after_init = false; + + for (const log_rec_t* recv : recs.log) { + const log_phys_t* l = static_cast(recv); + ut_ad(l->lsn); + ut_ad(end_lsn <= l->lsn); + ut_ad(l->lsn <= recv_sys.lsn); + + ut_ad(l->start_lsn); + ut_ad(recv_start_lsn <= l->start_lsn); + ut_d(recv_start_lsn = l->start_lsn); + + if (l->start_lsn < page_lsn) { + /* This record has already been applied. */ + DBUG_PRINT("ib_log", ("apply skip %u:%u LSN " LSN_PF + " < " LSN_PF, + block->page.id().space(), + block->page.id().page_no(), + l->start_lsn, page_lsn)); + skipped_after_init = true; + end_lsn = l->lsn; + continue; + } + + if (l->start_lsn < init_lsn) { + DBUG_PRINT("ib_log", ("init skip %u:%u LSN " LSN_PF + " < " LSN_PF, + block->page.id().space(), + block->page.id().page_no(), + l->start_lsn, init_lsn)); + skipped_after_init = false; + end_lsn = l->lsn; + continue; + } + + /* There is no need to check LSN for just initialized pages. */ + if (skipped_after_init) { + skipped_after_init = false; + ut_ad(end_lsn == page_lsn); + if (end_lsn != page_lsn) { + sql_print_warning( + "InnoDB: The last skipped log record" + " LSN " LSN_PF + " is not equal to page LSN " LSN_PF, + end_lsn, page_lsn); + } + } + + end_lsn = l->lsn; + + if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) { + ib::info() << "apply " << l->start_lsn + << ": " << block->page.id(); + } + + DBUG_PRINT("ib_log", ("apply " LSN_PF ": %u:%u", + l->start_lsn, + block->page.id().space(), + block->page.id().page_no())); + + log_phys_t::apply_status a= l->apply(*block, recs.last_offset); + + switch (a) { + case log_phys_t::APPLIED_NO: + ut_ad(!mtr.has_modifications()); + free_page = true; + start_lsn = 0; + continue; + case log_phys_t::APPLIED_YES: + case log_phys_t::APPLIED_CORRUPTED: + goto set_start_lsn; + case log_phys_t::APPLIED_TO_FSP_HEADER: + case log_phys_t::APPLIED_TO_ENCRYPTION: + break; + } + + if (fil_space_t* s = space + ? space + : fil_space_t::get(block->page.id().space())) { + switch (a) { + case log_phys_t::APPLIED_TO_FSP_HEADER: + s->flags = mach_read_from_4( + FSP_HEADER_OFFSET + + FSP_SPACE_FLAGS + frame); + s->size_in_header = mach_read_from_4( + FSP_HEADER_OFFSET + FSP_SIZE + + frame); + s->free_limit = mach_read_from_4( + FSP_HEADER_OFFSET + + FSP_FREE_LIMIT + frame); + s->free_len = mach_read_from_4( + FSP_HEADER_OFFSET + FSP_FREE + + FLST_LEN + frame); + break; + default: + byte* b= frame + + fsp_header_get_encryption_offset( + block->zip_size()) + + FSP_HEADER_OFFSET; + if (memcmp(b, CRYPT_MAGIC, MAGIC_SZ)) { + break; + } + b += MAGIC_SZ; + if (*b != CRYPT_SCHEME_UNENCRYPTED + && *b != CRYPT_SCHEME_1) { + break; + } + if (b[1] != MY_AES_BLOCK_SIZE) { + break; + } + if (b[2 + MY_AES_BLOCK_SIZE + 4 + 4] + > FIL_ENCRYPTION_OFF) { + break; + } + fil_crypt_parse(s, b); + } + + if (!space) { + s->release(); + } + } + +set_start_lsn: + if ((a == log_phys_t::APPLIED_CORRUPTED + || recv_sys.is_corrupt_log()) && !srv_force_recovery) { + if (init) { + init->created = false; + } + + mtr.discard_modifications(); + mtr.commit(); + + buf_pool.corrupted_evict(&block->page, + block->page.state() & + buf_page_t::LRU_MASK); + block = nullptr; + goto done; + } + + if (!start_lsn) { + start_lsn = l->start_lsn; + } + } + + if (start_lsn) { + ut_ad(end_lsn >= start_lsn); + ut_ad(!block->page.oldest_modification()); + mach_write_to_8(FIL_PAGE_LSN + frame, end_lsn); + if (UNIV_LIKELY(!block->page.zip.data)) { + mach_write_to_8(srv_page_size + - FIL_PAGE_END_LSN_OLD_CHKSUM + + frame, end_lsn); + } else { + buf_zip_decompress(block, false); + } + /* The following is adapted from + buf_pool_t::insert_into_flush_list() */ + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_pool.flush_list_bytes+= block->physical_size(); + block->page.set_oldest_modification(start_lsn); + UT_LIST_ADD_FIRST(buf_pool.flush_list, &block->page); + buf_pool.page_cleaner_wakeup(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + } else if (free_page && init) { + /* There have been no operations that modify the page. + Any buffered changes must not be merged. A subsequent + buf_page_create() from a user thread should discard + any buffered changes. */ + init->created = false; + ut_ad(!mtr.has_modifications()); + block->page.set_freed(block->page.state()); + } + + /* Make sure that committing mtr does not change the modification + lsn values of page */ + + mtr.discard_modifications(); + mtr.commit(); + +done: + /* FIXME: do this in page read, protected with recv_sys.mutex! */ + if (recv_max_page_lsn < page_lsn) { + recv_max_page_lsn = page_lsn; + } + + return block; +} + +/** Remove records for a corrupted page. +This function should only be called when innodb_force_recovery is set. +@param page_id corrupted page identifier */ +ATTRIBUTE_COLD void recv_sys_t::free_corrupted_page(page_id_t page_id) +{ + if (!recovery_on) + return; + + mysql_mutex_lock(&mutex); + map::iterator p= pages.find(page_id); + if (p == pages.end()) + { + mysql_mutex_unlock(&mutex); + return; + } + + p->second.being_processed= -1; + if (!srv_force_recovery) + set_corrupt_fs(); + mysql_mutex_unlock(&mutex); + + ib::error_or_warn(!srv_force_recovery) + << "Unable to apply log to corrupted page " << page_id; +} + +ATTRIBUTE_COLD void recv_sys_t::set_corrupt_log() +{ + mysql_mutex_lock(&mutex); + found_corrupt_log= true; + mysql_mutex_unlock(&mutex); +} + +ATTRIBUTE_COLD void recv_sys_t::set_corrupt_fs() +{ + mysql_mutex_assert_owner(&mutex); + if (!srv_force_recovery) + sql_print_information("InnoDB: Set innodb_force_recovery=1" + " to ignore corrupted pages."); + found_corrupt_fs= true; +} + +/** Apply any buffered redo log to a page. +@param space tablespace +@param bpage buffer pool page +@return whether the page was recovered correctly */ +bool recv_recover_page(fil_space_t* space, buf_page_t* bpage) +{ + mtr_t mtr; + mtr.start(); + mtr.set_log_mode(MTR_LOG_NO_REDO); + + ut_ad(bpage->frame); + /* Move the ownership of the x-latch on the page to this OS thread, + so that we can acquire a second x-latch on it. This is needed for + the operations to the page to pass the debug checks. */ + bpage->lock.claim_ownership(); + bpage->lock.x_lock_recursive(); + bpage->fix_on_recovery(); + mtr.memo_push(reinterpret_cast(bpage), MTR_MEMO_PAGE_X_FIX); + + buf_block_t *success= reinterpret_cast(bpage); + + mysql_mutex_lock(&recv_sys.mutex); + if (recv_sys.apply_log_recs) + { + const page_id_t id{bpage->id()}; + recv_sys_t::map::iterator p= recv_sys.pages.find(id); + if (p == recv_sys.pages.end()); + else if (p->second.being_processed < 0) + { + recv_sys.pages_it_invalidate(p); + recv_sys.erase(p); + } + else + { + p->second.being_processed= 1; + recv_sys_t::init *init= nullptr; + if (p->second.skip_read) + (init= &mlog_init.last(id))->created= true; + mysql_mutex_unlock(&recv_sys.mutex); + success= recv_recover_page(success, mtr, p->second, space, init); + p->second.being_processed= -1; + goto func_exit; + } + } + + mysql_mutex_unlock(&recv_sys.mutex); + mtr.commit(); +func_exit: + ut_ad(mtr.has_committed()); + return success; +} + +void IORequest::fake_read_complete(os_offset_t offset) const +{ + ut_ad(node); + ut_ad(is_read()); + ut_ad(bpage); + ut_ad(bpage->frame); + ut_ad(recv_recovery_is_on()); + ut_ad(offset); + + mtr_t mtr; + mtr.start(); + mtr.set_log_mode(MTR_LOG_NO_REDO); + + ut_ad(bpage->frame); + /* Move the ownership of the x-latch on the page to this OS thread, + so that we can acquire a second x-latch on it. This is needed for + the operations to the page to pass the debug checks. */ + bpage->lock.claim_ownership(); + bpage->lock.x_lock_recursive(); + bpage->fix_on_recovery(); + mtr.memo_push(reinterpret_cast(bpage), MTR_MEMO_PAGE_X_FIX); + + page_recv_t &recs= *reinterpret_cast(slot); + ut_ad(recs.being_processed == 1); + recv_init &init= *reinterpret_cast(offset); + ut_ad(init.lsn > 1); + init.created= true; + + if (recv_recover_page(reinterpret_cast(bpage), + mtr, recs, node->space, &init)) + { + ut_ad(bpage->oldest_modification() || bpage->is_freed()); + bpage->lock.x_unlock(true); + } + recs.being_processed= -1; + ut_ad(mtr.has_committed()); + + node->space->release(); +} + +/** @return whether a page has been freed */ +inline bool fil_space_t::is_freed(uint32_t page) +{ + std::lock_guard freed_lock(freed_range_mutex); + return freed_ranges.contains(page); +} + +bool recv_sys_t::report(time_t time) +{ + if (time - progress_time < 15) + return false; + progress_time= time; + return true; +} + +ATTRIBUTE_COLD +void recv_sys_t::report_progress() const +{ + mysql_mutex_assert_owner(&mutex); + const size_t n{pages.size()}; + if (recv_sys.scanned_lsn == recv_sys.lsn) + { + sql_print_information("InnoDB: To recover: %zu pages", n); + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "To recover: %zu pages", n); + } + else + { + sql_print_information("InnoDB: To recover: LSN " LSN_PF + "/" LSN_PF "; %zu pages", + recv_sys.lsn, recv_sys.scanned_lsn, n); + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "To recover: LSN " LSN_PF + "/" LSN_PF "; %zu pages", + recv_sys.lsn, recv_sys.scanned_lsn, n); + } +} + +/** Apply a recovery batch. +@param space_id current tablespace identifier +@param space current tablespace +@param free_block spare buffer block +@param last_batch whether it is possible to write more redo log +@return whether the caller must provide a new free_block */ +bool recv_sys_t::apply_batch(uint32_t space_id, fil_space_t *&space, + buf_block_t *&free_block, bool last_batch) +{ + mysql_mutex_assert_owner(&mutex); + ut_ad(pages_it != pages.end()); + ut_ad(!pages_it->second.log.empty()); + + mysql_mutex_lock(&buf_pool.mutex); + size_t n= 0, max_n= std::min(BUF_LRU_MIN_LEN, + UT_LIST_GET_LEN(buf_pool.LRU) + + UT_LIST_GET_LEN(buf_pool.free)); + mysql_mutex_unlock(&buf_pool.mutex); + + map::iterator begin= pages.end(); + page_id_t begin_id{~0ULL}; + + while (pages_it != pages.end() && n < max_n) + { + ut_ad(!buf_dblwr.is_inside(pages_it->first)); + if (!pages_it->second.being_processed) + { + if (space_id != pages_it->first.space()) + { + space_id= pages_it->first.space(); + if (space) + space->release(); + space= fil_space_t::get(space_id); + if (!space) + { + auto d= deferred_spaces.defers.find(space_id); + if (d == deferred_spaces.defers.end() || d->second.deleted) + /* For deleted files we preserve the deferred_spaces entry */; + else if (!free_block) + return true; + else + { + space= recover_deferred(pages_it, d->second.file_name, free_block); + deferred_spaces.defers.erase(d); + if (!space && !srv_force_recovery) + { + set_corrupt_fs(); + return false; + } + } + } + } + if (!space || space->is_freed(pages_it->first.page_no())) + pages_it->second.being_processed= -1; + else if (!n++) + { + begin= pages_it; + begin_id= pages_it->first; + } + } + pages_it++; + } + + if (!last_batch) + log_sys.latch.wr_unlock(); + + pages_it= begin; + + if (report(time(nullptr))) + report_progress(); + + if (!n) + goto wait; + + mysql_mutex_lock(&buf_pool.mutex); + + if (UNIV_UNLIKELY(UT_LIST_GET_LEN(buf_pool.free) < n)) + { + mysql_mutex_unlock(&buf_pool.mutex); + wait: + wait_for_pool(n); + if (n); + else if (!last_batch) + goto unlock_relock; + else + goto get_last; + pages_it= pages.lower_bound(begin_id); + ut_ad(pages_it != pages.end()); + } + else + mysql_mutex_unlock(&buf_pool.mutex); + + while (pages_it != pages.end()) + { + ut_ad(!buf_dblwr.is_inside(pages_it->first)); + if (!pages_it->second.being_processed) + { + const page_id_t id{pages_it->first}; + + if (space_id != id.space()) + { + space_id= id.space(); + if (space) + space->release(); + space= fil_space_t::get(space_id); + } + if (!space) + { + const auto it= deferred_spaces.defers.find(space_id); + if (it != deferred_spaces.defers.end() && !it->second.deleted) + /* The records must be processed after recover_deferred(). */ + goto next; + goto space_not_found; + } + else if (space->is_freed(id.page_no())) + { + space_not_found: + pages_it->second.being_processed= -1; + goto next; + } + else + { + page_recv_t &recs= pages_it->second; + ut_ad(!recs.log.empty()); + recs.being_processed= 1; + init *init= recs.skip_read ? &mlog_init.last(id) : nullptr; + mysql_mutex_unlock(&mutex); + buf_read_recover(space, id, recs, init); + } + + if (!--n) + { + if (last_batch) + goto relock_last; + goto relock; + } + mysql_mutex_lock(&mutex); + pages_it= pages.lower_bound(id); + } + else + next: + pages_it++; + } + + if (!last_batch) + { + unlock_relock: + mysql_mutex_unlock(&mutex); + relock: + log_sys.latch.wr_lock(SRW_LOCK_CALL); + relock_last: + mysql_mutex_lock(&mutex); + get_last: + pages_it= pages.lower_bound(begin_id); + } + + return false; +} + +/** Attempt to initialize a page based on redo log records. +@param p iterator +@param mtr mini-transaction +@param b pre-allocated buffer pool block +@param init page initialization +@return the recovered block +@retval nullptr if the page cannot be initialized based on log records +@retval -1 if the page cannot be recovered due to corruption */ +inline buf_block_t *recv_sys_t::recover_low(const map::iterator &p, mtr_t &mtr, + buf_block_t *b, init &init) +{ + mysql_mutex_assert_not_owner(&mutex); + page_recv_t &recs= p->second; + ut_ad(recs.skip_read); + ut_ad(recs.being_processed == 1); + buf_block_t* block= nullptr; + const lsn_t end_lsn= recs.log.last()->lsn; + if (end_lsn < init.lsn) + DBUG_LOG("ib_log", "skip log for page " << p->first + << " LSN " << end_lsn << " < " << init.lsn); + fil_space_t *space= fil_space_t::get(p->first.space()); + + mtr.start(); + mtr.set_log_mode(MTR_LOG_NO_REDO); + + ulint zip_size= space ? space->zip_size() : 0; + + if (!space) + { + if (p->first.page_no() != 0) + { + nothing_recoverable: + mtr.commit(); + return nullptr; + } + auto it= recv_spaces.find(p->first.space()); + ut_ad(it != recv_spaces.end()); + uint32_t flags= it->second.flags; + zip_size= fil_space_t::zip_size(flags); + block= buf_page_create_deferred(p->first.space(), zip_size, &mtr, b); + ut_ad(block == b); + block->page.lock.x_lock_recursive(); + } + else + { + block= buf_page_create(space, p->first.page_no(), zip_size, &mtr, b); + + if (UNIV_UNLIKELY(block != b)) + { + /* The page happened to exist in the buffer pool, or it + was just being read in. Before the exclusive page latch was acquired by + buf_page_create(), all changes to the page must have been applied. */ + ut_d(mysql_mutex_lock(&mutex)); + ut_ad(pages.find(p->first) == pages.end()); + ut_d(mysql_mutex_unlock(&mutex)); + space->release(); + goto nothing_recoverable; + } + } + + ut_d(mysql_mutex_lock(&mutex)); + ut_ad(&recs == &pages.find(p->first)->second); + ut_d(mysql_mutex_unlock(&mutex)); + init.created= true; + block= recv_recover_page(block, mtr, recs, space, &init); + ut_ad(mtr.has_committed()); + + if (space) + space->release(); + + return block ? block : reinterpret_cast(-1); +} + +/** Attempt to initialize a page based on redo log records. +@param page_id page identifier +@return recovered block +@retval nullptr if the page cannot be initialized based on log records */ +ATTRIBUTE_COLD buf_block_t *recv_sys_t::recover_low(const page_id_t page_id) +{ + mysql_mutex_lock(&mutex); + map::iterator p= pages.find(page_id); + + if (p != pages.end() && !p->second.being_processed && p->second.skip_read) + { + p->second.being_processed= 1; + init &init= mlog_init.last(page_id); + mysql_mutex_unlock(&mutex); + buf_block_t *free_block= buf_LRU_get_free_block(false); + mtr_t mtr; + buf_block_t *block= recover_low(p, mtr, free_block, init); + p->second.being_processed= -1; + ut_ad(!block || block == reinterpret_cast(-1) || + block == free_block); + if (UNIV_UNLIKELY(!block)) + buf_pool.free_block(free_block); + return block; + } + + mysql_mutex_unlock(&mutex); + return nullptr; +} + +inline fil_space_t *fil_system_t::find(const char *path) const +{ + mysql_mutex_assert_owner(&mutex); + for (fil_space_t &space : fil_system.space_list) + if (space.chain.start && !strcmp(space.chain.start->name, path)) + return &space; + return nullptr; +} + +/** Thread-safe function which sorts flush_list by oldest_modification */ +static void log_sort_flush_list() +{ + /* Ensure that oldest_modification() cannot change during std::sort() */ + { + const double pct_lwm= srv_max_dirty_pages_pct_lwm; + /* Disable "idle" flushing in order to minimize the wait time below. */ + srv_max_dirty_pages_pct_lwm= 0.0; + + for (;;) + { + os_aio_wait_until_no_pending_writes(false); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (buf_pool.page_cleaner_active()) + my_cond_wait(&buf_pool.done_flush_list, + &buf_pool.flush_list_mutex.m_mutex); + else if (!os_aio_pending_writes()) + break; + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + } + + srv_max_dirty_pages_pct_lwm= pct_lwm; + } + + const size_t size= UT_LIST_GET_LEN(buf_pool.flush_list); + std::unique_ptr list(new buf_page_t *[size]); + + /* Copy the dirty blocks from buf_pool.flush_list to an array for sorting. */ + size_t idx= 0; + for (buf_page_t *p= UT_LIST_GET_FIRST(buf_pool.flush_list); p; ) + { + const lsn_t lsn{p->oldest_modification()}; + ut_ad(lsn > 2 || lsn == 1); + buf_page_t *n= UT_LIST_GET_NEXT(list, p); + if (lsn > 1) + list.get()[idx++]= p; + else + buf_pool.delete_from_flush_list(p); + p= n; + } + + std::sort(list.get(), list.get() + idx, + [](const buf_page_t *lhs, const buf_page_t *rhs) { + const lsn_t l{lhs->oldest_modification()}; + const lsn_t r{rhs->oldest_modification()}; + DBUG_ASSERT(l > 2); DBUG_ASSERT(r > 2); + return r < l; + }); + + UT_LIST_INIT(buf_pool.flush_list, &buf_page_t::list); + + for (size_t i= 0; i < idx; i++) + { + UT_LIST_ADD_LAST(buf_pool.flush_list, list[i]); + DBUG_ASSERT(list[i]->oldest_modification() > 2); + } + + mysql_mutex_unlock(&buf_pool.flush_list_mutex); +} + +/** Apply buffered log to persistent data pages. +@param last_batch whether it is possible to write more redo log */ +void recv_sys_t::apply(bool last_batch) +{ + ut_ad(srv_operation <= SRV_OPERATION_EXPORT_RESTORED || + srv_operation == SRV_OPERATION_RESTORE || + srv_operation == SRV_OPERATION_RESTORE_EXPORT); + + mysql_mutex_assert_owner(&mutex); + + garbage_collect(); + + if (!pages.empty()) + { + recv_no_ibuf_operations = !last_batch || + srv_operation == SRV_OPERATION_RESTORE || + srv_operation == SRV_OPERATION_RESTORE_EXPORT; + ut_ad(!last_batch || lsn == scanned_lsn); + progress_time= time(nullptr); + report_progress(); + + apply_log_recs= true; + + for (auto id= srv_undo_tablespaces_open; id--;) + { + const trunc& t= truncated_undo_spaces[id]; + if (t.lsn) + { + /* The entire undo tablespace will be reinitialized by + innodb_undo_log_truncate=ON. Discard old log for all pages. + Even though we recv_sys_t::parse() already invoked trim(), + this will be needed in case recovery consists of multiple batches + (there was an invocation with !last_batch). */ + trim({id + srv_undo_space_id_start, 0}, t.lsn); + if (fil_space_t *space = fil_space_get(id + srv_undo_space_id_start)) + { + ut_ad(UT_LIST_GET_LEN(space->chain) == 1); + ut_ad(space->recv_size >= t.pages); + fil_node_t *file= UT_LIST_GET_FIRST(space->chain); + ut_ad(file->is_open()); + os_file_truncate(file->name, file->handle, + os_offset_t{space->recv_size} << + srv_page_size_shift, true); + } + } + } + + fil_system.extend_to_recv_size(); + + fil_space_t *space= nullptr; + uint32_t space_id= ~0; + buf_block_t *free_block= nullptr; + + for (pages_it= pages.begin(); pages_it != pages.end(); + pages_it= pages.begin()) + { + if (!free_block) + { + if (!last_batch) + log_sys.latch.wr_unlock(); + wait_for_pool(1); + pages_it= pages.begin(); + mysql_mutex_unlock(&mutex); + /* We must release log_sys.latch and recv_sys.mutex before + invoking buf_LRU_get_free_block(). Allocating a block may initiate + a redo log write and therefore acquire log_sys.latch. To avoid + deadlocks, log_sys.latch must not be acquired while holding + recv_sys.mutex. */ + free_block= buf_LRU_get_free_block(false); + if (!last_batch) + log_sys.latch.wr_lock(SRW_LOCK_CALL); + mysql_mutex_lock(&mutex); + pages_it= pages.begin(); + } + + while (pages_it != pages.end()) + { + if (is_corrupt_fs() || is_corrupt_log()) + { + if (space) + space->release(); + if (free_block) + { + mysql_mutex_unlock(&mutex); + mysql_mutex_lock(&buf_pool.mutex); + buf_LRU_block_free_non_file_page(free_block); + mysql_mutex_unlock(&buf_pool.mutex); + mysql_mutex_lock(&mutex); + } + return; + } + if (apply_batch(space_id, space, free_block, last_batch)) + break; + } + } + + if (space) + space->release(); + + if (free_block) + { + mysql_mutex_lock(&buf_pool.mutex); + buf_LRU_block_free_non_file_page(free_block); + mysql_mutex_unlock(&buf_pool.mutex); + } + } + + if (last_batch) + { + if (!recv_no_ibuf_operations) + /* We skipped this in buf_page_create(). */ + mlog_init.mark_ibuf_exist(); + mlog_init.clear(); + } + else + { + mlog_init.reset(); + log_sys.latch.wr_unlock(); + } + + mysql_mutex_unlock(&mutex); + + if (!last_batch) + { + buf_flush_sync_batch(lsn); + buf_pool_invalidate(); + log_sys.latch.wr_lock(SRW_LOCK_CALL); + } + else if (srv_operation == SRV_OPERATION_RESTORE || + srv_operation == SRV_OPERATION_RESTORE_EXPORT) + buf_flush_sync_batch(lsn); + else + /* Instead of flushing, last_batch sorts the buf_pool.flush_list + in ascending order of buf_page_t::oldest_modification. */ + log_sort_flush_list(); + +#ifdef HAVE_PMEM + if (last_batch && log_sys.is_pmem()) + mprotect(log_sys.buf, len, PROT_READ | PROT_WRITE); +#endif + + mysql_mutex_lock(&mutex); + + ut_d(after_apply= true); + clear(); +} + +/** Scan log_t::FORMAT_10_8 log store records to the parsing buffer. +@param last_phase whether changes can be applied to the tablespaces +@return whether rescan is needed (not everything was stored) */ +static bool recv_scan_log(bool last_phase) +{ + DBUG_ENTER("recv_scan_log"); + + ut_ad(log_sys.is_latest()); + const size_t block_size_1{log_sys.get_block_size() - 1}; + + mysql_mutex_lock(&recv_sys.mutex); + ut_d(recv_sys.after_apply= last_phase); + if (!last_phase) + recv_sys.clear(); + else + ut_ad(recv_sys.file_checkpoint); + + bool store{recv_sys.file_checkpoint != 0}; + size_t buf_size= log_sys.buf_size; +#ifdef HAVE_PMEM + if (log_sys.is_pmem()) + { + recv_sys.offset= size_t(log_sys.calc_lsn_offset(recv_sys.lsn)); + buf_size= size_t(log_sys.file_size); + recv_sys.len= size_t(log_sys.file_size); + } + else +#endif + { + recv_sys.offset= size_t(recv_sys.lsn - log_sys.get_first_lsn()) & + block_size_1; + recv_sys.len= 0; + } + + lsn_t rewound_lsn= 0; + for (ut_d(lsn_t source_offset= 0);;) + { +#ifndef SUX_LOCK_GENERIC + ut_ad(log_sys.latch.is_write_locked()); +#endif +#ifdef UNIV_DEBUG + const bool wrap{source_offset + recv_sys.len == log_sys.file_size}; +#endif + if (size_t size= buf_size - recv_sys.len) + { +#ifndef UNIV_DEBUG + lsn_t +#endif + source_offset= + log_sys.calc_lsn_offset(recv_sys.lsn + recv_sys.len - recv_sys.offset); + ut_ad(!wrap || source_offset == log_t::START_OFFSET); + source_offset&= ~block_size_1; + + if (source_offset + size > log_sys.file_size) + size= static_cast(log_sys.file_size - source_offset); + + if (dberr_t err= log_sys.log.read(source_offset, + {log_sys.buf + recv_sys.len, size})) + { + mysql_mutex_unlock(&recv_sys.mutex); + ib::error() << "Failed to read log at " << source_offset + << ": " << err; + recv_sys.set_corrupt_log(); + mysql_mutex_lock(&recv_sys.mutex); + } + else + recv_sys.len+= size; + } + + if (recv_sys.report(time(nullptr))) + { + sql_print_information("InnoDB: Read redo log up to LSN=" LSN_PF, + recv_sys.lsn); + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "Read redo log up to LSN=" LSN_PF, + recv_sys.lsn); + } + + recv_sys_t::parse_mtr_result r; + + if (UNIV_UNLIKELY(!recv_needed_recovery)) + { + ut_ad(!last_phase); + ut_ad(recv_sys.lsn >= log_sys.next_checkpoint_lsn); + + if (!store) + { + ut_ad(!recv_sys.file_checkpoint); + for (;;) + { + const byte& b{log_sys.buf[recv_sys.offset]}; + r= recv_sys.parse_pmem(false); + switch (r) { + case recv_sys_t::PREMATURE_EOF: + goto read_more; + default: + ut_ad(r == recv_sys_t::GOT_EOF); + break; + case recv_sys_t::OK: + if (b == FILE_CHECKPOINT + 2 + 8 || (b & 0xf0) == FILE_MODIFY) + continue; + } + + const lsn_t end{recv_sys.file_checkpoint}; + ut_ad(!end || end == recv_sys.lsn); + mysql_mutex_unlock(&recv_sys.mutex); + + if (!end) + { + recv_sys.set_corrupt_log(); + sql_print_error("InnoDB: Missing FILE_CHECKPOINT(" LSN_PF + ") at " LSN_PF, log_sys.next_checkpoint_lsn, + recv_sys.lsn); + } + DBUG_RETURN(true); + } + } + else + { + ut_ad(recv_sys.file_checkpoint != 0); + switch ((r= recv_sys.parse_pmem(false))) { + case recv_sys_t::PREMATURE_EOF: + goto read_more; + case recv_sys_t::GOT_EOF: + break; + default: + ut_ad(r == recv_sys_t::OK); + recv_needed_recovery= true; + if (srv_read_only_mode) + { + mysql_mutex_unlock(&recv_sys.mutex); + DBUG_RETURN(false); + } + sql_print_information("InnoDB: Starting crash recovery from" + " checkpoint LSN=" LSN_PF, + log_sys.next_checkpoint_lsn); + } + } + } + + if (!store) + skip_the_rest: + while ((r= recv_sys.parse_pmem(false)) == recv_sys_t::OK); + else + { + uint16_t count= 0; + while ((r= recv_sys.parse_pmem(last_phase)) == recv_sys_t::OK) + if (!++count && recv_sys.report(time(nullptr))) + { + const size_t n= recv_sys.pages.size(); + sql_print_information("InnoDB: Parsed redo log up to LSN=" LSN_PF + "; to recover: %zu pages", recv_sys.lsn, n); + service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, + "Parsed redo log up to LSN=" LSN_PF + "; to recover: %zu pages", + recv_sys.lsn, n); + } + if (r == recv_sys_t::GOT_OOM) + { + ut_ad(!last_phase); + rewound_lsn= recv_sys.lsn; + store= false; + if (recv_sys.scanned_lsn <= 1) + goto skip_the_rest; + ut_ad(recv_sys.file_checkpoint); + goto func_exit; + } + } + + if (r != recv_sys_t::PREMATURE_EOF) + { + ut_ad(r == recv_sys_t::GOT_EOF); + got_eof: + ut_ad(recv_sys.is_initialised()); + if (recv_sys.scanned_lsn > 1) + { + ut_ad(recv_sys.scanned_lsn == recv_sys.lsn); + break; + } + recv_sys.scanned_lsn= recv_sys.lsn; + sql_print_information("InnoDB: End of log at LSN=" LSN_PF, recv_sys.lsn); + break; + } + + read_more: +#ifdef HAVE_PMEM + if (log_sys.is_pmem()) + break; +#endif + if (recv_sys.is_corrupt_log()) + break; + + if (recv_sys.offset < log_sys.get_block_size() && + recv_sys.lsn == recv_sys.scanned_lsn) + goto got_eof; + + if (recv_sys.offset > buf_size / 4 || + (recv_sys.offset > block_size_1 && + recv_sys.len >= buf_size - recv_sys.MTR_SIZE_MAX)) + { + const size_t ofs{recv_sys.offset & ~block_size_1}; + memmove_aligned<64>(log_sys.buf, log_sys.buf + ofs, recv_sys.len - ofs); + recv_sys.len-= ofs; + recv_sys.offset&= block_size_1; + } + } + + if (last_phase) + { + ut_ad(!rewound_lsn); + ut_ad(recv_sys.lsn >= recv_sys.file_checkpoint); + log_sys.set_recovered_lsn(recv_sys.lsn); + } + else if (rewound_lsn) + { + ut_ad(!store); + ut_ad(recv_sys.file_checkpoint); + recv_sys.lsn= rewound_lsn; + } +func_exit: + mysql_mutex_unlock(&recv_sys.mutex); + DBUG_RETURN(!store); +} + +/** Report a missing tablespace for which page-redo log exists. +@param[in] err previous error code +@param[in] i tablespace descriptor +@return new error code */ +static +dberr_t +recv_init_missing_space(dberr_t err, const recv_spaces_t::const_iterator& i) +{ + switch (srv_operation) { + default: + break; + case SRV_OPERATION_RESTORE: + case SRV_OPERATION_RESTORE_EXPORT: + if (i->second.name.find("/#sql") != std::string::npos) { + sql_print_warning("InnoDB: Tablespace " UINT32PF + " was not found at %.*s when" + " restoring a (partial?) backup." + " All redo log" + " for this file will be ignored!", + i->first, int(i->second.name.size()), + i->second.name.data()); + } + return(err); + } + + if (srv_force_recovery == 0) { + sql_print_error("InnoDB: Tablespace " UINT32PF " was not" + " found at %.*s.", i->first, + int(i->second.name.size()), + i->second.name.data()); + + if (err == DB_SUCCESS) { + sql_print_information( + "InnoDB: Set innodb_force_recovery=1 to" + " ignore this and to permanently lose" + " all changes to the tablespace."); + err = DB_TABLESPACE_NOT_FOUND; + } + } else { + sql_print_warning("InnoDB: Tablespace " UINT32PF + " was not found at %.*s" + ", and innodb_force_recovery was set." + " All redo log for this tablespace" + " will be ignored!", + i->first, int(i->second.name.size()), + i->second.name.data()); + } + + return(err); +} + +/** Report the missing tablespace and discard the redo logs for the deleted +tablespace. +@param[in] rescan rescan of redo logs is needed + if hash table ran out of memory +@param[out] missing_tablespace missing tablespace exists or not +@return error code or DB_SUCCESS. */ +static MY_ATTRIBUTE((warn_unused_result)) +dberr_t +recv_validate_tablespace(bool rescan, bool& missing_tablespace) +{ + dberr_t err = DB_SUCCESS; + + mysql_mutex_lock(&recv_sys.mutex); + + for (recv_sys_t::map::iterator p = recv_sys.pages.begin(); + p != recv_sys.pages.end();) { + ut_ad(!p->second.log.empty()); + const uint32_t space = p->first.space(); + if (is_predefined_tablespace(space)) { +next: + p++; + continue; + } + + recv_spaces_t::iterator i = recv_spaces.find(space); + ut_ad(i != recv_spaces.end()); + + if (deferred_spaces.find(static_cast(space))) { + /* Skip redo logs belonging to + incomplete tablespaces */ + goto next; + } + + switch (i->second.status) { + case file_name_t::NORMAL: + goto next; + case file_name_t::MISSING: + err = recv_init_missing_space(err, i); + i->second.status = file_name_t::DELETED; + /* fall through */ + case file_name_t::DELETED: + recv_sys_t::map::iterator r = p++; + recv_sys.pages_it_invalidate(r); + recv_sys.erase(r); + continue; + } + ut_ad(0); + } + + if (err != DB_SUCCESS) { +func_exit: + mysql_mutex_unlock(&recv_sys.mutex); + return(err); + } + + /* When rescan is not needed, recv_sys.pages will contain the + entire redo log. If rescan is needed or innodb_force_recovery + is set, we can ignore missing tablespaces. */ + for (const recv_spaces_t::value_type& rs : recv_spaces) { + if (UNIV_LIKELY(rs.second.status != file_name_t::MISSING)) { + continue; + } + + if (deferred_spaces.find(static_cast(rs.first))) { + continue; + } + + if (srv_force_recovery) { + sql_print_warning("InnoDB: Tablespace " UINT32PF + " was not found at %.*s," + " and innodb_force_recovery was set." + " All redo log for this tablespace" + " will be ignored!", + rs.first, int(rs.second.name.size()), + rs.second.name.data()); + continue; + } + + if (!rescan) { + sql_print_information("InnoDB: Tablespace " UINT32PF + " was not found at '%.*s'," + " but there were" + " no modifications either.", + rs.first, + int(rs.second.name.size()), + rs.second.name.data()); + } else { + missing_tablespace = true; + } + } + + goto func_exit; +} + +/** Check if all tablespaces were found for crash recovery. +@param[in] rescan rescan of redo logs is needed +@param[out] missing_tablespace missing table exists +@return error code or DB_SUCCESS */ +static MY_ATTRIBUTE((warn_unused_result)) +dberr_t +recv_init_crash_recovery_spaces(bool rescan, bool& missing_tablespace) +{ + bool flag_deleted = false; + + ut_ad(!srv_read_only_mode); + ut_ad(recv_needed_recovery); + + for (recv_spaces_t::value_type& rs : recv_spaces) { + ut_ad(!is_predefined_tablespace(rs.first)); + ut_ad(rs.second.status != file_name_t::DELETED + || !rs.second.space); + + if (rs.second.status == file_name_t::DELETED) { + /* The tablespace was deleted, + so we can ignore any redo log for it. */ + flag_deleted = true; + } else if (rs.second.space != NULL) { + /* The tablespace was found, and there + are some redo log records for it. */ + fil_names_dirty(rs.second.space); + + /* Add the freed page ranges in the respective + tablespace */ + if (!rs.second.freed_ranges.empty() + && (srv_immediate_scrub_data_uncompressed + || rs.second.space->is_compressed())) { + + rs.second.space->add_free_ranges( + std::move(rs.second.freed_ranges)); + } + } else if (rs.second.name == "") { + sql_print_error("InnoDB: Missing FILE_CREATE," + " FILE_DELETE or FILE_MODIFY" + " before FILE_CHECKPOINT" + " for tablespace " UINT32PF, rs.first); + recv_sys.set_corrupt_log(); + return(DB_CORRUPTION); + } else { + rs.second.status = file_name_t::MISSING; + flag_deleted = true; + } + + ut_ad(rs.second.status == file_name_t::DELETED + || rs.second.name != ""); + } + + if (flag_deleted) { + return recv_validate_tablespace(rescan, missing_tablespace); + } + + return DB_SUCCESS; +} + +/** Apply any FILE_RENAME records */ +static dberr_t recv_rename_files() +{ + mysql_mutex_assert_owner(&recv_sys.mutex); +#ifndef SUX_LOCK_GENERIC + ut_ad(log_sys.latch.is_write_locked()); +#endif + + dberr_t err= DB_SUCCESS; + + for (auto i= renamed_spaces.begin(); i != renamed_spaces.end(); ) + { + const auto &r= *i; + const uint32_t id= r.first; + fil_space_t *space= fil_space_t::get(id); + if (!space) + { + i++; + continue; + } + ut_ad(UT_LIST_GET_LEN(space->chain) == 1); + char *old= space->chain.start->name; + if (r.second != old) + { + bool exists; + os_file_type_t ftype; + const char *new_name= r.second.c_str(); + mysql_mutex_lock(&fil_system.mutex); + const fil_space_t *other= nullptr; + if (!space->chain.start->is_open() && space->chain.start->deferred && + (other= fil_system.find(new_name)) && + (other->chain.start->is_open() || !other->chain.start->deferred)) + other= nullptr; + + if (other) + { + /* Multiple tablespaces use the same file name. This should + only be possible if the recovery of both files was deferred + (no valid page 0 is contained in either file). We shall not + rename the file, just rename the metadata. */ + sql_print_information("InnoDB: Renaming tablespace metadata " UINT32PF + " from '%s' to '%s' that is also associated" + " with tablespace " UINT32PF, + id, old, new_name, other->id); + space->chain.start->name= mem_strdup(new_name); + ut_free(old); + } + else if (!os_file_status(new_name, &exists, &ftype) || exists) + { + sql_print_error("InnoDB: Cannot replay rename of tablespace " UINT32PF + " from '%s' to '%s'%s", + id, old, new_name, exists ? + " because the target file exists" : ""); + err= DB_TABLESPACE_EXISTS; + } + else + { + mysql_mutex_unlock(&fil_system.mutex); + err= space->rename(new_name, false); + if (err != DB_SUCCESS) + sql_print_error("InnoDB: Cannot replay rename of tablespace " + UINT32PF " to '%s: %s", new_name, ut_strerr(err)); + goto done; + } + mysql_mutex_unlock(&fil_system.mutex); + } +done: + space->release(); + if (err != DB_SUCCESS) + { + recv_sys.set_corrupt_fs(); + break; + } + renamed_spaces.erase(i++); + } + return err; +} + +/** Start recovering from a redo log checkpoint. +of first system tablespace page +@return error code or DB_SUCCESS */ +dberr_t recv_recovery_from_checkpoint_start() +{ + bool rescan = false; + + ut_ad(srv_operation <= SRV_OPERATION_EXPORT_RESTORED + || srv_operation == SRV_OPERATION_RESTORE + || srv_operation == SRV_OPERATION_RESTORE_EXPORT); + ut_d(mysql_mutex_lock(&buf_pool.flush_list_mutex)); + ut_ad(UT_LIST_GET_LEN(buf_pool.LRU) == 0); + ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0); + ut_d(mysql_mutex_unlock(&buf_pool.flush_list_mutex)); + + if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) { + sql_print_information("InnoDB: innodb_force_recovery=6" + " skips redo log apply"); + return(DB_SUCCESS); + } + + recv_sys.recovery_on = true; + + log_sys.latch.wr_lock(SRW_LOCK_CALL); + + dberr_t err = recv_sys.find_checkpoint(); + if (err != DB_SUCCESS) { +early_exit: + log_sys.latch.wr_unlock(); + return err; + } + + log_sys.set_capacity(); + + /* Start reading the log from the checkpoint lsn. The variable + contiguous_lsn contains an lsn up to which the log is known to + be contiguously written. */ + + ut_ad(recv_sys.pages.empty()); + + if (log_sys.format == log_t::FORMAT_3_23) { + goto early_exit; + } + + if (log_sys.is_latest()) { + const bool rewind = recv_sys.lsn + != log_sys.next_checkpoint_lsn; + log_sys.last_checkpoint_lsn = log_sys.next_checkpoint_lsn; + + recv_scan_log(false); + if (recv_needed_recovery) { +read_only_recovery: + sql_print_warning("InnoDB: innodb_read_only" + " prevents crash recovery"); + err = DB_READ_ONLY; + goto early_exit; + } + if (recv_sys.is_corrupt_log()) { + sql_print_error("InnoDB: Log scan aborted at LSN " + LSN_PF, recv_sys.lsn); + goto err_exit; + } + ut_ad(recv_sys.file_checkpoint); + if (rewind) { + recv_sys.lsn = log_sys.next_checkpoint_lsn; + recv_sys.offset = 0; + recv_sys.len = 0; + } + ut_ad(!recv_max_page_lsn); + rescan = recv_scan_log(false); + + if (srv_read_only_mode && recv_needed_recovery) { + goto read_only_recovery; + } + + if ((recv_sys.is_corrupt_log() && !srv_force_recovery) + || recv_sys.is_corrupt_fs()) { + goto err_exit; + } + } + + log_sys.set_recovered_lsn(recv_sys.lsn); + + if (recv_needed_recovery) { + bool missing_tablespace = false; + + err = recv_init_crash_recovery_spaces( + rescan, missing_tablespace); + + if (err != DB_SUCCESS) { + goto early_exit; + } + + if (missing_tablespace) { + ut_ad(rescan); + /* If any tablespaces seem to be missing, + validate the remaining log records. */ + + do { + rescan = recv_scan_log(false); + ut_ad(!recv_sys.is_corrupt_fs()); + + if (recv_sys.is_corrupt_log()) { + goto err_exit; + } + + missing_tablespace = false; + + err = recv_validate_tablespace( + rescan, missing_tablespace); + + if (err != DB_SUCCESS) { + goto early_exit; + } + } while (missing_tablespace); + + rescan = true; + /* Because in the loop above we overwrote the + initially stored recv_sys.pages, we must + restart parsing the log from the very beginning. */ + + /* FIXME: Use a separate loop for checking for + tablespaces (not individual pages), while retaining + the initial recv_sys.pages. */ + mysql_mutex_lock(&recv_sys.mutex); + recv_sys.clear(); + recv_sys.lsn = log_sys.next_checkpoint_lsn; + mysql_mutex_unlock(&recv_sys.mutex); + } + + if (srv_operation <= SRV_OPERATION_EXPORT_RESTORED) { + deferred_spaces.deferred_dblwr(); + buf_dblwr.recover(); + } + + ut_ad(srv_force_recovery <= SRV_FORCE_NO_UNDO_LOG_SCAN); + + if (rescan) { + recv_scan_log(true); + if ((recv_sys.is_corrupt_log() + && !srv_force_recovery) + || recv_sys.is_corrupt_fs()) { + goto err_exit; + } + + /* In case of multi-batch recovery, + redo log for the last batch is not + applied yet. */ + ut_d(recv_sys.after_apply = false); + } + } else { + ut_ad(recv_sys.pages.empty()); + } + + if (log_sys.is_latest() + && (recv_sys.lsn < log_sys.next_checkpoint_lsn + || recv_sys.lsn < recv_max_page_lsn)) { + + sql_print_error("InnoDB: We scanned the log up to " LSN_PF "." + " A checkpoint was at " LSN_PF + " and the maximum LSN on a database page was " + LSN_PF ". It is possible that the" + " database is now corrupt!", + recv_sys.lsn, + log_sys.next_checkpoint_lsn, + recv_max_page_lsn); + } + + if (recv_sys.lsn < log_sys.next_checkpoint_lsn) { +err_exit: + err = DB_ERROR; + goto early_exit; + } + + if (!srv_read_only_mode && log_sys.is_latest()) { + ut_ad(log_sys.get_flushed_lsn() == log_sys.get_lsn()); + ut_ad(recv_sys.lsn == log_sys.get_lsn()); + if (!log_sys.is_pmem()) { + const size_t bs_1{log_sys.get_block_size() - 1}; + const size_t ro{recv_sys.offset}; + recv_sys.offset &= bs_1; + memmove_aligned<64>(log_sys.buf, + log_sys.buf + (ro & ~bs_1), + log_sys.get_block_size()); +#ifdef HAVE_PMEM + } else { + mprotect(log_sys.buf, size_t(log_sys.file_size), + PROT_READ | PROT_WRITE); +#endif + } + log_sys.buf_free = recv_sys.offset; + if (recv_needed_recovery + && srv_operation <= SRV_OPERATION_EXPORT_RESTORED) { + /* Write a FILE_CHECKPOINT marker as the first thing, + before generating any other redo log. This ensures + that subsequent crash recovery will be possible even + if the server were killed soon after this. */ + fil_names_clear(log_sys.next_checkpoint_lsn); + } + } + + mysql_mutex_lock(&recv_sys.mutex); + if (UNIV_UNLIKELY(recv_sys.scanned_lsn != recv_sys.lsn) + && log_sys.is_latest()) { + ut_ad("log parsing error" == 0); + mysql_mutex_unlock(&recv_sys.mutex); + err = DB_CORRUPTION; + goto early_exit; + } + recv_sys.apply_log_recs = true; + recv_no_ibuf_operations = false; + ut_d(recv_no_log_write = srv_operation == SRV_OPERATION_RESTORE + || srv_operation == SRV_OPERATION_RESTORE_EXPORT); + if (srv_operation == SRV_OPERATION_NORMAL) { + err = recv_rename_files(); + } + mysql_mutex_unlock(&recv_sys.mutex); + + recv_lsn_checks_on = true; + + /* The database is now ready to start almost normal processing of user + transactions: transaction rollbacks and the application of the log + records in the hash table can be run in background. */ + if (err == DB_SUCCESS && deferred_spaces.reinit_all() + && !srv_force_recovery) { + err = DB_CORRUPTION; + } + + log_sys.latch.wr_unlock(); + return err; +} + +bool recv_dblwr_t::validate_page(const page_id_t page_id, + const byte *page, + const fil_space_t *space, + byte *tmp_buf) +{ + if (page_id.page_no() == 0) + { + uint32_t flags= fsp_header_get_flags(page); + if (!fil_space_t::is_valid_flags(flags, page_id.space())) + { + uint32_t cflags= fsp_flags_convert_from_101(flags); + if (cflags == UINT32_MAX) + { + ib::warn() << "Ignoring a doublewrite copy of page " << page_id + << "due to invalid flags " << ib::hex(flags); + return false; + } + + flags= cflags; + } + + /* Page 0 is never page_compressed or encrypted. */ + return !buf_page_is_corrupted(true, page, flags); + } + + ut_ad(tmp_buf); + byte *tmp_frame= tmp_buf; + byte *tmp_page= tmp_buf + srv_page_size; + const uint16_t page_type= mach_read_from_2(page + FIL_PAGE_TYPE); + const bool expect_encrypted= space->crypt_data && + space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED; + + if (space->full_crc32()) + return !buf_page_is_corrupted(true, page, space->flags); + + if (expect_encrypted && + mach_read_from_4(page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION)) + { + if (!fil_space_verify_crypt_checksum(page, space->zip_size())) + return false; + if (page_type != FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) + return true; + if (space->zip_size()) + return false; + memcpy(tmp_page, page, space->physical_size()); + if (!fil_space_decrypt(space, tmp_frame, tmp_page)) + return false; + } + + switch (page_type) { + case FIL_PAGE_PAGE_COMPRESSED: + memcpy(tmp_page, page, space->physical_size()); + /* fall through */ + case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED: + if (space->zip_size()) + return false; /* ROW_FORMAT=COMPRESSED cannot be page_compressed */ + ulint decomp= fil_page_decompress(tmp_frame, tmp_page, space->flags); + if (!decomp) + return false; /* decompression failed */ + if (decomp == srv_page_size) + return false; /* the page was not compressed (invalid page type) */ + return !buf_page_is_corrupted(true, tmp_page, space->flags); + } + + return !buf_page_is_corrupted(true, page, space->flags); +} + +byte *recv_dblwr_t::find_page(const page_id_t page_id, + const fil_space_t *space, byte *tmp_buf) +{ + byte *result= NULL; + lsn_t max_lsn= 0; + + for (byte *page : pages) + { + if (page_get_page_no(page) != page_id.page_no() || + page_get_space_id(page) != page_id.space()) + continue; + if (page_id.page_no() == 0) + { + uint32_t flags= mach_read_from_4( + FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page); + if (!fil_space_t::is_valid_flags(flags, page_id.space())) + continue; + } + + const lsn_t lsn= mach_read_from_8(page + FIL_PAGE_LSN); + if (lsn <= max_lsn || + !validate_page(page_id, page, space, tmp_buf)) + { + /* Mark processed for subsequent iterations in buf_dblwr_t::recover() */ + memset(page + FIL_PAGE_LSN, 0, 8); + continue; + } + + ut_a(page_get_page_no(page) == page_id.page_no()); + max_lsn= lsn; + result= page; + } + + return result; +} + +bool recv_dblwr_t::restore_first_page(uint32_t space_id, const char *name, + os_file_t file) +{ + const page_id_t page_id(space_id, 0); + const byte* page= find_page(page_id); + if (!page) + { + /* If the first page of the given user tablespace is not there + in the doublewrite buffer, then the recovery is going to fail + now. Hence this is treated as error. */ + ib::error() + << "Corrupted page " << page_id << " of datafile '" + << name <<"' could not be found in the doublewrite buffer."; + return true; + } + + ulint physical_size= fil_space_t::physical_size( + mach_read_from_4(page + FSP_HEADER_OFFSET + FSP_SPACE_FLAGS)); + ib::info() << "Restoring page " << page_id << " of datafile '" + << name << "' from the doublewrite buffer. Writing " + << physical_size << " bytes into file '" << name << "'"; + + return os_file_write( + IORequestWrite, name, file, page, 0, physical_size) != + DB_SUCCESS; +} diff --git a/storage/innobase/log/log0sync.cc b/storage/innobase/log/log0sync.cc new file mode 100644 index 00000000..6b14d1d3 --- /dev/null +++ b/storage/innobase/log/log0sync.cc @@ -0,0 +1,404 @@ +/***************************************************************************** +Copyright (c) 2020 MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/* +The group commit synchronization used in log_write_up_to() +works as follows + +For simplicity, lets consider only write operation,synchronozation of +flush operation works the same. + +Rules of the game + +A thread enters log_write_up_to() with lsn of the current transaction +1. If last written lsn is greater than wait lsn (another thread already + wrote the log buffer),then there is no need to do anything. +2. If no other thread is currently writing, write the log buffer, + and update last written lsn. +3. Otherwise, wait, and go to step 1. + +Synchronization can be done in different ways, e.g + +a) Simple mutex locking the entire check and write operation +Disadvantage that threads that could continue after updating +last written lsn, still wait. + +b) Spinlock, with periodic checks for last written lsn. +Fixes a) but burns CPU unnecessary. + +c) Mutex / condition variable combo. + +Condtion variable notifies (broadcast) all waiters, whenever +last written lsn is changed. + +Has a disadvantage of many suprious wakeups, stress on OS scheduler, +and mutex contention. + +d) Something else. +Make use of the waiter's lsn parameter, and only wakeup "right" waiting +threads. + +We chose d). Even if implementation is more complicated than alternatves +due to the need to maintain list of waiters, it provides the best performance. + +See group_commit_lock implementation for details. + +Note that if write operation is very fast, a) or b) can be fine as alternative. +*/ +#ifdef _WIN32 +#include +#endif + +#ifdef __linux__ +#include +#include +#endif + +#include +#include +#include +#include +#include + +#include +#include "log0sync.h" +#include +#include +/** + Helper class , used in group commit lock. + + Binary semaphore, or (same thing), an auto-reset event + Has state (signalled or not), and provides 2 operations. + wait() and wake() + + The implementation uses efficient locking primitives on Linux and Windows. + Or, mutex/condition combo elsewhere. +*/ + +class binary_semaphore +{ +public: + /**Wait until semaphore becomes signalled, and atomically reset the state + to non-signalled*/ + void wait(); + /** signals the semaphore */ + void wake(); + +private: +#if defined(__linux__) || defined (_WIN32) + std::atomic m_signalled; + static constexpr std::memory_order mem_order= std::memory_order_acq_rel; +public: + binary_semaphore() :m_signalled(0) {} +#else + std::mutex m_mtx{}; + std::condition_variable m_cv{}; + bool m_signalled = false; +#endif +}; + +#if defined (__linux__) || defined (_WIN32) +void binary_semaphore::wait() +{ + for (;;) + { + if (m_signalled.exchange(0, mem_order) == 1) + { + break; + } +#ifdef _WIN32 + int zero = 0; + WaitOnAddress(&m_signalled, &zero, sizeof(m_signalled), INFINITE); +#else + syscall(SYS_futex, &m_signalled, FUTEX_WAIT_PRIVATE, 0, NULL, NULL, 0); +#endif + } +} + +void binary_semaphore::wake() +{ + if (m_signalled.exchange(1, mem_order) == 0) + { +#ifdef _WIN32 + WakeByAddressSingle(&m_signalled); +#else + syscall(SYS_futex, &m_signalled, FUTEX_WAKE_PRIVATE, 1, NULL, NULL, 0); +#endif + } +} +#else +void binary_semaphore::wait() +{ + std::unique_lock lk(m_mtx); + while (!m_signalled) + m_cv.wait(lk); + m_signalled = false; +} +void binary_semaphore::wake() +{ + std::unique_lock lk(m_mtx); + m_signalled = true; + m_cv.notify_one(); +} +#endif + +/* A thread helper structure, used in group commit lock below*/ +struct group_commit_waiter_t +{ + lsn_t m_value=0; + binary_semaphore m_sema{}; + group_commit_waiter_t* m_next= nullptr; + bool m_group_commit_leader=false; +}; + +group_commit_lock::group_commit_lock() : + m_mtx(), m_value(0), m_pending_value(0), m_lock(false), m_waiters_list() +{ +} + +group_commit_lock::value_type group_commit_lock::value() const +{ + return m_value.load(std::memory_order::memory_order_relaxed); +} + +group_commit_lock::value_type group_commit_lock::pending() const +{ + return m_pending_value.load(std::memory_order::memory_order_relaxed); +} + +void group_commit_lock::set_pending(group_commit_lock::value_type num) +{ + ut_a(num >= value()); + m_pending_value.store(num, std::memory_order::memory_order_relaxed); +} + +const unsigned int MAX_SPINS = 1; /** max spins in acquire */ +thread_local group_commit_waiter_t thread_local_waiter; + +static inline void do_completion_callback(const completion_callback* cb) +{ + if (cb) + cb->m_callback(cb->m_param); +} + +group_commit_lock::lock_return_code group_commit_lock::acquire(value_type num, const completion_callback *callback) +{ + unsigned int spins = MAX_SPINS; + + for(;;) + { + if (num <= value()) + { + /* No need to wait.*/ + do_completion_callback(callback); + return lock_return_code::EXPIRED; + } + + if(spins-- == 0) + break; + if (num > pending()) + { + /* Longer wait expected (longer than currently running operation), + don't spin.*/ + break; + } + ut_delay(1); + } + + thread_local_waiter.m_value = num; + thread_local_waiter.m_group_commit_leader= false; + std::unique_lock lk(m_mtx, std::defer_lock); + while (num > value() || thread_local_waiter.m_group_commit_leader) + { + lk.lock(); + + /* Re-read current value after acquiring the lock*/ + if (num <= value() && + (!thread_local_waiter.m_group_commit_leader || m_lock)) + { + lk.unlock(); + do_completion_callback(callback); + return lock_return_code::EXPIRED; + } + + if (!m_lock) + { + /* Take the lock, become group commit leader.*/ + m_lock = true; +#ifndef DBUG_OFF + m_owner_id = std::this_thread::get_id(); +#endif + if (callback) + m_pending_callbacks.push_back({num,*callback}); + return lock_return_code::ACQUIRED; + } + + if (callback && (m_waiters_list || num <= pending())) + { + /* + If num > pending(), we have a good candidate for the next group + commit lead, that will be taking over the lock after current owner + releases it. We put current thread into waiter's list so it sleeps + and can be signaled and marked as group commit lead during lock release. + + For this to work well, pending() must deliver a good approximation for N + in the next call to group_commit_lock::release(N). + */ + m_pending_callbacks.push_back({num, *callback}); + return lock_return_code::CALLBACK_QUEUED; + } + + /* Add yourself to waiters list.*/ + thread_local_waiter.m_group_commit_leader= false; + thread_local_waiter.m_next = m_waiters_list; + m_waiters_list = &thread_local_waiter; + lk.unlock(); + + /* Sleep until woken in release().*/ + thd_wait_begin(0,THD_WAIT_GROUP_COMMIT); + thread_local_waiter.m_sema.wait(); + thd_wait_end(0); + + } + do_completion_callback(callback); + return lock_return_code::EXPIRED; +} + +group_commit_lock::value_type group_commit_lock::release(value_type num) +{ + completion_callback callbacks[1000]; + size_t callback_count = 0; + value_type ret = 0; + std::unique_lock lk(m_mtx); + m_lock = false; + + /* Update current value. */ + ut_a(num >= value()); + m_value.store(num, std::memory_order_relaxed); + + /* + Wake waiters for value <= current value. + Wake one more waiter, who will become the group commit lead. + */ + group_commit_waiter_t* cur, * prev, * next; + group_commit_waiter_t* wakeup_list = nullptr; + for (auto& c : m_pending_callbacks) + { + if (c.first <= num) + { + if (callback_count < array_elements(callbacks)) + callbacks[callback_count++] = c.second; + else + c.second.m_callback(c.second.m_param); + } + } + + for (prev= nullptr, cur= m_waiters_list; cur; cur= next) + { + next= cur->m_next; + if (cur->m_value <= num) + { + /* Move current waiter to wakeup_list*/ + + if (!prev) + { + /* Remove from the start of the list.*/ + m_waiters_list = next; + } + else + { + /* Remove from the middle of the list.*/ + prev->m_next= cur->m_next; + } + + /* Append entry to the wakeup list.*/ + cur->m_next = wakeup_list; + wakeup_list = cur; + } + else + { + prev= cur; + } + } + + auto it= std::remove_if( + m_pending_callbacks.begin(), m_pending_callbacks.end(), + [num](const pending_cb &c) { return c.first <= num; }); + + m_pending_callbacks.erase(it, m_pending_callbacks.end()); + + if (m_pending_callbacks.size() || m_waiters_list) + { + /* + Ensure that after this thread released the lock, + there is a new group commit leader + We take this from waiters list or wakeup list. It + might look like a spurious wake, but in fact we just + ensure the waiter do not wait for eternity. + */ + if (m_waiters_list) + { + /* Move one waiter to wakeup list */ + auto e= m_waiters_list; + m_waiters_list= m_waiters_list->m_next; + e->m_next= wakeup_list; + e->m_group_commit_leader= true; + wakeup_list = e; + } + else if (wakeup_list) + { + wakeup_list->m_group_commit_leader=true; + } + else + { + /* Tell the caller that some pending callbacks left, and he should + do something to prevent stalls. This should be a rare situation.*/ + ret= m_pending_callbacks[0].first; + } + } + + lk.unlock(); + + /* + Release designated next group commit lead first, + to minimize spurious wakeups. + */ + if (wakeup_list && wakeup_list->m_group_commit_leader) + { + next = wakeup_list->m_next; + wakeup_list->m_sema.wake(); + wakeup_list= next; + } + + for (size_t i = 0; i < callback_count; i++) + callbacks[i].m_callback(callbacks[i].m_param); + + for (cur= wakeup_list; cur; cur= next) + { + next= cur->m_next; + cur->m_sema.wake(); + } + return ret; +} + +#ifndef DBUG_OFF +bool group_commit_lock::is_owner() +{ + return m_lock && std::this_thread::get_id() == m_owner_id; +} +#endif + diff --git a/storage/innobase/log/log0sync.h b/storage/innobase/log/log0sync.h new file mode 100644 index 00000000..00686d39 --- /dev/null +++ b/storage/innobase/log/log0sync.h @@ -0,0 +1,99 @@ +/***************************************************************************** +Copyright (c) 2020 MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +#include +#include +#include +#include + +struct group_commit_waiter_t; +struct completion_callback +{ + void (*m_callback)(void*); + void* m_param; +}; + +/** +Special synchronization primitive, which is helpful for +performing group commit. + +It has a state consisting of + - locked (bool) + - current value (number). This value is always increasing. + - pending value (number). current value can soon become this number + This is only used for optimization, does not have to be exact + +Operations supported on this semaphore + +1.acquire(num, callback): +- waits until current value exceeds num, or until lock is granted. + if running synchronously (callback is nullptr) + +- returns EXPIRED if current_value >= num, + or ACQUIRED, if current_value < num and lock is granted, + or CALLBACK_QUEUED, if callback was not nullptr, and function + would otherwise have to wait + +2.release(num) +- releases lock +- sets new current value to max(num,current_value) +- releases some threads waiting in acquire() +- executes some callbacks +- might return some lsn, meaning there are some pending + callbacks left, and there is no new group commit lead + (i.e caller must do something to flush those pending callbacks) + +3. value() +- read current value + +4. pending_value() +- read pending value + +5. set_pending_value() +*/ +class group_commit_lock +{ + using value_type = lsn_t; +#ifndef DBUG_OFF + std::thread::id m_owner_id{}; +#endif + std::mutex m_mtx; + std::atomic m_value; + std::atomic m_pending_value; + bool m_lock; + group_commit_waiter_t* m_waiters_list; + + typedef std::pair pending_cb; + std::vector m_pending_callbacks; + +public: + group_commit_lock(); + enum lock_return_code + { + ACQUIRED, + EXPIRED, + CALLBACK_QUEUED + }; + lock_return_code acquire(value_type num, const completion_callback *cb); + value_type release(value_type num); + value_type value() const; + value_type pending() const; + void set_pending(value_type num); +#ifndef DBUG_OFF + bool is_owner(); +#endif +}; diff --git a/storage/innobase/mem/mem0mem.cc b/storage/innobase/mem/mem0mem.cc new file mode 100644 index 00000000..5e8587bf --- /dev/null +++ b/storage/innobase/mem/mem0mem.cc @@ -0,0 +1,436 @@ +/***************************************************************************** + +Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2021, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file mem/mem0mem.cc +The memory management + +Created 6/9/1994 Heikki Tuuri +*************************************************************************/ + +#include "mem0mem.h" +#include "buf0buf.h" +#include "srv0srv.h" +#include + +/**********************************************************************//** +Concatenate two strings and return the result, using a memory heap. +@return own: the result */ +char* +mem_heap_strcat( +/*============*/ + mem_heap_t* heap, /*!< in: memory heap where string is allocated */ + const char* s1, /*!< in: string 1 */ + const char* s2) /*!< in: string 2 */ +{ + char* s; + ulint s1_len = strlen(s1); + ulint s2_len = strlen(s2); + + s = static_cast(mem_heap_alloc(heap, s1_len + s2_len + 1)); + + memcpy(s, s1, s1_len); + memcpy(s + s1_len, s2, s2_len); + + s[s1_len + s2_len] = '\0'; + + return(s); +} + + +/****************************************************************//** +Helper function for mem_heap_printf. +@return length of formatted string, including terminating NUL */ +static +ulint +mem_heap_printf_low( +/*================*/ + char* buf, /*!< in/out: buffer to store formatted string + in, or NULL to just calculate length */ + const char* format, /*!< in: format string */ + va_list ap) /*!< in: arguments */ +{ + ulint len = 0; + + while (*format) { + + /* Does this format specifier have the 'l' length modifier. */ + ibool is_long = FALSE; + + /* Length of one parameter. */ + size_t plen; + + if (*format++ != '%') { + /* Non-format character. */ + + len++; + + if (buf) { + *buf++ = *(format - 1); + } + + continue; + } + + if (*format == 'l') { + is_long = TRUE; + format++; + } + + switch (*format++) { + case 's': + /* string */ + { + char* s = va_arg(ap, char*); + + /* "%ls" is a non-sensical format specifier. */ + ut_a(!is_long); + + plen = strlen(s); + len += plen; + + if (buf) { + memcpy(buf, s, plen); + buf += plen; + } + } + + break; + + case 'u': + /* unsigned int */ + { + char tmp[32]; + unsigned long val; + + /* We only support 'long' values for now. */ + ut_a(is_long); + + val = va_arg(ap, unsigned long); + + plen = size_t(sprintf(tmp, "%lu", val)); + len += plen; + + if (buf) { + memcpy(buf, tmp, plen); + buf += plen; + } + } + + break; + + case '%': + + /* "%l%" is a non-sensical format specifier. */ + ut_a(!is_long); + + len++; + + if (buf) { + *buf++ = '%'; + } + + break; + + default: + ut_error; + } + } + + /* For the NUL character. */ + len++; + + if (buf) { + *buf = '\0'; + } + + return(len); +} + +/****************************************************************//** +A simple sprintf replacement that dynamically allocates the space for the +formatted string from the given heap. This supports a very limited set of +the printf syntax: types 's' and 'u' and length modifier 'l' (which is +required for the 'u' type). +@return heap-allocated formatted string */ +char* +mem_heap_printf( +/*============*/ + mem_heap_t* heap, /*!< in: memory heap */ + const char* format, /*!< in: format string */ + ...) +{ + va_list ap; + char* str; + ulint len; + + /* Calculate length of string */ + len = 0; + va_start(ap, format); + len = mem_heap_printf_low(NULL, format, ap); + va_end(ap); + + /* Now create it for real. */ + str = static_cast(mem_heap_alloc(heap, len)); + va_start(ap, format); + mem_heap_printf_low(str, format, ap); + va_end(ap); + + return(str); +} + +#ifdef UNIV_DEBUG +/** Validates the contents of a memory heap. +Checks a memory heap for consistency, prints the contents if any error +is detected. A fatal error is logged if an error is detected. +@param[in] heap Memory heap to validate. */ +void +mem_heap_validate( + const mem_heap_t* heap) +{ + ulint size = 0; + + for (const mem_block_t* block = heap; + block != NULL; + block = UT_LIST_GET_NEXT(list, block)) { + + switch (block->type) { + case MEM_HEAP_DYNAMIC: + break; + case MEM_HEAP_BUFFER: + case MEM_HEAP_BUFFER | MEM_HEAP_BTR_SEARCH: + ut_ad(block->len <= srv_page_size); + break; + default: + ut_error; + } + + size += block->len; + } + + ut_ad(size == heap->total_size); +} + +/** Copy the tail of a string. +@param[in,out] dst destination buffer +@param[in] src string whose tail to copy +@param[in] size size of dst buffer, in bytes, including NUL terminator +@return strlen(src) */ +static void ut_strlcpy_rev(char* dst, const char* src, ulint size) +{ + size_t src_size = strlen(src), n = std::min(src_size, size - 1); + memcpy(dst, src + src_size - n, n + 1); +} +#endif /* UNIV_DEBUG */ + +/***************************************************************//** +Creates a memory heap block where data can be allocated. +@return own: memory heap block, NULL if did not succeed (only possible +for MEM_HEAP_BTR_SEARCH type heaps) */ +mem_block_t* +mem_heap_create_block_func( +/*=======================*/ + mem_heap_t* heap, /*!< in: memory heap or NULL if first block + should be created */ + ulint n, /*!< in: number of bytes needed for user data */ +#ifdef UNIV_DEBUG + const char* file_name,/*!< in: file name where created */ + unsigned line, /*!< in: line where created */ +#endif /* UNIV_DEBUG */ + ulint type) /*!< in: type of heap: MEM_HEAP_DYNAMIC or + MEM_HEAP_BUFFER */ +{ + buf_block_t* buf_block = NULL; + mem_block_t* block; + ulint len; + + ut_ad((type == MEM_HEAP_DYNAMIC) || (type == MEM_HEAP_BUFFER) + || (type == MEM_HEAP_BUFFER + MEM_HEAP_BTR_SEARCH)); + + if (heap != NULL) { + ut_d(mem_heap_validate(heap)); + } + + /* In dynamic allocation, calculate the size: block header + data. */ + len = MEM_BLOCK_HEADER_SIZE + MEM_SPACE_NEEDED(n); + + if (type == MEM_HEAP_DYNAMIC || len < srv_page_size / 2) { + + ut_ad(type == MEM_HEAP_DYNAMIC || n <= MEM_MAX_ALLOC_IN_BUF); + + block = static_cast(ut_malloc_nokey(len)); + } else { + len = srv_page_size; + + if ((type & MEM_HEAP_BTR_SEARCH) && heap) { + /* We cannot allocate the block from the + buffer pool, but must get the free block from + the heap header free block field */ + + buf_block = static_cast(heap->free_block); + heap->free_block = NULL; + + if (UNIV_UNLIKELY(!buf_block)) { + + return(NULL); + } + } else { + buf_block = buf_block_alloc(); + } + + block = (mem_block_t*) buf_block->page.frame; + } + + if (block == NULL) { + ib::fatal() << "Unable to allocate memory of size " + << len << "."; + } + + block->buf_block = buf_block; + block->free_block = NULL; + + ut_d(ut_strlcpy_rev(block->file_name, file_name, + sizeof(block->file_name))); + ut_d(block->line = line); + + mem_block_set_len(block, len); + mem_block_set_type(block, type); + mem_block_set_free(block, MEM_BLOCK_HEADER_SIZE); + mem_block_set_start(block, MEM_BLOCK_HEADER_SIZE); + + if (UNIV_UNLIKELY(heap == NULL)) { + /* This is the first block of the heap. The field + total_size should be initialized here */ + block->total_size = len; + } else { + /* Not the first allocation for the heap. This block's + total_length field should be set to undefined. */ + ut_d(block->total_size = ULINT_UNDEFINED); + MEM_UNDEFINED(&block->total_size, sizeof block->total_size); + + heap->total_size += len; + } + + /* Poison all available memory. Individual chunks will be unpoisoned on + every mem_heap_alloc() call. */ + compile_time_assert(MEM_BLOCK_HEADER_SIZE >= sizeof *block); + MEM_NOACCESS(block + 1, len - sizeof *block); + + ut_ad((ulint)MEM_BLOCK_HEADER_SIZE < len); + + return(block); +} + +/***************************************************************//** +Adds a new block to a memory heap. +@return created block, NULL if did not succeed (only possible for +MEM_HEAP_BTR_SEARCH type heaps) */ +mem_block_t* +mem_heap_add_block( +/*===============*/ + mem_heap_t* heap, /*!< in: memory heap */ + ulint n) /*!< in: number of bytes user needs */ +{ + mem_block_t* block; + mem_block_t* new_block; + ulint new_size; + + block = UT_LIST_GET_LAST(heap->base); + + /* We have to allocate a new block. The size is always at least + doubled until the standard size is reached. After that the size + stays the same, except in cases where the caller needs more space. */ + + new_size = 2 * mem_block_get_len(block); + + if (heap->type != MEM_HEAP_DYNAMIC) { + /* From the buffer pool we allocate buffer frames */ + ut_a(n <= MEM_MAX_ALLOC_IN_BUF); + + if (new_size > MEM_MAX_ALLOC_IN_BUF) { + new_size = MEM_MAX_ALLOC_IN_BUF; + } + } else if (new_size > MEM_BLOCK_STANDARD_SIZE) { + + new_size = MEM_BLOCK_STANDARD_SIZE; + } + + if (new_size < n) { + new_size = n; + } + + new_block = mem_heap_create_block(heap, new_size, heap->type, + heap->file_name, heap->line); + if (new_block == NULL) { + + return(NULL); + } + + /* Add the new block as the last block */ + + UT_LIST_INSERT_AFTER(heap->base, block, new_block); + + return(new_block); +} + +/******************************************************************//** +Frees a block from a memory heap. */ +void +mem_heap_block_free( +/*================*/ + mem_heap_t* heap, /*!< in: heap */ + mem_block_t* block) /*!< in: block to free */ +{ + ulint type; + ulint len; + buf_block_t* buf_block; + + buf_block = static_cast(block->buf_block); + + UT_LIST_REMOVE(heap->base, block); + + ut_ad(heap->total_size >= block->len); + heap->total_size -= block->len; + + type = heap->type; + len = block->len; + + if (type == MEM_HEAP_DYNAMIC || len < srv_page_size / 2) { + ut_ad(!buf_block); + ut_free(block); + } else { + ut_ad(type & MEM_HEAP_BUFFER); + buf_block_free(buf_block); + } +} + +/******************************************************************//** +Frees the free_block field from a memory heap. */ +void +mem_heap_free_block_free( +/*=====================*/ + mem_heap_t* heap) /*!< in: heap */ +{ + if (UNIV_LIKELY_NULL(heap->free_block)) { + + buf_block_free(static_cast(heap->free_block)); + + heap->free_block = NULL; + } +} diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc new file mode 100644 index 00000000..1834a164 --- /dev/null +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -0,0 +1,1667 @@ +/***************************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2017, 2023, MariaDB Corporation. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file mtr/mtr0mtr.cc +Mini-transaction buffer + +Created 11/26/1995 Heikki Tuuri +*******************************************************/ + +#include "mtr0log.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "page0types.h" +#include "log0crypt.h" +#ifdef BTR_CUR_HASH_ADAPT +# include "btr0sea.h" +#else +# include "btr0cur.h" +#endif +#include "srv0start.h" +#include "log.h" +#include "mariadb_stats.h" + +void mtr_memo_slot_t::release() const +{ + ut_ad(object); + + switch (type) { + case MTR_MEMO_S_LOCK: + static_cast(object)->s_unlock(); + break; + case MTR_MEMO_X_LOCK: + case MTR_MEMO_SX_LOCK: + static_cast(object)-> + u_or_x_unlock(type == MTR_MEMO_SX_LOCK); + break; + case MTR_MEMO_SPACE_X_LOCK: + static_cast(object)->set_committed_size(); + static_cast(object)->x_unlock(); + break; + default: + buf_page_t *bpage= static_cast(object); + ut_d(const auto s=) + bpage->unfix(); + ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX); + switch (type) { + case MTR_MEMO_PAGE_S_FIX: + bpage->lock.s_unlock(); + break; + case MTR_MEMO_BUF_FIX: + break; + default: + ut_ad(type == MTR_MEMO_PAGE_SX_FIX || + type == MTR_MEMO_PAGE_X_FIX || + type == MTR_MEMO_PAGE_SX_MODIFY || + type == MTR_MEMO_PAGE_X_MODIFY); + bpage->lock.u_or_x_unlock(type & MTR_MEMO_PAGE_SX_FIX); + } + } +} + +/** Prepare to insert a modified blcok into flush_list. +@param lsn start LSN of the mini-transaction +@return insert position for insert_into_flush_list() */ +inline buf_page_t *buf_pool_t::prepare_insert_into_flush_list(lsn_t lsn) + noexcept +{ +#ifndef SUX_LOCK_GENERIC + ut_ad(recv_recovery_is_on() || log_sys.latch.is_locked()); +#endif + ut_ad(lsn >= log_sys.last_checkpoint_lsn); + mysql_mutex_assert_owner(&flush_list_mutex); + static_assert(log_t::FIRST_LSN >= 2, "compatibility"); + +rescan: + buf_page_t *prev= UT_LIST_GET_FIRST(flush_list); + if (prev) + { + lsn_t om= prev->oldest_modification(); + if (om == 1) + { + delete_from_flush_list(prev); + goto rescan; + } + ut_ad(om > 2); + if (om <= lsn) + return nullptr; + while (buf_page_t *next= UT_LIST_GET_NEXT(list, prev)) + { + om= next->oldest_modification(); + if (om == 1) + { + delete_from_flush_list(next); + continue; + } + ut_ad(om > 2); + if (om <= lsn) + break; + prev= next; + } + flush_hp.adjust(prev); + } + return prev; +} + +/** Insert a modified block into the flush list. +@param prev insert position (from prepare_insert_into_flush_list()) +@param block modified block +@param lsn start LSN of the mini-transaction that modified the block */ +inline void buf_pool_t::insert_into_flush_list(buf_page_t *prev, + buf_block_t *block, lsn_t lsn) + noexcept +{ + ut_ad(!fsp_is_system_temporary(block->page.id().space())); + mysql_mutex_assert_owner(&flush_list_mutex); + + MEM_CHECK_DEFINED(block->page.zip.data + ? block->page.zip.data : block->page.frame, + block->physical_size()); + + if (const lsn_t old= block->page.oldest_modification()) + { + if (old > 1) + return; + flush_hp.adjust(&block->page); + UT_LIST_REMOVE(flush_list, &block->page); + } + else + flush_list_bytes+= block->physical_size(); + + ut_ad(flush_list_bytes <= curr_pool_size); + + if (prev) + UT_LIST_INSERT_AFTER(flush_list, prev, &block->page); + else + UT_LIST_ADD_FIRST(flush_list, &block->page); + + block->page.set_oldest_modification(lsn); +} + +mtr_t::mtr_t()= default; +mtr_t::~mtr_t()= default; + +/** Start a mini-transaction. */ +void mtr_t::start() +{ + ut_ad(m_memo.empty()); + ut_ad(!m_freed_pages); + ut_ad(!m_freed_space); + MEM_UNDEFINED(this, sizeof *this); + MEM_MAKE_DEFINED(&m_memo, sizeof m_memo); + MEM_MAKE_DEFINED(&m_freed_space, sizeof m_freed_space); + MEM_MAKE_DEFINED(&m_freed_pages, sizeof m_freed_pages); + + ut_d(m_start= true); + ut_d(m_commit= false); + ut_d(m_freeing_tree= false); + + m_last= nullptr; + m_last_offset= 0; + + new(&m_log) mtr_buf_t(); + + m_made_dirty= false; + m_latch_ex= false; + m_inside_ibuf= false; + m_modifications= false; + m_log_mode= MTR_LOG_ALL; + ut_d(m_user_space_id= TRX_SYS_SPACE); + m_user_space= nullptr; + m_commit_lsn= 0; + m_trim_pages= false; +} + +/** Release the resources */ +inline void mtr_t::release_resources() +{ + ut_ad(is_active()); + ut_ad(m_memo.empty()); + m_log.erase(); + ut_d(m_commit= true); +} + +/** Handle any pages that were freed during the mini-transaction. */ +void mtr_t::process_freed_pages() +{ + if (m_freed_pages) + { + ut_ad(!m_freed_pages->empty()); + ut_ad(m_freed_space); + ut_ad(m_freed_space->is_owner()); + ut_ad(is_named_space(m_freed_space)); + + /* Update the last freed lsn */ + m_freed_space->freed_range_mutex.lock(); + m_freed_space->update_last_freed_lsn(m_commit_lsn); + if (!m_trim_pages) + for (const auto &range : *m_freed_pages) + m_freed_space->add_free_range(range); + else + m_freed_space->clear_freed_ranges(); + m_freed_space->freed_range_mutex.unlock(); + + delete m_freed_pages; + m_freed_pages= nullptr; + m_freed_space= nullptr; + /* mtr_t::start() will reset m_trim_pages */ + } + else + ut_ad(!m_freed_space); +} + +ATTRIBUTE_COLD __attribute__((noinline)) +/** Insert a modified block into buf_pool.flush_list on IMPORT TABLESPACE. */ +static void insert_imported(buf_block_t *block) +{ + if (block->page.oldest_modification() <= 1) + { + log_sys.latch.rd_lock(SRW_LOCK_CALL); + const lsn_t lsn= log_sys.last_checkpoint_lsn; + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_pool.insert_into_flush_list + (buf_pool.prepare_insert_into_flush_list(lsn), block, lsn); + log_sys.latch.rd_unlock(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + } +} + +/** Release modified pages when no log was written. */ +void mtr_t::release_unlogged() +{ + ut_ad(m_log_mode == MTR_LOG_NO_REDO); + ut_ad(m_log.size() == 0); + + process_freed_pages(); + + for (auto it= m_memo.rbegin(); it != m_memo.rend(); it++) + { + mtr_memo_slot_t &slot= *it; + ut_ad(slot.object); + switch (slot.type) { + case MTR_MEMO_S_LOCK: + static_cast(slot.object)->s_unlock(); + break; + case MTR_MEMO_SPACE_X_LOCK: + static_cast(slot.object)->set_committed_size(); + static_cast(slot.object)->x_unlock(); + break; + case MTR_MEMO_X_LOCK: + case MTR_MEMO_SX_LOCK: + static_cast(slot.object)-> + u_or_x_unlock(slot.type == MTR_MEMO_SX_LOCK); + break; + default: + buf_block_t *block= static_cast(slot.object); + ut_d(const auto s=) block->page.unfix(); + ut_ad(s >= buf_page_t::FREED); + ut_ad(s < buf_page_t::READ_FIX); + + if (slot.type & MTR_MEMO_MODIFY) + { + ut_ad(slot.type == MTR_MEMO_PAGE_X_MODIFY || + slot.type == MTR_MEMO_PAGE_SX_MODIFY); + ut_ad(block->page.id() < end_page_id); + insert_imported(block); + } + + switch (slot.type) { + case MTR_MEMO_PAGE_S_FIX: + block->page.lock.s_unlock(); + break; + case MTR_MEMO_BUF_FIX: + break; + default: + ut_ad(slot.type == MTR_MEMO_PAGE_SX_FIX || + slot.type == MTR_MEMO_PAGE_X_FIX || + slot.type == MTR_MEMO_PAGE_SX_MODIFY || + slot.type == MTR_MEMO_PAGE_X_MODIFY); + block->page.lock.u_or_x_unlock(slot.type & MTR_MEMO_PAGE_SX_FIX); + } + } + } + + m_memo.clear(); +} + +void mtr_t::release() +{ + for (auto it= m_memo.rbegin(); it != m_memo.rend(); it++) + it->release(); + m_memo.clear(); +} + +/** Commit a mini-transaction. */ +void mtr_t::commit() +{ + ut_ad(is_active()); + ut_ad(!is_inside_ibuf()); + + /* This is a dirty read, for debugging. */ + ut_ad(!m_modifications || !recv_no_log_write); + ut_ad(!m_modifications || m_log_mode != MTR_LOG_NONE); + ut_ad(!m_latch_ex); + + if (m_modifications && (m_log_mode == MTR_LOG_NO_REDO || !m_log.empty())) + { + if (UNIV_UNLIKELY(!is_logged())) + { + release_unlogged(); + goto func_exit; + } + + ut_ad(!srv_read_only_mode); + std::pair lsns{do_write()}; + process_freed_pages(); + size_t modified= 0; + + if (m_made_dirty) + { + auto it= m_memo.rbegin(); + + mysql_mutex_lock(&buf_pool.flush_list_mutex); + + buf_page_t *const prev= + buf_pool.prepare_insert_into_flush_list(lsns.first); + + while (it != m_memo.rend()) + { + const mtr_memo_slot_t &slot= *it++; + if (slot.type & MTR_MEMO_MODIFY) + { + ut_ad(slot.type == MTR_MEMO_PAGE_X_MODIFY || + slot.type == MTR_MEMO_PAGE_SX_MODIFY); + modified++; + buf_block_t *b= static_cast(slot.object); + ut_ad(b->page.id() < end_page_id); + ut_d(const auto s= b->page.state()); + ut_ad(s > buf_page_t::FREED); + ut_ad(s < buf_page_t::READ_FIX); + ut_ad(mach_read_from_8(b->page.frame + FIL_PAGE_LSN) <= + m_commit_lsn); + mach_write_to_8(b->page.frame + FIL_PAGE_LSN, m_commit_lsn); + if (UNIV_LIKELY_NULL(b->page.zip.data)) + memcpy_aligned<8>(FIL_PAGE_LSN + b->page.zip.data, + FIL_PAGE_LSN + b->page.frame, 8); + buf_pool.insert_into_flush_list(prev, b, lsns.first); + } + } + + ut_ad(modified); + buf_pool.flush_list_requests+= modified; + buf_pool.page_cleaner_wakeup(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + if (m_latch_ex) + { + log_sys.latch.wr_unlock(); + m_latch_ex= false; + } + else + log_sys.latch.rd_unlock(); + + release(); + } + else + { + if (m_latch_ex) + { + log_sys.latch.wr_unlock(); + m_latch_ex= false; + } + else + log_sys.latch.rd_unlock(); + + for (auto it= m_memo.rbegin(); it != m_memo.rend(); ) + { + const mtr_memo_slot_t &slot= *it++; + ut_ad(slot.object); + switch (slot.type) { + case MTR_MEMO_S_LOCK: + static_cast(slot.object)->s_unlock(); + break; + case MTR_MEMO_SPACE_X_LOCK: + static_cast(slot.object)->set_committed_size(); + static_cast(slot.object)->x_unlock(); + break; + case MTR_MEMO_X_LOCK: + case MTR_MEMO_SX_LOCK: + static_cast(slot.object)-> + u_or_x_unlock(slot.type == MTR_MEMO_SX_LOCK); + break; + default: + buf_page_t *bpage= static_cast(slot.object); + const auto s= bpage->unfix(); + if (slot.type & MTR_MEMO_MODIFY) + { + ut_ad(slot.type == MTR_MEMO_PAGE_X_MODIFY || + slot.type == MTR_MEMO_PAGE_SX_MODIFY); + ut_ad(bpage->oldest_modification() > 1); + ut_ad(bpage->oldest_modification() < m_commit_lsn); + ut_ad(bpage->id() < end_page_id); + ut_ad(s >= buf_page_t::FREED); + ut_ad(s < buf_page_t::READ_FIX); + ut_ad(mach_read_from_8(bpage->frame + FIL_PAGE_LSN) <= + m_commit_lsn); + if (s >= buf_page_t::UNFIXED) + { + mach_write_to_8(bpage->frame + FIL_PAGE_LSN, m_commit_lsn); + if (UNIV_LIKELY_NULL(bpage->zip.data)) + memcpy_aligned<8>(FIL_PAGE_LSN + bpage->zip.data, + FIL_PAGE_LSN + bpage->frame, 8); + } + modified++; + } + switch (auto latch= slot.type & ~MTR_MEMO_MODIFY) { + case MTR_MEMO_PAGE_S_FIX: + bpage->lock.s_unlock(); + continue; + case MTR_MEMO_PAGE_SX_FIX: + case MTR_MEMO_PAGE_X_FIX: + bpage->lock.u_or_x_unlock(latch == MTR_MEMO_PAGE_SX_FIX); + continue; + default: + ut_ad(latch == MTR_MEMO_BUF_FIX); + } + } + } + + buf_pool.add_flush_list_requests(modified); + m_memo.clear(); + } + + mariadb_increment_pages_updated(modified); + + if (UNIV_UNLIKELY(lsns.second != PAGE_FLUSH_NO)) + buf_flush_ahead(m_commit_lsn, lsns.second == PAGE_FLUSH_SYNC); + } + else + { + if (m_freed_pages) + { + ut_ad(!m_freed_pages->empty()); + ut_ad(m_freed_space == fil_system.temp_space); + ut_ad(!m_trim_pages); + for (const auto &range : *m_freed_pages) + m_freed_space->add_free_range(range); + delete m_freed_pages; + m_freed_pages= nullptr; + m_freed_space= nullptr; + } + release(); + } + +func_exit: + release_resources(); +} + +void mtr_t::rollback_to_savepoint(ulint begin, ulint end) +{ + ut_ad(end <= m_memo.size()); + ut_ad(begin <= end); + ulint s= end; + + while (s-- > begin) + { + const mtr_memo_slot_t &slot= m_memo[s]; + ut_ad(slot.object); + /* This is intended for releasing latches on indexes or unmodified + buffer pool pages. */ + ut_ad(slot.type <= MTR_MEMO_SX_LOCK); + ut_ad(!(slot.type & MTR_MEMO_MODIFY)); + slot.release(); + } + + m_memo.erase(m_memo.begin() + begin, m_memo.begin() + end); +} + +/** Commit a mini-transaction that is shrinking a tablespace. +@param space tablespace that is being shrunk */ +void mtr_t::commit_shrink(fil_space_t &space) +{ + ut_ad(is_active()); + ut_ad(!is_inside_ibuf()); + ut_ad(!high_level_read_only); + ut_ad(m_modifications); + ut_ad(m_made_dirty); + ut_ad(!m_memo.empty()); + ut_ad(!recv_recovery_is_on()); + ut_ad(m_log_mode == MTR_LOG_ALL); + ut_ad(!m_freed_pages); + ut_ad(UT_LIST_GET_LEN(space.chain) == 1); + + log_write_and_flush_prepare(); + m_latch_ex= true; + log_sys.latch.wr_lock(SRW_LOCK_CALL); + + const lsn_t start_lsn= do_write().first; + ut_d(m_log.erase()); + + /* Durably write the reduced FSP_SIZE before truncating the data file. */ + log_write_and_flush(); +#ifndef SUX_LOCK_GENERIC + ut_ad(log_sys.latch.is_write_locked()); +#endif + + os_file_truncate(space.chain.start->name, space.chain.start->handle, + os_offset_t{space.size} << srv_page_size_shift, true); + + space.clear_freed_ranges(); + + const page_id_t high{space.id, space.size}; + size_t modified= 0; + auto it= m_memo.rbegin(); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + + buf_page_t *const prev= buf_pool.prepare_insert_into_flush_list(start_lsn); + + while (it != m_memo.rend()) + { + mtr_memo_slot_t &slot= *it++; + + ut_ad(slot.object); + if (slot.type == MTR_MEMO_SPACE_X_LOCK) + ut_ad(high.space() == static_cast(slot.object)->id); + else + { + ut_ad(slot.type == MTR_MEMO_PAGE_X_MODIFY || + slot.type == MTR_MEMO_PAGE_SX_MODIFY || + slot.type == MTR_MEMO_PAGE_X_FIX || + slot.type == MTR_MEMO_PAGE_SX_FIX); + buf_block_t *b= static_cast(slot.object); + const page_id_t id{b->page.id()}; + const auto s= b->page.state(); + ut_ad(s > buf_page_t::FREED); + ut_ad(s < buf_page_t::READ_FIX); + ut_ad(b->page.frame); + ut_ad(mach_read_from_8(b->page.frame + FIL_PAGE_LSN) <= m_commit_lsn); + ut_ad(!b->page.zip.data); // we no not shrink ROW_FORMAT=COMPRESSED + + if (id < high) + { + ut_ad(id.space() == high.space() || + (id == page_id_t{0, TRX_SYS_PAGE_NO} && + srv_is_undo_tablespace(high.space()))); + if (slot.type & MTR_MEMO_MODIFY) + { + modified++; + mach_write_to_8(b->page.frame + FIL_PAGE_LSN, m_commit_lsn); + buf_pool.insert_into_flush_list(prev, b, start_lsn); + } + } + else + { + ut_ad(id.space() == high.space()); + if (s >= buf_page_t::UNFIXED) + b->page.set_freed(s); + if (b->page.oldest_modification() > 1) + b->page.reset_oldest_modification(); + slot.type= mtr_memo_type_t(slot.type & ~MTR_MEMO_MODIFY); + } + } + } + + ut_ad(modified); + buf_pool.flush_list_requests+= modified; + buf_pool.page_cleaner_wakeup(); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + + log_sys.latch.wr_unlock(); + m_latch_ex= false; + + mysql_mutex_lock(&fil_system.mutex); + ut_ad(space.is_being_truncated); + ut_ad(space.is_stopping_writes()); + space.clear_stopping(); + space.is_being_truncated= false; + mysql_mutex_unlock(&fil_system.mutex); + + release(); + release_resources(); +} + +/** Commit a mini-transaction that is deleting or renaming a file. +@param space tablespace that is being renamed or deleted +@param name new file name (nullptr=the file will be deleted) +@return whether the operation succeeded */ +bool mtr_t::commit_file(fil_space_t &space, const char *name) +{ + ut_ad(is_active()); + ut_ad(!is_inside_ibuf()); + ut_ad(!high_level_read_only); + ut_ad(m_modifications); + ut_ad(!m_made_dirty); + ut_ad(!recv_recovery_is_on()); + ut_ad(m_log_mode == MTR_LOG_ALL); + ut_ad(UT_LIST_GET_LEN(space.chain) == 1); + ut_ad(!m_latch_ex); + + m_latch_ex= true; + + log_write_and_flush_prepare(); + + log_sys.latch.wr_lock(SRW_LOCK_CALL); + + size_t size= m_log.size() + 5; + + if (log_sys.is_encrypted()) + { + /* We will not encrypt any FILE_ records, but we will reserve + a nonce at the end. */ + size+= 8; + m_commit_lsn= log_sys.get_lsn(); + } + else + m_commit_lsn= 0; + + m_crc= 0; + m_log.for_each_block([this](const mtr_buf_t::block_t *b) + { m_crc= my_crc32c(m_crc, b->begin(), b->used()); return true; }); + finish_write(size); + + if (!name && space.max_lsn) + { + ut_d(space.max_lsn= 0); + fil_system.named_spaces.remove(space); + } + + /* Block log_checkpoint(). */ + mysql_mutex_lock(&buf_pool.flush_list_mutex); + + /* Durably write the log for the file system operation. */ + log_write_and_flush(); + + log_sys.latch.wr_unlock(); + m_latch_ex= false; + + char *old_name= space.chain.start->name; + bool success= true; + + if (name) + { + char *new_name= mem_strdup(name); + mysql_mutex_lock(&fil_system.mutex); + success= os_file_rename(innodb_data_file_key, old_name, name); + if (success) + space.chain.start->name= new_name; + else + old_name= new_name; + mysql_mutex_unlock(&fil_system.mutex); + ut_free(old_name); + } + + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + release_resources(); + + return success; +} + +/** Commit a mini-transaction that did not modify any pages, +but generated some redo log on a higher level, such as +FILE_MODIFY records and an optional FILE_CHECKPOINT marker. +The caller must hold exclusive log_sys.latch. +This is to be used at log_checkpoint(). +@param checkpoint_lsn the log sequence number of a checkpoint, or 0 +@return current LSN */ +lsn_t mtr_t::commit_files(lsn_t checkpoint_lsn) +{ +#ifndef SUX_LOCK_GENERIC + ut_ad(log_sys.latch.is_write_locked()); +#endif + ut_ad(is_active()); + ut_ad(!is_inside_ibuf()); + ut_ad(m_log_mode == MTR_LOG_ALL); + ut_ad(!m_made_dirty); + ut_ad(m_memo.empty()); + ut_ad(!srv_read_only_mode); + ut_ad(!m_freed_space); + ut_ad(!m_freed_pages); + ut_ad(!m_user_space); + ut_ad(!m_latch_ex); + + m_latch_ex= true; + + if (checkpoint_lsn) + { + byte *ptr= m_log.push(3 + 8); + *ptr= FILE_CHECKPOINT | (2 + 8); + ::memset(ptr + 1, 0, 2); + mach_write_to_8(ptr + 3, checkpoint_lsn); + } + + size_t size= m_log.size() + 5; + + if (log_sys.is_encrypted()) + { + /* We will not encrypt any FILE_ records, but we will reserve + a nonce at the end. */ + size+= 8; + m_commit_lsn= log_sys.get_lsn(); + } + else + m_commit_lsn= 0; + + m_crc= 0; + m_log.for_each_block([this](const mtr_buf_t::block_t *b) + { m_crc= my_crc32c(m_crc, b->begin(), b->used()); return true; }); + finish_write(size); + release_resources(); + + if (checkpoint_lsn) + DBUG_PRINT("ib_log", + ("FILE_CHECKPOINT(" LSN_PF ") written at " LSN_PF, + checkpoint_lsn, m_commit_lsn)); + + return m_commit_lsn; +} + +#ifdef UNIV_DEBUG +/** Check if a tablespace is associated with the mini-transaction +(needed for generating a FILE_MODIFY record) +@param[in] space tablespace +@return whether the mini-transaction is associated with the space */ +bool +mtr_t::is_named_space(uint32_t space) const +{ + ut_ad(!m_user_space || m_user_space->id != TRX_SYS_SPACE); + return !is_logged() || m_user_space_id == space || + is_predefined_tablespace(space); +} +/** Check if a tablespace is associated with the mini-transaction +(needed for generating a FILE_MODIFY record) +@param[in] space tablespace +@return whether the mini-transaction is associated with the space */ +bool mtr_t::is_named_space(const fil_space_t* space) const +{ + ut_ad(!m_user_space || m_user_space->id != TRX_SYS_SPACE); + + return !is_logged() || m_user_space == space || + is_predefined_tablespace(space->id); +} +#endif /* UNIV_DEBUG */ + +/** Acquire a tablespace X-latch. +@param[in] space_id tablespace ID +@return the tablespace object (never NULL) */ +fil_space_t *mtr_t::x_lock_space(uint32_t space_id) +{ + fil_space_t* space; + + ut_ad(is_active()); + + if (space_id == TRX_SYS_SPACE) { + space = fil_system.sys_space; + } else if ((space = m_user_space) && space_id == space->id) { + } else { + space = fil_space_get(space_id); + ut_ad(m_log_mode != MTR_LOG_NO_REDO + || space->purpose == FIL_TYPE_TEMPORARY + || space->purpose == FIL_TYPE_IMPORT); + } + + ut_ad(space); + ut_ad(space->id == space_id); + x_lock_space(space); + return(space); +} + +/** Acquire an exclusive tablespace latch. +@param space tablespace */ +void mtr_t::x_lock_space(fil_space_t *space) +{ + ut_ad(space->purpose == FIL_TYPE_TEMPORARY || + space->purpose == FIL_TYPE_IMPORT || + space->purpose == FIL_TYPE_TABLESPACE); + if (!memo_contains(*space)) + { + memo_push(space, MTR_MEMO_SPACE_X_LOCK); + space->x_lock(); + } +} + +void mtr_t::release(const void *object) +{ + ut_ad(is_active()); + + auto it= + std::find_if(m_memo.begin(), m_memo.end(), + [object](const mtr_memo_slot_t& slot) + { return slot.object == object; }); + ut_ad(it != m_memo.end()); + ut_ad(!(it->type & MTR_MEMO_MODIFY)); + it->release(); + m_memo.erase(it, it + 1); + ut_ad(std::find_if(m_memo.begin(), m_memo.end(), + [object](const mtr_memo_slot_t& slot) + { return slot.object == &object; }) == m_memo.end()); +} + +static time_t log_close_warn_time; + +/** Display a warning that the log tail is overwriting the head, +making the server crash-unsafe. */ +ATTRIBUTE_COLD static void log_overwrite_warning(lsn_t lsn) +{ + if (log_sys.overwrite_warned) + return; + + time_t t= time(nullptr); + if (difftime(t, log_close_warn_time) < 15) + return; + + if (!log_sys.overwrite_warned) + log_sys.overwrite_warned= lsn; + log_close_warn_time= t; + + sql_print_error("InnoDB: Crash recovery is broken due to" + " insufficient innodb_log_file_size;" + " last checkpoint LSN=" LSN_PF ", current LSN=" LSN_PF + "%s.", + lsn_t{log_sys.last_checkpoint_lsn}, lsn, + srv_shutdown_state != SRV_SHUTDOWN_INITIATED + ? ". Shutdown is in progress" : ""); +} + +/** Wait in append_prepare() for buffer to become available +@param ex whether log_sys.latch is exclusively locked */ +ATTRIBUTE_COLD void log_t::append_prepare_wait(bool ex) noexcept +{ + log_sys.waits++; + log_sys.unlock_lsn(); + + if (ex) + log_sys.latch.wr_unlock(); + else + log_sys.latch.rd_unlock(); + + DEBUG_SYNC_C("log_buf_size_exceeded"); + log_buffer_flush_to_disk(log_sys.is_pmem()); + + if (ex) + log_sys.latch.wr_lock(SRW_LOCK_CALL); + else + log_sys.latch.rd_lock(SRW_LOCK_CALL); + + log_sys.lock_lsn(); +} + +/** Reserve space in the log buffer for appending data. +@tparam pmem log_sys.is_pmem() +@param size total length of the data to append(), in bytes +@param ex whether log_sys.latch is exclusively locked +@return the start LSN and the buffer position for append() */ +template +inline +std::pair log_t::append_prepare(size_t size, bool ex) noexcept +{ +#ifndef SUX_LOCK_GENERIC + ut_ad(latch.is_locked()); +# ifndef _WIN32 // there is no accurate is_write_locked() on SRWLOCK + ut_ad(ex == latch.is_write_locked()); +# endif +#endif + ut_ad(pmem == is_pmem()); + const lsn_t checkpoint_margin{last_checkpoint_lsn + log_capacity - size}; + const size_t avail{(pmem ? size_t(capacity()) : buf_size) - size}; + lock_lsn(); + write_to_buf++; + + for (ut_d(int count= 50); + UNIV_UNLIKELY((pmem + ? size_t(get_lsn() - + get_flushed_lsn(std::memory_order_relaxed)) + : size_t{buf_free}) > avail); ) + { + append_prepare_wait(ex); + ut_ad(count--); + } + + const lsn_t l{lsn.load(std::memory_order_relaxed)}; + lsn.store(l + size, std::memory_order_relaxed); + const size_t b{buf_free}; + size_t new_buf_free{b}; + new_buf_free+= size; + if (pmem && new_buf_free >= file_size) + new_buf_free-= size_t(capacity()); + buf_free= new_buf_free; + unlock_lsn(); + + if (UNIV_UNLIKELY(l > checkpoint_margin) || + (!pmem && b >= max_buf_free)) + set_check_flush_or_checkpoint(); + + return {l, &buf[b]}; +} + +/** Finish appending data to the log. +@param lsn the end LSN of the log record +@return whether buf_flush_ahead() will have to be invoked */ +static mtr_t::page_flush_ahead log_close(lsn_t lsn) noexcept +{ +#ifndef SUX_LOCK_GENERIC + ut_ad(log_sys.latch.is_locked()); +#endif + + const lsn_t checkpoint_age= lsn - log_sys.last_checkpoint_lsn; + + if (UNIV_UNLIKELY(checkpoint_age >= log_sys.log_capacity) && + /* silence message on create_log_file() after the log had been deleted */ + checkpoint_age != lsn) + log_overwrite_warning(lsn); + else if (UNIV_LIKELY(checkpoint_age <= log_sys.max_modified_age_async)) + return mtr_t::PAGE_FLUSH_NO; + else if (UNIV_LIKELY(checkpoint_age <= log_sys.max_checkpoint_age)) + return mtr_t::PAGE_FLUSH_ASYNC; + + log_sys.set_check_flush_or_checkpoint(); + return mtr_t::PAGE_FLUSH_SYNC; +} + +inline void mtr_t::page_checksum(const buf_page_t &bpage) +{ + const byte *page= bpage.frame; + size_t size= srv_page_size; + + if (UNIV_LIKELY_NULL(bpage.zip.data)) + { + size= (UNIV_ZIP_SIZE_MIN >> 1) << bpage.zip.ssize; + switch (fil_page_get_type(bpage.zip.data)) { + case FIL_PAGE_TYPE_ALLOCATED: + case FIL_PAGE_INODE: + case FIL_PAGE_IBUF_BITMAP: + case FIL_PAGE_TYPE_FSP_HDR: + case FIL_PAGE_TYPE_XDES: + /* These are essentially uncompressed pages. */ + break; + default: + page= bpage.zip.data; + } + } + + /* We have to exclude from the checksum the normal + page checksum that is written by buf_flush_init_for_writing() + and FIL_PAGE_LSN which would be updated once we have actually + allocated the LSN. + + Unfortunately, we cannot access fil_space_t easily here. In order to + be compatible with encrypted tablespaces in the pre-full_crc32 + format we will unconditionally exclude the 8 bytes at + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + a.k.a. FIL_RTREE_SPLIT_SEQ_NUM. */ + const uint32_t checksum= + my_crc32c(my_crc32c(my_crc32c(0, page + FIL_PAGE_OFFSET, + FIL_PAGE_LSN - FIL_PAGE_OFFSET), + page + FIL_PAGE_TYPE, 2), + page + FIL_PAGE_SPACE_ID, size - (FIL_PAGE_SPACE_ID + 8)); + + byte *l= log_write